do-not-assume-VROOT Wdiff usr/src/uts/common/fs/nfs/nfs_srv.c

Print this page

Try to remove assumption that zone's root vnode is marked VROOT

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30   30   *      All rights reserved.
  31   31   */
  32   32  
  33   33  /*
  34   34   * Copyright 2018 Nexenta Systems, Inc.
  35   35   * Copyright (c) 2016 by Delphix. All rights reserved.
  36   36   */
  37   37  
  38   38  #include <sys/param.h>
  39   39  #include <sys/types.h>
  40   40  #include <sys/systm.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/stat.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/statvfs.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/kstat.h>
  52   52  #include <sys/dirent.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/vtrace.h>
  56   56  #include <sys/mode.h>
  57   57  #include <sys/acl.h>
  58   58  #include <sys/nbmlock.h>
  59   59  #include <sys/policy.h>
  60   60  #include <sys/sdt.h>
  61   61  
  62   62  #include <rpc/types.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/svc.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/export.h>
  68   68  #include <nfs/nfs_cmd.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/as.h>
  72   72  #include <vm/seg.h>
  73   73  #include <vm/seg_map.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  #include <sys/strsubr.h>
  77   77  
  78   78  struct rfs_async_write_list;
  79   79  
  80   80  /*
  81   81   * Zone globals of NFSv2 server
  82   82   */
  83   83  typedef struct nfs_srv {
  84   84          kmutex_t                        async_write_lock;
  85   85          struct rfs_async_write_list     *async_write_head;
  86   86  
  87   87          /*
  88   88           * enables write clustering if == 1
  89   89           */
  90   90          int             write_async;
  91   91  } nfs_srv_t;
  92   92  
  93   93  /*
  94   94   * These are the interface routines for the server side of the
  95   95   * Network File System.  See the NFS version 2 protocol specification
  96   96   * for a description of this interface.
  97   97   */
  98   98  
  99   99  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100  100  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101  101                          cred_t *);
 102  102  static void     *rfs_zone_init(zoneid_t zoneid);
 103  103  static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 104  104  
 105  105  
 106  106  /*
 107  107   * Some "over the wire" UNIX file types.  These are encoded
 108  108   * into the mode.  This needs to be fixed in the next rev.
 109  109   */
 110  110  #define IFMT            0170000         /* type of file */
 111  111  #define IFCHR           0020000         /* character special */
 112  112  #define IFBLK           0060000         /* block special */
 113  113  #define IFSOCK          0140000         /* socket */
 114  114  
 115  115  u_longlong_t nfs2_srv_caller_id;
 116  116  static zone_key_t rfs_zone_key;
 117  117  
 118  118  /*
 119  119   * Get file attributes.
 120  120   * Returns the current attributes of the file with the given fhandle.
 121  121   */
 122  122  /* ARGSUSED */
 123  123  void
 124  124  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 125  125      struct svc_req *req, cred_t *cr, bool_t ro)
 126  126  {
 127  127          int error;
 128  128          vnode_t *vp;
 129  129          struct vattr va;
 130  130  
 131  131          vp = nfs_fhtovp(fhp, exi);
 132  132          if (vp == NULL) {
 133  133                  ns->ns_status = NFSERR_STALE;
 134  134                  return;
 135  135          }
 136  136  
 137  137          /*
 138  138           * Do the getattr.
 139  139           */
 140  140          va.va_mask = AT_ALL;    /* we want all the attributes */
 141  141  
 142  142          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 143  143  
 144  144          /* check for overflows */
 145  145          if (!error) {
 146  146                  /* Lie about the object type for a referral */
 147  147                  if (vn_is_nfs_reparse(vp, cr))
 148  148                          va.va_type = VLNK;
 149  149  
 150  150                  acl_perm(vp, exi, &va, cr);
 151  151                  error = vattr_to_nattr(&va, &ns->ns_attr);
 152  152          }
 153  153  
 154  154          VN_RELE(vp);
 155  155  
 156  156          ns->ns_status = puterrno(error);
 157  157  }
 158  158  void *
 159  159  rfs_getattr_getfh(fhandle_t *fhp)
 160  160  {
 161  161          return (fhp);
 162  162  }
 163  163  
 164  164  /*
 165  165   * Set file attributes.
 166  166   * Sets the attributes of the file with the given fhandle.  Returns
 167  167   * the new attributes.
 168  168   */
 169  169  /* ARGSUSED */
 170  170  void
 171  171  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 172  172      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 173  173  {
 174  174          int error;
 175  175          int flag;
 176  176          int in_crit = 0;
 177  177          vnode_t *vp;
 178  178          struct vattr va;
 179  179          struct vattr bva;
 180  180          struct flock64 bf;
 181  181          caller_context_t ct;
 182  182  
 183  183  
 184  184          vp = nfs_fhtovp(&args->saa_fh, exi);
 185  185          if (vp == NULL) {
 186  186                  ns->ns_status = NFSERR_STALE;
 187  187                  return;
 188  188          }
 189  189  
 190  190          if (rdonly(ro, vp)) {
 191  191                  VN_RELE(vp);
 192  192                  ns->ns_status = NFSERR_ROFS;
 193  193                  return;
 194  194          }
 195  195  
 196  196          error = sattr_to_vattr(&args->saa_sa, &va);
 197  197          if (error) {
 198  198                  VN_RELE(vp);
 199  199                  ns->ns_status = puterrno(error);
 200  200                  return;
 201  201          }
 202  202  
 203  203          /*
 204  204           * If the client is requesting a change to the mtime,
 205  205           * but the nanosecond field is set to 1 billion, then
 206  206           * this is a flag to the server that it should set the
 207  207           * atime and mtime fields to the server's current time.
 208  208           * The 1 billion number actually came from the client
 209  209           * as 1 million, but the units in the over the wire
 210  210           * request are microseconds instead of nanoseconds.
 211  211           *
 212  212           * This is an overload of the protocol and should be
 213  213           * documented in the NFS Version 2 protocol specification.
 214  214           */
 215  215          if (va.va_mask & AT_MTIME) {
 216  216                  if (va.va_mtime.tv_nsec == 1000000000) {
 217  217                          gethrestime(&va.va_mtime);
 218  218                          va.va_atime = va.va_mtime;
 219  219                          va.va_mask |= AT_ATIME;
 220  220                          flag = 0;
 221  221                  } else
 222  222                          flag = ATTR_UTIME;
 223  223          } else
 224  224                  flag = 0;
 225  225  
 226  226          /*
 227  227           * If the filesystem is exported with nosuid, then mask off
 228  228           * the setuid and setgid bits.
 229  229           */
 230  230          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 231  231              (exi->exi_export.ex_flags & EX_NOSUID))
 232  232                  va.va_mode &= ~(VSUID | VSGID);
 233  233  
 234  234          ct.cc_sysid = 0;
 235  235          ct.cc_pid = 0;
 236  236          ct.cc_caller_id = nfs2_srv_caller_id;
 237  237          ct.cc_flags = CC_DONTBLOCK;
 238  238  
 239  239          /*
 240  240           * We need to specially handle size changes because it is
 241  241           * possible for the client to create a file with modes
 242  242           * which indicate read-only, but with the file opened for
 243  243           * writing.  If the client then tries to set the size of
 244  244           * the file, then the normal access checking done in
 245  245           * VOP_SETATTR would prevent the client from doing so,
 246  246           * although it should be legal for it to do so.  To get
 247  247           * around this, we do the access checking for ourselves
 248  248           * and then use VOP_SPACE which doesn't do the access
 249  249           * checking which VOP_SETATTR does. VOP_SPACE can only
 250  250           * operate on VREG files, let VOP_SETATTR handle the other
 251  251           * extremely rare cases.
 252  252           * Also the client should not be allowed to change the
 253  253           * size of the file if there is a conflicting non-blocking
 254  254           * mandatory lock in the region of change.
 255  255           */
 256  256          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 257  257                  if (nbl_need_check(vp)) {
 258  258                          nbl_start_crit(vp, RW_READER);
 259  259                          in_crit = 1;
 260  260                  }
 261  261  
 262  262                  bva.va_mask = AT_UID | AT_SIZE;
 263  263  
 264  264                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 265  265  
 266  266                  if (error) {
 267  267                          if (in_crit)
 268  268                                  nbl_end_crit(vp);
 269  269                          VN_RELE(vp);
 270  270                          ns->ns_status = puterrno(error);
 271  271                          return;
 272  272                  }
 273  273  
 274  274                  if (in_crit) {
 275  275                          u_offset_t offset;
 276  276                          ssize_t length;
 277  277  
 278  278                          if (va.va_size < bva.va_size) {
 279  279                                  offset = va.va_size;
 280  280                                  length = bva.va_size - va.va_size;
 281  281                          } else {
 282  282                                  offset = bva.va_size;
 283  283                                  length = va.va_size - bva.va_size;
 284  284                          }
 285  285                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 286  286                              NULL)) {
 287  287                                  error = EACCES;
 288  288                          }
 289  289                  }
 290  290  
 291  291                  if (crgetuid(cr) == bva.va_uid && !error &&
 292  292                      va.va_size != bva.va_size) {
 293  293                          va.va_mask &= ~AT_SIZE;
 294  294                          bf.l_type = F_WRLCK;
 295  295                          bf.l_whence = 0;
 296  296                          bf.l_start = (off64_t)va.va_size;
 297  297                          bf.l_len = 0;
 298  298                          bf.l_sysid = 0;
 299  299                          bf.l_pid = 0;
 300  300  
 301  301                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 302  302                              (offset_t)va.va_size, cr, &ct);
 303  303                  }
 304  304                  if (in_crit)
 305  305                          nbl_end_crit(vp);
 306  306          } else
 307  307                  error = 0;
 308  308  
 309  309          /*
 310  310           * Do the setattr.
 311  311           */
 312  312          if (!error && va.va_mask) {
 313  313                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 314  314          }
 315  315  
 316  316          /*
 317  317           * check if the monitor on either vop_space or vop_setattr detected
 318  318           * a delegation conflict and if so, mark the thread flag as
 319  319           * wouldblock so that the response is dropped and the client will
 320  320           * try again.
 321  321           */
 322  322          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 323  323                  VN_RELE(vp);
 324  324                  curthread->t_flag |= T_WOULDBLOCK;
 325  325                  return;
 326  326          }
 327  327  
 328  328          if (!error) {
 329  329                  va.va_mask = AT_ALL;    /* get everything */
 330  330  
 331  331                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 332  332  
 333  333                  /* check for overflows */
 334  334                  if (!error) {
 335  335                          acl_perm(vp, exi, &va, cr);
 336  336                          error = vattr_to_nattr(&va, &ns->ns_attr);
 337  337                  }
 338  338          }
 339  339  
 340  340          ct.cc_flags = 0;
 341  341  
 342  342          /*
 343  343           * Force modified metadata out to stable storage.
 344  344           */
 345  345          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 346  346  
 347  347          VN_RELE(vp);
 348  348  
 349  349          ns->ns_status = puterrno(error);
 350  350  }
 351  351  void *
 352  352  rfs_setattr_getfh(struct nfssaargs *args)
 353  353  {
 354  354          return (&args->saa_fh);
 355  355  }
 356  356  
 357  357  /* Change and release @exip and @vpp only in success */
 358  358  int
 359  359  rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 360  360  {
 361  361          struct exportinfo *exi;
 362  362          vnode_t *vp = *vpp;
 363  363          fid_t fid;
 364  364          int error;
 365  365  
 366  366          VN_HOLD(vp);
 367  367  
 368  368          if ((error = traverse(&vp)) != 0) {
 369  369                  VN_RELE(vp);
 370  370                  return (error);
 371  371          }
 372  372  
 373  373          bzero(&fid, sizeof (fid));
 374  374          fid.fid_len = MAXFIDSZ;
 375  375          error = VOP_FID(vp, &fid, NULL);
 376  376          if (error) {
 377  377                  VN_RELE(vp);
 378  378                  return (error);
 379  379          }
 380  380  
 381  381          exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 382  382          if (exi == NULL ||
 383  383              (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 384  384                  /*
 385  385                   * It is not error, just subdir is not exported
 386  386                   * or "nohide" is not set
 387  387                   */
 388  388                  if (exi != NULL)
 389  389                          exi_rele(exi);
 390  390                  VN_RELE(vp);
 391  391          } else {
 392  392                  /* go to submount */
 393  393                  exi_rele(*exip);
 394  394                  *exip = exi;
 395  395  
 396  396                  VN_RELE(*vpp);
 397  397                  *vpp = vp;
 398  398          }
 399  399  
 400  400          return (0);
 401  401  }
 402  402  
 403  403  /*

↓ open down ↓

403 lines elided

↑ open up ↑

 404  404   * Given mounted "dvp" and "exi", go upper mountpoint
 405  405   * with dvp/exi correction
 406  406   * Return 0 in success
 407  407   */
 408  408  int
 409  409  rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 410  410  {
 411  411          struct exportinfo *exi;
 412  412          vnode_t *dvp = *dvpp;
 413  413  
 414      -        ASSERT(dvp->v_flag & VROOT);
      414 +        ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 415  415  
 416  416          VN_HOLD(dvp);
 417  417          dvp = untraverse(dvp);
 418  418          exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 419  419          if (exi == NULL) {
 420  420                  VN_RELE(dvp);
 421  421                  return (-1);
 422  422          }
 423  423  
 424  424          exi_rele(*exip);

 425  425          *exip = exi;
 426  426          VN_RELE(*dvpp);
 427  427          *dvpp = dvp;
 428  428  
 429  429          return (0);
 430  430  }
 431  431  /*
 432  432   * Directory lookup.
 433  433   * Returns an fhandle and file attributes for file name in a directory.
 434  434   */
 435  435  /* ARGSUSED */
 436  436  void
 437  437  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 438  438      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 439  439  {
 440  440          int error;
 441  441          vnode_t *dvp;
 442  442          vnode_t *vp;
 443  443          struct vattr va;
 444  444          fhandle_t *fhp = da->da_fhandle;
 445  445          struct sec_ol sec = {0, 0};
 446  446          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 447  447          char *name;
 448  448          struct sockaddr *ca;
 449  449  
 450  450          /*
 451  451           * Trusted Extension doesn't support NFSv2. MOUNT
 452  452           * will reject v2 clients. Need to prevent v2 client
 453  453           * access via WebNFS here.
 454  454           */
 455  455          if (is_system_labeled() && req->rq_vers == 2) {
 456  456                  dr->dr_status = NFSERR_ACCES;
 457  457                  return;
 458  458          }
 459  459  
 460  460          /*
 461  461           * Disallow NULL paths
 462  462           */
 463  463          if (da->da_name == NULL || *da->da_name == '\0') {
 464  464                  dr->dr_status = NFSERR_ACCES;
 465  465                  return;
 466  466          }
 467  467  
 468  468          /*
 469  469           * Allow lookups from the root - the default
 470  470           * location of the public filehandle.
 471  471           */
 472  472          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 473  473                  dvp = ZONE_ROOTVP();
 474  474                  VN_HOLD(dvp);
 475  475          } else {
 476  476                  dvp = nfs_fhtovp(fhp, exi);
 477  477                  if (dvp == NULL) {
 478  478                          dr->dr_status = NFSERR_STALE;
 479  479                          return;
 480  480                  }
 481  481          }
 482  482

↓ open down ↓

58 lines elided

↑ open up ↑

 483  483          exi_hold(exi);
 484  484  
 485  485          /*
 486  486           * Not allow lookup beyond root.
 487  487           * If the filehandle matches a filehandle of the exi,
 488  488           * then the ".." refers beyond the root of an exported filesystem.
 489  489           */
 490  490          if (strcmp(da->da_name, "..") == 0 &&
 491  491              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 492  492                  if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 493      -                    (dvp->v_flag & VROOT)) {
      493 +                    ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 494  494                          /*
 495  495                           * special case for ".." and 'nohide'exported root
 496  496                           */
 497  497                          if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 498  498                                  error = NFSERR_ACCES;
 499  499                                  goto out;
 500  500                          }
 501  501                  } else  {
 502  502                          error = NFSERR_NOENT;
 503  503                          goto out;

 504  504                  }
 505  505          }
 506  506  
 507  507          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 508  508          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 509  509              MAXPATHLEN);
 510  510  
 511  511          if (name == NULL) {
 512  512                  error = NFSERR_ACCES;
 513  513                  goto out;
 514  514          }
 515  515  
 516  516          /*
 517  517           * If the public filehandle is used then allow
 518  518           * a multi-component lookup, i.e. evaluate
 519  519           * a pathname and follow symbolic links if
 520  520           * necessary.
 521  521           *
 522  522           * This may result in a vnode in another filesystem
 523  523           * which is OK as long as the filesystem is exported.
 524  524           */
 525  525          if (PUBLIC_FH2(fhp)) {
 526  526                  publicfh_flag = TRUE;
 527  527  
 528  528                  exi_rele(exi);
 529  529  
 530  530                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 531  531                      &sec);
 532  532          } else {
 533  533                  /*
 534  534                   * Do a normal single component lookup.
 535  535                   */
 536  536                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 537  537                      NULL, NULL, NULL);
 538  538          }
 539  539  
 540  540          if (name != da->da_name)
 541  541                  kmem_free(name, MAXPATHLEN);
 542  542  
 543  543          if (error == 0 && vn_ismntpt(vp)) {
 544  544                  error = rfs_cross_mnt(&vp, &exi);
 545  545                  if (error)
 546  546                          VN_RELE(vp);
 547  547          }
 548  548  
 549  549          if (!error) {
 550  550                  va.va_mask = AT_ALL;    /* we want everything */
 551  551  
 552  552                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 553  553  
 554  554                  /* check for overflows */
 555  555                  if (!error) {
 556  556                          acl_perm(vp, exi, &va, cr);
 557  557                          error = vattr_to_nattr(&va, &dr->dr_attr);
 558  558                          if (!error) {
 559  559                                  if (sec.sec_flags & SEC_QUERY)
 560  560                                          error = makefh_ol(&dr->dr_fhandle, exi,
 561  561                                              sec.sec_index);
 562  562                                  else {
 563  563                                          error = makefh(&dr->dr_fhandle, vp,
 564  564                                              exi);
 565  565                                          if (!error && publicfh_flag &&
 566  566                                              !chk_clnt_sec(exi, req))
 567  567                                                  auth_weak = TRUE;
 568  568                                  }
 569  569                          }
 570  570                  }
 571  571                  VN_RELE(vp);
 572  572          }
 573  573  
 574  574  out:
 575  575          VN_RELE(dvp);
 576  576  
 577  577          if (exi != NULL)
 578  578                  exi_rele(exi);
 579  579  
 580  580          /*
 581  581           * If it's public fh, no 0x81, and client's flavor is
 582  582           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 583  583           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 584  584           */
 585  585          if (auth_weak)
 586  586                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 587  587          else
 588  588                  dr->dr_status = puterrno(error);
 589  589  }
 590  590  void *
 591  591  rfs_lookup_getfh(struct nfsdiropargs *da)
 592  592  {
 593  593          return (da->da_fhandle);
 594  594  }
 595  595  
 596  596  /*
 597  597   * Read symbolic link.
 598  598   * Returns the string in the symbolic link at the given fhandle.
 599  599   */
 600  600  /* ARGSUSED */
 601  601  void
 602  602  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 603  603      struct svc_req *req, cred_t *cr, bool_t ro)
 604  604  {
 605  605          int error;
 606  606          struct iovec iov;
 607  607          struct uio uio;
 608  608          vnode_t *vp;
 609  609          struct vattr va;
 610  610          struct sockaddr *ca;
 611  611          char *name = NULL;
 612  612          int is_referral = 0;
 613  613  
 614  614          vp = nfs_fhtovp(fhp, exi);
 615  615          if (vp == NULL) {
 616  616                  rl->rl_data = NULL;
 617  617                  rl->rl_status = NFSERR_STALE;
 618  618                  return;
 619  619          }
 620  620  
 621  621          va.va_mask = AT_MODE;
 622  622  
 623  623          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 624  624  
 625  625          if (error) {
 626  626                  VN_RELE(vp);
 627  627                  rl->rl_data = NULL;
 628  628                  rl->rl_status = puterrno(error);
 629  629                  return;
 630  630          }
 631  631  
 632  632          if (MANDLOCK(vp, va.va_mode)) {
 633  633                  VN_RELE(vp);
 634  634                  rl->rl_data = NULL;
 635  635                  rl->rl_status = NFSERR_ACCES;
 636  636                  return;
 637  637          }
 638  638  
 639  639          /* We lied about the object type for a referral */
 640  640          if (vn_is_nfs_reparse(vp, cr))
 641  641                  is_referral = 1;
 642  642  
 643  643          /*
 644  644           * XNFS and RFC1094 require us to return ENXIO if argument
 645  645           * is not a link. BUGID 1138002.
 646  646           */
 647  647          if (vp->v_type != VLNK && !is_referral) {
 648  648                  VN_RELE(vp);
 649  649                  rl->rl_data = NULL;
 650  650                  rl->rl_status = NFSERR_NXIO;
 651  651                  return;
 652  652          }
 653  653  
 654  654          /*
 655  655           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 656  656           */
 657  657          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 658  658  
 659  659          if (is_referral) {
 660  660                  char *s;
 661  661                  size_t strsz;
 662  662  
 663  663                  /* Get an artificial symlink based on a referral */
 664  664                  s = build_symlink(vp, cr, &strsz);
 665  665                  global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 666  666                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 667  667                      vnode_t *, vp, char *, s);
 668  668                  if (s == NULL)
 669  669                          error = EINVAL;
 670  670                  else {
 671  671                          error = 0;
 672  672                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 673  673                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 674  674                          kmem_free(s, strsz);
 675  675                  }
 676  676  
 677  677          } else {
 678  678  
 679  679                  /*
 680  680                   * Set up io vector to read sym link data
 681  681                   */
 682  682                  iov.iov_base = rl->rl_data;
 683  683                  iov.iov_len = NFS_MAXPATHLEN;
 684  684                  uio.uio_iov = &iov;
 685  685                  uio.uio_iovcnt = 1;
 686  686                  uio.uio_segflg = UIO_SYSSPACE;
 687  687                  uio.uio_extflg = UIO_COPY_CACHED;
 688  688                  uio.uio_loffset = (offset_t)0;
 689  689                  uio.uio_resid = NFS_MAXPATHLEN;
 690  690  
 691  691                  /*
 692  692                   * Do the readlink.
 693  693                   */
 694  694                  error = VOP_READLINK(vp, &uio, cr, NULL);
 695  695  
 696  696                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 697  697  
 698  698                  if (!error)
 699  699                          rl->rl_data[rl->rl_count] = '\0';
 700  700  
 701  701          }
 702  702  
 703  703  
 704  704          VN_RELE(vp);
 705  705  
 706  706          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 707  707          name = nfscmd_convname(ca, exi, rl->rl_data,
 708  708              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 709  709  
 710  710          if (name != NULL && name != rl->rl_data) {
 711  711                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 712  712                  rl->rl_data = name;
 713  713          }
 714  714  
 715  715          /*
 716  716           * XNFS and RFC1094 require us to return ENXIO if argument
 717  717           * is not a link. UFS returns EINVAL if this is the case,
 718  718           * so we do the mapping here. BUGID 1138002.
 719  719           */
 720  720          if (error == EINVAL)
 721  721                  rl->rl_status = NFSERR_NXIO;
 722  722          else
 723  723                  rl->rl_status = puterrno(error);
 724  724  
 725  725  }
 726  726  void *
 727  727  rfs_readlink_getfh(fhandle_t *fhp)
 728  728  {
 729  729          return (fhp);
 730  730  }
 731  731  /*
 732  732   * Free data allocated by rfs_readlink
 733  733   */
 734  734  void
 735  735  rfs_rlfree(struct nfsrdlnres *rl)
 736  736  {
 737  737          if (rl->rl_data != NULL)
 738  738                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 739  739  }
 740  740  
 741  741  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 742  742  
 743  743  /*
 744  744   * Read data.
 745  745   * Returns some data read from the file at the given fhandle.
 746  746   */
 747  747  /* ARGSUSED */
 748  748  void
 749  749  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 750  750      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 751  751  {
 752  752          vnode_t *vp;
 753  753          int error;
 754  754          struct vattr va;
 755  755          struct iovec iov;
 756  756          struct uio uio;
 757  757          mblk_t *mp;
 758  758          int alloc_err = 0;
 759  759          int in_crit = 0;
 760  760          caller_context_t ct;
 761  761  
 762  762          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 763  763          if (vp == NULL) {
 764  764                  rr->rr_data = NULL;
 765  765                  rr->rr_status = NFSERR_STALE;
 766  766                  return;
 767  767          }
 768  768  
 769  769          if (vp->v_type != VREG) {
 770  770                  VN_RELE(vp);
 771  771                  rr->rr_data = NULL;
 772  772                  rr->rr_status = NFSERR_ISDIR;
 773  773                  return;
 774  774          }
 775  775  
 776  776          ct.cc_sysid = 0;
 777  777          ct.cc_pid = 0;
 778  778          ct.cc_caller_id = nfs2_srv_caller_id;
 779  779          ct.cc_flags = CC_DONTBLOCK;
 780  780  
 781  781          /*
 782  782           * Enter the critical region before calling VOP_RWLOCK
 783  783           * to avoid a deadlock with write requests.
 784  784           */
 785  785          if (nbl_need_check(vp)) {
 786  786                  nbl_start_crit(vp, RW_READER);
 787  787                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 788  788                      0, NULL)) {
 789  789                          nbl_end_crit(vp);
 790  790                          VN_RELE(vp);
 791  791                          rr->rr_data = NULL;
 792  792                          rr->rr_status = NFSERR_ACCES;
 793  793                          return;
 794  794                  }
 795  795                  in_crit = 1;
 796  796          }
 797  797  
 798  798          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 799  799  
 800  800          /* check if a monitor detected a delegation conflict */
 801  801          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 802  802                  if (in_crit)
 803  803                          nbl_end_crit(vp);
 804  804                  VN_RELE(vp);
 805  805                  /* mark as wouldblock so response is dropped */
 806  806                  curthread->t_flag |= T_WOULDBLOCK;
 807  807  
 808  808                  rr->rr_data = NULL;
 809  809                  return;
 810  810          }
 811  811  
 812  812          va.va_mask = AT_ALL;
 813  813  
 814  814          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 815  815  
 816  816          if (error) {
 817  817                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 818  818                  if (in_crit)
 819  819                          nbl_end_crit(vp);
 820  820  
 821  821                  VN_RELE(vp);
 822  822                  rr->rr_data = NULL;
 823  823                  rr->rr_status = puterrno(error);
 824  824  
 825  825                  return;
 826  826          }
 827  827  
 828  828          /*
 829  829           * This is a kludge to allow reading of files created
 830  830           * with no read permission.  The owner of the file
 831  831           * is always allowed to read it.
 832  832           */
 833  833          if (crgetuid(cr) != va.va_uid) {
 834  834                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 835  835  
 836  836                  if (error) {
 837  837                          /*
 838  838                           * Exec is the same as read over the net because
 839  839                           * of demand loading.
 840  840                           */
 841  841                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 842  842                  }
 843  843                  if (error) {
 844  844                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 845  845                          if (in_crit)
 846  846                                  nbl_end_crit(vp);
 847  847                          VN_RELE(vp);
 848  848                          rr->rr_data = NULL;
 849  849                          rr->rr_status = puterrno(error);
 850  850  
 851  851                          return;
 852  852                  }
 853  853          }
 854  854  
 855  855          if (MANDLOCK(vp, va.va_mode)) {
 856  856                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 857  857                  if (in_crit)
 858  858                          nbl_end_crit(vp);
 859  859  
 860  860                  VN_RELE(vp);
 861  861                  rr->rr_data = NULL;
 862  862                  rr->rr_status = NFSERR_ACCES;
 863  863  
 864  864                  return;
 865  865          }
 866  866  
 867  867          rr->rr_ok.rrok_wlist_len = 0;
 868  868          rr->rr_ok.rrok_wlist = NULL;
 869  869  
 870  870          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 871  871                  rr->rr_count = 0;
 872  872                  rr->rr_data = NULL;
 873  873                  /*
 874  874                   * In this case, status is NFS_OK, but there is no data
 875  875                   * to encode. So set rr_mp to NULL.
 876  876                   */
 877  877                  rr->rr_mp = NULL;
 878  878                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 879  879                  if (rr->rr_ok.rrok_wlist)
 880  880                          clist_zero_len(rr->rr_ok.rrok_wlist);
 881  881                  goto done;
 882  882          }
 883  883  
 884  884          if (ra->ra_wlist) {
 885  885                  mp = NULL;
 886  886                  rr->rr_mp = NULL;
 887  887                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 888  888                  if (ra->ra_count > iov.iov_len) {
 889  889                          rr->rr_data = NULL;
 890  890                          rr->rr_status = NFSERR_INVAL;
 891  891                          goto done;
 892  892                  }
 893  893          } else {
 894  894                  /*
 895  895                   * mp will contain the data to be sent out in the read reply.
 896  896                   * This will be freed after the reply has been sent out (by the
 897  897                   * driver).
 898  898                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 899  899                   * that the call to xdrmblk_putmblk() never fails.
 900  900                   */
 901  901                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 902  902                      &alloc_err);
 903  903                  ASSERT(mp != NULL);
 904  904                  ASSERT(alloc_err == 0);
 905  905  
 906  906                  rr->rr_mp = mp;
 907  907  
 908  908                  /*
 909  909                   * Set up io vector
 910  910                   */
 911  911                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 912  912                  iov.iov_len = ra->ra_count;
 913  913          }
 914  914  
 915  915          uio.uio_iov = &iov;
 916  916          uio.uio_iovcnt = 1;
 917  917          uio.uio_segflg = UIO_SYSSPACE;
 918  918          uio.uio_extflg = UIO_COPY_CACHED;
 919  919          uio.uio_loffset = (offset_t)ra->ra_offset;
 920  920          uio.uio_resid = ra->ra_count;
 921  921  
 922  922          error = VOP_READ(vp, &uio, 0, cr, &ct);
 923  923  
 924  924          if (error) {
 925  925                  if (mp)
 926  926                          freeb(mp);
 927  927  
 928  928                  /*
 929  929                   * check if a monitor detected a delegation conflict and
 930  930                   * mark as wouldblock so response is dropped
 931  931                   */
 932  932                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 933  933                          curthread->t_flag |= T_WOULDBLOCK;
 934  934                  else
 935  935                          rr->rr_status = puterrno(error);
 936  936  
 937  937                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 938  938                  if (in_crit)
 939  939                          nbl_end_crit(vp);
 940  940  
 941  941                  VN_RELE(vp);
 942  942                  rr->rr_data = NULL;
 943  943  
 944  944                  return;
 945  945          }
 946  946  
 947  947          /*
 948  948           * Get attributes again so we can send the latest access
 949  949           * time to the client side for its cache.
 950  950           */
 951  951          va.va_mask = AT_ALL;
 952  952  
 953  953          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 954  954  
 955  955          if (error) {
 956  956                  if (mp)
 957  957                          freeb(mp);
 958  958  
 959  959                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 960  960                  if (in_crit)
 961  961                          nbl_end_crit(vp);
 962  962  
 963  963                  VN_RELE(vp);
 964  964                  rr->rr_data = NULL;
 965  965                  rr->rr_status = puterrno(error);
 966  966  
 967  967                  return;
 968  968          }
 969  969  
 970  970          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 971  971  
 972  972          if (mp) {
 973  973                  rr->rr_data = (char *)mp->b_datap->db_base;
 974  974          } else {
 975  975                  if (ra->ra_wlist) {
 976  976                          rr->rr_data = (caddr_t)iov.iov_base;
 977  977                          if (!rdma_setup_read_data2(ra, rr)) {
 978  978                                  rr->rr_data = NULL;
 979  979                                  rr->rr_status = puterrno(NFSERR_INVAL);
 980  980                          }
 981  981                  }
 982  982          }
 983  983  done:
 984  984          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 985  985          if (in_crit)
 986  986                  nbl_end_crit(vp);
 987  987  
 988  988          acl_perm(vp, exi, &va, cr);
 989  989  
 990  990          /* check for overflows */
 991  991          error = vattr_to_nattr(&va, &rr->rr_attr);
 992  992  
 993  993          VN_RELE(vp);
 994  994  
 995  995          rr->rr_status = puterrno(error);
 996  996  }
 997  997  
 998  998  /*
 999  999   * Free data allocated by rfs_read
1000 1000   */
1001 1001  void
1002 1002  rfs_rdfree(struct nfsrdresult *rr)
1003 1003  {
1004 1004          mblk_t *mp;
1005 1005  
1006 1006          if (rr->rr_status == NFS_OK) {
1007 1007                  mp = rr->rr_mp;
1008 1008                  if (mp != NULL)
1009 1009                          freeb(mp);
1010 1010          }
1011 1011  }
1012 1012  
1013 1013  void *
1014 1014  rfs_read_getfh(struct nfsreadargs *ra)
1015 1015  {
1016 1016          return (&ra->ra_fhandle);
1017 1017  }
1018 1018  
1019 1019  #define MAX_IOVECS      12
1020 1020  
1021 1021  #ifdef DEBUG
1022 1022  static int rfs_write_sync_hits = 0;
1023 1023  static int rfs_write_sync_misses = 0;
1024 1024  #endif
1025 1025  
1026 1026  /*
1027 1027   * Write data to file.
1028 1028   * Returns attributes of a file after writing some data to it.
1029 1029   *
1030 1030   * Any changes made here, especially in error handling might have
1031 1031   * to also be done in rfs_write (which clusters write requests).
1032 1032   */
1033 1033  /* ARGSUSED */
1034 1034  void
1035 1035  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1036 1036      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1037 1037  {
1038 1038          int error;
1039 1039          vnode_t *vp;
1040 1040          rlim64_t rlimit;
1041 1041          struct vattr va;
1042 1042          struct uio uio;
1043 1043          struct iovec iov[MAX_IOVECS];
1044 1044          mblk_t *m;
1045 1045          struct iovec *iovp;
1046 1046          int iovcnt;
1047 1047          cred_t *savecred;
1048 1048          int in_crit = 0;
1049 1049          caller_context_t ct;
1050 1050  
1051 1051          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1052 1052          if (vp == NULL) {
1053 1053                  ns->ns_status = NFSERR_STALE;
1054 1054                  return;
1055 1055          }
1056 1056  
1057 1057          if (rdonly(ro, vp)) {
1058 1058                  VN_RELE(vp);
1059 1059                  ns->ns_status = NFSERR_ROFS;
1060 1060                  return;
1061 1061          }
1062 1062  
1063 1063          if (vp->v_type != VREG) {
1064 1064                  VN_RELE(vp);
1065 1065                  ns->ns_status = NFSERR_ISDIR;
1066 1066                  return;
1067 1067          }
1068 1068  
1069 1069          ct.cc_sysid = 0;
1070 1070          ct.cc_pid = 0;
1071 1071          ct.cc_caller_id = nfs2_srv_caller_id;
1072 1072          ct.cc_flags = CC_DONTBLOCK;
1073 1073  
1074 1074          va.va_mask = AT_UID|AT_MODE;
1075 1075  
1076 1076          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1077 1077  
1078 1078          if (error) {
1079 1079                  VN_RELE(vp);
1080 1080                  ns->ns_status = puterrno(error);
1081 1081  
1082 1082                  return;
1083 1083          }
1084 1084  
1085 1085          if (crgetuid(cr) != va.va_uid) {
1086 1086                  /*
1087 1087                   * This is a kludge to allow writes of files created
1088 1088                   * with read only permission.  The owner of the file
1089 1089                   * is always allowed to write it.
1090 1090                   */
1091 1091                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1092 1092  
1093 1093                  if (error) {
1094 1094                          VN_RELE(vp);
1095 1095                          ns->ns_status = puterrno(error);
1096 1096                          return;
1097 1097                  }
1098 1098          }
1099 1099  
1100 1100          /*
1101 1101           * Can't access a mandatory lock file.  This might cause
1102 1102           * the NFS service thread to block forever waiting for a
1103 1103           * lock to be released that will never be released.
1104 1104           */
1105 1105          if (MANDLOCK(vp, va.va_mode)) {
1106 1106                  VN_RELE(vp);
1107 1107                  ns->ns_status = NFSERR_ACCES;
1108 1108                  return;
1109 1109          }
1110 1110  
1111 1111          /*
1112 1112           * We have to enter the critical region before calling VOP_RWLOCK
1113 1113           * to avoid a deadlock with ufs.
1114 1114           */
1115 1115          if (nbl_need_check(vp)) {
1116 1116                  nbl_start_crit(vp, RW_READER);
1117 1117                  in_crit = 1;
1118 1118                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1119 1119                      wa->wa_count, 0, NULL)) {
1120 1120                          error = EACCES;
1121 1121                          goto out;
1122 1122                  }
1123 1123          }
1124 1124  
1125 1125          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1126 1126  
1127 1127          /* check if a monitor detected a delegation conflict */
1128 1128          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1129 1129                  goto out;
1130 1130          }
1131 1131  
1132 1132          if (wa->wa_data || wa->wa_rlist) {
1133 1133                  /* Do the RDMA thing if necessary */
1134 1134                  if (wa->wa_rlist) {
1135 1135                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1136 1136                          iov[0].iov_len = wa->wa_count;
1137 1137                  } else  {
1138 1138                          iov[0].iov_base = wa->wa_data;
1139 1139                          iov[0].iov_len = wa->wa_count;
1140 1140                  }
1141 1141                  uio.uio_iov = iov;
1142 1142                  uio.uio_iovcnt = 1;
1143 1143                  uio.uio_segflg = UIO_SYSSPACE;
1144 1144                  uio.uio_extflg = UIO_COPY_DEFAULT;
1145 1145                  uio.uio_loffset = (offset_t)wa->wa_offset;
1146 1146                  uio.uio_resid = wa->wa_count;
1147 1147                  /*
1148 1148                   * The limit is checked on the client. We
1149 1149                   * should allow any size writes here.
1150 1150                   */
1151 1151                  uio.uio_llimit = curproc->p_fsz_ctl;
1152 1152                  rlimit = uio.uio_llimit - wa->wa_offset;
1153 1153                  if (rlimit < (rlim64_t)uio.uio_resid)
1154 1154                          uio.uio_resid = (uint_t)rlimit;
1155 1155  
1156 1156                  /*
1157 1157                   * for now we assume no append mode
1158 1158                   */
1159 1159                  /*
1160 1160                   * We're changing creds because VM may fault and we need
1161 1161                   * the cred of the current thread to be used if quota
1162 1162                   * checking is enabled.
1163 1163                   */
1164 1164                  savecred = curthread->t_cred;
1165 1165                  curthread->t_cred = cr;
1166 1166                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1167 1167                  curthread->t_cred = savecred;
1168 1168          } else {
1169 1169  
1170 1170                  iovcnt = 0;
1171 1171                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1172 1172                          iovcnt++;
1173 1173                  if (iovcnt <= MAX_IOVECS) {
1174 1174  #ifdef DEBUG
1175 1175                          rfs_write_sync_hits++;
1176 1176  #endif
1177 1177                          iovp = iov;
1178 1178                  } else {
1179 1179  #ifdef DEBUG
1180 1180                          rfs_write_sync_misses++;
1181 1181  #endif
1182 1182                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1183 1183                  }
1184 1184                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1185 1185                  uio.uio_iov = iovp;
1186 1186                  uio.uio_iovcnt = iovcnt;
1187 1187                  uio.uio_segflg = UIO_SYSSPACE;
1188 1188                  uio.uio_extflg = UIO_COPY_DEFAULT;
1189 1189                  uio.uio_loffset = (offset_t)wa->wa_offset;
1190 1190                  uio.uio_resid = wa->wa_count;
1191 1191                  /*
1192 1192                   * The limit is checked on the client. We
1193 1193                   * should allow any size writes here.
1194 1194                   */
1195 1195                  uio.uio_llimit = curproc->p_fsz_ctl;
1196 1196                  rlimit = uio.uio_llimit - wa->wa_offset;
1197 1197                  if (rlimit < (rlim64_t)uio.uio_resid)
1198 1198                          uio.uio_resid = (uint_t)rlimit;
1199 1199  
1200 1200                  /*
1201 1201                   * For now we assume no append mode.
1202 1202                   */
1203 1203                  /*
1204 1204                   * We're changing creds because VM may fault and we need
1205 1205                   * the cred of the current thread to be used if quota
1206 1206                   * checking is enabled.
1207 1207                   */
1208 1208                  savecred = curthread->t_cred;
1209 1209                  curthread->t_cred = cr;
1210 1210                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1211 1211                  curthread->t_cred = savecred;
1212 1212  
1213 1213                  if (iovp != iov)
1214 1214                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1215 1215          }
1216 1216  
1217 1217          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1218 1218  
1219 1219          if (!error) {
1220 1220                  /*
1221 1221                   * Get attributes again so we send the latest mod
1222 1222                   * time to the client side for its cache.
1223 1223                   */
1224 1224                  va.va_mask = AT_ALL;    /* now we want everything */
1225 1225  
1226 1226                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1227 1227  
1228 1228                  /* check for overflows */
1229 1229                  if (!error) {
1230 1230                          acl_perm(vp, exi, &va, cr);
1231 1231                          error = vattr_to_nattr(&va, &ns->ns_attr);
1232 1232                  }
1233 1233          }
1234 1234  
1235 1235  out:
1236 1236          if (in_crit)
1237 1237                  nbl_end_crit(vp);
1238 1238          VN_RELE(vp);
1239 1239  
1240 1240          /* check if a monitor detected a delegation conflict */
1241 1241          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1242 1242                  /* mark as wouldblock so response is dropped */
1243 1243                  curthread->t_flag |= T_WOULDBLOCK;
1244 1244          else
1245 1245                  ns->ns_status = puterrno(error);
1246 1246  
1247 1247  }
1248 1248  
1249 1249  struct rfs_async_write {
1250 1250          struct nfswriteargs *wa;
1251 1251          struct nfsattrstat *ns;
1252 1252          struct svc_req *req;
1253 1253          cred_t *cr;
1254 1254          bool_t ro;
1255 1255          kthread_t *thread;
1256 1256          struct rfs_async_write *list;
1257 1257  };
1258 1258  
1259 1259  struct rfs_async_write_list {
1260 1260          fhandle_t *fhp;
1261 1261          kcondvar_t cv;
1262 1262          struct rfs_async_write *list;
1263 1263          struct rfs_async_write_list *next;
1264 1264  };
1265 1265  
1266 1266  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1267 1267  static kmutex_t rfs_async_write_lock;
1268 1268  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1269 1269  
1270 1270  #define MAXCLIOVECS     42
1271 1271  #define RFSWRITE_INITVAL (enum nfsstat) -1
1272 1272  
1273 1273  #ifdef DEBUG
1274 1274  static int rfs_write_hits = 0;
1275 1275  static int rfs_write_misses = 0;
1276 1276  #endif
1277 1277  
1278 1278  /*
1279 1279   * Write data to file.
1280 1280   * Returns attributes of a file after writing some data to it.
1281 1281   */
1282 1282  void
1283 1283  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1284 1284      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1285 1285  {
1286 1286          int error;
1287 1287          vnode_t *vp;
1288 1288          rlim64_t rlimit;
1289 1289          struct vattr va;
1290 1290          struct uio uio;
1291 1291          struct rfs_async_write_list *lp;
1292 1292          struct rfs_async_write_list *nlp;
1293 1293          struct rfs_async_write *rp;
1294 1294          struct rfs_async_write *nrp;
1295 1295          struct rfs_async_write *trp;
1296 1296          struct rfs_async_write *lrp;
1297 1297          int data_written;
1298 1298          int iovcnt;
1299 1299          mblk_t *m;
1300 1300          struct iovec *iovp;
1301 1301          struct iovec *niovp;
1302 1302          struct iovec iov[MAXCLIOVECS];
1303 1303          int count;
1304 1304          int rcount;
1305 1305          uint_t off;
1306 1306          uint_t len;
1307 1307          struct rfs_async_write nrpsp;
1308 1308          struct rfs_async_write_list nlpsp;
1309 1309          ushort_t t_flag;
1310 1310          cred_t *savecred;
1311 1311          int in_crit = 0;
1312 1312          caller_context_t ct;
1313 1313          nfs_srv_t *nsrv;
1314 1314  
1315 1315          nsrv = zone_getspecific(rfs_zone_key, curzone);
1316 1316          if (!nsrv->write_async) {
1317 1317                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1318 1318                  return;
1319 1319          }
1320 1320  
1321 1321          /*
1322 1322           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1323 1323           * is considered an OK.
1324 1324           */
1325 1325          ns->ns_status = RFSWRITE_INITVAL;
1326 1326  
1327 1327          nrp = &nrpsp;
1328 1328          nrp->wa = wa;
1329 1329          nrp->ns = ns;
1330 1330          nrp->req = req;
1331 1331          nrp->cr = cr;
1332 1332          nrp->ro = ro;
1333 1333          nrp->thread = curthread;
1334 1334  
1335 1335          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1336 1336  
1337 1337          /*
1338 1338           * Look to see if there is already a cluster started
1339 1339           * for this file.
1340 1340           */
1341 1341          mutex_enter(&nsrv->async_write_lock);
1342 1342          for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1343 1343                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1344 1344                      sizeof (fhandle_t)) == 0)
1345 1345                          break;
1346 1346          }
1347 1347  
1348 1348          /*
1349 1349           * If lp is non-NULL, then there is already a cluster
1350 1350           * started.  We need to place ourselves in the cluster
1351 1351           * list in the right place as determined by starting
1352 1352           * offset.  Conflicts with non-blocking mandatory locked
1353 1353           * regions will be checked when the cluster is processed.
1354 1354           */
1355 1355          if (lp != NULL) {
1356 1356                  rp = lp->list;
1357 1357                  trp = NULL;
1358 1358                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1359 1359                          trp = rp;
1360 1360                          rp = rp->list;
1361 1361                  }
1362 1362                  nrp->list = rp;
1363 1363                  if (trp == NULL)
1364 1364                          lp->list = nrp;
1365 1365                  else
1366 1366                          trp->list = nrp;
1367 1367                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1368 1368                          cv_wait(&lp->cv, &nsrv->async_write_lock);
1369 1369                  mutex_exit(&nsrv->async_write_lock);
1370 1370  
1371 1371                  return;
1372 1372          }
1373 1373  
1374 1374          /*
1375 1375           * No cluster started yet, start one and add ourselves
1376 1376           * to the list of clusters.
1377 1377           */
1378 1378          nrp->list = NULL;
1379 1379  
1380 1380          nlp = &nlpsp;
1381 1381          nlp->fhp = &wa->wa_fhandle;
1382 1382          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1383 1383          nlp->list = nrp;
1384 1384          nlp->next = NULL;
1385 1385  
1386 1386          if (nsrv->async_write_head == NULL) {
1387 1387                  nsrv->async_write_head = nlp;
1388 1388          } else {
1389 1389                  lp = nsrv->async_write_head;
1390 1390                  while (lp->next != NULL)
1391 1391                          lp = lp->next;
1392 1392                  lp->next = nlp;
1393 1393          }
1394 1394          mutex_exit(&nsrv->async_write_lock);
1395 1395  
1396 1396          /*
1397 1397           * Convert the file handle common to all of the requests
1398 1398           * in this cluster to a vnode.
1399 1399           */
1400 1400          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1401 1401          if (vp == NULL) {
1402 1402                  mutex_enter(&nsrv->async_write_lock);
1403 1403                  if (nsrv->async_write_head == nlp)
1404 1404                          nsrv->async_write_head = nlp->next;
1405 1405                  else {
1406 1406                          lp = nsrv->async_write_head;
1407 1407                          while (lp->next != nlp)
1408 1408                                  lp = lp->next;
1409 1409                          lp->next = nlp->next;
1410 1410                  }
1411 1411                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1412 1412                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413 1413                          rp->ns->ns_status = NFSERR_STALE;
1414 1414                          rp->thread->t_flag |= t_flag;
1415 1415                  }
1416 1416                  cv_broadcast(&nlp->cv);
1417 1417                  mutex_exit(&nsrv->async_write_lock);
1418 1418  
1419 1419                  return;
1420 1420          }
1421 1421  
1422 1422          /*
1423 1423           * Can only write regular files.  Attempts to write any
1424 1424           * other file types fail with EISDIR.
1425 1425           */
1426 1426          if (vp->v_type != VREG) {
1427 1427                  VN_RELE(vp);
1428 1428                  mutex_enter(&nsrv->async_write_lock);
1429 1429                  if (nsrv->async_write_head == nlp)
1430 1430                          nsrv->async_write_head = nlp->next;
1431 1431                  else {
1432 1432                          lp = nsrv->async_write_head;
1433 1433                          while (lp->next != nlp)
1434 1434                                  lp = lp->next;
1435 1435                          lp->next = nlp->next;
1436 1436                  }
1437 1437                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1438 1438                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1439 1439                          rp->ns->ns_status = NFSERR_ISDIR;
1440 1440                          rp->thread->t_flag |= t_flag;
1441 1441                  }
1442 1442                  cv_broadcast(&nlp->cv);
1443 1443                  mutex_exit(&nsrv->async_write_lock);
1444 1444  
1445 1445                  return;
1446 1446          }
1447 1447  
1448 1448          /*
1449 1449           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1450 1450           * deadlock with ufs.
1451 1451           */
1452 1452          if (nbl_need_check(vp)) {
1453 1453                  nbl_start_crit(vp, RW_READER);
1454 1454                  in_crit = 1;
1455 1455          }
1456 1456  
1457 1457          ct.cc_sysid = 0;
1458 1458          ct.cc_pid = 0;
1459 1459          ct.cc_caller_id = nfs2_srv_caller_id;
1460 1460          ct.cc_flags = CC_DONTBLOCK;
1461 1461  
1462 1462          /*
1463 1463           * Lock the file for writing.  This operation provides
1464 1464           * the delay which allows clusters to grow.
1465 1465           */
1466 1466          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1467 1467  
1468 1468          /* check if a monitor detected a delegation conflict */
1469 1469          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1470 1470                  if (in_crit)
1471 1471                          nbl_end_crit(vp);
1472 1472                  VN_RELE(vp);
1473 1473                  /* mark as wouldblock so response is dropped */
1474 1474                  curthread->t_flag |= T_WOULDBLOCK;
1475 1475                  mutex_enter(&nsrv->async_write_lock);
1476 1476                  if (nsrv->async_write_head == nlp)
1477 1477                          nsrv->async_write_head = nlp->next;
1478 1478                  else {
1479 1479                          lp = nsrv->async_write_head;
1480 1480                          while (lp->next != nlp)
1481 1481                                  lp = lp->next;
1482 1482                          lp->next = nlp->next;
1483 1483                  }
1484 1484                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1485 1485                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1486 1486                                  rp->ns->ns_status = puterrno(error);
1487 1487                                  rp->thread->t_flag |= T_WOULDBLOCK;
1488 1488                          }
1489 1489                  }
1490 1490                  cv_broadcast(&nlp->cv);
1491 1491                  mutex_exit(&nsrv->async_write_lock);
1492 1492  
1493 1493                  return;
1494 1494          }
1495 1495  
1496 1496          /*
1497 1497           * Disconnect this cluster from the list of clusters.
1498 1498           * The cluster that is being dealt with must be fixed
1499 1499           * in size after this point, so there is no reason
1500 1500           * to leave it on the list so that new requests can
1501 1501           * find it.
1502 1502           *
1503 1503           * The algorithm is that the first write request will
1504 1504           * create a cluster, convert the file handle to a
1505 1505           * vnode pointer, and then lock the file for writing.
1506 1506           * This request is not likely to be clustered with
1507 1507           * any others.  However, the next request will create
1508 1508           * a new cluster and be blocked in VOP_RWLOCK while
1509 1509           * the first request is being processed.  This delay
1510 1510           * will allow more requests to be clustered in this
1511 1511           * second cluster.
1512 1512           */
1513 1513          mutex_enter(&nsrv->async_write_lock);
1514 1514          if (nsrv->async_write_head == nlp)
1515 1515                  nsrv->async_write_head = nlp->next;
1516 1516          else {
1517 1517                  lp = nsrv->async_write_head;
1518 1518                  while (lp->next != nlp)
1519 1519                          lp = lp->next;
1520 1520                  lp->next = nlp->next;
1521 1521          }
1522 1522          mutex_exit(&nsrv->async_write_lock);
1523 1523  
1524 1524          /*
1525 1525           * Step through the list of requests in this cluster.
1526 1526           * We need to check permissions to make sure that all
1527 1527           * of the requests have sufficient permission to write
1528 1528           * the file.  A cluster can be composed of requests
1529 1529           * from different clients and different users on each
1530 1530           * client.
1531 1531           *
1532 1532           * As a side effect, we also calculate the size of the
1533 1533           * byte range that this cluster encompasses.
1534 1534           */
1535 1535          rp = nlp->list;
1536 1536          off = rp->wa->wa_offset;
1537 1537          len = (uint_t)0;
1538 1538          do {
1539 1539                  if (rdonly(rp->ro, vp)) {
1540 1540                          rp->ns->ns_status = NFSERR_ROFS;
1541 1541                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1542 1542                          rp->thread->t_flag |= t_flag;
1543 1543                          continue;
1544 1544                  }
1545 1545  
1546 1546                  va.va_mask = AT_UID|AT_MODE;
1547 1547  
1548 1548                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1549 1549  
1550 1550                  if (!error) {
1551 1551                          if (crgetuid(rp->cr) != va.va_uid) {
1552 1552                                  /*
1553 1553                                   * This is a kludge to allow writes of files
1554 1554                                   * created with read only permission.  The
1555 1555                                   * owner of the file is always allowed to
1556 1556                                   * write it.
1557 1557                                   */
1558 1558                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1559 1559                          }
1560 1560                          if (!error && MANDLOCK(vp, va.va_mode))
1561 1561                                  error = EACCES;
1562 1562                  }
1563 1563  
1564 1564                  /*
1565 1565                   * Check for a conflict with a nbmand-locked region.
1566 1566                   */
1567 1567                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1568 1568                      rp->wa->wa_count, 0, NULL)) {
1569 1569                          error = EACCES;
1570 1570                  }
1571 1571  
1572 1572                  if (error) {
1573 1573                          rp->ns->ns_status = puterrno(error);
1574 1574                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1575 1575                          rp->thread->t_flag |= t_flag;
1576 1576                          continue;
1577 1577                  }
1578 1578                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1579 1579                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1580 1580          } while ((rp = rp->list) != NULL);
1581 1581  
1582 1582          /*
1583 1583           * Step through the cluster attempting to gather as many
1584 1584           * requests which are contiguous as possible.  These
1585 1585           * contiguous requests are handled via one call to VOP_WRITE
1586 1586           * instead of different calls to VOP_WRITE.  We also keep
1587 1587           * track of the fact that any data was written.
1588 1588           */
1589 1589          rp = nlp->list;
1590 1590          data_written = 0;
1591 1591          do {
1592 1592                  /*
1593 1593                   * Skip any requests which are already marked as having an
1594 1594                   * error.
1595 1595                   */
1596 1596                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1597 1597                          rp = rp->list;
1598 1598                          continue;
1599 1599                  }
1600 1600  
1601 1601                  /*
1602 1602                   * Count the number of iovec's which are required
1603 1603                   * to handle this set of requests.  One iovec is
1604 1604                   * needed for each data buffer, whether addressed
1605 1605                   * by wa_data or by the b_rptr pointers in the
1606 1606                   * mblk chains.
1607 1607                   */
1608 1608                  iovcnt = 0;
1609 1609                  lrp = rp;
1610 1610                  for (;;) {
1611 1611                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1612 1612                                  iovcnt++;
1613 1613                          else {
1614 1614                                  m = lrp->wa->wa_mblk;
1615 1615                                  while (m != NULL) {
1616 1616                                          iovcnt++;
1617 1617                                          m = m->b_cont;
1618 1618                                  }
1619 1619                          }
1620 1620                          if (lrp->list == NULL ||
1621 1621                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1622 1622                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1623 1623                              lrp->list->wa->wa_offset) {
1624 1624                                  lrp = lrp->list;
1625 1625                                  break;
1626 1626                          }
1627 1627                          lrp = lrp->list;
1628 1628                  }
1629 1629  
1630 1630                  if (iovcnt <= MAXCLIOVECS) {
1631 1631  #ifdef DEBUG
1632 1632                          rfs_write_hits++;
1633 1633  #endif
1634 1634                          niovp = iov;
1635 1635                  } else {
1636 1636  #ifdef DEBUG
1637 1637                          rfs_write_misses++;
1638 1638  #endif
1639 1639                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1640 1640                  }
1641 1641                  /*
1642 1642                   * Put together the scatter/gather iovecs.
1643 1643                   */
1644 1644                  iovp = niovp;
1645 1645                  trp = rp;
1646 1646                  count = 0;
1647 1647                  do {
1648 1648                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1649 1649                                  if (trp->wa->wa_rlist) {
1650 1650                                          iovp->iov_base =
1651 1651                                              (char *)((trp->wa->wa_rlist)->
1652 1652                                              u.c_daddr3);
1653 1653                                          iovp->iov_len = trp->wa->wa_count;
1654 1654                                  } else  {
1655 1655                                          iovp->iov_base = trp->wa->wa_data;
1656 1656                                          iovp->iov_len = trp->wa->wa_count;
1657 1657                                  }
1658 1658                                  iovp++;
1659 1659                          } else {
1660 1660                                  m = trp->wa->wa_mblk;
1661 1661                                  rcount = trp->wa->wa_count;
1662 1662                                  while (m != NULL) {
1663 1663                                          iovp->iov_base = (caddr_t)m->b_rptr;
1664 1664                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1665 1665                                          rcount -= iovp->iov_len;
1666 1666                                          if (rcount < 0)
1667 1667                                                  iovp->iov_len += rcount;
1668 1668                                          iovp++;
1669 1669                                          if (rcount <= 0)
1670 1670                                                  break;
1671 1671                                          m = m->b_cont;
1672 1672                                  }
1673 1673                          }
1674 1674                          count += trp->wa->wa_count;
1675 1675                          trp = trp->list;
1676 1676                  } while (trp != lrp);
1677 1677  
1678 1678                  uio.uio_iov = niovp;
1679 1679                  uio.uio_iovcnt = iovcnt;
1680 1680                  uio.uio_segflg = UIO_SYSSPACE;
1681 1681                  uio.uio_extflg = UIO_COPY_DEFAULT;
1682 1682                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1683 1683                  uio.uio_resid = count;
1684 1684                  /*
1685 1685                   * The limit is checked on the client. We
1686 1686                   * should allow any size writes here.
1687 1687                   */
1688 1688                  uio.uio_llimit = curproc->p_fsz_ctl;
1689 1689                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1690 1690                  if (rlimit < (rlim64_t)uio.uio_resid)
1691 1691                          uio.uio_resid = (uint_t)rlimit;
1692 1692  
1693 1693                  /*
1694 1694                   * For now we assume no append mode.
1695 1695                   */
1696 1696  
1697 1697                  /*
1698 1698                   * We're changing creds because VM may fault
1699 1699                   * and we need the cred of the current
1700 1700                   * thread to be used if quota * checking is
1701 1701                   * enabled.
1702 1702                   */
1703 1703                  savecred = curthread->t_cred;
1704 1704                  curthread->t_cred = cr;
1705 1705                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1706 1706                  curthread->t_cred = savecred;
1707 1707  
1708 1708                  /* check if a monitor detected a delegation conflict */
1709 1709                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1710 1710                          /* mark as wouldblock so response is dropped */
1711 1711                          curthread->t_flag |= T_WOULDBLOCK;
1712 1712  
1713 1713                  if (niovp != iov)
1714 1714                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1715 1715  
1716 1716                  if (!error) {
1717 1717                          data_written = 1;
1718 1718                          /*
1719 1719                           * Get attributes again so we send the latest mod
1720 1720                           * time to the client side for its cache.
1721 1721                           */
1722 1722                          va.va_mask = AT_ALL;    /* now we want everything */
1723 1723  
1724 1724                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1725 1725  
1726 1726                          if (!error)
1727 1727                                  acl_perm(vp, exi, &va, rp->cr);
1728 1728                  }
1729 1729  
1730 1730                  /*
1731 1731                   * Fill in the status responses for each request
1732 1732                   * which was just handled.  Also, copy the latest
1733 1733                   * attributes in to the attribute responses if
1734 1734                   * appropriate.
1735 1735                   */
1736 1736                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1737 1737                  do {
1738 1738                          rp->thread->t_flag |= t_flag;
1739 1739                          /* check for overflows */
1740 1740                          if (!error) {
1741 1741                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1742 1742                          }
1743 1743                          rp->ns->ns_status = puterrno(error);
1744 1744                          rp = rp->list;
1745 1745                  } while (rp != lrp);
1746 1746          } while (rp != NULL);
1747 1747  
1748 1748          /*
1749 1749           * If any data was written at all, then we need to flush
1750 1750           * the data and metadata to stable storage.
1751 1751           */
1752 1752          if (data_written) {
1753 1753                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1754 1754  
1755 1755                  if (!error) {
1756 1756                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1757 1757                  }
1758 1758          }
1759 1759  
1760 1760          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1761 1761  
1762 1762          if (in_crit)
1763 1763                  nbl_end_crit(vp);
1764 1764          VN_RELE(vp);
1765 1765  
1766 1766          t_flag = curthread->t_flag & T_WOULDBLOCK;
1767 1767          mutex_enter(&nsrv->async_write_lock);
1768 1768          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1769 1769                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1770 1770                          rp->ns->ns_status = puterrno(error);
1771 1771                          rp->thread->t_flag |= t_flag;
1772 1772                  }
1773 1773          }
1774 1774          cv_broadcast(&nlp->cv);
1775 1775          mutex_exit(&nsrv->async_write_lock);
1776 1776  
1777 1777  }
1778 1778  
1779 1779  void *
1780 1780  rfs_write_getfh(struct nfswriteargs *wa)
1781 1781  {
1782 1782          return (&wa->wa_fhandle);
1783 1783  }
1784 1784  
1785 1785  /*
1786 1786   * Create a file.
1787 1787   * Creates a file with given attributes and returns those attributes
1788 1788   * and an fhandle for the new file.
1789 1789   */
1790 1790  void
1791 1791  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1792 1792      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1793 1793  {
1794 1794          int error;
1795 1795          int lookuperr;
1796 1796          int in_crit = 0;
1797 1797          struct vattr va;
1798 1798          vnode_t *vp;
1799 1799          vnode_t *realvp;
1800 1800          vnode_t *dvp;
1801 1801          char *name = args->ca_da.da_name;
1802 1802          vnode_t *tvp = NULL;
1803 1803          int mode;
1804 1804          int lookup_ok;
1805 1805          bool_t trunc;
1806 1806          struct sockaddr *ca;
1807 1807  
1808 1808          /*
1809 1809           * Disallow NULL paths
1810 1810           */
1811 1811          if (name == NULL || *name == '\0') {
1812 1812                  dr->dr_status = NFSERR_ACCES;
1813 1813                  return;
1814 1814          }
1815 1815  
1816 1816          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1817 1817          if (dvp == NULL) {
1818 1818                  dr->dr_status = NFSERR_STALE;
1819 1819                  return;
1820 1820          }
1821 1821  
1822 1822          error = sattr_to_vattr(args->ca_sa, &va);
1823 1823          if (error) {
1824 1824                  dr->dr_status = puterrno(error);
1825 1825                  return;
1826 1826          }
1827 1827  
1828 1828          /*
1829 1829           * Must specify the mode.
1830 1830           */
1831 1831          if (!(va.va_mask & AT_MODE)) {
1832 1832                  VN_RELE(dvp);
1833 1833                  dr->dr_status = NFSERR_INVAL;
1834 1834                  return;
1835 1835          }
1836 1836  
1837 1837          /*
1838 1838           * This is a completely gross hack to make mknod
1839 1839           * work over the wire until we can wack the protocol
1840 1840           */
1841 1841          if ((va.va_mode & IFMT) == IFCHR) {
1842 1842                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1843 1843                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1844 1844                  else {
1845 1845                          va.va_type = VCHR;
1846 1846                          /*
1847 1847                           * uncompress the received dev_t
1848 1848                           * if the top half is zero indicating a request
1849 1849                           * from an `older style' OS.
1850 1850                           */
1851 1851                          if ((va.va_size & 0xffff0000) == 0)
1852 1852                                  va.va_rdev = nfsv2_expdev(va.va_size);
1853 1853                          else
1854 1854                                  va.va_rdev = (dev_t)va.va_size;
1855 1855                  }
1856 1856                  va.va_mask &= ~AT_SIZE;
1857 1857          } else if ((va.va_mode & IFMT) == IFBLK) {
1858 1858                  va.va_type = VBLK;
1859 1859                  /*
1860 1860                   * uncompress the received dev_t
1861 1861                   * if the top half is zero indicating a request
1862 1862                   * from an `older style' OS.
1863 1863                   */
1864 1864                  if ((va.va_size & 0xffff0000) == 0)
1865 1865                          va.va_rdev = nfsv2_expdev(va.va_size);
1866 1866                  else
1867 1867                          va.va_rdev = (dev_t)va.va_size;
1868 1868                  va.va_mask &= ~AT_SIZE;
1869 1869          } else if ((va.va_mode & IFMT) == IFSOCK) {
1870 1870                  va.va_type = VSOCK;
1871 1871          } else {
1872 1872                  va.va_type = VREG;
1873 1873          }
1874 1874          va.va_mode &= ~IFMT;
1875 1875          va.va_mask |= AT_TYPE;
1876 1876  
1877 1877          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1878 1878          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1879 1879              MAXPATHLEN);
1880 1880          if (name == NULL) {
1881 1881                  dr->dr_status = puterrno(EINVAL);
1882 1882                  return;
1883 1883          }
1884 1884  
1885 1885          /*
1886 1886           * Why was the choice made to use VWRITE as the mode to the
1887 1887           * call to VOP_CREATE ? This results in a bug.  When a client
1888 1888           * opens a file that already exists and is RDONLY, the second
1889 1889           * open fails with an EACESS because of the mode.
1890 1890           * bug ID 1054648.
1891 1891           */
1892 1892          lookup_ok = 0;
1893 1893          mode = VWRITE;
1894 1894          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1895 1895                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1896 1896                      NULL, NULL, NULL);
1897 1897                  if (!error) {
1898 1898                          struct vattr at;
1899 1899  
1900 1900                          lookup_ok = 1;
1901 1901                          at.va_mask = AT_MODE;
1902 1902                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1903 1903                          if (!error)
1904 1904                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1905 1905                          VN_RELE(tvp);
1906 1906                          tvp = NULL;
1907 1907                  }
1908 1908          }
1909 1909  
1910 1910          if (!lookup_ok) {
1911 1911                  if (rdonly(ro, dvp)) {
1912 1912                          error = EROFS;
1913 1913                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1914 1914                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1915 1915                          error = EPERM;
1916 1916                  } else {
1917 1917                          error = 0;
1918 1918                  }
1919 1919          }
1920 1920  
1921 1921          /*
1922 1922           * If file size is being modified on an already existing file
1923 1923           * make sure that there are no conflicting non-blocking mandatory
1924 1924           * locks in the region being manipulated. Return EACCES if there
1925 1925           * are conflicting locks.
1926 1926           */
1927 1927          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1928 1928                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1929 1929                      NULL, NULL, NULL);
1930 1930  
1931 1931                  if (!lookuperr &&
1932 1932                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1933 1933                          VN_RELE(tvp);
1934 1934                          curthread->t_flag |= T_WOULDBLOCK;
1935 1935                          goto out;
1936 1936                  }
1937 1937  
1938 1938                  if (!lookuperr && nbl_need_check(tvp)) {
1939 1939                          /*
1940 1940                           * The file exists. Now check if it has any
1941 1941                           * conflicting non-blocking mandatory locks
1942 1942                           * in the region being changed.
1943 1943                           */
1944 1944                          struct vattr bva;
1945 1945                          u_offset_t offset;
1946 1946                          ssize_t length;
1947 1947  
1948 1948                          nbl_start_crit(tvp, RW_READER);
1949 1949                          in_crit = 1;
1950 1950  
1951 1951                          bva.va_mask = AT_SIZE;
1952 1952                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1953 1953                          if (!error) {
1954 1954                                  if (va.va_size < bva.va_size) {
1955 1955                                          offset = va.va_size;
1956 1956                                          length = bva.va_size - va.va_size;
1957 1957                                  } else {
1958 1958                                          offset = bva.va_size;
1959 1959                                          length = va.va_size - bva.va_size;
1960 1960                                  }
1961 1961                                  if (length) {
1962 1962                                          if (nbl_conflict(tvp, NBL_WRITE,
1963 1963                                              offset, length, 0, NULL)) {
1964 1964                                                  error = EACCES;
1965 1965                                          }
1966 1966                                  }
1967 1967                          }
1968 1968                          if (error) {
1969 1969                                  nbl_end_crit(tvp);
1970 1970                                  VN_RELE(tvp);
1971 1971                                  in_crit = 0;
1972 1972                          }
1973 1973                  } else if (tvp != NULL) {
1974 1974                          VN_RELE(tvp);
1975 1975                  }
1976 1976          }
1977 1977  
1978 1978          if (!error) {
1979 1979                  /*
1980 1980                   * If filesystem is shared with nosuid the remove any
1981 1981                   * setuid/setgid bits on create.
1982 1982                   */
1983 1983                  if (va.va_type == VREG &&
1984 1984                      exi->exi_export.ex_flags & EX_NOSUID)
1985 1985                          va.va_mode &= ~(VSUID | VSGID);
1986 1986  
1987 1987                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1988 1988                      NULL, NULL);
1989 1989  
1990 1990                  if (!error) {
1991 1991  
1992 1992                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1993 1993                                  trunc = TRUE;
1994 1994                          else
1995 1995                                  trunc = FALSE;
1996 1996  
1997 1997                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1998 1998                                  VN_RELE(vp);
1999 1999                                  curthread->t_flag |= T_WOULDBLOCK;
2000 2000                                  goto out;
2001 2001                          }
2002 2002                          va.va_mask = AT_ALL;
2003 2003  
2004 2004                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2005 2005  
2006 2006                          /* check for overflows */
2007 2007                          if (!error) {
2008 2008                                  acl_perm(vp, exi, &va, cr);
2009 2009                                  error = vattr_to_nattr(&va, &dr->dr_attr);
2010 2010                                  if (!error) {
2011 2011                                          error = makefh(&dr->dr_fhandle, vp,
2012 2012                                              exi);
2013 2013                                  }
2014 2014                          }
2015 2015                          /*
2016 2016                           * Force modified metadata out to stable storage.
2017 2017                           *
2018 2018                           * if a underlying vp exists, pass it to VOP_FSYNC
2019 2019                           */
2020 2020                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
2021 2021                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2022 2022                          else
2023 2023                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2024 2024                          VN_RELE(vp);
2025 2025                  }
2026 2026  
2027 2027                  if (in_crit) {
2028 2028                          nbl_end_crit(tvp);
2029 2029                          VN_RELE(tvp);
2030 2030                  }
2031 2031          }
2032 2032  
2033 2033          /*
2034 2034           * Force modified data and metadata out to stable storage.
2035 2035           */
2036 2036          (void) VOP_FSYNC(dvp, 0, cr, NULL);
2037 2037  
2038 2038  out:
2039 2039  
2040 2040          VN_RELE(dvp);
2041 2041  
2042 2042          dr->dr_status = puterrno(error);
2043 2043  
2044 2044          if (name != args->ca_da.da_name)
2045 2045                  kmem_free(name, MAXPATHLEN);
2046 2046  }
2047 2047  void *
2048 2048  rfs_create_getfh(struct nfscreatargs *args)
2049 2049  {
2050 2050          return (args->ca_da.da_fhandle);
2051 2051  }
2052 2052  
2053 2053  /*
2054 2054   * Remove a file.
2055 2055   * Remove named file from parent directory.
2056 2056   */
2057 2057  /* ARGSUSED */
2058 2058  void
2059 2059  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2060 2060      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2061 2061  {
2062 2062          int error = 0;
2063 2063          vnode_t *vp;
2064 2064          vnode_t *targvp;
2065 2065          int in_crit = 0;
2066 2066  
2067 2067          /*
2068 2068           * Disallow NULL paths
2069 2069           */
2070 2070          if (da->da_name == NULL || *da->da_name == '\0') {
2071 2071                  *status = NFSERR_ACCES;
2072 2072                  return;
2073 2073          }
2074 2074  
2075 2075          vp = nfs_fhtovp(da->da_fhandle, exi);
2076 2076          if (vp == NULL) {
2077 2077                  *status = NFSERR_STALE;
2078 2078                  return;
2079 2079          }
2080 2080  
2081 2081          if (rdonly(ro, vp)) {
2082 2082                  VN_RELE(vp);
2083 2083                  *status = NFSERR_ROFS;
2084 2084                  return;
2085 2085          }
2086 2086  
2087 2087          /*
2088 2088           * Check for a conflict with a non-blocking mandatory share reservation.
2089 2089           */
2090 2090          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2091 2091              NULL, cr, NULL, NULL, NULL);
2092 2092          if (error != 0) {
2093 2093                  VN_RELE(vp);
2094 2094                  *status = puterrno(error);
2095 2095                  return;
2096 2096          }
2097 2097  
2098 2098          /*
2099 2099           * If the file is delegated to an v4 client, then initiate
2100 2100           * recall and drop this request (by setting T_WOULDBLOCK).
2101 2101           * The client will eventually re-transmit the request and
2102 2102           * (hopefully), by then, the v4 client will have returned
2103 2103           * the delegation.
2104 2104           */
2105 2105  
2106 2106          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2107 2107                  VN_RELE(vp);
2108 2108                  VN_RELE(targvp);
2109 2109                  curthread->t_flag |= T_WOULDBLOCK;
2110 2110                  return;
2111 2111          }
2112 2112  
2113 2113          if (nbl_need_check(targvp)) {
2114 2114                  nbl_start_crit(targvp, RW_READER);
2115 2115                  in_crit = 1;
2116 2116                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2117 2117                          error = EACCES;
2118 2118                          goto out;
2119 2119                  }
2120 2120          }
2121 2121  
2122 2122          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2123 2123  
2124 2124          /*
2125 2125           * Force modified data and metadata out to stable storage.
2126 2126           */
2127 2127          (void) VOP_FSYNC(vp, 0, cr, NULL);
2128 2128  
2129 2129  out:
2130 2130          if (in_crit)
2131 2131                  nbl_end_crit(targvp);
2132 2132          VN_RELE(targvp);
2133 2133          VN_RELE(vp);
2134 2134  
2135 2135          *status = puterrno(error);
2136 2136  
2137 2137  }
2138 2138  
2139 2139  void *
2140 2140  rfs_remove_getfh(struct nfsdiropargs *da)
2141 2141  {
2142 2142          return (da->da_fhandle);
2143 2143  }
2144 2144  
2145 2145  /*
2146 2146   * rename a file
2147 2147   * Give a file (from) a new name (to).
2148 2148   */
2149 2149  /* ARGSUSED */
2150 2150  void
2151 2151  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2152 2152      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2153 2153  {
2154 2154          int error = 0;
2155 2155          vnode_t *fromvp;
2156 2156          vnode_t *tovp;
2157 2157          struct exportinfo *to_exi;
2158 2158          fhandle_t *fh;
2159 2159          vnode_t *srcvp;
2160 2160          vnode_t *targvp;
2161 2161          int in_crit = 0;
2162 2162  
2163 2163          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2164 2164          if (fromvp == NULL) {
2165 2165                  *status = NFSERR_STALE;
2166 2166                  return;
2167 2167          }
2168 2168  
2169 2169          fh = args->rna_to.da_fhandle;
2170 2170          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2171 2171          if (to_exi == NULL) {
2172 2172                  VN_RELE(fromvp);
2173 2173                  *status = NFSERR_ACCES;
2174 2174                  return;
2175 2175          }
2176 2176          exi_rele(to_exi);
2177 2177  
2178 2178          if (to_exi != exi) {
2179 2179                  VN_RELE(fromvp);
2180 2180                  *status = NFSERR_XDEV;
2181 2181                  return;
2182 2182          }
2183 2183  
2184 2184          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2185 2185          if (tovp == NULL) {
2186 2186                  VN_RELE(fromvp);
2187 2187                  *status = NFSERR_STALE;
2188 2188                  return;
2189 2189          }
2190 2190  
2191 2191          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2192 2192                  VN_RELE(tovp);
2193 2193                  VN_RELE(fromvp);
2194 2194                  *status = NFSERR_NOTDIR;
2195 2195                  return;
2196 2196          }
2197 2197  
2198 2198          /*
2199 2199           * Disallow NULL paths
2200 2200           */
2201 2201          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2202 2202              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2203 2203                  VN_RELE(tovp);
2204 2204                  VN_RELE(fromvp);
2205 2205                  *status = NFSERR_ACCES;
2206 2206                  return;
2207 2207          }
2208 2208  
2209 2209          if (rdonly(ro, tovp)) {
2210 2210                  VN_RELE(tovp);
2211 2211                  VN_RELE(fromvp);
2212 2212                  *status = NFSERR_ROFS;
2213 2213                  return;
2214 2214          }
2215 2215  
2216 2216          /*
2217 2217           * Check for a conflict with a non-blocking mandatory share reservation.
2218 2218           */
2219 2219          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2220 2220              NULL, cr, NULL, NULL, NULL);
2221 2221          if (error != 0) {
2222 2222                  VN_RELE(tovp);
2223 2223                  VN_RELE(fromvp);
2224 2224                  *status = puterrno(error);
2225 2225                  return;
2226 2226          }
2227 2227  
2228 2228          /* Check for delegations on the source file */
2229 2229  
2230 2230          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2231 2231                  VN_RELE(tovp);
2232 2232                  VN_RELE(fromvp);
2233 2233                  VN_RELE(srcvp);
2234 2234                  curthread->t_flag |= T_WOULDBLOCK;
2235 2235                  return;
2236 2236          }
2237 2237  
2238 2238          /* Check for delegation on the file being renamed over, if it exists */
2239 2239  
2240 2240          if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2241 2241              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2242 2242              NULL, NULL, NULL) == 0) {
2243 2243  
2244 2244                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2245 2245                          VN_RELE(tovp);
2246 2246                          VN_RELE(fromvp);
2247 2247                          VN_RELE(srcvp);
2248 2248                          VN_RELE(targvp);
2249 2249                          curthread->t_flag |= T_WOULDBLOCK;
2250 2250                          return;
2251 2251                  }
2252 2252                  VN_RELE(targvp);
2253 2253          }
2254 2254  
2255 2255  
2256 2256          if (nbl_need_check(srcvp)) {
2257 2257                  nbl_start_crit(srcvp, RW_READER);
2258 2258                  in_crit = 1;
2259 2259                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2260 2260                          error = EACCES;
2261 2261                          goto out;
2262 2262                  }
2263 2263          }
2264 2264  
2265 2265          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2266 2266              tovp, args->rna_to.da_name, cr, NULL, 0);
2267 2267  
2268 2268          if (error == 0)
2269 2269                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2270 2270                      strlen(args->rna_to.da_name));
2271 2271  
2272 2272          /*
2273 2273           * Force modified data and metadata out to stable storage.
2274 2274           */
2275 2275          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2276 2276          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2277 2277  
2278 2278  out:
2279 2279          if (in_crit)
2280 2280                  nbl_end_crit(srcvp);
2281 2281          VN_RELE(srcvp);
2282 2282          VN_RELE(tovp);
2283 2283          VN_RELE(fromvp);
2284 2284  
2285 2285          *status = puterrno(error);
2286 2286  
2287 2287  }
2288 2288  void *
2289 2289  rfs_rename_getfh(struct nfsrnmargs *args)
2290 2290  {
2291 2291          return (args->rna_from.da_fhandle);
2292 2292  }
2293 2293  
2294 2294  /*
2295 2295   * Link to a file.
2296 2296   * Create a file (to) which is a hard link to the given file (from).
2297 2297   */
2298 2298  /* ARGSUSED */
2299 2299  void
2300 2300  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2301 2301      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2302 2302  {
2303 2303          int error;
2304 2304          vnode_t *fromvp;
2305 2305          vnode_t *tovp;
2306 2306          struct exportinfo *to_exi;
2307 2307          fhandle_t *fh;
2308 2308  
2309 2309          fromvp = nfs_fhtovp(args->la_from, exi);
2310 2310          if (fromvp == NULL) {
2311 2311                  *status = NFSERR_STALE;
2312 2312                  return;
2313 2313          }
2314 2314  
2315 2315          fh = args->la_to.da_fhandle;
2316 2316          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2317 2317          if (to_exi == NULL) {
2318 2318                  VN_RELE(fromvp);
2319 2319                  *status = NFSERR_ACCES;
2320 2320                  return;
2321 2321          }
2322 2322          exi_rele(to_exi);
2323 2323  
2324 2324          if (to_exi != exi) {
2325 2325                  VN_RELE(fromvp);
2326 2326                  *status = NFSERR_XDEV;
2327 2327                  return;
2328 2328          }
2329 2329  
2330 2330          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2331 2331          if (tovp == NULL) {
2332 2332                  VN_RELE(fromvp);
2333 2333                  *status = NFSERR_STALE;
2334 2334                  return;
2335 2335          }
2336 2336  
2337 2337          if (tovp->v_type != VDIR) {
2338 2338                  VN_RELE(tovp);
2339 2339                  VN_RELE(fromvp);
2340 2340                  *status = NFSERR_NOTDIR;
2341 2341                  return;
2342 2342          }
2343 2343          /*
2344 2344           * Disallow NULL paths
2345 2345           */
2346 2346          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2347 2347                  VN_RELE(tovp);
2348 2348                  VN_RELE(fromvp);
2349 2349                  *status = NFSERR_ACCES;
2350 2350                  return;
2351 2351          }
2352 2352  
2353 2353          if (rdonly(ro, tovp)) {
2354 2354                  VN_RELE(tovp);
2355 2355                  VN_RELE(fromvp);
2356 2356                  *status = NFSERR_ROFS;
2357 2357                  return;
2358 2358          }
2359 2359  
2360 2360          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2361 2361  
2362 2362          /*
2363 2363           * Force modified data and metadata out to stable storage.
2364 2364           */
2365 2365          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2366 2366          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2367 2367  
2368 2368          VN_RELE(tovp);
2369 2369          VN_RELE(fromvp);
2370 2370  
2371 2371          *status = puterrno(error);
2372 2372  
2373 2373  }
2374 2374  void *
2375 2375  rfs_link_getfh(struct nfslinkargs *args)
2376 2376  {
2377 2377          return (args->la_from);
2378 2378  }
2379 2379  
2380 2380  /*
2381 2381   * Symbolicly link to a file.
2382 2382   * Create a file (to) with the given attributes which is a symbolic link
2383 2383   * to the given path name (to).
2384 2384   */
2385 2385  void
2386 2386  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2387 2387      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2388 2388  {
2389 2389          int error;
2390 2390          struct vattr va;
2391 2391          vnode_t *vp;
2392 2392          vnode_t *svp;
2393 2393          int lerror;
2394 2394          struct sockaddr *ca;
2395 2395          char *name = NULL;
2396 2396  
2397 2397          /*
2398 2398           * Disallow NULL paths
2399 2399           */
2400 2400          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2401 2401                  *status = NFSERR_ACCES;
2402 2402                  return;
2403 2403          }
2404 2404  
2405 2405          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2406 2406          if (vp == NULL) {
2407 2407                  *status = NFSERR_STALE;
2408 2408                  return;
2409 2409          }
2410 2410  
2411 2411          if (rdonly(ro, vp)) {
2412 2412                  VN_RELE(vp);
2413 2413                  *status = NFSERR_ROFS;
2414 2414                  return;
2415 2415          }
2416 2416  
2417 2417          error = sattr_to_vattr(args->sla_sa, &va);
2418 2418          if (error) {
2419 2419                  VN_RELE(vp);
2420 2420                  *status = puterrno(error);
2421 2421                  return;
2422 2422          }
2423 2423  
2424 2424          if (!(va.va_mask & AT_MODE)) {
2425 2425                  VN_RELE(vp);
2426 2426                  *status = NFSERR_INVAL;
2427 2427                  return;
2428 2428          }
2429 2429  
2430 2430          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2431 2431          name = nfscmd_convname(ca, exi, args->sla_tnm,
2432 2432              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2433 2433  
2434 2434          if (name == NULL) {
2435 2435                  *status = NFSERR_ACCES;
2436 2436                  return;
2437 2437          }
2438 2438  
2439 2439          va.va_type = VLNK;
2440 2440          va.va_mask |= AT_TYPE;
2441 2441  
2442 2442          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2443 2443  
2444 2444          /*
2445 2445           * Force new data and metadata out to stable storage.
2446 2446           */
2447 2447          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2448 2448              NULL, cr, NULL, NULL, NULL);
2449 2449  
2450 2450          if (!lerror) {
2451 2451                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2452 2452                  VN_RELE(svp);
2453 2453          }
2454 2454  
2455 2455          /*
2456 2456           * Force modified data and metadata out to stable storage.
2457 2457           */
2458 2458          (void) VOP_FSYNC(vp, 0, cr, NULL);
2459 2459  
2460 2460          VN_RELE(vp);
2461 2461  
2462 2462          *status = puterrno(error);
2463 2463          if (name != args->sla_tnm)
2464 2464                  kmem_free(name, MAXPATHLEN);
2465 2465  
2466 2466  }
2467 2467  void *
2468 2468  rfs_symlink_getfh(struct nfsslargs *args)
2469 2469  {
2470 2470          return (args->sla_from.da_fhandle);
2471 2471  }
2472 2472  
2473 2473  /*
2474 2474   * Make a directory.
2475 2475   * Create a directory with the given name, parent directory, and attributes.
2476 2476   * Returns a file handle and attributes for the new directory.
2477 2477   */
2478 2478  /* ARGSUSED */
2479 2479  void
2480 2480  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2481 2481      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2482 2482  {
2483 2483          int error;
2484 2484          struct vattr va;
2485 2485          vnode_t *dvp = NULL;
2486 2486          vnode_t *vp;
2487 2487          char *name = args->ca_da.da_name;
2488 2488  
2489 2489          /*
2490 2490           * Disallow NULL paths
2491 2491           */
2492 2492          if (name == NULL || *name == '\0') {
2493 2493                  dr->dr_status = NFSERR_ACCES;
2494 2494                  return;
2495 2495          }
2496 2496  
2497 2497          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2498 2498          if (vp == NULL) {
2499 2499                  dr->dr_status = NFSERR_STALE;
2500 2500                  return;
2501 2501          }
2502 2502  
2503 2503          if (rdonly(ro, vp)) {
2504 2504                  VN_RELE(vp);
2505 2505                  dr->dr_status = NFSERR_ROFS;
2506 2506                  return;
2507 2507          }
2508 2508  
2509 2509          error = sattr_to_vattr(args->ca_sa, &va);
2510 2510          if (error) {
2511 2511                  VN_RELE(vp);
2512 2512                  dr->dr_status = puterrno(error);
2513 2513                  return;
2514 2514          }
2515 2515  
2516 2516          if (!(va.va_mask & AT_MODE)) {
2517 2517                  VN_RELE(vp);
2518 2518                  dr->dr_status = NFSERR_INVAL;
2519 2519                  return;
2520 2520          }
2521 2521  
2522 2522          va.va_type = VDIR;
2523 2523          va.va_mask |= AT_TYPE;
2524 2524  
2525 2525          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2526 2526  
2527 2527          if (!error) {
2528 2528                  /*
2529 2529                   * Attribtutes of the newly created directory should
2530 2530                   * be returned to the client.
2531 2531                   */
2532 2532                  va.va_mask = AT_ALL; /* We want everything */
2533 2533                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2534 2534  
2535 2535                  /* check for overflows */
2536 2536                  if (!error) {
2537 2537                          acl_perm(vp, exi, &va, cr);
2538 2538                          error = vattr_to_nattr(&va, &dr->dr_attr);
2539 2539                          if (!error) {
2540 2540                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2541 2541                          }
2542 2542                  }
2543 2543                  /*
2544 2544                   * Force new data and metadata out to stable storage.
2545 2545                   */
2546 2546                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2547 2547                  VN_RELE(dvp);
2548 2548          }
2549 2549  
2550 2550          /*
2551 2551           * Force modified data and metadata out to stable storage.
2552 2552           */
2553 2553          (void) VOP_FSYNC(vp, 0, cr, NULL);
2554 2554  
2555 2555          VN_RELE(vp);
2556 2556  
2557 2557          dr->dr_status = puterrno(error);
2558 2558  
2559 2559  }
2560 2560  void *
2561 2561  rfs_mkdir_getfh(struct nfscreatargs *args)
2562 2562  {
2563 2563          return (args->ca_da.da_fhandle);
2564 2564  }
2565 2565  
2566 2566  /*
2567 2567   * Remove a directory.
2568 2568   * Remove the given directory name from the given parent directory.
2569 2569   */
2570 2570  /* ARGSUSED */
2571 2571  void
2572 2572  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2573 2573      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2574 2574  {
2575 2575          int error;
2576 2576          vnode_t *vp;
2577 2577  
2578 2578          /*
2579 2579           * Disallow NULL paths
2580 2580           */
2581 2581          if (da->da_name == NULL || *da->da_name == '\0') {
2582 2582                  *status = NFSERR_ACCES;
2583 2583                  return;
2584 2584          }
2585 2585  
2586 2586          vp = nfs_fhtovp(da->da_fhandle, exi);
2587 2587          if (vp == NULL) {
2588 2588                  *status = NFSERR_STALE;
2589 2589                  return;
2590 2590          }
2591 2591  
2592 2592          if (rdonly(ro, vp)) {
2593 2593                  VN_RELE(vp);
2594 2594                  *status = NFSERR_ROFS;
2595 2595                  return;
2596 2596          }
2597 2597  
2598 2598          /*
2599 2599           * VOP_RMDIR takes a third argument (the current
2600 2600           * directory of the process).  That's because someone
2601 2601           * wants to return EINVAL if one tries to remove ".".
2602 2602           * Of course, NFS servers have no idea what their
2603 2603           * clients' current directories are.  We fake it by
2604 2604           * supplying a vnode known to exist and illegal to
2605 2605           * remove.
2606 2606           */
2607 2607          error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2608 2608  
2609 2609          /*
2610 2610           * Force modified data and metadata out to stable storage.
2611 2611           */
2612 2612          (void) VOP_FSYNC(vp, 0, cr, NULL);
2613 2613  
2614 2614          VN_RELE(vp);
2615 2615  
2616 2616          /*
2617 2617           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2618 2618           * if the directory is not empty.  A System V NFS server
2619 2619           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2620 2620           * over the wire.
2621 2621           */
2622 2622          if (error == EEXIST)
2623 2623                  *status = NFSERR_NOTEMPTY;
2624 2624          else
2625 2625                  *status = puterrno(error);
2626 2626  
2627 2627  }
2628 2628  void *
2629 2629  rfs_rmdir_getfh(struct nfsdiropargs *da)
2630 2630  {
2631 2631          return (da->da_fhandle);
2632 2632  }
2633 2633  
2634 2634  /* ARGSUSED */
2635 2635  void
2636 2636  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2637 2637      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2638 2638  {
2639 2639          int error;
2640 2640          int iseof;
2641 2641          struct iovec iov;
2642 2642          struct uio uio;
2643 2643          vnode_t *vp;
2644 2644          char *ndata = NULL;
2645 2645          struct sockaddr *ca;
2646 2646          size_t nents;
2647 2647          int ret;
2648 2648  
2649 2649          vp = nfs_fhtovp(&rda->rda_fh, exi);
2650 2650          if (vp == NULL) {
2651 2651                  rd->rd_entries = NULL;
2652 2652                  rd->rd_status = NFSERR_STALE;
2653 2653                  return;
2654 2654          }
2655 2655  
2656 2656          if (vp->v_type != VDIR) {
2657 2657                  VN_RELE(vp);
2658 2658                  rd->rd_entries = NULL;
2659 2659                  rd->rd_status = NFSERR_NOTDIR;
2660 2660                  return;
2661 2661          }
2662 2662  
2663 2663          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2664 2664  
2665 2665          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2666 2666  
2667 2667          if (error) {
2668 2668                  rd->rd_entries = NULL;
2669 2669                  goto bad;
2670 2670          }
2671 2671  
2672 2672          if (rda->rda_count == 0) {
2673 2673                  rd->rd_entries = NULL;
2674 2674                  rd->rd_size = 0;
2675 2675                  rd->rd_eof = FALSE;
2676 2676                  goto bad;
2677 2677          }
2678 2678  
2679 2679          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2680 2680  
2681 2681          /*
2682 2682           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2683 2683           */
2684 2684          rd->rd_bufsize = (uint_t)rda->rda_count;
2685 2685          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2686 2686  
2687 2687          /*
2688 2688           * Set up io vector to read directory data
2689 2689           */
2690 2690          iov.iov_base = (caddr_t)rd->rd_entries;
2691 2691          iov.iov_len = rda->rda_count;
2692 2692          uio.uio_iov = &iov;
2693 2693          uio.uio_iovcnt = 1;
2694 2694          uio.uio_segflg = UIO_SYSSPACE;
2695 2695          uio.uio_extflg = UIO_COPY_CACHED;
2696 2696          uio.uio_loffset = (offset_t)rda->rda_offset;
2697 2697          uio.uio_resid = rda->rda_count;
2698 2698  
2699 2699          /*
2700 2700           * read directory
2701 2701           */
2702 2702          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2703 2703  
2704 2704          /*
2705 2705           * Clean up
2706 2706           */
2707 2707          if (!error) {
2708 2708                  /*
2709 2709                   * set size and eof
2710 2710                   */
2711 2711                  if (uio.uio_resid == rda->rda_count) {
2712 2712                          rd->rd_size = 0;
2713 2713                          rd->rd_eof = TRUE;
2714 2714                  } else {
2715 2715                          rd->rd_size = (uint32_t)(rda->rda_count -
2716 2716                              uio.uio_resid);
2717 2717                          rd->rd_eof = iseof ? TRUE : FALSE;
2718 2718                  }
2719 2719          }
2720 2720  
2721 2721          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2722 2722          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2723 2723          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2724 2724              rda->rda_count, &ndata);
2725 2725  
2726 2726          if (ret != 0) {
2727 2727                  size_t dropbytes;
2728 2728                  /*
2729 2729                   * We had to drop one or more entries in order to fit
2730 2730                   * during the character conversion.  We need to patch
2731 2731                   * up the size and eof info.
2732 2732                   */
2733 2733                  if (rd->rd_eof)
2734 2734                          rd->rd_eof = FALSE;
2735 2735                  dropbytes = nfscmd_dropped_entrysize(
2736 2736                      (struct dirent64 *)rd->rd_entries, nents, ret);
2737 2737                  rd->rd_size -= dropbytes;
2738 2738          }
2739 2739          if (ndata == NULL) {
2740 2740                  ndata = (char *)rd->rd_entries;
2741 2741          } else if (ndata != (char *)rd->rd_entries) {
2742 2742                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2743 2743                  rd->rd_entries = (void *)ndata;
2744 2744                  rd->rd_bufsize = rda->rda_count;
2745 2745          }
2746 2746  
2747 2747  bad:
2748 2748          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2749 2749  
2750 2750  #if 0 /* notyet */
2751 2751          /*
2752 2752           * Don't do this.  It causes local disk writes when just
2753 2753           * reading the file and the overhead is deemed larger
2754 2754           * than the benefit.
2755 2755           */
2756 2756          /*
2757 2757           * Force modified metadata out to stable storage.
2758 2758           */
2759 2759          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2760 2760  #endif
2761 2761  
2762 2762          VN_RELE(vp);
2763 2763  
2764 2764          rd->rd_status = puterrno(error);
2765 2765  
2766 2766  }
2767 2767  void *
2768 2768  rfs_readdir_getfh(struct nfsrddirargs *rda)
2769 2769  {
2770 2770          return (&rda->rda_fh);
2771 2771  }
2772 2772  void
2773 2773  rfs_rddirfree(struct nfsrddirres *rd)
2774 2774  {
2775 2775          if (rd->rd_entries != NULL)
2776 2776                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2777 2777  }
2778 2778  
2779 2779  /* ARGSUSED */
2780 2780  void
2781 2781  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2782 2782      struct svc_req *req, cred_t *cr, bool_t ro)
2783 2783  {
2784 2784          int error;
2785 2785          struct statvfs64 sb;
2786 2786          vnode_t *vp;
2787 2787  
2788 2788          vp = nfs_fhtovp(fh, exi);
2789 2789          if (vp == NULL) {
2790 2790                  fs->fs_status = NFSERR_STALE;
2791 2791                  return;
2792 2792          }
2793 2793  
2794 2794          error = VFS_STATVFS(vp->v_vfsp, &sb);
2795 2795  
2796 2796          if (!error) {
2797 2797                  fs->fs_tsize = nfstsize();
2798 2798                  fs->fs_bsize = sb.f_frsize;
2799 2799                  fs->fs_blocks = sb.f_blocks;
2800 2800                  fs->fs_bfree = sb.f_bfree;
2801 2801                  fs->fs_bavail = sb.f_bavail;
2802 2802          }
2803 2803  
2804 2804          VN_RELE(vp);
2805 2805  
2806 2806          fs->fs_status = puterrno(error);
2807 2807  
2808 2808  }
2809 2809  void *
2810 2810  rfs_statfs_getfh(fhandle_t *fh)
2811 2811  {
2812 2812          return (fh);
2813 2813  }
2814 2814  
2815 2815  static int
2816 2816  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2817 2817  {
2818 2818          vap->va_mask = 0;
2819 2819  
2820 2820          /*
2821 2821           * There was a sign extension bug in some VFS based systems
2822 2822           * which stored the mode as a short.  When it would get
2823 2823           * assigned to a u_long, no sign extension would occur.
2824 2824           * It needed to, but this wasn't noticed because sa_mode
2825 2825           * would then get assigned back to the short, thus ignoring
2826 2826           * the upper 16 bits of sa_mode.
2827 2827           *
2828 2828           * To make this implementation work for both broken
2829 2829           * clients and good clients, we check for both versions
2830 2830           * of the mode.
2831 2831           */
2832 2832          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2833 2833              sa->sa_mode != (uint32_t)-1) {
2834 2834                  vap->va_mask |= AT_MODE;
2835 2835                  vap->va_mode = sa->sa_mode;
2836 2836          }
2837 2837          if (sa->sa_uid != (uint32_t)-1) {
2838 2838                  vap->va_mask |= AT_UID;
2839 2839                  vap->va_uid = sa->sa_uid;
2840 2840          }
2841 2841          if (sa->sa_gid != (uint32_t)-1) {
2842 2842                  vap->va_mask |= AT_GID;
2843 2843                  vap->va_gid = sa->sa_gid;
2844 2844          }
2845 2845          if (sa->sa_size != (uint32_t)-1) {
2846 2846                  vap->va_mask |= AT_SIZE;
2847 2847                  vap->va_size = sa->sa_size;
2848 2848          }
2849 2849          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2850 2850              sa->sa_atime.tv_usec != (int32_t)-1) {
2851 2851  #ifndef _LP64
2852 2852                  /* return error if time overflow */
2853 2853                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2854 2854                          return (EOVERFLOW);
2855 2855  #endif
2856 2856                  vap->va_mask |= AT_ATIME;
2857 2857                  /*
2858 2858                   * nfs protocol defines times as unsigned so don't extend sign,
2859 2859                   * unless sysadmin set nfs_allow_preepoch_time.
2860 2860                   */
2861 2861                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2862 2862                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2863 2863          }
2864 2864          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2865 2865              sa->sa_mtime.tv_usec != (int32_t)-1) {
2866 2866  #ifndef _LP64
2867 2867                  /* return error if time overflow */
2868 2868                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2869 2869                          return (EOVERFLOW);
2870 2870  #endif
2871 2871                  vap->va_mask |= AT_MTIME;
2872 2872                  /*
2873 2873                   * nfs protocol defines times as unsigned so don't extend sign,
2874 2874                   * unless sysadmin set nfs_allow_preepoch_time.
2875 2875                   */
2876 2876                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2877 2877                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2878 2878          }
2879 2879          return (0);
2880 2880  }
2881 2881  
2882 2882  static const enum nfsftype vt_to_nf[] = {
2883 2883          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2884 2884  };
2885 2885  
2886 2886  /*
2887 2887   * check the following fields for overflow: nodeid, size, and time.
2888 2888   * There could be a problem when converting 64-bit LP64 fields
2889 2889   * into 32-bit ones.  Return an error if there is an overflow.
2890 2890   */
2891 2891  int
2892 2892  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2893 2893  {
2894 2894          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2895 2895          na->na_type = vt_to_nf[vap->va_type];
2896 2896  
2897 2897          if (vap->va_mode == (unsigned short) -1)
2898 2898                  na->na_mode = (uint32_t)-1;
2899 2899          else
2900 2900                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2901 2901  
2902 2902          if (vap->va_uid == (unsigned short)(-1))
2903 2903                  na->na_uid = (uint32_t)(-1);
2904 2904          else if (vap->va_uid == UID_NOBODY)
2905 2905                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2906 2906          else
2907 2907                  na->na_uid = vap->va_uid;
2908 2908  
2909 2909          if (vap->va_gid == (unsigned short)(-1))
2910 2910                  na->na_gid = (uint32_t)-1;
2911 2911          else if (vap->va_gid == GID_NOBODY)
2912 2912                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2913 2913          else
2914 2914                  na->na_gid = vap->va_gid;
2915 2915  
2916 2916          /*
2917 2917           * Do we need to check fsid for overflow?  It is 64-bit in the
2918 2918           * vattr, but are bigger than 32 bit values supported?
2919 2919           */
2920 2920          na->na_fsid = vap->va_fsid;
2921 2921  
2922 2922          na->na_nodeid = vap->va_nodeid;
2923 2923  
2924 2924          /*
2925 2925           * Check to make sure that the nodeid is representable over the
2926 2926           * wire without losing bits.
2927 2927           */
2928 2928          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2929 2929                  return (EFBIG);
2930 2930          na->na_nlink = vap->va_nlink;
2931 2931  
2932 2932          /*
2933 2933           * Check for big files here, instead of at the caller.  See
2934 2934           * comments in cstat for large special file explanation.
2935 2935           */
2936 2936          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2937 2937                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2938 2938                          return (EFBIG);
2939 2939                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2940 2940                          /* UNKNOWN_SIZE | OVERFLOW */
2941 2941                          na->na_size = MAXOFF32_T;
2942 2942                  } else
2943 2943                          na->na_size = vap->va_size;
2944 2944          } else
2945 2945                  na->na_size = vap->va_size;
2946 2946  
2947 2947          /*
2948 2948           * If the vnode times overflow the 32-bit times that NFS2
2949 2949           * uses on the wire then return an error.
2950 2950           */
2951 2951          if (!NFS_VAP_TIME_OK(vap)) {
2952 2952                  return (EOVERFLOW);
2953 2953          }
2954 2954          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2955 2955          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2956 2956  
2957 2957          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2958 2958          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2959 2959  
2960 2960          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2961 2961          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2962 2962  
2963 2963          /*
2964 2964           * If the dev_t will fit into 16 bits then compress
2965 2965           * it, otherwise leave it alone. See comments in
2966 2966           * nfs_client.c.
2967 2967           */
2968 2968          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2969 2969              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2970 2970                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2971 2971          else
2972 2972                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2973 2973  
2974 2974          na->na_blocks = vap->va_nblocks;
2975 2975          na->na_blocksize = vap->va_blksize;
2976 2976  
2977 2977          /*
2978 2978           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2979 2979           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2980 2980           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2981 2981           *
2982 2982           * BUYER BEWARE:
2983 2983           *  If you are porting the NFS to a non-Sun server, you probably
2984 2984           *  don't want to include the following block of code.  The
2985 2985           *  over-the-wire special file types will be changing with the
2986 2986           *  NFS Protocol Revision.
2987 2987           */
2988 2988          if (vap->va_type == VFIFO)
2989 2989                  NA_SETFIFO(na);
2990 2990          return (0);
2991 2991  }
2992 2992  
2993 2993  /*
2994 2994   * acl v2 support: returns approximate permission.
2995 2995   *      default: returns minimal permission (more restrictive)
2996 2996   *      aclok: returns maximal permission (less restrictive)
2997 2997   *      This routine changes the permissions that are alaredy in *va.
2998 2998   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2999 2999   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3000 3000   */
3001 3001  static void
3002 3002  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3003 3003  {
3004 3004          vsecattr_t      vsa;
3005 3005          int             aclcnt;
3006 3006          aclent_t        *aclentp;
3007 3007          mode_t          mask_perm;
3008 3008          mode_t          grp_perm;
3009 3009          mode_t          other_perm;
3010 3010          mode_t          other_orig;
3011 3011          int             error;
3012 3012  
3013 3013          /* dont care default acl */
3014 3014          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3015 3015          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3016 3016  
3017 3017          if (!error) {
3018 3018                  aclcnt = vsa.vsa_aclcnt;
3019 3019                  if (aclcnt > MIN_ACL_ENTRIES) {
3020 3020                          /* non-trivial ACL */
3021 3021                          aclentp = vsa.vsa_aclentp;
3022 3022                          if (exi->exi_export.ex_flags & EX_ACLOK) {
3023 3023                                  /* maximal permissions */
3024 3024                                  grp_perm = 0;
3025 3025                                  other_perm = 0;
3026 3026                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3027 3027                                          switch (aclentp->a_type) {
3028 3028                                          case USER_OBJ:
3029 3029                                                  break;
3030 3030                                          case USER:
3031 3031                                                  grp_perm |=
3032 3032                                                      aclentp->a_perm << 3;
3033 3033                                                  other_perm |= aclentp->a_perm;
3034 3034                                                  break;
3035 3035                                          case GROUP_OBJ:
3036 3036                                                  grp_perm |=
3037 3037                                                      aclentp->a_perm << 3;
3038 3038                                                  break;
3039 3039                                          case GROUP:
3040 3040                                                  other_perm |= aclentp->a_perm;
3041 3041                                                  break;
3042 3042                                          case OTHER_OBJ:
3043 3043                                                  other_orig = aclentp->a_perm;
3044 3044                                                  break;
3045 3045                                          case CLASS_OBJ:
3046 3046                                                  mask_perm = aclentp->a_perm;
3047 3047                                                  break;
3048 3048                                          default:
3049 3049                                                  break;
3050 3050                                          }
3051 3051                                  }
3052 3052                                  grp_perm &= mask_perm << 3;
3053 3053                                  other_perm &= mask_perm;
3054 3054                                  other_perm |= other_orig;
3055 3055  
3056 3056                          } else {
3057 3057                                  /* minimal permissions */
3058 3058                                  grp_perm = 070;
3059 3059                                  other_perm = 07;
3060 3060                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3061 3061                                          switch (aclentp->a_type) {
3062 3062                                          case USER_OBJ:
3063 3063                                                  break;
3064 3064                                          case USER:
3065 3065                                          case CLASS_OBJ:
3066 3066                                                  grp_perm &=
3067 3067                                                      aclentp->a_perm << 3;
3068 3068                                                  other_perm &=
3069 3069                                                      aclentp->a_perm;
3070 3070                                                  break;
3071 3071                                          case GROUP_OBJ:
3072 3072                                                  grp_perm &=
3073 3073                                                      aclentp->a_perm << 3;
3074 3074                                                  break;
3075 3075                                          case GROUP:
3076 3076                                                  other_perm &=
3077 3077                                                      aclentp->a_perm;
3078 3078                                                  break;
3079 3079                                          case OTHER_OBJ:
3080 3080                                                  other_perm &=
3081 3081                                                      aclentp->a_perm;
3082 3082                                                  break;
3083 3083                                          default:
3084 3084                                                  break;
3085 3085                                          }
3086 3086                                  }
3087 3087                          }
3088 3088                          /* copy to va */
3089 3089                          va->va_mode &= ~077;
3090 3090                          va->va_mode |= grp_perm | other_perm;
3091 3091                  }
3092 3092                  if (vsa.vsa_aclcnt)
3093 3093                          kmem_free(vsa.vsa_aclentp,
3094 3094                              vsa.vsa_aclcnt * sizeof (aclent_t));
3095 3095          }
3096 3096  }
3097 3097  
3098 3098  void
3099 3099  rfs_srvrinit(void)
3100 3100  {
3101 3101          nfs2_srv_caller_id = fs_new_caller_id();
3102 3102          zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3103 3103  }
3104 3104  
3105 3105  void
3106 3106  rfs_srvrfini(void)
3107 3107  {
3108 3108  }
3109 3109  
3110 3110  /* ARGSUSED */
3111 3111  static void *
3112 3112  rfs_zone_init(zoneid_t zoneid)
3113 3113  {
3114 3114          nfs_srv_t *ns;
3115 3115  
3116 3116          ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3117 3117  
3118 3118          mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3119 3119          ns->write_async = 1;
3120 3120  
3121 3121          return (ns);
3122 3122  }
3123 3123  
3124 3124  /* ARGSUSED */
3125 3125  static void
3126 3126  rfs_zone_fini(zoneid_t zoneid, void *data)
3127 3127  {
3128 3128          nfs_srv_t *ns;
3129 3129  
3130 3130          ns = (nfs_srv_t *)data;
3131 3131          mutex_destroy(&ns->async_write_lock);
3132 3132          kmem_free(ns, sizeof (*ns));
3133 3133  }
3134 3134  
3135 3135  static int
3136 3136  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3137 3137  {
3138 3138          struct clist    *wcl;
3139 3139          int             wlist_len;
3140 3140          uint32_t        count = rr->rr_count;
3141 3141  
3142 3142          wcl = ra->ra_wlist;
3143 3143  
3144 3144          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3145 3145                  return (FALSE);
3146 3146          }
3147 3147  
3148 3148          wcl = ra->ra_wlist;
3149 3149          rr->rr_ok.rrok_wlist_len = wlist_len;
3150 3150          rr->rr_ok.rrok_wlist = wcl;
3151 3151  
3152 3152          return (TRUE);
3153 3153  }

↓ open down ↓

2650 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX