gwr-2-plus Wdiff usr/src/uts/common/fs/nfs/nfs_srv.c

Print this page

Caution with use after exi_rele()

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30   30   *      All rights reserved.
  31   31   */
  32   32  
  33   33  /*
  34   34   * Copyright 2018 Nexenta Systems, Inc.
  35   35   * Copyright (c) 2016 by Delphix. All rights reserved.
  36   36   */
  37   37  
  38   38  #include <sys/param.h>
  39   39  #include <sys/types.h>
  40   40  #include <sys/systm.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/stat.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/statvfs.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/kstat.h>
  52   52  #include <sys/dirent.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/vtrace.h>
  56   56  #include <sys/mode.h>
  57   57  #include <sys/acl.h>
  58   58  #include <sys/nbmlock.h>
  59   59  #include <sys/policy.h>
  60   60  #include <sys/sdt.h>
  61   61  
  62   62  #include <rpc/types.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/svc.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/export.h>
  68   68  #include <nfs/nfs_cmd.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/as.h>
  72   72  #include <vm/seg.h>
  73   73  #include <vm/seg_map.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  #include <sys/strsubr.h>
  77   77  
  78   78  struct rfs_async_write_list;
  79   79  
  80   80  /*
  81   81   * Zone globals of NFSv2 server
  82   82   */
  83   83  typedef struct nfs_srv {
  84   84          kmutex_t                        async_write_lock;
  85   85          struct rfs_async_write_list     *async_write_head;
  86   86  
  87   87          /*
  88   88           * enables write clustering if == 1
  89   89           */
  90   90          int             write_async;
  91   91  } nfs_srv_t;
  92   92  
  93   93  /*
  94   94   * These are the interface routines for the server side of the
  95   95   * Network File System.  See the NFS version 2 protocol specification
  96   96   * for a description of this interface.
  97   97   */
  98   98  
  99   99  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100  100  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101  101                          cred_t *);
 102  102  
 103  103  
 104  104  /*
 105  105   * Some "over the wire" UNIX file types.  These are encoded
 106  106   * into the mode.  This needs to be fixed in the next rev.
 107  107   */
 108  108  #define IFMT            0170000         /* type of file */
 109  109  #define IFCHR           0020000         /* character special */
 110  110  #define IFBLK           0060000         /* block special */
 111  111  #define IFSOCK          0140000         /* socket */
 112  112  
 113  113  u_longlong_t nfs2_srv_caller_id;
 114  114  
 115  115  static nfs_srv_t *
 116  116  nfs_get_srv(void)
 117  117  {
 118  118          nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119  119          nfs_srv_t *srv = ng->nfs_srv;
 120  120          ASSERT(srv != NULL);
 121  121          return (srv);
 122  122  }
 123  123  
 124  124  /*
 125  125   * Get file attributes.
 126  126   * Returns the current attributes of the file with the given fhandle.
 127  127   */
 128  128  /* ARGSUSED */
 129  129  void
 130  130  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131  131      struct svc_req *req, cred_t *cr, bool_t ro)
 132  132  {
 133  133          int error;
 134  134          vnode_t *vp;
 135  135          struct vattr va;
 136  136  
 137  137          vp = nfs_fhtovp(fhp, exi);
 138  138          if (vp == NULL) {
 139  139                  ns->ns_status = NFSERR_STALE;
 140  140                  return;
 141  141          }
 142  142  
 143  143          /*
 144  144           * Do the getattr.
 145  145           */
 146  146          va.va_mask = AT_ALL;    /* we want all the attributes */
 147  147  
 148  148          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149  149  
 150  150          /* check for overflows */
 151  151          if (!error) {
 152  152                  /* Lie about the object type for a referral */
 153  153                  if (vn_is_nfs_reparse(vp, cr))
 154  154                          va.va_type = VLNK;
 155  155  
 156  156                  acl_perm(vp, exi, &va, cr);
 157  157                  error = vattr_to_nattr(&va, &ns->ns_attr);
 158  158          }
 159  159  
 160  160          VN_RELE(vp);
 161  161  
 162  162          ns->ns_status = puterrno(error);
 163  163  }
 164  164  void *
 165  165  rfs_getattr_getfh(fhandle_t *fhp)
 166  166  {
 167  167          return (fhp);
 168  168  }
 169  169  
 170  170  /*
 171  171   * Set file attributes.
 172  172   * Sets the attributes of the file with the given fhandle.  Returns
 173  173   * the new attributes.
 174  174   */
 175  175  /* ARGSUSED */
 176  176  void
 177  177  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178  178      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179  179  {
 180  180          int error;
 181  181          int flag;
 182  182          int in_crit = 0;
 183  183          vnode_t *vp;
 184  184          struct vattr va;
 185  185          struct vattr bva;
 186  186          struct flock64 bf;
 187  187          caller_context_t ct;
 188  188  
 189  189  
 190  190          vp = nfs_fhtovp(&args->saa_fh, exi);
 191  191          if (vp == NULL) {
 192  192                  ns->ns_status = NFSERR_STALE;
 193  193                  return;
 194  194          }
 195  195  
 196  196          if (rdonly(ro, vp)) {
 197  197                  VN_RELE(vp);
 198  198                  ns->ns_status = NFSERR_ROFS;
 199  199                  return;
 200  200          }
 201  201  
 202  202          error = sattr_to_vattr(&args->saa_sa, &va);
 203  203          if (error) {
 204  204                  VN_RELE(vp);
 205  205                  ns->ns_status = puterrno(error);
 206  206                  return;
 207  207          }
 208  208  
 209  209          /*
 210  210           * If the client is requesting a change to the mtime,
 211  211           * but the nanosecond field is set to 1 billion, then
 212  212           * this is a flag to the server that it should set the
 213  213           * atime and mtime fields to the server's current time.
 214  214           * The 1 billion number actually came from the client
 215  215           * as 1 million, but the units in the over the wire
 216  216           * request are microseconds instead of nanoseconds.
 217  217           *
 218  218           * This is an overload of the protocol and should be
 219  219           * documented in the NFS Version 2 protocol specification.
 220  220           */
 221  221          if (va.va_mask & AT_MTIME) {
 222  222                  if (va.va_mtime.tv_nsec == 1000000000) {
 223  223                          gethrestime(&va.va_mtime);
 224  224                          va.va_atime = va.va_mtime;
 225  225                          va.va_mask |= AT_ATIME;
 226  226                          flag = 0;
 227  227                  } else
 228  228                          flag = ATTR_UTIME;
 229  229          } else
 230  230                  flag = 0;
 231  231  
 232  232          /*
 233  233           * If the filesystem is exported with nosuid, then mask off
 234  234           * the setuid and setgid bits.
 235  235           */
 236  236          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237  237              (exi->exi_export.ex_flags & EX_NOSUID))
 238  238                  va.va_mode &= ~(VSUID | VSGID);
 239  239  
 240  240          ct.cc_sysid = 0;
 241  241          ct.cc_pid = 0;
 242  242          ct.cc_caller_id = nfs2_srv_caller_id;
 243  243          ct.cc_flags = CC_DONTBLOCK;
 244  244  
 245  245          /*
 246  246           * We need to specially handle size changes because it is
 247  247           * possible for the client to create a file with modes
 248  248           * which indicate read-only, but with the file opened for
 249  249           * writing.  If the client then tries to set the size of
 250  250           * the file, then the normal access checking done in
 251  251           * VOP_SETATTR would prevent the client from doing so,
 252  252           * although it should be legal for it to do so.  To get
 253  253           * around this, we do the access checking for ourselves
 254  254           * and then use VOP_SPACE which doesn't do the access
 255  255           * checking which VOP_SETATTR does. VOP_SPACE can only
 256  256           * operate on VREG files, let VOP_SETATTR handle the other
 257  257           * extremely rare cases.
 258  258           * Also the client should not be allowed to change the
 259  259           * size of the file if there is a conflicting non-blocking
 260  260           * mandatory lock in the region of change.
 261  261           */
 262  262          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263  263                  if (nbl_need_check(vp)) {
 264  264                          nbl_start_crit(vp, RW_READER);
 265  265                          in_crit = 1;
 266  266                  }
 267  267  
 268  268                  bva.va_mask = AT_UID | AT_SIZE;
 269  269  
 270  270                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271  271  
 272  272                  if (error) {
 273  273                          if (in_crit)
 274  274                                  nbl_end_crit(vp);
 275  275                          VN_RELE(vp);
 276  276                          ns->ns_status = puterrno(error);
 277  277                          return;
 278  278                  }
 279  279  
 280  280                  if (in_crit) {
 281  281                          u_offset_t offset;
 282  282                          ssize_t length;
 283  283  
 284  284                          if (va.va_size < bva.va_size) {
 285  285                                  offset = va.va_size;
 286  286                                  length = bva.va_size - va.va_size;
 287  287                          } else {
 288  288                                  offset = bva.va_size;
 289  289                                  length = va.va_size - bva.va_size;
 290  290                          }
 291  291                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292  292                              NULL)) {
 293  293                                  error = EACCES;
 294  294                          }
 295  295                  }
 296  296  
 297  297                  if (crgetuid(cr) == bva.va_uid && !error &&
 298  298                      va.va_size != bva.va_size) {
 299  299                          va.va_mask &= ~AT_SIZE;
 300  300                          bf.l_type = F_WRLCK;
 301  301                          bf.l_whence = 0;
 302  302                          bf.l_start = (off64_t)va.va_size;
 303  303                          bf.l_len = 0;
 304  304                          bf.l_sysid = 0;
 305  305                          bf.l_pid = 0;
 306  306  
 307  307                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308  308                              (offset_t)va.va_size, cr, &ct);
 309  309                  }
 310  310                  if (in_crit)
 311  311                          nbl_end_crit(vp);
 312  312          } else
 313  313                  error = 0;
 314  314  
 315  315          /*
 316  316           * Do the setattr.
 317  317           */
 318  318          if (!error && va.va_mask) {
 319  319                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320  320          }
 321  321  
 322  322          /*
 323  323           * check if the monitor on either vop_space or vop_setattr detected
 324  324           * a delegation conflict and if so, mark the thread flag as
 325  325           * wouldblock so that the response is dropped and the client will
 326  326           * try again.
 327  327           */
 328  328          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329  329                  VN_RELE(vp);
 330  330                  curthread->t_flag |= T_WOULDBLOCK;
 331  331                  return;
 332  332          }
 333  333  
 334  334          if (!error) {
 335  335                  va.va_mask = AT_ALL;    /* get everything */
 336  336  
 337  337                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338  338  
 339  339                  /* check for overflows */
 340  340                  if (!error) {
 341  341                          acl_perm(vp, exi, &va, cr);
 342  342                          error = vattr_to_nattr(&va, &ns->ns_attr);
 343  343                  }
 344  344          }
 345  345  
 346  346          ct.cc_flags = 0;
 347  347  
 348  348          /*
 349  349           * Force modified metadata out to stable storage.
 350  350           */
 351  351          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352  352  
 353  353          VN_RELE(vp);
 354  354  
 355  355          ns->ns_status = puterrno(error);
 356  356  }
 357  357  void *
 358  358  rfs_setattr_getfh(struct nfssaargs *args)
 359  359  {
 360  360          return (&args->saa_fh);
 361  361  }
 362  362  
 363  363  /* Change and release @exip and @vpp only in success */
 364  364  int
 365  365  rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366  366  {
 367  367          struct exportinfo *exi;
 368  368          vnode_t *vp = *vpp;
 369  369          fid_t fid;
 370  370          int error;
 371  371  
 372  372          VN_HOLD(vp);
 373  373  
 374  374          if ((error = traverse(&vp)) != 0) {
 375  375                  VN_RELE(vp);
 376  376                  return (error);
 377  377          }
 378  378  
 379  379          bzero(&fid, sizeof (fid));
 380  380          fid.fid_len = MAXFIDSZ;
 381  381          error = VOP_FID(vp, &fid, NULL);
 382  382          if (error) {
 383  383                  VN_RELE(vp);
 384  384                  return (error);
 385  385          }
 386  386  
 387  387          exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388  388          if (exi == NULL ||
 389  389              (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390  390                  /*
 391  391                   * It is not error, just subdir is not exported
 392  392                   * or "nohide" is not set
 393  393                   */
 394  394                  if (exi != NULL)
 395  395                          exi_rele(exi);
 396  396                  VN_RELE(vp);
 397  397          } else {
 398  398                  /* go to submount */
 399  399                  exi_rele(*exip);
 400  400                  *exip = exi;
 401  401  
 402  402                  VN_RELE(*vpp);
 403  403                  *vpp = vp;
 404  404          }
 405  405  
 406  406          return (0);
 407  407  }
 408  408  
 409  409  /*
 410  410   * Given mounted "dvp" and "exi", go upper mountpoint
 411  411   * with dvp/exi correction
 412  412   * Return 0 in success
 413  413   */
 414  414  int
 415  415  rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416  416  {
 417  417          struct exportinfo *exi;
 418  418          vnode_t *dvp = *dvpp;
 419  419  
 420  420          ASSERT3P((*exip)->exi_zone, ==, curzone);
 421  421          ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 422  422  
 423  423          VN_HOLD(dvp);
 424  424          dvp = untraverse(dvp);
 425  425          exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 426  426          if (exi == NULL) {
 427  427                  VN_RELE(dvp);
 428  428                  return (-1);
 429  429          }
 430  430  
 431  431          ASSERT3P(exi->exi_zone, ==, curzone);
 432  432          exi_rele(*exip);
 433  433          *exip = exi;
 434  434          VN_RELE(*dvpp);
 435  435          *dvpp = dvp;
 436  436  
 437  437          return (0);
 438  438  }
 439  439  /*
 440  440   * Directory lookup.
 441  441   * Returns an fhandle and file attributes for file name in a directory.
 442  442   */
 443  443  /* ARGSUSED */
 444  444  void
 445  445  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 446  446      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 447  447  {
 448  448          int error;
 449  449          vnode_t *dvp;
 450  450          vnode_t *vp;
 451  451          struct vattr va;
 452  452          fhandle_t *fhp = da->da_fhandle;
 453  453          struct sec_ol sec = {0, 0};
 454  454          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 455  455          char *name;
 456  456          struct sockaddr *ca;
 457  457  
 458  458          /*
 459  459           * Trusted Extension doesn't support NFSv2. MOUNT
 460  460           * will reject v2 clients. Need to prevent v2 client
 461  461           * access via WebNFS here.
 462  462           */
 463  463          if (is_system_labeled() && req->rq_vers == 2) {
 464  464                  dr->dr_status = NFSERR_ACCES;
 465  465                  return;
 466  466          }
 467  467  
 468  468          /*
 469  469           * Disallow NULL paths
 470  470           */
 471  471          if (da->da_name == NULL || *da->da_name == '\0') {
 472  472                  dr->dr_status = NFSERR_ACCES;
 473  473                  return;
 474  474          }
 475  475  
 476  476          /*
 477  477           * Allow lookups from the root - the default
 478  478           * location of the public filehandle.
 479  479           */
 480  480          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 481  481                  dvp = ZONE_ROOTVP();
 482  482                  VN_HOLD(dvp);
 483  483          } else {
 484  484                  dvp = nfs_fhtovp(fhp, exi);
 485  485                  if (dvp == NULL) {
 486  486                          dr->dr_status = NFSERR_STALE;
 487  487                          return;
 488  488                  }
 489  489          }
 490  490  
 491  491          exi_hold(exi);
 492  492          ASSERT3P(exi->exi_zone, ==, curzone);
 493  493  
 494  494          /*
 495  495           * Not allow lookup beyond root.
 496  496           * If the filehandle matches a filehandle of the exi,
 497  497           * then the ".." refers beyond the root of an exported filesystem.
 498  498           */
 499  499          if (strcmp(da->da_name, "..") == 0 &&
 500  500              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 501  501                  if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 502  502                      ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 503  503                          /*
 504  504                           * special case for ".." and 'nohide'exported root
 505  505                           */
 506  506                          if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 507  507                                  error = NFSERR_ACCES;
 508  508                                  goto out;
 509  509                          }
 510  510                  } else  {
 511  511                          error = NFSERR_NOENT;
 512  512                          goto out;
 513  513                  }
 514  514          }
 515  515  
 516  516          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 517  517          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 518  518              MAXPATHLEN);
 519  519  
 520  520          if (name == NULL) {
 521  521                  error = NFSERR_ACCES;
 522  522                  goto out;
 523  523          }
 524  524  
 525  525          /*
 526  526           * If the public filehandle is used then allow
 527  527           * a multi-component lookup, i.e. evaluate

↓ open down ↓

527 lines elided

↑ open up ↑

 528  528           * a pathname and follow symbolic links if
 529  529           * necessary.
 530  530           *
 531  531           * This may result in a vnode in another filesystem
 532  532           * which is OK as long as the filesystem is exported.
 533  533           */
 534  534          if (PUBLIC_FH2(fhp)) {
 535  535                  publicfh_flag = TRUE;
 536  536  
 537  537                  exi_rele(exi);
      538 +                exi = NULL;
 538  539  
 539  540                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 540  541                      &sec);
 541  542          } else {
 542  543                  /*
 543  544                   * Do a normal single component lookup.
 544  545                   */
 545  546                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 546  547                      NULL, NULL, NULL);
 547  548          }

 548  549  
 549  550          if (name != da->da_name)
 550  551                  kmem_free(name, MAXPATHLEN);
 551  552  
 552  553          if (error == 0 && vn_ismntpt(vp)) {
 553  554                  error = rfs_cross_mnt(&vp, &exi);
 554  555                  if (error)
 555  556                          VN_RELE(vp);
 556  557          }
 557  558  
 558  559          if (!error) {
 559  560                  va.va_mask = AT_ALL;    /* we want everything */
 560  561  
 561  562                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 562  563  
 563  564                  /* check for overflows */
 564  565                  if (!error) {
 565  566                          acl_perm(vp, exi, &va, cr);
 566  567                          error = vattr_to_nattr(&va, &dr->dr_attr);
 567  568                          if (!error) {
 568  569                                  if (sec.sec_flags & SEC_QUERY)
 569  570                                          error = makefh_ol(&dr->dr_fhandle, exi,
 570  571                                              sec.sec_index);
 571  572                                  else {
 572  573                                          error = makefh(&dr->dr_fhandle, vp,
 573  574                                              exi);
 574  575                                          if (!error && publicfh_flag &&
 575  576                                              !chk_clnt_sec(exi, req))
 576  577                                                  auth_weak = TRUE;
 577  578                                  }
 578  579                          }
 579  580                  }
 580  581                  VN_RELE(vp);
 581  582          }
 582  583  
 583  584  out:
 584  585          VN_RELE(dvp);
 585  586  
 586  587          if (exi != NULL)
 587  588                  exi_rele(exi);
 588  589  
 589  590          /*
 590  591           * If it's public fh, no 0x81, and client's flavor is
 591  592           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 592  593           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 593  594           */
 594  595          if (auth_weak)
 595  596                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 596  597          else
 597  598                  dr->dr_status = puterrno(error);
 598  599  }
 599  600  void *
 600  601  rfs_lookup_getfh(struct nfsdiropargs *da)
 601  602  {
 602  603          return (da->da_fhandle);
 603  604  }
 604  605  
 605  606  /*
 606  607   * Read symbolic link.
 607  608   * Returns the string in the symbolic link at the given fhandle.
 608  609   */
 609  610  /* ARGSUSED */
 610  611  void
 611  612  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 612  613      struct svc_req *req, cred_t *cr, bool_t ro)
 613  614  {
 614  615          int error;
 615  616          struct iovec iov;
 616  617          struct uio uio;
 617  618          vnode_t *vp;
 618  619          struct vattr va;
 619  620          struct sockaddr *ca;
 620  621          char *name = NULL;
 621  622          int is_referral = 0;
 622  623  
 623  624          vp = nfs_fhtovp(fhp, exi);
 624  625          if (vp == NULL) {
 625  626                  rl->rl_data = NULL;
 626  627                  rl->rl_status = NFSERR_STALE;
 627  628                  return;
 628  629          }
 629  630  
 630  631          va.va_mask = AT_MODE;
 631  632  
 632  633          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 633  634  
 634  635          if (error) {
 635  636                  VN_RELE(vp);
 636  637                  rl->rl_data = NULL;
 637  638                  rl->rl_status = puterrno(error);
 638  639                  return;
 639  640          }
 640  641  
 641  642          if (MANDLOCK(vp, va.va_mode)) {
 642  643                  VN_RELE(vp);
 643  644                  rl->rl_data = NULL;
 644  645                  rl->rl_status = NFSERR_ACCES;
 645  646                  return;
 646  647          }
 647  648  
 648  649          /* We lied about the object type for a referral */
 649  650          if (vn_is_nfs_reparse(vp, cr))
 650  651                  is_referral = 1;
 651  652  
 652  653          /*
 653  654           * XNFS and RFC1094 require us to return ENXIO if argument
 654  655           * is not a link. BUGID 1138002.
 655  656           */
 656  657          if (vp->v_type != VLNK && !is_referral) {
 657  658                  VN_RELE(vp);
 658  659                  rl->rl_data = NULL;
 659  660                  rl->rl_status = NFSERR_NXIO;
 660  661                  return;
 661  662          }
 662  663  
 663  664          /*
 664  665           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 665  666           */
 666  667          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 667  668  
 668  669          if (is_referral) {
 669  670                  char *s;
 670  671                  size_t strsz;
 671  672  
 672  673                  /* Get an artificial symlink based on a referral */
 673  674                  s = build_symlink(vp, cr, &strsz);
 674  675                  global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 675  676                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 676  677                      vnode_t *, vp, char *, s);
 677  678                  if (s == NULL)
 678  679                          error = EINVAL;
 679  680                  else {
 680  681                          error = 0;
 681  682                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 682  683                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 683  684                          kmem_free(s, strsz);
 684  685                  }
 685  686  
 686  687          } else {
 687  688  
 688  689                  /*
 689  690                   * Set up io vector to read sym link data
 690  691                   */
 691  692                  iov.iov_base = rl->rl_data;
 692  693                  iov.iov_len = NFS_MAXPATHLEN;
 693  694                  uio.uio_iov = &iov;
 694  695                  uio.uio_iovcnt = 1;
 695  696                  uio.uio_segflg = UIO_SYSSPACE;
 696  697                  uio.uio_extflg = UIO_COPY_CACHED;
 697  698                  uio.uio_loffset = (offset_t)0;
 698  699                  uio.uio_resid = NFS_MAXPATHLEN;
 699  700  
 700  701                  /*
 701  702                   * Do the readlink.
 702  703                   */
 703  704                  error = VOP_READLINK(vp, &uio, cr, NULL);
 704  705  
 705  706                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 706  707  
 707  708                  if (!error)
 708  709                          rl->rl_data[rl->rl_count] = '\0';
 709  710  
 710  711          }
 711  712  
 712  713  
 713  714          VN_RELE(vp);
 714  715  
 715  716          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 716  717          name = nfscmd_convname(ca, exi, rl->rl_data,
 717  718              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 718  719  
 719  720          if (name != NULL && name != rl->rl_data) {
 720  721                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 721  722                  rl->rl_data = name;
 722  723          }
 723  724  
 724  725          /*
 725  726           * XNFS and RFC1094 require us to return ENXIO if argument
 726  727           * is not a link. UFS returns EINVAL if this is the case,
 727  728           * so we do the mapping here. BUGID 1138002.
 728  729           */
 729  730          if (error == EINVAL)
 730  731                  rl->rl_status = NFSERR_NXIO;
 731  732          else
 732  733                  rl->rl_status = puterrno(error);
 733  734  
 734  735  }
 735  736  void *
 736  737  rfs_readlink_getfh(fhandle_t *fhp)
 737  738  {
 738  739          return (fhp);
 739  740  }
 740  741  /*
 741  742   * Free data allocated by rfs_readlink
 742  743   */
 743  744  void
 744  745  rfs_rlfree(struct nfsrdlnres *rl)
 745  746  {
 746  747          if (rl->rl_data != NULL)
 747  748                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 748  749  }
 749  750  
 750  751  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 751  752  
 752  753  /*
 753  754   * Read data.
 754  755   * Returns some data read from the file at the given fhandle.
 755  756   */
 756  757  /* ARGSUSED */
 757  758  void
 758  759  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 759  760      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 760  761  {
 761  762          vnode_t *vp;
 762  763          int error;
 763  764          struct vattr va;
 764  765          struct iovec iov;
 765  766          struct uio uio;
 766  767          mblk_t *mp;
 767  768          int alloc_err = 0;
 768  769          int in_crit = 0;
 769  770          caller_context_t ct;
 770  771  
 771  772          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 772  773          if (vp == NULL) {
 773  774                  rr->rr_data = NULL;
 774  775                  rr->rr_status = NFSERR_STALE;
 775  776                  return;
 776  777          }
 777  778  
 778  779          if (vp->v_type != VREG) {
 779  780                  VN_RELE(vp);
 780  781                  rr->rr_data = NULL;
 781  782                  rr->rr_status = NFSERR_ISDIR;
 782  783                  return;
 783  784          }
 784  785  
 785  786          ct.cc_sysid = 0;
 786  787          ct.cc_pid = 0;
 787  788          ct.cc_caller_id = nfs2_srv_caller_id;
 788  789          ct.cc_flags = CC_DONTBLOCK;
 789  790  
 790  791          /*
 791  792           * Enter the critical region before calling VOP_RWLOCK
 792  793           * to avoid a deadlock with write requests.
 793  794           */
 794  795          if (nbl_need_check(vp)) {
 795  796                  nbl_start_crit(vp, RW_READER);
 796  797                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 797  798                      0, NULL)) {
 798  799                          nbl_end_crit(vp);
 799  800                          VN_RELE(vp);
 800  801                          rr->rr_data = NULL;
 801  802                          rr->rr_status = NFSERR_ACCES;
 802  803                          return;
 803  804                  }
 804  805                  in_crit = 1;
 805  806          }
 806  807  
 807  808          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 808  809  
 809  810          /* check if a monitor detected a delegation conflict */
 810  811          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 811  812                  if (in_crit)
 812  813                          nbl_end_crit(vp);
 813  814                  VN_RELE(vp);
 814  815                  /* mark as wouldblock so response is dropped */
 815  816                  curthread->t_flag |= T_WOULDBLOCK;
 816  817  
 817  818                  rr->rr_data = NULL;
 818  819                  return;
 819  820          }
 820  821  
 821  822          va.va_mask = AT_ALL;
 822  823  
 823  824          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 824  825  
 825  826          if (error) {
 826  827                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 827  828                  if (in_crit)
 828  829                          nbl_end_crit(vp);
 829  830  
 830  831                  VN_RELE(vp);
 831  832                  rr->rr_data = NULL;
 832  833                  rr->rr_status = puterrno(error);
 833  834  
 834  835                  return;
 835  836          }
 836  837  
 837  838          /*
 838  839           * This is a kludge to allow reading of files created
 839  840           * with no read permission.  The owner of the file
 840  841           * is always allowed to read it.
 841  842           */
 842  843          if (crgetuid(cr) != va.va_uid) {
 843  844                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 844  845  
 845  846                  if (error) {
 846  847                          /*
 847  848                           * Exec is the same as read over the net because
 848  849                           * of demand loading.
 849  850                           */
 850  851                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 851  852                  }
 852  853                  if (error) {
 853  854                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 854  855                          if (in_crit)
 855  856                                  nbl_end_crit(vp);
 856  857                          VN_RELE(vp);
 857  858                          rr->rr_data = NULL;
 858  859                          rr->rr_status = puterrno(error);
 859  860  
 860  861                          return;
 861  862                  }
 862  863          }
 863  864  
 864  865          if (MANDLOCK(vp, va.va_mode)) {
 865  866                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 866  867                  if (in_crit)
 867  868                          nbl_end_crit(vp);
 868  869  
 869  870                  VN_RELE(vp);
 870  871                  rr->rr_data = NULL;
 871  872                  rr->rr_status = NFSERR_ACCES;
 872  873  
 873  874                  return;
 874  875          }
 875  876  
 876  877          rr->rr_ok.rrok_wlist_len = 0;
 877  878          rr->rr_ok.rrok_wlist = NULL;
 878  879  
 879  880          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 880  881                  rr->rr_count = 0;
 881  882                  rr->rr_data = NULL;
 882  883                  /*
 883  884                   * In this case, status is NFS_OK, but there is no data
 884  885                   * to encode. So set rr_mp to NULL.
 885  886                   */
 886  887                  rr->rr_mp = NULL;
 887  888                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 888  889                  if (rr->rr_ok.rrok_wlist)
 889  890                          clist_zero_len(rr->rr_ok.rrok_wlist);
 890  891                  goto done;
 891  892          }
 892  893  
 893  894          if (ra->ra_wlist) {
 894  895                  mp = NULL;
 895  896                  rr->rr_mp = NULL;
 896  897                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 897  898                  if (ra->ra_count > iov.iov_len) {
 898  899                          rr->rr_data = NULL;
 899  900                          rr->rr_status = NFSERR_INVAL;
 900  901                          goto done;
 901  902                  }
 902  903          } else {
 903  904                  /*
 904  905                   * mp will contain the data to be sent out in the read reply.
 905  906                   * This will be freed after the reply has been sent out (by the
 906  907                   * driver).
 907  908                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 908  909                   * that the call to xdrmblk_putmblk() never fails.
 909  910                   */
 910  911                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 911  912                      &alloc_err);
 912  913                  ASSERT(mp != NULL);
 913  914                  ASSERT(alloc_err == 0);
 914  915  
 915  916                  rr->rr_mp = mp;
 916  917  
 917  918                  /*
 918  919                   * Set up io vector
 919  920                   */
 920  921                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 921  922                  iov.iov_len = ra->ra_count;
 922  923          }
 923  924  
 924  925          uio.uio_iov = &iov;
 925  926          uio.uio_iovcnt = 1;
 926  927          uio.uio_segflg = UIO_SYSSPACE;
 927  928          uio.uio_extflg = UIO_COPY_CACHED;
 928  929          uio.uio_loffset = (offset_t)ra->ra_offset;
 929  930          uio.uio_resid = ra->ra_count;
 930  931  
 931  932          error = VOP_READ(vp, &uio, 0, cr, &ct);
 932  933  
 933  934          if (error) {
 934  935                  if (mp)
 935  936                          freeb(mp);
 936  937  
 937  938                  /*
 938  939                   * check if a monitor detected a delegation conflict and
 939  940                   * mark as wouldblock so response is dropped
 940  941                   */
 941  942                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 942  943                          curthread->t_flag |= T_WOULDBLOCK;
 943  944                  else
 944  945                          rr->rr_status = puterrno(error);
 945  946  
 946  947                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 947  948                  if (in_crit)
 948  949                          nbl_end_crit(vp);
 949  950  
 950  951                  VN_RELE(vp);
 951  952                  rr->rr_data = NULL;
 952  953  
 953  954                  return;
 954  955          }
 955  956  
 956  957          /*
 957  958           * Get attributes again so we can send the latest access
 958  959           * time to the client side for its cache.
 959  960           */
 960  961          va.va_mask = AT_ALL;
 961  962  
 962  963          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 963  964  
 964  965          if (error) {
 965  966                  if (mp)
 966  967                          freeb(mp);
 967  968  
 968  969                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 969  970                  if (in_crit)
 970  971                          nbl_end_crit(vp);
 971  972  
 972  973                  VN_RELE(vp);
 973  974                  rr->rr_data = NULL;
 974  975                  rr->rr_status = puterrno(error);
 975  976  
 976  977                  return;
 977  978          }
 978  979  
 979  980          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 980  981  
 981  982          if (mp) {
 982  983                  rr->rr_data = (char *)mp->b_datap->db_base;
 983  984          } else {
 984  985                  if (ra->ra_wlist) {
 985  986                          rr->rr_data = (caddr_t)iov.iov_base;
 986  987                          if (!rdma_setup_read_data2(ra, rr)) {
 987  988                                  rr->rr_data = NULL;
 988  989                                  rr->rr_status = puterrno(NFSERR_INVAL);
 989  990                          }
 990  991                  }
 991  992          }
 992  993  done:
 993  994          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 994  995          if (in_crit)
 995  996                  nbl_end_crit(vp);
 996  997  
 997  998          acl_perm(vp, exi, &va, cr);
 998  999  
 999 1000          /* check for overflows */
1000 1001          error = vattr_to_nattr(&va, &rr->rr_attr);
1001 1002  
1002 1003          VN_RELE(vp);
1003 1004  
1004 1005          rr->rr_status = puterrno(error);
1005 1006  }
1006 1007  
1007 1008  /*
1008 1009   * Free data allocated by rfs_read
1009 1010   */
1010 1011  void
1011 1012  rfs_rdfree(struct nfsrdresult *rr)
1012 1013  {
1013 1014          mblk_t *mp;
1014 1015  
1015 1016          if (rr->rr_status == NFS_OK) {
1016 1017                  mp = rr->rr_mp;
1017 1018                  if (mp != NULL)
1018 1019                          freeb(mp);
1019 1020          }
1020 1021  }
1021 1022  
1022 1023  void *
1023 1024  rfs_read_getfh(struct nfsreadargs *ra)
1024 1025  {
1025 1026          return (&ra->ra_fhandle);
1026 1027  }
1027 1028  
1028 1029  #define MAX_IOVECS      12
1029 1030  
1030 1031  #ifdef DEBUG
1031 1032  static int rfs_write_sync_hits = 0;
1032 1033  static int rfs_write_sync_misses = 0;
1033 1034  #endif
1034 1035  
1035 1036  /*
1036 1037   * Write data to file.
1037 1038   * Returns attributes of a file after writing some data to it.
1038 1039   *
1039 1040   * Any changes made here, especially in error handling might have
1040 1041   * to also be done in rfs_write (which clusters write requests).
1041 1042   */
1042 1043  /* ARGSUSED */
1043 1044  void
1044 1045  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1045 1046      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1046 1047  {
1047 1048          int error;
1048 1049          vnode_t *vp;
1049 1050          rlim64_t rlimit;
1050 1051          struct vattr va;
1051 1052          struct uio uio;
1052 1053          struct iovec iov[MAX_IOVECS];
1053 1054          mblk_t *m;
1054 1055          struct iovec *iovp;
1055 1056          int iovcnt;
1056 1057          cred_t *savecred;
1057 1058          int in_crit = 0;
1058 1059          caller_context_t ct;
1059 1060  
1060 1061          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1061 1062          if (vp == NULL) {
1062 1063                  ns->ns_status = NFSERR_STALE;
1063 1064                  return;
1064 1065          }
1065 1066  
1066 1067          if (rdonly(ro, vp)) {
1067 1068                  VN_RELE(vp);
1068 1069                  ns->ns_status = NFSERR_ROFS;
1069 1070                  return;
1070 1071          }
1071 1072  
1072 1073          if (vp->v_type != VREG) {
1073 1074                  VN_RELE(vp);
1074 1075                  ns->ns_status = NFSERR_ISDIR;
1075 1076                  return;
1076 1077          }
1077 1078  
1078 1079          ct.cc_sysid = 0;
1079 1080          ct.cc_pid = 0;
1080 1081          ct.cc_caller_id = nfs2_srv_caller_id;
1081 1082          ct.cc_flags = CC_DONTBLOCK;
1082 1083  
1083 1084          va.va_mask = AT_UID|AT_MODE;
1084 1085  
1085 1086          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1086 1087  
1087 1088          if (error) {
1088 1089                  VN_RELE(vp);
1089 1090                  ns->ns_status = puterrno(error);
1090 1091  
1091 1092                  return;
1092 1093          }
1093 1094  
1094 1095          if (crgetuid(cr) != va.va_uid) {
1095 1096                  /*
1096 1097                   * This is a kludge to allow writes of files created
1097 1098                   * with read only permission.  The owner of the file
1098 1099                   * is always allowed to write it.
1099 1100                   */
1100 1101                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1101 1102  
1102 1103                  if (error) {
1103 1104                          VN_RELE(vp);
1104 1105                          ns->ns_status = puterrno(error);
1105 1106                          return;
1106 1107                  }
1107 1108          }
1108 1109  
1109 1110          /*
1110 1111           * Can't access a mandatory lock file.  This might cause
1111 1112           * the NFS service thread to block forever waiting for a
1112 1113           * lock to be released that will never be released.
1113 1114           */
1114 1115          if (MANDLOCK(vp, va.va_mode)) {
1115 1116                  VN_RELE(vp);
1116 1117                  ns->ns_status = NFSERR_ACCES;
1117 1118                  return;
1118 1119          }
1119 1120  
1120 1121          /*
1121 1122           * We have to enter the critical region before calling VOP_RWLOCK
1122 1123           * to avoid a deadlock with ufs.
1123 1124           */
1124 1125          if (nbl_need_check(vp)) {
1125 1126                  nbl_start_crit(vp, RW_READER);
1126 1127                  in_crit = 1;
1127 1128                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1128 1129                      wa->wa_count, 0, NULL)) {
1129 1130                          error = EACCES;
1130 1131                          goto out;
1131 1132                  }
1132 1133          }
1133 1134  
1134 1135          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1135 1136  
1136 1137          /* check if a monitor detected a delegation conflict */
1137 1138          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1138 1139                  goto out;
1139 1140          }
1140 1141  
1141 1142          if (wa->wa_data || wa->wa_rlist) {
1142 1143                  /* Do the RDMA thing if necessary */
1143 1144                  if (wa->wa_rlist) {
1144 1145                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1145 1146                          iov[0].iov_len = wa->wa_count;
1146 1147                  } else  {
1147 1148                          iov[0].iov_base = wa->wa_data;
1148 1149                          iov[0].iov_len = wa->wa_count;
1149 1150                  }
1150 1151                  uio.uio_iov = iov;
1151 1152                  uio.uio_iovcnt = 1;
1152 1153                  uio.uio_segflg = UIO_SYSSPACE;
1153 1154                  uio.uio_extflg = UIO_COPY_DEFAULT;
1154 1155                  uio.uio_loffset = (offset_t)wa->wa_offset;
1155 1156                  uio.uio_resid = wa->wa_count;
1156 1157                  /*
1157 1158                   * The limit is checked on the client. We
1158 1159                   * should allow any size writes here.
1159 1160                   */
1160 1161                  uio.uio_llimit = curproc->p_fsz_ctl;
1161 1162                  rlimit = uio.uio_llimit - wa->wa_offset;
1162 1163                  if (rlimit < (rlim64_t)uio.uio_resid)
1163 1164                          uio.uio_resid = (uint_t)rlimit;
1164 1165  
1165 1166                  /*
1166 1167                   * for now we assume no append mode
1167 1168                   */
1168 1169                  /*
1169 1170                   * We're changing creds because VM may fault and we need
1170 1171                   * the cred of the current thread to be used if quota
1171 1172                   * checking is enabled.
1172 1173                   */
1173 1174                  savecred = curthread->t_cred;
1174 1175                  curthread->t_cred = cr;
1175 1176                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1176 1177                  curthread->t_cred = savecred;
1177 1178          } else {
1178 1179  
1179 1180                  iovcnt = 0;
1180 1181                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1181 1182                          iovcnt++;
1182 1183                  if (iovcnt <= MAX_IOVECS) {
1183 1184  #ifdef DEBUG
1184 1185                          rfs_write_sync_hits++;
1185 1186  #endif
1186 1187                          iovp = iov;
1187 1188                  } else {
1188 1189  #ifdef DEBUG
1189 1190                          rfs_write_sync_misses++;
1190 1191  #endif
1191 1192                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1192 1193                  }
1193 1194                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1194 1195                  uio.uio_iov = iovp;
1195 1196                  uio.uio_iovcnt = iovcnt;
1196 1197                  uio.uio_segflg = UIO_SYSSPACE;
1197 1198                  uio.uio_extflg = UIO_COPY_DEFAULT;
1198 1199                  uio.uio_loffset = (offset_t)wa->wa_offset;
1199 1200                  uio.uio_resid = wa->wa_count;
1200 1201                  /*
1201 1202                   * The limit is checked on the client. We
1202 1203                   * should allow any size writes here.
1203 1204                   */
1204 1205                  uio.uio_llimit = curproc->p_fsz_ctl;
1205 1206                  rlimit = uio.uio_llimit - wa->wa_offset;
1206 1207                  if (rlimit < (rlim64_t)uio.uio_resid)
1207 1208                          uio.uio_resid = (uint_t)rlimit;
1208 1209  
1209 1210                  /*
1210 1211                   * For now we assume no append mode.
1211 1212                   */
1212 1213                  /*
1213 1214                   * We're changing creds because VM may fault and we need
1214 1215                   * the cred of the current thread to be used if quota
1215 1216                   * checking is enabled.
1216 1217                   */
1217 1218                  savecred = curthread->t_cred;
1218 1219                  curthread->t_cred = cr;
1219 1220                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1220 1221                  curthread->t_cred = savecred;
1221 1222  
1222 1223                  if (iovp != iov)
1223 1224                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1224 1225          }
1225 1226  
1226 1227          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1227 1228  
1228 1229          if (!error) {
1229 1230                  /*
1230 1231                   * Get attributes again so we send the latest mod
1231 1232                   * time to the client side for its cache.
1232 1233                   */
1233 1234                  va.va_mask = AT_ALL;    /* now we want everything */
1234 1235  
1235 1236                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1236 1237  
1237 1238                  /* check for overflows */
1238 1239                  if (!error) {
1239 1240                          acl_perm(vp, exi, &va, cr);
1240 1241                          error = vattr_to_nattr(&va, &ns->ns_attr);
1241 1242                  }
1242 1243          }
1243 1244  
1244 1245  out:
1245 1246          if (in_crit)
1246 1247                  nbl_end_crit(vp);
1247 1248          VN_RELE(vp);
1248 1249  
1249 1250          /* check if a monitor detected a delegation conflict */
1250 1251          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1251 1252                  /* mark as wouldblock so response is dropped */
1252 1253                  curthread->t_flag |= T_WOULDBLOCK;
1253 1254          else
1254 1255                  ns->ns_status = puterrno(error);
1255 1256  
1256 1257  }
1257 1258  
1258 1259  struct rfs_async_write {
1259 1260          struct nfswriteargs *wa;
1260 1261          struct nfsattrstat *ns;
1261 1262          struct svc_req *req;
1262 1263          cred_t *cr;
1263 1264          bool_t ro;
1264 1265          kthread_t *thread;
1265 1266          struct rfs_async_write *list;
1266 1267  };
1267 1268  
1268 1269  struct rfs_async_write_list {
1269 1270          fhandle_t *fhp;
1270 1271          kcondvar_t cv;
1271 1272          struct rfs_async_write *list;
1272 1273          struct rfs_async_write_list *next;
1273 1274  };
1274 1275  
1275 1276  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1276 1277  static kmutex_t rfs_async_write_lock;
1277 1278  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1278 1279  
1279 1280  #define MAXCLIOVECS     42
1280 1281  #define RFSWRITE_INITVAL (enum nfsstat) -1
1281 1282  
1282 1283  #ifdef DEBUG
1283 1284  static int rfs_write_hits = 0;
1284 1285  static int rfs_write_misses = 0;
1285 1286  #endif
1286 1287  
1287 1288  /*
1288 1289   * Write data to file.
1289 1290   * Returns attributes of a file after writing some data to it.
1290 1291   */
1291 1292  void
1292 1293  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1293 1294      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1294 1295  {
1295 1296          int error;
1296 1297          vnode_t *vp;
1297 1298          rlim64_t rlimit;
1298 1299          struct vattr va;
1299 1300          struct uio uio;
1300 1301          struct rfs_async_write_list *lp;
1301 1302          struct rfs_async_write_list *nlp;
1302 1303          struct rfs_async_write *rp;
1303 1304          struct rfs_async_write *nrp;
1304 1305          struct rfs_async_write *trp;
1305 1306          struct rfs_async_write *lrp;
1306 1307          int data_written;
1307 1308          int iovcnt;
1308 1309          mblk_t *m;
1309 1310          struct iovec *iovp;
1310 1311          struct iovec *niovp;
1311 1312          struct iovec iov[MAXCLIOVECS];
1312 1313          int count;
1313 1314          int rcount;
1314 1315          uint_t off;
1315 1316          uint_t len;
1316 1317          struct rfs_async_write nrpsp;
1317 1318          struct rfs_async_write_list nlpsp;
1318 1319          ushort_t t_flag;
1319 1320          cred_t *savecred;
1320 1321          int in_crit = 0;
1321 1322          caller_context_t ct;
1322 1323          nfs_srv_t *nsrv;
1323 1324  
1324 1325          ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1325 1326          nsrv = nfs_get_srv();
1326 1327          if (!nsrv->write_async) {
1327 1328                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1328 1329                  return;
1329 1330          }
1330 1331  
1331 1332          /*
1332 1333           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1333 1334           * is considered an OK.
1334 1335           */
1335 1336          ns->ns_status = RFSWRITE_INITVAL;
1336 1337  
1337 1338          nrp = &nrpsp;
1338 1339          nrp->wa = wa;
1339 1340          nrp->ns = ns;
1340 1341          nrp->req = req;
1341 1342          nrp->cr = cr;
1342 1343          nrp->ro = ro;
1343 1344          nrp->thread = curthread;
1344 1345  
1345 1346          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1346 1347  
1347 1348          /*
1348 1349           * Look to see if there is already a cluster started
1349 1350           * for this file.
1350 1351           */
1351 1352          mutex_enter(&nsrv->async_write_lock);
1352 1353          for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1353 1354                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1354 1355                      sizeof (fhandle_t)) == 0)
1355 1356                          break;
1356 1357          }
1357 1358  
1358 1359          /*
1359 1360           * If lp is non-NULL, then there is already a cluster
1360 1361           * started.  We need to place ourselves in the cluster
1361 1362           * list in the right place as determined by starting
1362 1363           * offset.  Conflicts with non-blocking mandatory locked
1363 1364           * regions will be checked when the cluster is processed.
1364 1365           */
1365 1366          if (lp != NULL) {
1366 1367                  rp = lp->list;
1367 1368                  trp = NULL;
1368 1369                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1369 1370                          trp = rp;
1370 1371                          rp = rp->list;
1371 1372                  }
1372 1373                  nrp->list = rp;
1373 1374                  if (trp == NULL)
1374 1375                          lp->list = nrp;
1375 1376                  else
1376 1377                          trp->list = nrp;
1377 1378                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1378 1379                          cv_wait(&lp->cv, &nsrv->async_write_lock);
1379 1380                  mutex_exit(&nsrv->async_write_lock);
1380 1381  
1381 1382                  return;
1382 1383          }
1383 1384  
1384 1385          /*
1385 1386           * No cluster started yet, start one and add ourselves
1386 1387           * to the list of clusters.
1387 1388           */
1388 1389          nrp->list = NULL;
1389 1390  
1390 1391          nlp = &nlpsp;
1391 1392          nlp->fhp = &wa->wa_fhandle;
1392 1393          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1393 1394          nlp->list = nrp;
1394 1395          nlp->next = NULL;
1395 1396  
1396 1397          if (nsrv->async_write_head == NULL) {
1397 1398                  nsrv->async_write_head = nlp;
1398 1399          } else {
1399 1400                  lp = nsrv->async_write_head;
1400 1401                  while (lp->next != NULL)
1401 1402                          lp = lp->next;
1402 1403                  lp->next = nlp;
1403 1404          }
1404 1405          mutex_exit(&nsrv->async_write_lock);
1405 1406  
1406 1407          /*
1407 1408           * Convert the file handle common to all of the requests
1408 1409           * in this cluster to a vnode.
1409 1410           */
1410 1411          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1411 1412          if (vp == NULL) {
1412 1413                  mutex_enter(&nsrv->async_write_lock);
1413 1414                  if (nsrv->async_write_head == nlp)
1414 1415                          nsrv->async_write_head = nlp->next;
1415 1416                  else {
1416 1417                          lp = nsrv->async_write_head;
1417 1418                          while (lp->next != nlp)
1418 1419                                  lp = lp->next;
1419 1420                          lp->next = nlp->next;
1420 1421                  }
1421 1422                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1422 1423                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1423 1424                          rp->ns->ns_status = NFSERR_STALE;
1424 1425                          rp->thread->t_flag |= t_flag;
1425 1426                  }
1426 1427                  cv_broadcast(&nlp->cv);
1427 1428                  mutex_exit(&nsrv->async_write_lock);
1428 1429  
1429 1430                  return;
1430 1431          }
1431 1432  
1432 1433          /*
1433 1434           * Can only write regular files.  Attempts to write any
1434 1435           * other file types fail with EISDIR.
1435 1436           */
1436 1437          if (vp->v_type != VREG) {
1437 1438                  VN_RELE(vp);
1438 1439                  mutex_enter(&nsrv->async_write_lock);
1439 1440                  if (nsrv->async_write_head == nlp)
1440 1441                          nsrv->async_write_head = nlp->next;
1441 1442                  else {
1442 1443                          lp = nsrv->async_write_head;
1443 1444                          while (lp->next != nlp)
1444 1445                                  lp = lp->next;
1445 1446                          lp->next = nlp->next;
1446 1447                  }
1447 1448                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1448 1449                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1449 1450                          rp->ns->ns_status = NFSERR_ISDIR;
1450 1451                          rp->thread->t_flag |= t_flag;
1451 1452                  }
1452 1453                  cv_broadcast(&nlp->cv);
1453 1454                  mutex_exit(&nsrv->async_write_lock);
1454 1455  
1455 1456                  return;
1456 1457          }
1457 1458  
1458 1459          /*
1459 1460           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1460 1461           * deadlock with ufs.
1461 1462           */
1462 1463          if (nbl_need_check(vp)) {
1463 1464                  nbl_start_crit(vp, RW_READER);
1464 1465                  in_crit = 1;
1465 1466          }
1466 1467  
1467 1468          ct.cc_sysid = 0;
1468 1469          ct.cc_pid = 0;
1469 1470          ct.cc_caller_id = nfs2_srv_caller_id;
1470 1471          ct.cc_flags = CC_DONTBLOCK;
1471 1472  
1472 1473          /*
1473 1474           * Lock the file for writing.  This operation provides
1474 1475           * the delay which allows clusters to grow.
1475 1476           */
1476 1477          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1477 1478  
1478 1479          /* check if a monitor detected a delegation conflict */
1479 1480          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1480 1481                  if (in_crit)
1481 1482                          nbl_end_crit(vp);
1482 1483                  VN_RELE(vp);
1483 1484                  /* mark as wouldblock so response is dropped */
1484 1485                  curthread->t_flag |= T_WOULDBLOCK;
1485 1486                  mutex_enter(&nsrv->async_write_lock);
1486 1487                  if (nsrv->async_write_head == nlp)
1487 1488                          nsrv->async_write_head = nlp->next;
1488 1489                  else {
1489 1490                          lp = nsrv->async_write_head;
1490 1491                          while (lp->next != nlp)
1491 1492                                  lp = lp->next;
1492 1493                          lp->next = nlp->next;
1493 1494                  }
1494 1495                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1495 1496                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1496 1497                                  rp->ns->ns_status = puterrno(error);
1497 1498                                  rp->thread->t_flag |= T_WOULDBLOCK;
1498 1499                          }
1499 1500                  }
1500 1501                  cv_broadcast(&nlp->cv);
1501 1502                  mutex_exit(&nsrv->async_write_lock);
1502 1503  
1503 1504                  return;
1504 1505          }
1505 1506  
1506 1507          /*
1507 1508           * Disconnect this cluster from the list of clusters.
1508 1509           * The cluster that is being dealt with must be fixed
1509 1510           * in size after this point, so there is no reason
1510 1511           * to leave it on the list so that new requests can
1511 1512           * find it.
1512 1513           *
1513 1514           * The algorithm is that the first write request will
1514 1515           * create a cluster, convert the file handle to a
1515 1516           * vnode pointer, and then lock the file for writing.
1516 1517           * This request is not likely to be clustered with
1517 1518           * any others.  However, the next request will create
1518 1519           * a new cluster and be blocked in VOP_RWLOCK while
1519 1520           * the first request is being processed.  This delay
1520 1521           * will allow more requests to be clustered in this
1521 1522           * second cluster.
1522 1523           */
1523 1524          mutex_enter(&nsrv->async_write_lock);
1524 1525          if (nsrv->async_write_head == nlp)
1525 1526                  nsrv->async_write_head = nlp->next;
1526 1527          else {
1527 1528                  lp = nsrv->async_write_head;
1528 1529                  while (lp->next != nlp)
1529 1530                          lp = lp->next;
1530 1531                  lp->next = nlp->next;
1531 1532          }
1532 1533          mutex_exit(&nsrv->async_write_lock);
1533 1534  
1534 1535          /*
1535 1536           * Step through the list of requests in this cluster.
1536 1537           * We need to check permissions to make sure that all
1537 1538           * of the requests have sufficient permission to write
1538 1539           * the file.  A cluster can be composed of requests
1539 1540           * from different clients and different users on each
1540 1541           * client.
1541 1542           *
1542 1543           * As a side effect, we also calculate the size of the
1543 1544           * byte range that this cluster encompasses.
1544 1545           */
1545 1546          rp = nlp->list;
1546 1547          off = rp->wa->wa_offset;
1547 1548          len = (uint_t)0;
1548 1549          do {
1549 1550                  if (rdonly(rp->ro, vp)) {
1550 1551                          rp->ns->ns_status = NFSERR_ROFS;
1551 1552                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1552 1553                          rp->thread->t_flag |= t_flag;
1553 1554                          continue;
1554 1555                  }
1555 1556  
1556 1557                  va.va_mask = AT_UID|AT_MODE;
1557 1558  
1558 1559                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1559 1560  
1560 1561                  if (!error) {
1561 1562                          if (crgetuid(rp->cr) != va.va_uid) {
1562 1563                                  /*
1563 1564                                   * This is a kludge to allow writes of files
1564 1565                                   * created with read only permission.  The
1565 1566                                   * owner of the file is always allowed to
1566 1567                                   * write it.
1567 1568                                   */
1568 1569                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1569 1570                          }
1570 1571                          if (!error && MANDLOCK(vp, va.va_mode))
1571 1572                                  error = EACCES;
1572 1573                  }
1573 1574  
1574 1575                  /*
1575 1576                   * Check for a conflict with a nbmand-locked region.
1576 1577                   */
1577 1578                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1578 1579                      rp->wa->wa_count, 0, NULL)) {
1579 1580                          error = EACCES;
1580 1581                  }
1581 1582  
1582 1583                  if (error) {
1583 1584                          rp->ns->ns_status = puterrno(error);
1584 1585                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1585 1586                          rp->thread->t_flag |= t_flag;
1586 1587                          continue;
1587 1588                  }
1588 1589                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1589 1590                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1590 1591          } while ((rp = rp->list) != NULL);
1591 1592  
1592 1593          /*
1593 1594           * Step through the cluster attempting to gather as many
1594 1595           * requests which are contiguous as possible.  These
1595 1596           * contiguous requests are handled via one call to VOP_WRITE
1596 1597           * instead of different calls to VOP_WRITE.  We also keep
1597 1598           * track of the fact that any data was written.
1598 1599           */
1599 1600          rp = nlp->list;
1600 1601          data_written = 0;
1601 1602          do {
1602 1603                  /*
1603 1604                   * Skip any requests which are already marked as having an
1604 1605                   * error.
1605 1606                   */
1606 1607                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1607 1608                          rp = rp->list;
1608 1609                          continue;
1609 1610                  }
1610 1611  
1611 1612                  /*
1612 1613                   * Count the number of iovec's which are required
1613 1614                   * to handle this set of requests.  One iovec is
1614 1615                   * needed for each data buffer, whether addressed
1615 1616                   * by wa_data or by the b_rptr pointers in the
1616 1617                   * mblk chains.
1617 1618                   */
1618 1619                  iovcnt = 0;
1619 1620                  lrp = rp;
1620 1621                  for (;;) {
1621 1622                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1622 1623                                  iovcnt++;
1623 1624                          else {
1624 1625                                  m = lrp->wa->wa_mblk;
1625 1626                                  while (m != NULL) {
1626 1627                                          iovcnt++;
1627 1628                                          m = m->b_cont;
1628 1629                                  }
1629 1630                          }
1630 1631                          if (lrp->list == NULL ||
1631 1632                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1632 1633                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1633 1634                              lrp->list->wa->wa_offset) {
1634 1635                                  lrp = lrp->list;
1635 1636                                  break;
1636 1637                          }
1637 1638                          lrp = lrp->list;
1638 1639                  }
1639 1640  
1640 1641                  if (iovcnt <= MAXCLIOVECS) {
1641 1642  #ifdef DEBUG
1642 1643                          rfs_write_hits++;
1643 1644  #endif
1644 1645                          niovp = iov;
1645 1646                  } else {
1646 1647  #ifdef DEBUG
1647 1648                          rfs_write_misses++;
1648 1649  #endif
1649 1650                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1650 1651                  }
1651 1652                  /*
1652 1653                   * Put together the scatter/gather iovecs.
1653 1654                   */
1654 1655                  iovp = niovp;
1655 1656                  trp = rp;
1656 1657                  count = 0;
1657 1658                  do {
1658 1659                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1659 1660                                  if (trp->wa->wa_rlist) {
1660 1661                                          iovp->iov_base =
1661 1662                                              (char *)((trp->wa->wa_rlist)->
1662 1663                                              u.c_daddr3);
1663 1664                                          iovp->iov_len = trp->wa->wa_count;
1664 1665                                  } else  {
1665 1666                                          iovp->iov_base = trp->wa->wa_data;
1666 1667                                          iovp->iov_len = trp->wa->wa_count;
1667 1668                                  }
1668 1669                                  iovp++;
1669 1670                          } else {
1670 1671                                  m = trp->wa->wa_mblk;
1671 1672                                  rcount = trp->wa->wa_count;
1672 1673                                  while (m != NULL) {
1673 1674                                          iovp->iov_base = (caddr_t)m->b_rptr;
1674 1675                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1675 1676                                          rcount -= iovp->iov_len;
1676 1677                                          if (rcount < 0)
1677 1678                                                  iovp->iov_len += rcount;
1678 1679                                          iovp++;
1679 1680                                          if (rcount <= 0)
1680 1681                                                  break;
1681 1682                                          m = m->b_cont;
1682 1683                                  }
1683 1684                          }
1684 1685                          count += trp->wa->wa_count;
1685 1686                          trp = trp->list;
1686 1687                  } while (trp != lrp);
1687 1688  
1688 1689                  uio.uio_iov = niovp;
1689 1690                  uio.uio_iovcnt = iovcnt;
1690 1691                  uio.uio_segflg = UIO_SYSSPACE;
1691 1692                  uio.uio_extflg = UIO_COPY_DEFAULT;
1692 1693                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1693 1694                  uio.uio_resid = count;
1694 1695                  /*
1695 1696                   * The limit is checked on the client. We
1696 1697                   * should allow any size writes here.
1697 1698                   */
1698 1699                  uio.uio_llimit = curproc->p_fsz_ctl;
1699 1700                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1700 1701                  if (rlimit < (rlim64_t)uio.uio_resid)
1701 1702                          uio.uio_resid = (uint_t)rlimit;
1702 1703  
1703 1704                  /*
1704 1705                   * For now we assume no append mode.
1705 1706                   */
1706 1707  
1707 1708                  /*
1708 1709                   * We're changing creds because VM may fault
1709 1710                   * and we need the cred of the current
1710 1711                   * thread to be used if quota * checking is
1711 1712                   * enabled.
1712 1713                   */
1713 1714                  savecred = curthread->t_cred;
1714 1715                  curthread->t_cred = cr;
1715 1716                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1716 1717                  curthread->t_cred = savecred;
1717 1718  
1718 1719                  /* check if a monitor detected a delegation conflict */
1719 1720                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1720 1721                          /* mark as wouldblock so response is dropped */
1721 1722                          curthread->t_flag |= T_WOULDBLOCK;
1722 1723  
1723 1724                  if (niovp != iov)
1724 1725                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1725 1726  
1726 1727                  if (!error) {
1727 1728                          data_written = 1;
1728 1729                          /*
1729 1730                           * Get attributes again so we send the latest mod
1730 1731                           * time to the client side for its cache.
1731 1732                           */
1732 1733                          va.va_mask = AT_ALL;    /* now we want everything */
1733 1734  
1734 1735                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1735 1736  
1736 1737                          if (!error)
1737 1738                                  acl_perm(vp, exi, &va, rp->cr);
1738 1739                  }
1739 1740  
1740 1741                  /*
1741 1742                   * Fill in the status responses for each request
1742 1743                   * which was just handled.  Also, copy the latest
1743 1744                   * attributes in to the attribute responses if
1744 1745                   * appropriate.
1745 1746                   */
1746 1747                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1747 1748                  do {
1748 1749                          rp->thread->t_flag |= t_flag;
1749 1750                          /* check for overflows */
1750 1751                          if (!error) {
1751 1752                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1752 1753                          }
1753 1754                          rp->ns->ns_status = puterrno(error);
1754 1755                          rp = rp->list;
1755 1756                  } while (rp != lrp);
1756 1757          } while (rp != NULL);
1757 1758  
1758 1759          /*
1759 1760           * If any data was written at all, then we need to flush
1760 1761           * the data and metadata to stable storage.
1761 1762           */
1762 1763          if (data_written) {
1763 1764                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1764 1765  
1765 1766                  if (!error) {
1766 1767                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1767 1768                  }
1768 1769          }
1769 1770  
1770 1771          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1771 1772  
1772 1773          if (in_crit)
1773 1774                  nbl_end_crit(vp);
1774 1775          VN_RELE(vp);
1775 1776  
1776 1777          t_flag = curthread->t_flag & T_WOULDBLOCK;
1777 1778          mutex_enter(&nsrv->async_write_lock);
1778 1779          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1779 1780                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1780 1781                          rp->ns->ns_status = puterrno(error);
1781 1782                          rp->thread->t_flag |= t_flag;
1782 1783                  }
1783 1784          }
1784 1785          cv_broadcast(&nlp->cv);
1785 1786          mutex_exit(&nsrv->async_write_lock);
1786 1787  
1787 1788  }
1788 1789  
1789 1790  void *
1790 1791  rfs_write_getfh(struct nfswriteargs *wa)
1791 1792  {
1792 1793          return (&wa->wa_fhandle);
1793 1794  }
1794 1795  
1795 1796  /*
1796 1797   * Create a file.
1797 1798   * Creates a file with given attributes and returns those attributes
1798 1799   * and an fhandle for the new file.
1799 1800   */
1800 1801  void
1801 1802  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1802 1803      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1803 1804  {
1804 1805          int error;
1805 1806          int lookuperr;
1806 1807          int in_crit = 0;
1807 1808          struct vattr va;
1808 1809          vnode_t *vp;
1809 1810          vnode_t *realvp;
1810 1811          vnode_t *dvp;
1811 1812          char *name = args->ca_da.da_name;
1812 1813          vnode_t *tvp = NULL;
1813 1814          int mode;
1814 1815          int lookup_ok;
1815 1816          bool_t trunc;
1816 1817          struct sockaddr *ca;
1817 1818  
1818 1819          /*
1819 1820           * Disallow NULL paths
1820 1821           */
1821 1822          if (name == NULL || *name == '\0') {
1822 1823                  dr->dr_status = NFSERR_ACCES;
1823 1824                  return;
1824 1825          }
1825 1826  
1826 1827          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1827 1828          if (dvp == NULL) {
1828 1829                  dr->dr_status = NFSERR_STALE;
1829 1830                  return;
1830 1831          }
1831 1832  
1832 1833          error = sattr_to_vattr(args->ca_sa, &va);
1833 1834          if (error) {
1834 1835                  dr->dr_status = puterrno(error);
1835 1836                  return;
1836 1837          }
1837 1838  
1838 1839          /*
1839 1840           * Must specify the mode.
1840 1841           */
1841 1842          if (!(va.va_mask & AT_MODE)) {
1842 1843                  VN_RELE(dvp);
1843 1844                  dr->dr_status = NFSERR_INVAL;
1844 1845                  return;
1845 1846          }
1846 1847  
1847 1848          /*
1848 1849           * This is a completely gross hack to make mknod
1849 1850           * work over the wire until we can wack the protocol
1850 1851           */
1851 1852          if ((va.va_mode & IFMT) == IFCHR) {
1852 1853                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1853 1854                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1854 1855                  else {
1855 1856                          va.va_type = VCHR;
1856 1857                          /*
1857 1858                           * uncompress the received dev_t
1858 1859                           * if the top half is zero indicating a request
1859 1860                           * from an `older style' OS.
1860 1861                           */
1861 1862                          if ((va.va_size & 0xffff0000) == 0)
1862 1863                                  va.va_rdev = nfsv2_expdev(va.va_size);
1863 1864                          else
1864 1865                                  va.va_rdev = (dev_t)va.va_size;
1865 1866                  }
1866 1867                  va.va_mask &= ~AT_SIZE;
1867 1868          } else if ((va.va_mode & IFMT) == IFBLK) {
1868 1869                  va.va_type = VBLK;
1869 1870                  /*
1870 1871                   * uncompress the received dev_t
1871 1872                   * if the top half is zero indicating a request
1872 1873                   * from an `older style' OS.
1873 1874                   */
1874 1875                  if ((va.va_size & 0xffff0000) == 0)
1875 1876                          va.va_rdev = nfsv2_expdev(va.va_size);
1876 1877                  else
1877 1878                          va.va_rdev = (dev_t)va.va_size;
1878 1879                  va.va_mask &= ~AT_SIZE;
1879 1880          } else if ((va.va_mode & IFMT) == IFSOCK) {
1880 1881                  va.va_type = VSOCK;
1881 1882          } else {
1882 1883                  va.va_type = VREG;
1883 1884          }
1884 1885          va.va_mode &= ~IFMT;
1885 1886          va.va_mask |= AT_TYPE;
1886 1887  
1887 1888          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1888 1889          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1889 1890              MAXPATHLEN);
1890 1891          if (name == NULL) {
1891 1892                  dr->dr_status = puterrno(EINVAL);
1892 1893                  return;
1893 1894          }
1894 1895  
1895 1896          /*
1896 1897           * Why was the choice made to use VWRITE as the mode to the
1897 1898           * call to VOP_CREATE ? This results in a bug.  When a client
1898 1899           * opens a file that already exists and is RDONLY, the second
1899 1900           * open fails with an EACESS because of the mode.
1900 1901           * bug ID 1054648.
1901 1902           */
1902 1903          lookup_ok = 0;
1903 1904          mode = VWRITE;
1904 1905          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1905 1906                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1906 1907                      NULL, NULL, NULL);
1907 1908                  if (!error) {
1908 1909                          struct vattr at;
1909 1910  
1910 1911                          lookup_ok = 1;
1911 1912                          at.va_mask = AT_MODE;
1912 1913                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1913 1914                          if (!error)
1914 1915                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1915 1916                          VN_RELE(tvp);
1916 1917                          tvp = NULL;
1917 1918                  }
1918 1919          }
1919 1920  
1920 1921          if (!lookup_ok) {
1921 1922                  if (rdonly(ro, dvp)) {
1922 1923                          error = EROFS;
1923 1924                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1924 1925                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1925 1926                          error = EPERM;
1926 1927                  } else {
1927 1928                          error = 0;
1928 1929                  }
1929 1930          }
1930 1931  
1931 1932          /*
1932 1933           * If file size is being modified on an already existing file
1933 1934           * make sure that there are no conflicting non-blocking mandatory
1934 1935           * locks in the region being manipulated. Return EACCES if there
1935 1936           * are conflicting locks.
1936 1937           */
1937 1938          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1938 1939                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1939 1940                      NULL, NULL, NULL);
1940 1941  
1941 1942                  if (!lookuperr &&
1942 1943                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1943 1944                          VN_RELE(tvp);
1944 1945                          curthread->t_flag |= T_WOULDBLOCK;
1945 1946                          goto out;
1946 1947                  }
1947 1948  
1948 1949                  if (!lookuperr && nbl_need_check(tvp)) {
1949 1950                          /*
1950 1951                           * The file exists. Now check if it has any
1951 1952                           * conflicting non-blocking mandatory locks
1952 1953                           * in the region being changed.
1953 1954                           */
1954 1955                          struct vattr bva;
1955 1956                          u_offset_t offset;
1956 1957                          ssize_t length;
1957 1958  
1958 1959                          nbl_start_crit(tvp, RW_READER);
1959 1960                          in_crit = 1;
1960 1961  
1961 1962                          bva.va_mask = AT_SIZE;
1962 1963                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1963 1964                          if (!error) {
1964 1965                                  if (va.va_size < bva.va_size) {
1965 1966                                          offset = va.va_size;
1966 1967                                          length = bva.va_size - va.va_size;
1967 1968                                  } else {
1968 1969                                          offset = bva.va_size;
1969 1970                                          length = va.va_size - bva.va_size;
1970 1971                                  }
1971 1972                                  if (length) {
1972 1973                                          if (nbl_conflict(tvp, NBL_WRITE,
1973 1974                                              offset, length, 0, NULL)) {
1974 1975                                                  error = EACCES;
1975 1976                                          }
1976 1977                                  }
1977 1978                          }
1978 1979                          if (error) {
1979 1980                                  nbl_end_crit(tvp);
1980 1981                                  VN_RELE(tvp);
1981 1982                                  in_crit = 0;
1982 1983                          }
1983 1984                  } else if (tvp != NULL) {
1984 1985                          VN_RELE(tvp);
1985 1986                  }
1986 1987          }
1987 1988  
1988 1989          if (!error) {
1989 1990                  /*
1990 1991                   * If filesystem is shared with nosuid the remove any
1991 1992                   * setuid/setgid bits on create.
1992 1993                   */
1993 1994                  if (va.va_type == VREG &&
1994 1995                      exi->exi_export.ex_flags & EX_NOSUID)
1995 1996                          va.va_mode &= ~(VSUID | VSGID);
1996 1997  
1997 1998                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1998 1999                      NULL, NULL);
1999 2000  
2000 2001                  if (!error) {
2001 2002  
2002 2003                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2003 2004                                  trunc = TRUE;
2004 2005                          else
2005 2006                                  trunc = FALSE;
2006 2007  
2007 2008                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2008 2009                                  VN_RELE(vp);
2009 2010                                  curthread->t_flag |= T_WOULDBLOCK;
2010 2011                                  goto out;
2011 2012                          }
2012 2013                          va.va_mask = AT_ALL;
2013 2014  
2014 2015                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2015 2016  
2016 2017                          /* check for overflows */
2017 2018                          if (!error) {
2018 2019                                  acl_perm(vp, exi, &va, cr);
2019 2020                                  error = vattr_to_nattr(&va, &dr->dr_attr);
2020 2021                                  if (!error) {
2021 2022                                          error = makefh(&dr->dr_fhandle, vp,
2022 2023                                              exi);
2023 2024                                  }
2024 2025                          }
2025 2026                          /*
2026 2027                           * Force modified metadata out to stable storage.
2027 2028                           *
2028 2029                           * if a underlying vp exists, pass it to VOP_FSYNC
2029 2030                           */
2030 2031                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
2031 2032                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2032 2033                          else
2033 2034                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2034 2035                          VN_RELE(vp);
2035 2036                  }
2036 2037  
2037 2038                  if (in_crit) {
2038 2039                          nbl_end_crit(tvp);
2039 2040                          VN_RELE(tvp);
2040 2041                  }
2041 2042          }
2042 2043  
2043 2044          /*
2044 2045           * Force modified data and metadata out to stable storage.
2045 2046           */
2046 2047          (void) VOP_FSYNC(dvp, 0, cr, NULL);
2047 2048  
2048 2049  out:
2049 2050  
2050 2051          VN_RELE(dvp);
2051 2052  
2052 2053          dr->dr_status = puterrno(error);
2053 2054  
2054 2055          if (name != args->ca_da.da_name)
2055 2056                  kmem_free(name, MAXPATHLEN);
2056 2057  }
2057 2058  void *
2058 2059  rfs_create_getfh(struct nfscreatargs *args)
2059 2060  {
2060 2061          return (args->ca_da.da_fhandle);
2061 2062  }
2062 2063  
2063 2064  /*
2064 2065   * Remove a file.
2065 2066   * Remove named file from parent directory.
2066 2067   */
2067 2068  /* ARGSUSED */
2068 2069  void
2069 2070  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2070 2071      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2071 2072  {
2072 2073          int error = 0;
2073 2074          vnode_t *vp;
2074 2075          vnode_t *targvp;
2075 2076          int in_crit = 0;
2076 2077  
2077 2078          /*
2078 2079           * Disallow NULL paths
2079 2080           */
2080 2081          if (da->da_name == NULL || *da->da_name == '\0') {
2081 2082                  *status = NFSERR_ACCES;
2082 2083                  return;
2083 2084          }
2084 2085  
2085 2086          vp = nfs_fhtovp(da->da_fhandle, exi);
2086 2087          if (vp == NULL) {
2087 2088                  *status = NFSERR_STALE;
2088 2089                  return;
2089 2090          }
2090 2091  
2091 2092          if (rdonly(ro, vp)) {
2092 2093                  VN_RELE(vp);
2093 2094                  *status = NFSERR_ROFS;
2094 2095                  return;
2095 2096          }
2096 2097  
2097 2098          /*
2098 2099           * Check for a conflict with a non-blocking mandatory share reservation.
2099 2100           */
2100 2101          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2101 2102              NULL, cr, NULL, NULL, NULL);
2102 2103          if (error != 0) {
2103 2104                  VN_RELE(vp);
2104 2105                  *status = puterrno(error);
2105 2106                  return;
2106 2107          }
2107 2108  
2108 2109          /*
2109 2110           * If the file is delegated to an v4 client, then initiate
2110 2111           * recall and drop this request (by setting T_WOULDBLOCK).
2111 2112           * The client will eventually re-transmit the request and
2112 2113           * (hopefully), by then, the v4 client will have returned
2113 2114           * the delegation.
2114 2115           */
2115 2116  
2116 2117          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2117 2118                  VN_RELE(vp);
2118 2119                  VN_RELE(targvp);
2119 2120                  curthread->t_flag |= T_WOULDBLOCK;
2120 2121                  return;
2121 2122          }
2122 2123  
2123 2124          if (nbl_need_check(targvp)) {
2124 2125                  nbl_start_crit(targvp, RW_READER);
2125 2126                  in_crit = 1;
2126 2127                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2127 2128                          error = EACCES;
2128 2129                          goto out;
2129 2130                  }
2130 2131          }
2131 2132  
2132 2133          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2133 2134  
2134 2135          /*
2135 2136           * Force modified data and metadata out to stable storage.
2136 2137           */
2137 2138          (void) VOP_FSYNC(vp, 0, cr, NULL);
2138 2139  
2139 2140  out:
2140 2141          if (in_crit)
2141 2142                  nbl_end_crit(targvp);
2142 2143          VN_RELE(targvp);
2143 2144          VN_RELE(vp);
2144 2145  
2145 2146          *status = puterrno(error);
2146 2147  
2147 2148  }
2148 2149  
2149 2150  void *
2150 2151  rfs_remove_getfh(struct nfsdiropargs *da)
2151 2152  {
2152 2153          return (da->da_fhandle);
2153 2154  }
2154 2155  
2155 2156  /*
2156 2157   * rename a file
2157 2158   * Give a file (from) a new name (to).
2158 2159   */
2159 2160  /* ARGSUSED */
2160 2161  void
2161 2162  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2162 2163      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2163 2164  {
2164 2165          int error = 0;
2165 2166          vnode_t *fromvp;
2166 2167          vnode_t *tovp;
2167 2168          struct exportinfo *to_exi;
2168 2169          fhandle_t *fh;
2169 2170          vnode_t *srcvp;
2170 2171          vnode_t *targvp;
2171 2172          int in_crit = 0;
2172 2173  
2173 2174          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2174 2175          if (fromvp == NULL) {
2175 2176                  *status = NFSERR_STALE;
2176 2177                  return;
2177 2178          }
2178 2179  
2179 2180          fh = args->rna_to.da_fhandle;
2180 2181          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2181 2182          if (to_exi == NULL) {
2182 2183                  VN_RELE(fromvp);
2183 2184                  *status = NFSERR_ACCES;
2184 2185                  return;
2185 2186          }
2186 2187          exi_rele(to_exi);
2187 2188  
2188 2189          if (to_exi != exi) {
2189 2190                  VN_RELE(fromvp);
2190 2191                  *status = NFSERR_XDEV;
2191 2192                  return;
2192 2193          }
2193 2194  
2194 2195          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2195 2196          if (tovp == NULL) {
2196 2197                  VN_RELE(fromvp);
2197 2198                  *status = NFSERR_STALE;
2198 2199                  return;
2199 2200          }
2200 2201  
2201 2202          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2202 2203                  VN_RELE(tovp);
2203 2204                  VN_RELE(fromvp);
2204 2205                  *status = NFSERR_NOTDIR;
2205 2206                  return;
2206 2207          }
2207 2208  
2208 2209          /*
2209 2210           * Disallow NULL paths
2210 2211           */
2211 2212          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2212 2213              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2213 2214                  VN_RELE(tovp);
2214 2215                  VN_RELE(fromvp);
2215 2216                  *status = NFSERR_ACCES;
2216 2217                  return;
2217 2218          }
2218 2219  
2219 2220          if (rdonly(ro, tovp)) {
2220 2221                  VN_RELE(tovp);
2221 2222                  VN_RELE(fromvp);
2222 2223                  *status = NFSERR_ROFS;
2223 2224                  return;
2224 2225          }
2225 2226  
2226 2227          /*
2227 2228           * Check for a conflict with a non-blocking mandatory share reservation.
2228 2229           */
2229 2230          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2230 2231              NULL, cr, NULL, NULL, NULL);
2231 2232          if (error != 0) {
2232 2233                  VN_RELE(tovp);
2233 2234                  VN_RELE(fromvp);
2234 2235                  *status = puterrno(error);
2235 2236                  return;
2236 2237          }
2237 2238  
2238 2239          /* Check for delegations on the source file */
2239 2240  
2240 2241          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2241 2242                  VN_RELE(tovp);
2242 2243                  VN_RELE(fromvp);
2243 2244                  VN_RELE(srcvp);
2244 2245                  curthread->t_flag |= T_WOULDBLOCK;
2245 2246                  return;
2246 2247          }
2247 2248  
2248 2249          /* Check for delegation on the file being renamed over, if it exists */
2249 2250  
2250 2251          if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2251 2252              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2252 2253              NULL, NULL, NULL) == 0) {
2253 2254  
2254 2255                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2255 2256                          VN_RELE(tovp);
2256 2257                          VN_RELE(fromvp);
2257 2258                          VN_RELE(srcvp);
2258 2259                          VN_RELE(targvp);
2259 2260                          curthread->t_flag |= T_WOULDBLOCK;
2260 2261                          return;
2261 2262                  }
2262 2263                  VN_RELE(targvp);
2263 2264          }
2264 2265  
2265 2266  
2266 2267          if (nbl_need_check(srcvp)) {
2267 2268                  nbl_start_crit(srcvp, RW_READER);
2268 2269                  in_crit = 1;
2269 2270                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2270 2271                          error = EACCES;
2271 2272                          goto out;
2272 2273                  }
2273 2274          }
2274 2275  
2275 2276          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2276 2277              tovp, args->rna_to.da_name, cr, NULL, 0);
2277 2278  
2278 2279          if (error == 0)
2279 2280                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2280 2281                      strlen(args->rna_to.da_name));
2281 2282  
2282 2283          /*
2283 2284           * Force modified data and metadata out to stable storage.
2284 2285           */
2285 2286          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2286 2287          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2287 2288  
2288 2289  out:
2289 2290          if (in_crit)
2290 2291                  nbl_end_crit(srcvp);
2291 2292          VN_RELE(srcvp);
2292 2293          VN_RELE(tovp);
2293 2294          VN_RELE(fromvp);
2294 2295  
2295 2296          *status = puterrno(error);
2296 2297  
2297 2298  }
2298 2299  void *
2299 2300  rfs_rename_getfh(struct nfsrnmargs *args)
2300 2301  {
2301 2302          return (args->rna_from.da_fhandle);
2302 2303  }
2303 2304  
2304 2305  /*
2305 2306   * Link to a file.
2306 2307   * Create a file (to) which is a hard link to the given file (from).
2307 2308   */
2308 2309  /* ARGSUSED */
2309 2310  void
2310 2311  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2311 2312      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2312 2313  {
2313 2314          int error;
2314 2315          vnode_t *fromvp;
2315 2316          vnode_t *tovp;
2316 2317          struct exportinfo *to_exi;
2317 2318          fhandle_t *fh;
2318 2319  
2319 2320          fromvp = nfs_fhtovp(args->la_from, exi);
2320 2321          if (fromvp == NULL) {
2321 2322                  *status = NFSERR_STALE;
2322 2323                  return;
2323 2324          }
2324 2325  
2325 2326          fh = args->la_to.da_fhandle;
2326 2327          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2327 2328          if (to_exi == NULL) {
2328 2329                  VN_RELE(fromvp);
2329 2330                  *status = NFSERR_ACCES;
2330 2331                  return;
2331 2332          }
2332 2333          exi_rele(to_exi);
2333 2334  
2334 2335          if (to_exi != exi) {
2335 2336                  VN_RELE(fromvp);
2336 2337                  *status = NFSERR_XDEV;
2337 2338                  return;
2338 2339          }
2339 2340  
2340 2341          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2341 2342          if (tovp == NULL) {
2342 2343                  VN_RELE(fromvp);
2343 2344                  *status = NFSERR_STALE;
2344 2345                  return;
2345 2346          }
2346 2347  
2347 2348          if (tovp->v_type != VDIR) {
2348 2349                  VN_RELE(tovp);
2349 2350                  VN_RELE(fromvp);
2350 2351                  *status = NFSERR_NOTDIR;
2351 2352                  return;
2352 2353          }
2353 2354          /*
2354 2355           * Disallow NULL paths
2355 2356           */
2356 2357          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2357 2358                  VN_RELE(tovp);
2358 2359                  VN_RELE(fromvp);
2359 2360                  *status = NFSERR_ACCES;
2360 2361                  return;
2361 2362          }
2362 2363  
2363 2364          if (rdonly(ro, tovp)) {
2364 2365                  VN_RELE(tovp);
2365 2366                  VN_RELE(fromvp);
2366 2367                  *status = NFSERR_ROFS;
2367 2368                  return;
2368 2369          }
2369 2370  
2370 2371          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2371 2372  
2372 2373          /*
2373 2374           * Force modified data and metadata out to stable storage.
2374 2375           */
2375 2376          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2376 2377          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2377 2378  
2378 2379          VN_RELE(tovp);
2379 2380          VN_RELE(fromvp);
2380 2381  
2381 2382          *status = puterrno(error);
2382 2383  
2383 2384  }
2384 2385  void *
2385 2386  rfs_link_getfh(struct nfslinkargs *args)
2386 2387  {
2387 2388          return (args->la_from);
2388 2389  }
2389 2390  
2390 2391  /*
2391 2392   * Symbolicly link to a file.
2392 2393   * Create a file (to) with the given attributes which is a symbolic link
2393 2394   * to the given path name (to).
2394 2395   */
2395 2396  void
2396 2397  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2397 2398      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2398 2399  {
2399 2400          int error;
2400 2401          struct vattr va;
2401 2402          vnode_t *vp;
2402 2403          vnode_t *svp;
2403 2404          int lerror;
2404 2405          struct sockaddr *ca;
2405 2406          char *name = NULL;
2406 2407  
2407 2408          /*
2408 2409           * Disallow NULL paths
2409 2410           */
2410 2411          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2411 2412                  *status = NFSERR_ACCES;
2412 2413                  return;
2413 2414          }
2414 2415  
2415 2416          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2416 2417          if (vp == NULL) {
2417 2418                  *status = NFSERR_STALE;
2418 2419                  return;
2419 2420          }
2420 2421  
2421 2422          if (rdonly(ro, vp)) {
2422 2423                  VN_RELE(vp);
2423 2424                  *status = NFSERR_ROFS;
2424 2425                  return;
2425 2426          }
2426 2427  
2427 2428          error = sattr_to_vattr(args->sla_sa, &va);
2428 2429          if (error) {
2429 2430                  VN_RELE(vp);
2430 2431                  *status = puterrno(error);
2431 2432                  return;
2432 2433          }
2433 2434  
2434 2435          if (!(va.va_mask & AT_MODE)) {
2435 2436                  VN_RELE(vp);
2436 2437                  *status = NFSERR_INVAL;
2437 2438                  return;
2438 2439          }
2439 2440  
2440 2441          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2441 2442          name = nfscmd_convname(ca, exi, args->sla_tnm,
2442 2443              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2443 2444  
2444 2445          if (name == NULL) {
2445 2446                  *status = NFSERR_ACCES;
2446 2447                  return;
2447 2448          }
2448 2449  
2449 2450          va.va_type = VLNK;
2450 2451          va.va_mask |= AT_TYPE;
2451 2452  
2452 2453          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2453 2454  
2454 2455          /*
2455 2456           * Force new data and metadata out to stable storage.
2456 2457           */
2457 2458          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2458 2459              NULL, cr, NULL, NULL, NULL);
2459 2460  
2460 2461          if (!lerror) {
2461 2462                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2462 2463                  VN_RELE(svp);
2463 2464          }
2464 2465  
2465 2466          /*
2466 2467           * Force modified data and metadata out to stable storage.
2467 2468           */
2468 2469          (void) VOP_FSYNC(vp, 0, cr, NULL);
2469 2470  
2470 2471          VN_RELE(vp);
2471 2472  
2472 2473          *status = puterrno(error);
2473 2474          if (name != args->sla_tnm)
2474 2475                  kmem_free(name, MAXPATHLEN);
2475 2476  
2476 2477  }
2477 2478  void *
2478 2479  rfs_symlink_getfh(struct nfsslargs *args)
2479 2480  {
2480 2481          return (args->sla_from.da_fhandle);
2481 2482  }
2482 2483  
2483 2484  /*
2484 2485   * Make a directory.
2485 2486   * Create a directory with the given name, parent directory, and attributes.
2486 2487   * Returns a file handle and attributes for the new directory.
2487 2488   */
2488 2489  /* ARGSUSED */
2489 2490  void
2490 2491  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2491 2492      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2492 2493  {
2493 2494          int error;
2494 2495          struct vattr va;
2495 2496          vnode_t *dvp = NULL;
2496 2497          vnode_t *vp;
2497 2498          char *name = args->ca_da.da_name;
2498 2499  
2499 2500          /*
2500 2501           * Disallow NULL paths
2501 2502           */
2502 2503          if (name == NULL || *name == '\0') {
2503 2504                  dr->dr_status = NFSERR_ACCES;
2504 2505                  return;
2505 2506          }
2506 2507  
2507 2508          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2508 2509          if (vp == NULL) {
2509 2510                  dr->dr_status = NFSERR_STALE;
2510 2511                  return;
2511 2512          }
2512 2513  
2513 2514          if (rdonly(ro, vp)) {
2514 2515                  VN_RELE(vp);
2515 2516                  dr->dr_status = NFSERR_ROFS;
2516 2517                  return;
2517 2518          }
2518 2519  
2519 2520          error = sattr_to_vattr(args->ca_sa, &va);
2520 2521          if (error) {
2521 2522                  VN_RELE(vp);
2522 2523                  dr->dr_status = puterrno(error);
2523 2524                  return;
2524 2525          }
2525 2526  
2526 2527          if (!(va.va_mask & AT_MODE)) {
2527 2528                  VN_RELE(vp);
2528 2529                  dr->dr_status = NFSERR_INVAL;
2529 2530                  return;
2530 2531          }
2531 2532  
2532 2533          va.va_type = VDIR;
2533 2534          va.va_mask |= AT_TYPE;
2534 2535  
2535 2536          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2536 2537  
2537 2538          if (!error) {
2538 2539                  /*
2539 2540                   * Attribtutes of the newly created directory should
2540 2541                   * be returned to the client.
2541 2542                   */
2542 2543                  va.va_mask = AT_ALL; /* We want everything */
2543 2544                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2544 2545  
2545 2546                  /* check for overflows */
2546 2547                  if (!error) {
2547 2548                          acl_perm(vp, exi, &va, cr);
2548 2549                          error = vattr_to_nattr(&va, &dr->dr_attr);
2549 2550                          if (!error) {
2550 2551                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2551 2552                          }
2552 2553                  }
2553 2554                  /*
2554 2555                   * Force new data and metadata out to stable storage.
2555 2556                   */
2556 2557                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2557 2558                  VN_RELE(dvp);
2558 2559          }
2559 2560  
2560 2561          /*
2561 2562           * Force modified data and metadata out to stable storage.
2562 2563           */
2563 2564          (void) VOP_FSYNC(vp, 0, cr, NULL);
2564 2565  
2565 2566          VN_RELE(vp);
2566 2567  
2567 2568          dr->dr_status = puterrno(error);
2568 2569  
2569 2570  }
2570 2571  void *
2571 2572  rfs_mkdir_getfh(struct nfscreatargs *args)
2572 2573  {
2573 2574          return (args->ca_da.da_fhandle);
2574 2575  }
2575 2576  
2576 2577  /*
2577 2578   * Remove a directory.
2578 2579   * Remove the given directory name from the given parent directory.
2579 2580   */
2580 2581  /* ARGSUSED */
2581 2582  void
2582 2583  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2583 2584      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2584 2585  {
2585 2586          int error;
2586 2587          vnode_t *vp;
2587 2588  
2588 2589          /*
2589 2590           * Disallow NULL paths
2590 2591           */
2591 2592          if (da->da_name == NULL || *da->da_name == '\0') {
2592 2593                  *status = NFSERR_ACCES;
2593 2594                  return;
2594 2595          }
2595 2596  
2596 2597          vp = nfs_fhtovp(da->da_fhandle, exi);
2597 2598          if (vp == NULL) {
2598 2599                  *status = NFSERR_STALE;
2599 2600                  return;
2600 2601          }
2601 2602  
2602 2603          if (rdonly(ro, vp)) {
2603 2604                  VN_RELE(vp);
2604 2605                  *status = NFSERR_ROFS;
2605 2606                  return;
2606 2607          }
2607 2608  
2608 2609          /*
2609 2610           * VOP_RMDIR takes a third argument (the current
2610 2611           * directory of the process).  That's because someone
2611 2612           * wants to return EINVAL if one tries to remove ".".
2612 2613           * Of course, NFS servers have no idea what their
2613 2614           * clients' current directories are.  We fake it by
2614 2615           * supplying a vnode known to exist and illegal to
2615 2616           * remove.
2616 2617           */
2617 2618          error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2618 2619  
2619 2620          /*
2620 2621           * Force modified data and metadata out to stable storage.
2621 2622           */
2622 2623          (void) VOP_FSYNC(vp, 0, cr, NULL);
2623 2624  
2624 2625          VN_RELE(vp);
2625 2626  
2626 2627          /*
2627 2628           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2628 2629           * if the directory is not empty.  A System V NFS server
2629 2630           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2630 2631           * over the wire.
2631 2632           */
2632 2633          if (error == EEXIST)
2633 2634                  *status = NFSERR_NOTEMPTY;
2634 2635          else
2635 2636                  *status = puterrno(error);
2636 2637  
2637 2638  }
2638 2639  void *
2639 2640  rfs_rmdir_getfh(struct nfsdiropargs *da)
2640 2641  {
2641 2642          return (da->da_fhandle);
2642 2643  }
2643 2644  
2644 2645  /* ARGSUSED */
2645 2646  void
2646 2647  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2647 2648      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2648 2649  {
2649 2650          int error;
2650 2651          int iseof;
2651 2652          struct iovec iov;
2652 2653          struct uio uio;
2653 2654          vnode_t *vp;
2654 2655          char *ndata = NULL;
2655 2656          struct sockaddr *ca;
2656 2657          size_t nents;
2657 2658          int ret;
2658 2659  
2659 2660          vp = nfs_fhtovp(&rda->rda_fh, exi);
2660 2661          if (vp == NULL) {
2661 2662                  rd->rd_entries = NULL;
2662 2663                  rd->rd_status = NFSERR_STALE;
2663 2664                  return;
2664 2665          }
2665 2666  
2666 2667          if (vp->v_type != VDIR) {
2667 2668                  VN_RELE(vp);
2668 2669                  rd->rd_entries = NULL;
2669 2670                  rd->rd_status = NFSERR_NOTDIR;
2670 2671                  return;
2671 2672          }
2672 2673  
2673 2674          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2674 2675  
2675 2676          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2676 2677  
2677 2678          if (error) {
2678 2679                  rd->rd_entries = NULL;
2679 2680                  goto bad;
2680 2681          }
2681 2682  
2682 2683          if (rda->rda_count == 0) {
2683 2684                  rd->rd_entries = NULL;
2684 2685                  rd->rd_size = 0;
2685 2686                  rd->rd_eof = FALSE;
2686 2687                  goto bad;
2687 2688          }
2688 2689  
2689 2690          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2690 2691  
2691 2692          /*
2692 2693           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2693 2694           */
2694 2695          rd->rd_bufsize = (uint_t)rda->rda_count;
2695 2696          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2696 2697  
2697 2698          /*
2698 2699           * Set up io vector to read directory data
2699 2700           */
2700 2701          iov.iov_base = (caddr_t)rd->rd_entries;
2701 2702          iov.iov_len = rda->rda_count;
2702 2703          uio.uio_iov = &iov;
2703 2704          uio.uio_iovcnt = 1;
2704 2705          uio.uio_segflg = UIO_SYSSPACE;
2705 2706          uio.uio_extflg = UIO_COPY_CACHED;
2706 2707          uio.uio_loffset = (offset_t)rda->rda_offset;
2707 2708          uio.uio_resid = rda->rda_count;
2708 2709  
2709 2710          /*
2710 2711           * read directory
2711 2712           */
2712 2713          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2713 2714  
2714 2715          /*
2715 2716           * Clean up
2716 2717           */
2717 2718          if (!error) {
2718 2719                  /*
2719 2720                   * set size and eof
2720 2721                   */
2721 2722                  if (uio.uio_resid == rda->rda_count) {
2722 2723                          rd->rd_size = 0;
2723 2724                          rd->rd_eof = TRUE;
2724 2725                  } else {
2725 2726                          rd->rd_size = (uint32_t)(rda->rda_count -
2726 2727                              uio.uio_resid);
2727 2728                          rd->rd_eof = iseof ? TRUE : FALSE;
2728 2729                  }
2729 2730          }
2730 2731  
2731 2732          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2732 2733          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2733 2734          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2734 2735              rda->rda_count, &ndata);
2735 2736  
2736 2737          if (ret != 0) {
2737 2738                  size_t dropbytes;
2738 2739                  /*
2739 2740                   * We had to drop one or more entries in order to fit
2740 2741                   * during the character conversion.  We need to patch
2741 2742                   * up the size and eof info.
2742 2743                   */
2743 2744                  if (rd->rd_eof)
2744 2745                          rd->rd_eof = FALSE;
2745 2746                  dropbytes = nfscmd_dropped_entrysize(
2746 2747                      (struct dirent64 *)rd->rd_entries, nents, ret);
2747 2748                  rd->rd_size -= dropbytes;
2748 2749          }
2749 2750          if (ndata == NULL) {
2750 2751                  ndata = (char *)rd->rd_entries;
2751 2752          } else if (ndata != (char *)rd->rd_entries) {
2752 2753                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2753 2754                  rd->rd_entries = (void *)ndata;
2754 2755                  rd->rd_bufsize = rda->rda_count;
2755 2756          }
2756 2757  
2757 2758  bad:
2758 2759          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2759 2760  
2760 2761  #if 0 /* notyet */
2761 2762          /*
2762 2763           * Don't do this.  It causes local disk writes when just
2763 2764           * reading the file and the overhead is deemed larger
2764 2765           * than the benefit.
2765 2766           */
2766 2767          /*
2767 2768           * Force modified metadata out to stable storage.
2768 2769           */
2769 2770          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2770 2771  #endif
2771 2772  
2772 2773          VN_RELE(vp);
2773 2774  
2774 2775          rd->rd_status = puterrno(error);
2775 2776  
2776 2777  }
2777 2778  void *
2778 2779  rfs_readdir_getfh(struct nfsrddirargs *rda)
2779 2780  {
2780 2781          return (&rda->rda_fh);
2781 2782  }
2782 2783  void
2783 2784  rfs_rddirfree(struct nfsrddirres *rd)
2784 2785  {
2785 2786          if (rd->rd_entries != NULL)
2786 2787                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2787 2788  }
2788 2789  
2789 2790  /* ARGSUSED */
2790 2791  void
2791 2792  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2792 2793      struct svc_req *req, cred_t *cr, bool_t ro)
2793 2794  {
2794 2795          int error;
2795 2796          struct statvfs64 sb;
2796 2797          vnode_t *vp;
2797 2798  
2798 2799          vp = nfs_fhtovp(fh, exi);
2799 2800          if (vp == NULL) {
2800 2801                  fs->fs_status = NFSERR_STALE;
2801 2802                  return;
2802 2803          }
2803 2804  
2804 2805          error = VFS_STATVFS(vp->v_vfsp, &sb);
2805 2806  
2806 2807          if (!error) {
2807 2808                  fs->fs_tsize = nfstsize();
2808 2809                  fs->fs_bsize = sb.f_frsize;
2809 2810                  fs->fs_blocks = sb.f_blocks;
2810 2811                  fs->fs_bfree = sb.f_bfree;
2811 2812                  fs->fs_bavail = sb.f_bavail;
2812 2813          }
2813 2814  
2814 2815          VN_RELE(vp);
2815 2816  
2816 2817          fs->fs_status = puterrno(error);
2817 2818  
2818 2819  }
2819 2820  void *
2820 2821  rfs_statfs_getfh(fhandle_t *fh)
2821 2822  {
2822 2823          return (fh);
2823 2824  }
2824 2825  
2825 2826  static int
2826 2827  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2827 2828  {
2828 2829          vap->va_mask = 0;
2829 2830  
2830 2831          /*
2831 2832           * There was a sign extension bug in some VFS based systems
2832 2833           * which stored the mode as a short.  When it would get
2833 2834           * assigned to a u_long, no sign extension would occur.
2834 2835           * It needed to, but this wasn't noticed because sa_mode
2835 2836           * would then get assigned back to the short, thus ignoring
2836 2837           * the upper 16 bits of sa_mode.
2837 2838           *
2838 2839           * To make this implementation work for both broken
2839 2840           * clients and good clients, we check for both versions
2840 2841           * of the mode.
2841 2842           */
2842 2843          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2843 2844              sa->sa_mode != (uint32_t)-1) {
2844 2845                  vap->va_mask |= AT_MODE;
2845 2846                  vap->va_mode = sa->sa_mode;
2846 2847          }
2847 2848          if (sa->sa_uid != (uint32_t)-1) {
2848 2849                  vap->va_mask |= AT_UID;
2849 2850                  vap->va_uid = sa->sa_uid;
2850 2851          }
2851 2852          if (sa->sa_gid != (uint32_t)-1) {
2852 2853                  vap->va_mask |= AT_GID;
2853 2854                  vap->va_gid = sa->sa_gid;
2854 2855          }
2855 2856          if (sa->sa_size != (uint32_t)-1) {
2856 2857                  vap->va_mask |= AT_SIZE;
2857 2858                  vap->va_size = sa->sa_size;
2858 2859          }
2859 2860          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2860 2861              sa->sa_atime.tv_usec != (int32_t)-1) {
2861 2862  #ifndef _LP64
2862 2863                  /* return error if time overflow */
2863 2864                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2864 2865                          return (EOVERFLOW);
2865 2866  #endif
2866 2867                  vap->va_mask |= AT_ATIME;
2867 2868                  /*
2868 2869                   * nfs protocol defines times as unsigned so don't extend sign,
2869 2870                   * unless sysadmin set nfs_allow_preepoch_time.
2870 2871                   */
2871 2872                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2872 2873                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2873 2874          }
2874 2875          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2875 2876              sa->sa_mtime.tv_usec != (int32_t)-1) {
2876 2877  #ifndef _LP64
2877 2878                  /* return error if time overflow */
2878 2879                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2879 2880                          return (EOVERFLOW);
2880 2881  #endif
2881 2882                  vap->va_mask |= AT_MTIME;
2882 2883                  /*
2883 2884                   * nfs protocol defines times as unsigned so don't extend sign,
2884 2885                   * unless sysadmin set nfs_allow_preepoch_time.
2885 2886                   */
2886 2887                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2887 2888                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2888 2889          }
2889 2890          return (0);
2890 2891  }
2891 2892  
2892 2893  static const enum nfsftype vt_to_nf[] = {
2893 2894          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2894 2895  };
2895 2896  
2896 2897  /*
2897 2898   * check the following fields for overflow: nodeid, size, and time.
2898 2899   * There could be a problem when converting 64-bit LP64 fields
2899 2900   * into 32-bit ones.  Return an error if there is an overflow.
2900 2901   */
2901 2902  int
2902 2903  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2903 2904  {
2904 2905          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2905 2906          na->na_type = vt_to_nf[vap->va_type];
2906 2907  
2907 2908          if (vap->va_mode == (unsigned short) -1)
2908 2909                  na->na_mode = (uint32_t)-1;
2909 2910          else
2910 2911                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2911 2912  
2912 2913          if (vap->va_uid == (unsigned short)(-1))
2913 2914                  na->na_uid = (uint32_t)(-1);
2914 2915          else if (vap->va_uid == UID_NOBODY)
2915 2916                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2916 2917          else
2917 2918                  na->na_uid = vap->va_uid;
2918 2919  
2919 2920          if (vap->va_gid == (unsigned short)(-1))
2920 2921                  na->na_gid = (uint32_t)-1;
2921 2922          else if (vap->va_gid == GID_NOBODY)
2922 2923                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2923 2924          else
2924 2925                  na->na_gid = vap->va_gid;
2925 2926  
2926 2927          /*
2927 2928           * Do we need to check fsid for overflow?  It is 64-bit in the
2928 2929           * vattr, but are bigger than 32 bit values supported?
2929 2930           */
2930 2931          na->na_fsid = vap->va_fsid;
2931 2932  
2932 2933          na->na_nodeid = vap->va_nodeid;
2933 2934  
2934 2935          /*
2935 2936           * Check to make sure that the nodeid is representable over the
2936 2937           * wire without losing bits.
2937 2938           */
2938 2939          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2939 2940                  return (EFBIG);
2940 2941          na->na_nlink = vap->va_nlink;
2941 2942  
2942 2943          /*
2943 2944           * Check for big files here, instead of at the caller.  See
2944 2945           * comments in cstat for large special file explanation.
2945 2946           */
2946 2947          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2947 2948                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2948 2949                          return (EFBIG);
2949 2950                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2950 2951                          /* UNKNOWN_SIZE | OVERFLOW */
2951 2952                          na->na_size = MAXOFF32_T;
2952 2953                  } else
2953 2954                          na->na_size = vap->va_size;
2954 2955          } else
2955 2956                  na->na_size = vap->va_size;
2956 2957  
2957 2958          /*
2958 2959           * If the vnode times overflow the 32-bit times that NFS2
2959 2960           * uses on the wire then return an error.
2960 2961           */
2961 2962          if (!NFS_VAP_TIME_OK(vap)) {
2962 2963                  return (EOVERFLOW);
2963 2964          }
2964 2965          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2965 2966          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2966 2967  
2967 2968          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2968 2969          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2969 2970  
2970 2971          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2971 2972          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2972 2973  
2973 2974          /*
2974 2975           * If the dev_t will fit into 16 bits then compress
2975 2976           * it, otherwise leave it alone. See comments in
2976 2977           * nfs_client.c.
2977 2978           */
2978 2979          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2979 2980              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2980 2981                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2981 2982          else
2982 2983                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2983 2984  
2984 2985          na->na_blocks = vap->va_nblocks;
2985 2986          na->na_blocksize = vap->va_blksize;
2986 2987  
2987 2988          /*
2988 2989           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2989 2990           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2990 2991           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2991 2992           *
2992 2993           * BUYER BEWARE:
2993 2994           *  If you are porting the NFS to a non-Sun server, you probably
2994 2995           *  don't want to include the following block of code.  The
2995 2996           *  over-the-wire special file types will be changing with the
2996 2997           *  NFS Protocol Revision.
2997 2998           */
2998 2999          if (vap->va_type == VFIFO)
2999 3000                  NA_SETFIFO(na);
3000 3001          return (0);
3001 3002  }
3002 3003  
3003 3004  /*
3004 3005   * acl v2 support: returns approximate permission.
3005 3006   *      default: returns minimal permission (more restrictive)
3006 3007   *      aclok: returns maximal permission (less restrictive)
3007 3008   *      This routine changes the permissions that are alaredy in *va.
3008 3009   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3009 3010   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3010 3011   */
3011 3012  static void
3012 3013  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3013 3014  {
3014 3015          vsecattr_t      vsa;
3015 3016          int             aclcnt;
3016 3017          aclent_t        *aclentp;
3017 3018          mode_t          mask_perm;
3018 3019          mode_t          grp_perm;
3019 3020          mode_t          other_perm;
3020 3021          mode_t          other_orig;
3021 3022          int             error;
3022 3023  
3023 3024          /* dont care default acl */
3024 3025          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3025 3026          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3026 3027  
3027 3028          if (!error) {
3028 3029                  aclcnt = vsa.vsa_aclcnt;
3029 3030                  if (aclcnt > MIN_ACL_ENTRIES) {
3030 3031                          /* non-trivial ACL */
3031 3032                          aclentp = vsa.vsa_aclentp;
3032 3033                          if (exi->exi_export.ex_flags & EX_ACLOK) {
3033 3034                                  /* maximal permissions */
3034 3035                                  grp_perm = 0;
3035 3036                                  other_perm = 0;
3036 3037                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3037 3038                                          switch (aclentp->a_type) {
3038 3039                                          case USER_OBJ:
3039 3040                                                  break;
3040 3041                                          case USER:
3041 3042                                                  grp_perm |=
3042 3043                                                      aclentp->a_perm << 3;
3043 3044                                                  other_perm |= aclentp->a_perm;
3044 3045                                                  break;
3045 3046                                          case GROUP_OBJ:
3046 3047                                                  grp_perm |=
3047 3048                                                      aclentp->a_perm << 3;
3048 3049                                                  break;
3049 3050                                          case GROUP:
3050 3051                                                  other_perm |= aclentp->a_perm;
3051 3052                                                  break;
3052 3053                                          case OTHER_OBJ:
3053 3054                                                  other_orig = aclentp->a_perm;
3054 3055                                                  break;
3055 3056                                          case CLASS_OBJ:
3056 3057                                                  mask_perm = aclentp->a_perm;
3057 3058                                                  break;
3058 3059                                          default:
3059 3060                                                  break;
3060 3061                                          }
3061 3062                                  }
3062 3063                                  grp_perm &= mask_perm << 3;
3063 3064                                  other_perm &= mask_perm;
3064 3065                                  other_perm |= other_orig;
3065 3066  
3066 3067                          } else {
3067 3068                                  /* minimal permissions */
3068 3069                                  grp_perm = 070;
3069 3070                                  other_perm = 07;
3070 3071                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3071 3072                                          switch (aclentp->a_type) {
3072 3073                                          case USER_OBJ:
3073 3074                                                  break;
3074 3075                                          case USER:
3075 3076                                          case CLASS_OBJ:
3076 3077                                                  grp_perm &=
3077 3078                                                      aclentp->a_perm << 3;
3078 3079                                                  other_perm &=
3079 3080                                                      aclentp->a_perm;
3080 3081                                                  break;
3081 3082                                          case GROUP_OBJ:
3082 3083                                                  grp_perm &=
3083 3084                                                      aclentp->a_perm << 3;
3084 3085                                                  break;
3085 3086                                          case GROUP:
3086 3087                                                  other_perm &=
3087 3088                                                      aclentp->a_perm;
3088 3089                                                  break;
3089 3090                                          case OTHER_OBJ:
3090 3091                                                  other_perm &=
3091 3092                                                      aclentp->a_perm;
3092 3093                                                  break;
3093 3094                                          default:
3094 3095                                                  break;
3095 3096                                          }
3096 3097                                  }
3097 3098                          }
3098 3099                          /* copy to va */
3099 3100                          va->va_mode &= ~077;
3100 3101                          va->va_mode |= grp_perm | other_perm;
3101 3102                  }
3102 3103                  if (vsa.vsa_aclcnt)
3103 3104                          kmem_free(vsa.vsa_aclentp,
3104 3105                              vsa.vsa_aclcnt * sizeof (aclent_t));
3105 3106          }
3106 3107  }
3107 3108  
3108 3109  void
3109 3110  rfs_srvrinit(void)
3110 3111  {
3111 3112          nfs2_srv_caller_id = fs_new_caller_id();
3112 3113  }
3113 3114  
3114 3115  void
3115 3116  rfs_srvrfini(void)
3116 3117  {
3117 3118  }
3118 3119  
3119 3120  /* ARGSUSED */
3120 3121  void
3121 3122  rfs_srv_zone_init(nfs_globals_t *ng)
3122 3123  {
3123 3124          nfs_srv_t *ns;
3124 3125  
3125 3126          ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3126 3127  
3127 3128          mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3128 3129          ns->write_async = 1;
3129 3130  
3130 3131          ng->nfs_srv = ns;
3131 3132  }
3132 3133  
3133 3134  /* ARGSUSED */
3134 3135  void
3135 3136  rfs_srv_zone_fini(nfs_globals_t *ng)
3136 3137  {
3137 3138          nfs_srv_t *ns = ng->nfs_srv;
3138 3139  
3139 3140          ng->nfs_srv = NULL;
3140 3141  
3141 3142          mutex_destroy(&ns->async_write_lock);
3142 3143          kmem_free(ns, sizeof (*ns));
3143 3144  }
3144 3145  
3145 3146  static int
3146 3147  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3147 3148  {
3148 3149          struct clist    *wcl;
3149 3150          int             wlist_len;
3150 3151          uint32_t        count = rr->rr_count;
3151 3152  
3152 3153          wcl = ra->ra_wlist;
3153 3154  
3154 3155          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3155 3156                  return (FALSE);
3156 3157          }
3157 3158  
3158 3159          wcl = ra->ra_wlist;
3159 3160          rr->rr_ok.rrok_wlist_len = wlist_len;
3160 3161          rr->rr_ok.rrok_wlist = wcl;
3161 3162  
3162 3163          return (TRUE);
3163 3164  }

↓ open down ↓

2616 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX