nlm-and-less-ZSD Wdiff usr/src/uts/common/fs/nfs/nfs_srv.c

Print this page

Fix NFS design problems re. multiple zone keys
Make NFS server zone-specific data all have the same lifetime
Fix rfs4_clean_state_exi
Fix exi_cache_reclaim
Fix mistakes in zone keys work
More fixes re. exi_zoneid and exi_tree
(danmcd -> Keep some ASSERT()s around for readability.)

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30   30   *      All rights reserved.
  31   31   */
  32   32  
  33   33  /*
  34   34   * Copyright 2018 Nexenta Systems, Inc.
  35   35   * Copyright (c) 2016 by Delphix. All rights reserved.
  36   36   */
  37   37  
  38   38  #include <sys/param.h>
  39   39  #include <sys/types.h>
  40   40  #include <sys/systm.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/stat.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/statvfs.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/kstat.h>
  52   52  #include <sys/dirent.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/vtrace.h>
  56   56  #include <sys/mode.h>
  57   57  #include <sys/acl.h>
  58   58  #include <sys/nbmlock.h>
  59   59  #include <sys/policy.h>
  60   60  #include <sys/sdt.h>
  61   61  
  62   62  #include <rpc/types.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/svc.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/export.h>
  68   68  #include <nfs/nfs_cmd.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/as.h>
  72   72  #include <vm/seg.h>
  73   73  #include <vm/seg_map.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  #include <sys/strsubr.h>
  77   77  
  78   78  struct rfs_async_write_list;
  79   79  
  80   80  /*
  81   81   * Zone globals of NFSv2 server
  82   82   */
  83   83  typedef struct nfs_srv {
  84   84          kmutex_t                        async_write_lock;
  85   85          struct rfs_async_write_list     *async_write_head;
  86   86  
  87   87          /*
  88   88           * enables write clustering if == 1
  89   89           */
  90   90          int             write_async;
  91   91  } nfs_srv_t;

↓ open down ↓

91 lines elided

↑ open up ↑

  92   92  
  93   93  /*
  94   94   * These are the interface routines for the server side of the
  95   95   * Network File System.  See the NFS version 2 protocol specification
  96   96   * for a description of this interface.
  97   97   */
  98   98  
  99   99  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100  100  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101  101                          cred_t *);
 102      -static void     *rfs_zone_init(zoneid_t zoneid);
 103      -static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 104  102  
 105  103  
 106  104  /*
 107  105   * Some "over the wire" UNIX file types.  These are encoded
 108  106   * into the mode.  This needs to be fixed in the next rev.
 109  107   */
 110  108  #define IFMT            0170000         /* type of file */
 111  109  #define IFCHR           0020000         /* character special */
 112  110  #define IFBLK           0060000         /* block special */
 113  111  #define IFSOCK          0140000         /* socket */
 114  112  
 115  113  u_longlong_t nfs2_srv_caller_id;
 116      -static zone_key_t rfs_zone_key;
 117  114  
      115 +static nfs_srv_t *
      116 +nfs_get_srv(void)
      117 +{
      118 +        nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
      119 +        nfs_srv_t *srv = ng->nfs_srv;
      120 +        ASSERT(srv != NULL);
      121 +        return (srv);
      122 +}
      123 +
 118  124  /*
 119  125   * Get file attributes.
 120  126   * Returns the current attributes of the file with the given fhandle.
 121  127   */
 122  128  /* ARGSUSED */
 123  129  void
 124  130  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 125  131      struct svc_req *req, cred_t *cr, bool_t ro)
 126  132  {
 127  133          int error;

 128  134          vnode_t *vp;
 129  135          struct vattr va;
 130  136  
 131  137          vp = nfs_fhtovp(fhp, exi);
 132  138          if (vp == NULL) {
 133  139                  ns->ns_status = NFSERR_STALE;
 134  140                  return;
 135  141          }
 136  142  
 137  143          /*
 138  144           * Do the getattr.
 139  145           */
 140  146          va.va_mask = AT_ALL;    /* we want all the attributes */
 141  147  
 142  148          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 143  149  
 144  150          /* check for overflows */
 145  151          if (!error) {
 146  152                  /* Lie about the object type for a referral */
 147  153                  if (vn_is_nfs_reparse(vp, cr))
 148  154                          va.va_type = VLNK;
 149  155  
 150  156                  acl_perm(vp, exi, &va, cr);
 151  157                  error = vattr_to_nattr(&va, &ns->ns_attr);
 152  158          }
 153  159  
 154  160          VN_RELE(vp);
 155  161  
 156  162          ns->ns_status = puterrno(error);
 157  163  }
 158  164  void *
 159  165  rfs_getattr_getfh(fhandle_t *fhp)
 160  166  {
 161  167          return (fhp);
 162  168  }
 163  169  
 164  170  /*
 165  171   * Set file attributes.
 166  172   * Sets the attributes of the file with the given fhandle.  Returns
 167  173   * the new attributes.
 168  174   */
 169  175  /* ARGSUSED */
 170  176  void
 171  177  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 172  178      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 173  179  {
 174  180          int error;
 175  181          int flag;
 176  182          int in_crit = 0;
 177  183          vnode_t *vp;
 178  184          struct vattr va;
 179  185          struct vattr bva;
 180  186          struct flock64 bf;
 181  187          caller_context_t ct;
 182  188  
 183  189  
 184  190          vp = nfs_fhtovp(&args->saa_fh, exi);
 185  191          if (vp == NULL) {
 186  192                  ns->ns_status = NFSERR_STALE;
 187  193                  return;
 188  194          }
 189  195  
 190  196          if (rdonly(ro, vp)) {
 191  197                  VN_RELE(vp);
 192  198                  ns->ns_status = NFSERR_ROFS;
 193  199                  return;
 194  200          }
 195  201  
 196  202          error = sattr_to_vattr(&args->saa_sa, &va);
 197  203          if (error) {
 198  204                  VN_RELE(vp);
 199  205                  ns->ns_status = puterrno(error);
 200  206                  return;
 201  207          }
 202  208  
 203  209          /*
 204  210           * If the client is requesting a change to the mtime,
 205  211           * but the nanosecond field is set to 1 billion, then
 206  212           * this is a flag to the server that it should set the
 207  213           * atime and mtime fields to the server's current time.
 208  214           * The 1 billion number actually came from the client
 209  215           * as 1 million, but the units in the over the wire
 210  216           * request are microseconds instead of nanoseconds.
 211  217           *
 212  218           * This is an overload of the protocol and should be
 213  219           * documented in the NFS Version 2 protocol specification.
 214  220           */
 215  221          if (va.va_mask & AT_MTIME) {
 216  222                  if (va.va_mtime.tv_nsec == 1000000000) {
 217  223                          gethrestime(&va.va_mtime);
 218  224                          va.va_atime = va.va_mtime;
 219  225                          va.va_mask |= AT_ATIME;
 220  226                          flag = 0;
 221  227                  } else
 222  228                          flag = ATTR_UTIME;
 223  229          } else
 224  230                  flag = 0;
 225  231  
 226  232          /*
 227  233           * If the filesystem is exported with nosuid, then mask off
 228  234           * the setuid and setgid bits.
 229  235           */
 230  236          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 231  237              (exi->exi_export.ex_flags & EX_NOSUID))
 232  238                  va.va_mode &= ~(VSUID | VSGID);
 233  239  
 234  240          ct.cc_sysid = 0;
 235  241          ct.cc_pid = 0;
 236  242          ct.cc_caller_id = nfs2_srv_caller_id;
 237  243          ct.cc_flags = CC_DONTBLOCK;
 238  244  
 239  245          /*
 240  246           * We need to specially handle size changes because it is
 241  247           * possible for the client to create a file with modes
 242  248           * which indicate read-only, but with the file opened for
 243  249           * writing.  If the client then tries to set the size of
 244  250           * the file, then the normal access checking done in
 245  251           * VOP_SETATTR would prevent the client from doing so,
 246  252           * although it should be legal for it to do so.  To get
 247  253           * around this, we do the access checking for ourselves
 248  254           * and then use VOP_SPACE which doesn't do the access
 249  255           * checking which VOP_SETATTR does. VOP_SPACE can only
 250  256           * operate on VREG files, let VOP_SETATTR handle the other
 251  257           * extremely rare cases.
 252  258           * Also the client should not be allowed to change the
 253  259           * size of the file if there is a conflicting non-blocking
 254  260           * mandatory lock in the region of change.
 255  261           */
 256  262          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 257  263                  if (nbl_need_check(vp)) {
 258  264                          nbl_start_crit(vp, RW_READER);
 259  265                          in_crit = 1;
 260  266                  }
 261  267  
 262  268                  bva.va_mask = AT_UID | AT_SIZE;
 263  269  
 264  270                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 265  271  
 266  272                  if (error) {
 267  273                          if (in_crit)
 268  274                                  nbl_end_crit(vp);
 269  275                          VN_RELE(vp);
 270  276                          ns->ns_status = puterrno(error);
 271  277                          return;
 272  278                  }
 273  279  
 274  280                  if (in_crit) {
 275  281                          u_offset_t offset;
 276  282                          ssize_t length;
 277  283  
 278  284                          if (va.va_size < bva.va_size) {
 279  285                                  offset = va.va_size;
 280  286                                  length = bva.va_size - va.va_size;
 281  287                          } else {
 282  288                                  offset = bva.va_size;
 283  289                                  length = va.va_size - bva.va_size;
 284  290                          }
 285  291                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 286  292                              NULL)) {
 287  293                                  error = EACCES;
 288  294                          }
 289  295                  }
 290  296  
 291  297                  if (crgetuid(cr) == bva.va_uid && !error &&
 292  298                      va.va_size != bva.va_size) {
 293  299                          va.va_mask &= ~AT_SIZE;
 294  300                          bf.l_type = F_WRLCK;
 295  301                          bf.l_whence = 0;
 296  302                          bf.l_start = (off64_t)va.va_size;
 297  303                          bf.l_len = 0;
 298  304                          bf.l_sysid = 0;
 299  305                          bf.l_pid = 0;
 300  306  
 301  307                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 302  308                              (offset_t)va.va_size, cr, &ct);
 303  309                  }
 304  310                  if (in_crit)
 305  311                          nbl_end_crit(vp);
 306  312          } else
 307  313                  error = 0;
 308  314  
 309  315          /*
 310  316           * Do the setattr.
 311  317           */
 312  318          if (!error && va.va_mask) {
 313  319                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 314  320          }
 315  321  
 316  322          /*
 317  323           * check if the monitor on either vop_space or vop_setattr detected
 318  324           * a delegation conflict and if so, mark the thread flag as
 319  325           * wouldblock so that the response is dropped and the client will
 320  326           * try again.
 321  327           */
 322  328          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 323  329                  VN_RELE(vp);
 324  330                  curthread->t_flag |= T_WOULDBLOCK;
 325  331                  return;
 326  332          }
 327  333  
 328  334          if (!error) {
 329  335                  va.va_mask = AT_ALL;    /* get everything */
 330  336  
 331  337                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 332  338  
 333  339                  /* check for overflows */
 334  340                  if (!error) {
 335  341                          acl_perm(vp, exi, &va, cr);
 336  342                          error = vattr_to_nattr(&va, &ns->ns_attr);
 337  343                  }
 338  344          }
 339  345  
 340  346          ct.cc_flags = 0;
 341  347  
 342  348          /*
 343  349           * Force modified metadata out to stable storage.
 344  350           */
 345  351          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 346  352  
 347  353          VN_RELE(vp);
 348  354  
 349  355          ns->ns_status = puterrno(error);
 350  356  }
 351  357  void *
 352  358  rfs_setattr_getfh(struct nfssaargs *args)
 353  359  {
 354  360          return (&args->saa_fh);
 355  361  }
 356  362  
 357  363  /* Change and release @exip and @vpp only in success */
 358  364  int
 359  365  rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 360  366  {
 361  367          struct exportinfo *exi;
 362  368          vnode_t *vp = *vpp;
 363  369          fid_t fid;
 364  370          int error;
 365  371  
 366  372          VN_HOLD(vp);
 367  373  
 368  374          if ((error = traverse(&vp)) != 0) {
 369  375                  VN_RELE(vp);
 370  376                  return (error);
 371  377          }
 372  378  
 373  379          bzero(&fid, sizeof (fid));
 374  380          fid.fid_len = MAXFIDSZ;
 375  381          error = VOP_FID(vp, &fid, NULL);
 376  382          if (error) {
 377  383                  VN_RELE(vp);
 378  384                  return (error);
 379  385          }
 380  386  
 381  387          exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 382  388          if (exi == NULL ||
 383  389              (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 384  390                  /*
 385  391                   * It is not error, just subdir is not exported
 386  392                   * or "nohide" is not set
 387  393                   */
 388  394                  if (exi != NULL)
 389  395                          exi_rele(exi);
 390  396                  VN_RELE(vp);
 391  397          } else {
 392  398                  /* go to submount */
 393  399                  exi_rele(*exip);
 394  400                  *exip = exi;
 395  401  
 396  402                  VN_RELE(*vpp);
 397  403                  *vpp = vp;
 398  404          }
 399  405  
 400  406          return (0);
 401  407  }
 402  408  
 403  409  /*
 404  410   * Given mounted "dvp" and "exi", go upper mountpoint
 405  411   * with dvp/exi correction
 406  412   * Return 0 in success
 407  413   */
 408  414  int
 409  415  rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 410  416  {
 411  417          struct exportinfo *exi;
 412  418          vnode_t *dvp = *dvpp;
 413  419  
 414  420          ASSERT3P((*exip)->exi_zone, ==, curzone);
 415  421          ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 416  422  
 417  423          VN_HOLD(dvp);
 418  424          dvp = untraverse(dvp);
 419  425          exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 420  426          if (exi == NULL) {
 421  427                  VN_RELE(dvp);
 422  428                  return (-1);
 423  429          }
 424  430  
 425  431          ASSERT3P(exi->exi_zone, ==, curzone);
 426  432          exi_rele(*exip);
 427  433          *exip = exi;
 428  434          VN_RELE(*dvpp);
 429  435          *dvpp = dvp;
 430  436  
 431  437          return (0);
 432  438  }
 433  439  /*
 434  440   * Directory lookup.
 435  441   * Returns an fhandle and file attributes for file name in a directory.
 436  442   */
 437  443  /* ARGSUSED */
 438  444  void
 439  445  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 440  446      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 441  447  {
 442  448          int error;
 443  449          vnode_t *dvp;
 444  450          vnode_t *vp;
 445  451          struct vattr va;
 446  452          fhandle_t *fhp = da->da_fhandle;
 447  453          struct sec_ol sec = {0, 0};
 448  454          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 449  455          char *name;
 450  456          struct sockaddr *ca;
 451  457  
 452  458          /*
 453  459           * Trusted Extension doesn't support NFSv2. MOUNT
 454  460           * will reject v2 clients. Need to prevent v2 client
 455  461           * access via WebNFS here.
 456  462           */
 457  463          if (is_system_labeled() && req->rq_vers == 2) {
 458  464                  dr->dr_status = NFSERR_ACCES;
 459  465                  return;
 460  466          }
 461  467  
 462  468          /*
 463  469           * Disallow NULL paths
 464  470           */
 465  471          if (da->da_name == NULL || *da->da_name == '\0') {
 466  472                  dr->dr_status = NFSERR_ACCES;
 467  473                  return;
 468  474          }
 469  475  
 470  476          /*
 471  477           * Allow lookups from the root - the default
 472  478           * location of the public filehandle.
 473  479           */
 474  480          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 475  481                  dvp = ZONE_ROOTVP();
 476  482                  VN_HOLD(dvp);
 477  483          } else {
 478  484                  dvp = nfs_fhtovp(fhp, exi);
 479  485                  if (dvp == NULL) {
 480  486                          dr->dr_status = NFSERR_STALE;
 481  487                          return;
 482  488                  }
 483  489          }
 484  490  
 485  491          exi_hold(exi);
 486  492          ASSERT3P(exi->exi_zone, ==, curzone);
 487  493  
 488  494          /*
 489  495           * Not allow lookup beyond root.
 490  496           * If the filehandle matches a filehandle of the exi,
 491  497           * then the ".." refers beyond the root of an exported filesystem.
 492  498           */
 493  499          if (strcmp(da->da_name, "..") == 0 &&
 494  500              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 495  501                  if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 496  502                      ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 497  503                          /*
 498  504                           * special case for ".." and 'nohide'exported root
 499  505                           */
 500  506                          if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 501  507                                  error = NFSERR_ACCES;
 502  508                                  goto out;
 503  509                          }
 504  510                  } else  {
 505  511                          error = NFSERR_NOENT;
 506  512                          goto out;
 507  513                  }
 508  514          }
 509  515  
 510  516          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 511  517          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 512  518              MAXPATHLEN);
 513  519  
 514  520          if (name == NULL) {
 515  521                  error = NFSERR_ACCES;
 516  522                  goto out;
 517  523          }
 518  524  
 519  525          /*
 520  526           * If the public filehandle is used then allow
 521  527           * a multi-component lookup, i.e. evaluate
 522  528           * a pathname and follow symbolic links if
 523  529           * necessary.
 524  530           *
 525  531           * This may result in a vnode in another filesystem
 526  532           * which is OK as long as the filesystem is exported.
 527  533           */
 528  534          if (PUBLIC_FH2(fhp)) {
 529  535                  publicfh_flag = TRUE;
 530  536  
 531  537                  exi_rele(exi);
 532  538  
 533  539                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 534  540                      &sec);
 535  541          } else {
 536  542                  /*
 537  543                   * Do a normal single component lookup.
 538  544                   */
 539  545                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 540  546                      NULL, NULL, NULL);
 541  547          }
 542  548  
 543  549          if (name != da->da_name)
 544  550                  kmem_free(name, MAXPATHLEN);
 545  551  
 546  552          if (error == 0 && vn_ismntpt(vp)) {
 547  553                  error = rfs_cross_mnt(&vp, &exi);
 548  554                  if (error)
 549  555                          VN_RELE(vp);
 550  556          }
 551  557  
 552  558          if (!error) {
 553  559                  va.va_mask = AT_ALL;    /* we want everything */
 554  560  
 555  561                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 556  562  
 557  563                  /* check for overflows */
 558  564                  if (!error) {
 559  565                          acl_perm(vp, exi, &va, cr);
 560  566                          error = vattr_to_nattr(&va, &dr->dr_attr);
 561  567                          if (!error) {
 562  568                                  if (sec.sec_flags & SEC_QUERY)
 563  569                                          error = makefh_ol(&dr->dr_fhandle, exi,
 564  570                                              sec.sec_index);
 565  571                                  else {
 566  572                                          error = makefh(&dr->dr_fhandle, vp,
 567  573                                              exi);
 568  574                                          if (!error && publicfh_flag &&
 569  575                                              !chk_clnt_sec(exi, req))
 570  576                                                  auth_weak = TRUE;
 571  577                                  }
 572  578                          }
 573  579                  }
 574  580                  VN_RELE(vp);
 575  581          }
 576  582  
 577  583  out:
 578  584          VN_RELE(dvp);
 579  585  
 580  586          if (exi != NULL)
 581  587                  exi_rele(exi);
 582  588  
 583  589          /*
 584  590           * If it's public fh, no 0x81, and client's flavor is
 585  591           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 586  592           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 587  593           */
 588  594          if (auth_weak)
 589  595                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 590  596          else
 591  597                  dr->dr_status = puterrno(error);
 592  598  }
 593  599  void *
 594  600  rfs_lookup_getfh(struct nfsdiropargs *da)
 595  601  {
 596  602          return (da->da_fhandle);
 597  603  }
 598  604  
 599  605  /*
 600  606   * Read symbolic link.
 601  607   * Returns the string in the symbolic link at the given fhandle.
 602  608   */
 603  609  /* ARGSUSED */
 604  610  void
 605  611  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 606  612      struct svc_req *req, cred_t *cr, bool_t ro)
 607  613  {
 608  614          int error;
 609  615          struct iovec iov;
 610  616          struct uio uio;
 611  617          vnode_t *vp;
 612  618          struct vattr va;
 613  619          struct sockaddr *ca;
 614  620          char *name = NULL;
 615  621          int is_referral = 0;
 616  622  
 617  623          vp = nfs_fhtovp(fhp, exi);
 618  624          if (vp == NULL) {
 619  625                  rl->rl_data = NULL;
 620  626                  rl->rl_status = NFSERR_STALE;
 621  627                  return;
 622  628          }
 623  629  
 624  630          va.va_mask = AT_MODE;
 625  631  
 626  632          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 627  633  
 628  634          if (error) {
 629  635                  VN_RELE(vp);
 630  636                  rl->rl_data = NULL;
 631  637                  rl->rl_status = puterrno(error);
 632  638                  return;
 633  639          }
 634  640  
 635  641          if (MANDLOCK(vp, va.va_mode)) {
 636  642                  VN_RELE(vp);
 637  643                  rl->rl_data = NULL;
 638  644                  rl->rl_status = NFSERR_ACCES;
 639  645                  return;
 640  646          }
 641  647  
 642  648          /* We lied about the object type for a referral */
 643  649          if (vn_is_nfs_reparse(vp, cr))
 644  650                  is_referral = 1;
 645  651  
 646  652          /*
 647  653           * XNFS and RFC1094 require us to return ENXIO if argument
 648  654           * is not a link. BUGID 1138002.
 649  655           */
 650  656          if (vp->v_type != VLNK && !is_referral) {
 651  657                  VN_RELE(vp);
 652  658                  rl->rl_data = NULL;
 653  659                  rl->rl_status = NFSERR_NXIO;
 654  660                  return;
 655  661          }
 656  662  
 657  663          /*
 658  664           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 659  665           */
 660  666          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 661  667  
 662  668          if (is_referral) {
 663  669                  char *s;
 664  670                  size_t strsz;
 665  671  
 666  672                  /* Get an artificial symlink based on a referral */
 667  673                  s = build_symlink(vp, cr, &strsz);
 668  674                  global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 669  675                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 670  676                      vnode_t *, vp, char *, s);
 671  677                  if (s == NULL)
 672  678                          error = EINVAL;
 673  679                  else {
 674  680                          error = 0;
 675  681                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 676  682                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 677  683                          kmem_free(s, strsz);
 678  684                  }
 679  685  
 680  686          } else {
 681  687  
 682  688                  /*
 683  689                   * Set up io vector to read sym link data
 684  690                   */
 685  691                  iov.iov_base = rl->rl_data;
 686  692                  iov.iov_len = NFS_MAXPATHLEN;
 687  693                  uio.uio_iov = &iov;
 688  694                  uio.uio_iovcnt = 1;
 689  695                  uio.uio_segflg = UIO_SYSSPACE;
 690  696                  uio.uio_extflg = UIO_COPY_CACHED;
 691  697                  uio.uio_loffset = (offset_t)0;
 692  698                  uio.uio_resid = NFS_MAXPATHLEN;
 693  699  
 694  700                  /*
 695  701                   * Do the readlink.
 696  702                   */
 697  703                  error = VOP_READLINK(vp, &uio, cr, NULL);
 698  704  
 699  705                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 700  706  
 701  707                  if (!error)
 702  708                          rl->rl_data[rl->rl_count] = '\0';
 703  709  
 704  710          }
 705  711  
 706  712  
 707  713          VN_RELE(vp);
 708  714  
 709  715          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 710  716          name = nfscmd_convname(ca, exi, rl->rl_data,
 711  717              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 712  718  
 713  719          if (name != NULL && name != rl->rl_data) {
 714  720                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 715  721                  rl->rl_data = name;
 716  722          }
 717  723  
 718  724          /*
 719  725           * XNFS and RFC1094 require us to return ENXIO if argument
 720  726           * is not a link. UFS returns EINVAL if this is the case,
 721  727           * so we do the mapping here. BUGID 1138002.
 722  728           */
 723  729          if (error == EINVAL)
 724  730                  rl->rl_status = NFSERR_NXIO;
 725  731          else
 726  732                  rl->rl_status = puterrno(error);
 727  733  
 728  734  }
 729  735  void *
 730  736  rfs_readlink_getfh(fhandle_t *fhp)
 731  737  {
 732  738          return (fhp);
 733  739  }
 734  740  /*
 735  741   * Free data allocated by rfs_readlink
 736  742   */
 737  743  void
 738  744  rfs_rlfree(struct nfsrdlnres *rl)
 739  745  {
 740  746          if (rl->rl_data != NULL)
 741  747                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 742  748  }
 743  749  
 744  750  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 745  751  
 746  752  /*
 747  753   * Read data.
 748  754   * Returns some data read from the file at the given fhandle.
 749  755   */
 750  756  /* ARGSUSED */
 751  757  void
 752  758  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 753  759      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 754  760  {
 755  761          vnode_t *vp;
 756  762          int error;
 757  763          struct vattr va;
 758  764          struct iovec iov;
 759  765          struct uio uio;
 760  766          mblk_t *mp;
 761  767          int alloc_err = 0;
 762  768          int in_crit = 0;
 763  769          caller_context_t ct;
 764  770  
 765  771          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 766  772          if (vp == NULL) {
 767  773                  rr->rr_data = NULL;
 768  774                  rr->rr_status = NFSERR_STALE;
 769  775                  return;
 770  776          }
 771  777  
 772  778          if (vp->v_type != VREG) {
 773  779                  VN_RELE(vp);
 774  780                  rr->rr_data = NULL;
 775  781                  rr->rr_status = NFSERR_ISDIR;
 776  782                  return;
 777  783          }
 778  784  
 779  785          ct.cc_sysid = 0;
 780  786          ct.cc_pid = 0;
 781  787          ct.cc_caller_id = nfs2_srv_caller_id;
 782  788          ct.cc_flags = CC_DONTBLOCK;
 783  789  
 784  790          /*
 785  791           * Enter the critical region before calling VOP_RWLOCK
 786  792           * to avoid a deadlock with write requests.
 787  793           */
 788  794          if (nbl_need_check(vp)) {
 789  795                  nbl_start_crit(vp, RW_READER);
 790  796                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 791  797                      0, NULL)) {
 792  798                          nbl_end_crit(vp);
 793  799                          VN_RELE(vp);
 794  800                          rr->rr_data = NULL;
 795  801                          rr->rr_status = NFSERR_ACCES;
 796  802                          return;
 797  803                  }
 798  804                  in_crit = 1;
 799  805          }
 800  806  
 801  807          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 802  808  
 803  809          /* check if a monitor detected a delegation conflict */
 804  810          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 805  811                  if (in_crit)
 806  812                          nbl_end_crit(vp);
 807  813                  VN_RELE(vp);
 808  814                  /* mark as wouldblock so response is dropped */
 809  815                  curthread->t_flag |= T_WOULDBLOCK;
 810  816  
 811  817                  rr->rr_data = NULL;
 812  818                  return;
 813  819          }
 814  820  
 815  821          va.va_mask = AT_ALL;
 816  822  
 817  823          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 818  824  
 819  825          if (error) {
 820  826                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 821  827                  if (in_crit)
 822  828                          nbl_end_crit(vp);
 823  829  
 824  830                  VN_RELE(vp);
 825  831                  rr->rr_data = NULL;
 826  832                  rr->rr_status = puterrno(error);
 827  833  
 828  834                  return;
 829  835          }
 830  836  
 831  837          /*
 832  838           * This is a kludge to allow reading of files created
 833  839           * with no read permission.  The owner of the file
 834  840           * is always allowed to read it.
 835  841           */
 836  842          if (crgetuid(cr) != va.va_uid) {
 837  843                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 838  844  
 839  845                  if (error) {
 840  846                          /*
 841  847                           * Exec is the same as read over the net because
 842  848                           * of demand loading.
 843  849                           */
 844  850                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 845  851                  }
 846  852                  if (error) {
 847  853                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 848  854                          if (in_crit)
 849  855                                  nbl_end_crit(vp);
 850  856                          VN_RELE(vp);
 851  857                          rr->rr_data = NULL;
 852  858                          rr->rr_status = puterrno(error);
 853  859  
 854  860                          return;
 855  861                  }
 856  862          }
 857  863  
 858  864          if (MANDLOCK(vp, va.va_mode)) {
 859  865                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 860  866                  if (in_crit)
 861  867                          nbl_end_crit(vp);
 862  868  
 863  869                  VN_RELE(vp);
 864  870                  rr->rr_data = NULL;
 865  871                  rr->rr_status = NFSERR_ACCES;
 866  872  
 867  873                  return;
 868  874          }
 869  875  
 870  876          rr->rr_ok.rrok_wlist_len = 0;
 871  877          rr->rr_ok.rrok_wlist = NULL;
 872  878  
 873  879          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 874  880                  rr->rr_count = 0;
 875  881                  rr->rr_data = NULL;
 876  882                  /*
 877  883                   * In this case, status is NFS_OK, but there is no data
 878  884                   * to encode. So set rr_mp to NULL.
 879  885                   */
 880  886                  rr->rr_mp = NULL;
 881  887                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 882  888                  if (rr->rr_ok.rrok_wlist)
 883  889                          clist_zero_len(rr->rr_ok.rrok_wlist);
 884  890                  goto done;
 885  891          }
 886  892  
 887  893          if (ra->ra_wlist) {
 888  894                  mp = NULL;
 889  895                  rr->rr_mp = NULL;
 890  896                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 891  897                  if (ra->ra_count > iov.iov_len) {
 892  898                          rr->rr_data = NULL;
 893  899                          rr->rr_status = NFSERR_INVAL;
 894  900                          goto done;
 895  901                  }
 896  902          } else {
 897  903                  /*
 898  904                   * mp will contain the data to be sent out in the read reply.
 899  905                   * This will be freed after the reply has been sent out (by the
 900  906                   * driver).
 901  907                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 902  908                   * that the call to xdrmblk_putmblk() never fails.
 903  909                   */
 904  910                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 905  911                      &alloc_err);
 906  912                  ASSERT(mp != NULL);
 907  913                  ASSERT(alloc_err == 0);
 908  914  
 909  915                  rr->rr_mp = mp;
 910  916  
 911  917                  /*
 912  918                   * Set up io vector
 913  919                   */
 914  920                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 915  921                  iov.iov_len = ra->ra_count;
 916  922          }
 917  923  
 918  924          uio.uio_iov = &iov;
 919  925          uio.uio_iovcnt = 1;
 920  926          uio.uio_segflg = UIO_SYSSPACE;
 921  927          uio.uio_extflg = UIO_COPY_CACHED;
 922  928          uio.uio_loffset = (offset_t)ra->ra_offset;
 923  929          uio.uio_resid = ra->ra_count;
 924  930  
 925  931          error = VOP_READ(vp, &uio, 0, cr, &ct);
 926  932  
 927  933          if (error) {
 928  934                  if (mp)
 929  935                          freeb(mp);
 930  936  
 931  937                  /*
 932  938                   * check if a monitor detected a delegation conflict and
 933  939                   * mark as wouldblock so response is dropped
 934  940                   */
 935  941                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 936  942                          curthread->t_flag |= T_WOULDBLOCK;
 937  943                  else
 938  944                          rr->rr_status = puterrno(error);
 939  945  
 940  946                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 941  947                  if (in_crit)
 942  948                          nbl_end_crit(vp);
 943  949  
 944  950                  VN_RELE(vp);
 945  951                  rr->rr_data = NULL;
 946  952  
 947  953                  return;
 948  954          }
 949  955  
 950  956          /*
 951  957           * Get attributes again so we can send the latest access
 952  958           * time to the client side for its cache.
 953  959           */
 954  960          va.va_mask = AT_ALL;
 955  961  
 956  962          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 957  963  
 958  964          if (error) {
 959  965                  if (mp)
 960  966                          freeb(mp);
 961  967  
 962  968                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 963  969                  if (in_crit)
 964  970                          nbl_end_crit(vp);
 965  971  
 966  972                  VN_RELE(vp);
 967  973                  rr->rr_data = NULL;
 968  974                  rr->rr_status = puterrno(error);
 969  975  
 970  976                  return;
 971  977          }
 972  978  
 973  979          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 974  980  
 975  981          if (mp) {
 976  982                  rr->rr_data = (char *)mp->b_datap->db_base;
 977  983          } else {
 978  984                  if (ra->ra_wlist) {
 979  985                          rr->rr_data = (caddr_t)iov.iov_base;
 980  986                          if (!rdma_setup_read_data2(ra, rr)) {
 981  987                                  rr->rr_data = NULL;
 982  988                                  rr->rr_status = puterrno(NFSERR_INVAL);
 983  989                          }
 984  990                  }
 985  991          }
 986  992  done:
 987  993          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 988  994          if (in_crit)
 989  995                  nbl_end_crit(vp);
 990  996  
 991  997          acl_perm(vp, exi, &va, cr);
 992  998  
 993  999          /* check for overflows */
 994 1000          error = vattr_to_nattr(&va, &rr->rr_attr);
 995 1001  
 996 1002          VN_RELE(vp);
 997 1003  
 998 1004          rr->rr_status = puterrno(error);
 999 1005  }
1000 1006  
1001 1007  /*
1002 1008   * Free data allocated by rfs_read
1003 1009   */
1004 1010  void
1005 1011  rfs_rdfree(struct nfsrdresult *rr)
1006 1012  {
1007 1013          mblk_t *mp;
1008 1014  
1009 1015          if (rr->rr_status == NFS_OK) {
1010 1016                  mp = rr->rr_mp;
1011 1017                  if (mp != NULL)
1012 1018                          freeb(mp);
1013 1019          }
1014 1020  }
1015 1021  
1016 1022  void *
1017 1023  rfs_read_getfh(struct nfsreadargs *ra)
1018 1024  {
1019 1025          return (&ra->ra_fhandle);
1020 1026  }
1021 1027  
1022 1028  #define MAX_IOVECS      12
1023 1029  
1024 1030  #ifdef DEBUG
1025 1031  static int rfs_write_sync_hits = 0;
1026 1032  static int rfs_write_sync_misses = 0;
1027 1033  #endif
1028 1034  
1029 1035  /*
1030 1036   * Write data to file.
1031 1037   * Returns attributes of a file after writing some data to it.
1032 1038   *
1033 1039   * Any changes made here, especially in error handling might have
1034 1040   * to also be done in rfs_write (which clusters write requests).
1035 1041   */
1036 1042  /* ARGSUSED */
1037 1043  void
1038 1044  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1039 1045      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1040 1046  {
1041 1047          int error;
1042 1048          vnode_t *vp;
1043 1049          rlim64_t rlimit;
1044 1050          struct vattr va;
1045 1051          struct uio uio;
1046 1052          struct iovec iov[MAX_IOVECS];
1047 1053          mblk_t *m;
1048 1054          struct iovec *iovp;
1049 1055          int iovcnt;
1050 1056          cred_t *savecred;
1051 1057          int in_crit = 0;
1052 1058          caller_context_t ct;
1053 1059  
1054 1060          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1055 1061          if (vp == NULL) {
1056 1062                  ns->ns_status = NFSERR_STALE;
1057 1063                  return;
1058 1064          }
1059 1065  
1060 1066          if (rdonly(ro, vp)) {
1061 1067                  VN_RELE(vp);
1062 1068                  ns->ns_status = NFSERR_ROFS;
1063 1069                  return;
1064 1070          }
1065 1071  
1066 1072          if (vp->v_type != VREG) {
1067 1073                  VN_RELE(vp);
1068 1074                  ns->ns_status = NFSERR_ISDIR;
1069 1075                  return;
1070 1076          }
1071 1077  
1072 1078          ct.cc_sysid = 0;
1073 1079          ct.cc_pid = 0;
1074 1080          ct.cc_caller_id = nfs2_srv_caller_id;
1075 1081          ct.cc_flags = CC_DONTBLOCK;
1076 1082  
1077 1083          va.va_mask = AT_UID|AT_MODE;
1078 1084  
1079 1085          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1080 1086  
1081 1087          if (error) {
1082 1088                  VN_RELE(vp);
1083 1089                  ns->ns_status = puterrno(error);
1084 1090  
1085 1091                  return;
1086 1092          }
1087 1093  
1088 1094          if (crgetuid(cr) != va.va_uid) {
1089 1095                  /*
1090 1096                   * This is a kludge to allow writes of files created
1091 1097                   * with read only permission.  The owner of the file
1092 1098                   * is always allowed to write it.
1093 1099                   */
1094 1100                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1095 1101  
1096 1102                  if (error) {
1097 1103                          VN_RELE(vp);
1098 1104                          ns->ns_status = puterrno(error);
1099 1105                          return;
1100 1106                  }
1101 1107          }
1102 1108  
1103 1109          /*
1104 1110           * Can't access a mandatory lock file.  This might cause
1105 1111           * the NFS service thread to block forever waiting for a
1106 1112           * lock to be released that will never be released.
1107 1113           */
1108 1114          if (MANDLOCK(vp, va.va_mode)) {
1109 1115                  VN_RELE(vp);
1110 1116                  ns->ns_status = NFSERR_ACCES;
1111 1117                  return;
1112 1118          }
1113 1119  
1114 1120          /*
1115 1121           * We have to enter the critical region before calling VOP_RWLOCK
1116 1122           * to avoid a deadlock with ufs.
1117 1123           */
1118 1124          if (nbl_need_check(vp)) {
1119 1125                  nbl_start_crit(vp, RW_READER);
1120 1126                  in_crit = 1;
1121 1127                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1122 1128                      wa->wa_count, 0, NULL)) {
1123 1129                          error = EACCES;
1124 1130                          goto out;
1125 1131                  }
1126 1132          }
1127 1133  
1128 1134          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1129 1135  
1130 1136          /* check if a monitor detected a delegation conflict */
1131 1137          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1132 1138                  goto out;
1133 1139          }
1134 1140  
1135 1141          if (wa->wa_data || wa->wa_rlist) {
1136 1142                  /* Do the RDMA thing if necessary */
1137 1143                  if (wa->wa_rlist) {
1138 1144                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1139 1145                          iov[0].iov_len = wa->wa_count;
1140 1146                  } else  {
1141 1147                          iov[0].iov_base = wa->wa_data;
1142 1148                          iov[0].iov_len = wa->wa_count;
1143 1149                  }
1144 1150                  uio.uio_iov = iov;
1145 1151                  uio.uio_iovcnt = 1;
1146 1152                  uio.uio_segflg = UIO_SYSSPACE;
1147 1153                  uio.uio_extflg = UIO_COPY_DEFAULT;
1148 1154                  uio.uio_loffset = (offset_t)wa->wa_offset;
1149 1155                  uio.uio_resid = wa->wa_count;
1150 1156                  /*
1151 1157                   * The limit is checked on the client. We
1152 1158                   * should allow any size writes here.
1153 1159                   */
1154 1160                  uio.uio_llimit = curproc->p_fsz_ctl;
1155 1161                  rlimit = uio.uio_llimit - wa->wa_offset;
1156 1162                  if (rlimit < (rlim64_t)uio.uio_resid)
1157 1163                          uio.uio_resid = (uint_t)rlimit;
1158 1164  
1159 1165                  /*
1160 1166                   * for now we assume no append mode
1161 1167                   */
1162 1168                  /*
1163 1169                   * We're changing creds because VM may fault and we need
1164 1170                   * the cred of the current thread to be used if quota
1165 1171                   * checking is enabled.
1166 1172                   */
1167 1173                  savecred = curthread->t_cred;
1168 1174                  curthread->t_cred = cr;
1169 1175                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1170 1176                  curthread->t_cred = savecred;
1171 1177          } else {
1172 1178  
1173 1179                  iovcnt = 0;
1174 1180                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1175 1181                          iovcnt++;
1176 1182                  if (iovcnt <= MAX_IOVECS) {
1177 1183  #ifdef DEBUG
1178 1184                          rfs_write_sync_hits++;
1179 1185  #endif
1180 1186                          iovp = iov;
1181 1187                  } else {
1182 1188  #ifdef DEBUG
1183 1189                          rfs_write_sync_misses++;
1184 1190  #endif
1185 1191                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1186 1192                  }
1187 1193                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1188 1194                  uio.uio_iov = iovp;
1189 1195                  uio.uio_iovcnt = iovcnt;
1190 1196                  uio.uio_segflg = UIO_SYSSPACE;
1191 1197                  uio.uio_extflg = UIO_COPY_DEFAULT;
1192 1198                  uio.uio_loffset = (offset_t)wa->wa_offset;
1193 1199                  uio.uio_resid = wa->wa_count;
1194 1200                  /*
1195 1201                   * The limit is checked on the client. We
1196 1202                   * should allow any size writes here.
1197 1203                   */
1198 1204                  uio.uio_llimit = curproc->p_fsz_ctl;
1199 1205                  rlimit = uio.uio_llimit - wa->wa_offset;
1200 1206                  if (rlimit < (rlim64_t)uio.uio_resid)
1201 1207                          uio.uio_resid = (uint_t)rlimit;
1202 1208  
1203 1209                  /*
1204 1210                   * For now we assume no append mode.
1205 1211                   */
1206 1212                  /*
1207 1213                   * We're changing creds because VM may fault and we need
1208 1214                   * the cred of the current thread to be used if quota
1209 1215                   * checking is enabled.
1210 1216                   */
1211 1217                  savecred = curthread->t_cred;
1212 1218                  curthread->t_cred = cr;
1213 1219                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1214 1220                  curthread->t_cred = savecred;
1215 1221  
1216 1222                  if (iovp != iov)
1217 1223                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1218 1224          }
1219 1225  
1220 1226          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1221 1227  
1222 1228          if (!error) {
1223 1229                  /*
1224 1230                   * Get attributes again so we send the latest mod
1225 1231                   * time to the client side for its cache.
1226 1232                   */
1227 1233                  va.va_mask = AT_ALL;    /* now we want everything */
1228 1234  
1229 1235                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1230 1236  
1231 1237                  /* check for overflows */
1232 1238                  if (!error) {
1233 1239                          acl_perm(vp, exi, &va, cr);
1234 1240                          error = vattr_to_nattr(&va, &ns->ns_attr);
1235 1241                  }
1236 1242          }
1237 1243  
1238 1244  out:
1239 1245          if (in_crit)
1240 1246                  nbl_end_crit(vp);
1241 1247          VN_RELE(vp);
1242 1248  
1243 1249          /* check if a monitor detected a delegation conflict */
1244 1250          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1245 1251                  /* mark as wouldblock so response is dropped */
1246 1252                  curthread->t_flag |= T_WOULDBLOCK;
1247 1253          else
1248 1254                  ns->ns_status = puterrno(error);
1249 1255  
1250 1256  }
1251 1257  
1252 1258  struct rfs_async_write {
1253 1259          struct nfswriteargs *wa;
1254 1260          struct nfsattrstat *ns;
1255 1261          struct svc_req *req;
1256 1262          cred_t *cr;
1257 1263          bool_t ro;
1258 1264          kthread_t *thread;
1259 1265          struct rfs_async_write *list;
1260 1266  };
1261 1267  
1262 1268  struct rfs_async_write_list {
1263 1269          fhandle_t *fhp;
1264 1270          kcondvar_t cv;
1265 1271          struct rfs_async_write *list;
1266 1272          struct rfs_async_write_list *next;
1267 1273  };
1268 1274  
1269 1275  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1270 1276  static kmutex_t rfs_async_write_lock;
1271 1277  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1272 1278  
1273 1279  #define MAXCLIOVECS     42
1274 1280  #define RFSWRITE_INITVAL (enum nfsstat) -1
1275 1281  
1276 1282  #ifdef DEBUG
1277 1283  static int rfs_write_hits = 0;
1278 1284  static int rfs_write_misses = 0;
1279 1285  #endif
1280 1286  
1281 1287  /*
1282 1288   * Write data to file.
1283 1289   * Returns attributes of a file after writing some data to it.
1284 1290   */
1285 1291  void
1286 1292  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1287 1293      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1288 1294  {
1289 1295          int error;
1290 1296          vnode_t *vp;
1291 1297          rlim64_t rlimit;
1292 1298          struct vattr va;
1293 1299          struct uio uio;
1294 1300          struct rfs_async_write_list *lp;
1295 1301          struct rfs_async_write_list *nlp;
1296 1302          struct rfs_async_write *rp;
1297 1303          struct rfs_async_write *nrp;
1298 1304          struct rfs_async_write *trp;
1299 1305          struct rfs_async_write *lrp;
1300 1306          int data_written;
1301 1307          int iovcnt;
1302 1308          mblk_t *m;
1303 1309          struct iovec *iovp;
1304 1310          struct iovec *niovp;
1305 1311          struct iovec iov[MAXCLIOVECS];
1306 1312          int count;
1307 1313          int rcount;

↓ open down ↓

1180 lines elided

↑ open up ↑

1308 1314          uint_t off;
1309 1315          uint_t len;
1310 1316          struct rfs_async_write nrpsp;
1311 1317          struct rfs_async_write_list nlpsp;
1312 1318          ushort_t t_flag;
1313 1319          cred_t *savecred;
1314 1320          int in_crit = 0;
1315 1321          caller_context_t ct;
1316 1322          nfs_srv_t *nsrv;
1317 1323  
1318      -        ASSERT3P(curzone, ==, ((exi == NULL) ? curzone : exi->exi_zone));
1319      -        nsrv = zone_getspecific(rfs_zone_key, curzone);
     1324 +        ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
     1325 +        nsrv = nfs_get_srv();
1320 1326          if (!nsrv->write_async) {
1321 1327                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1322 1328                  return;
1323 1329          }
1324 1330  
1325 1331          /*
1326 1332           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1327 1333           * is considered an OK.
1328 1334           */
1329 1335          ns->ns_status = RFSWRITE_INITVAL;

1330 1336  
1331 1337          nrp = &nrpsp;
1332 1338          nrp->wa = wa;
1333 1339          nrp->ns = ns;
1334 1340          nrp->req = req;
1335 1341          nrp->cr = cr;
1336 1342          nrp->ro = ro;
1337 1343          nrp->thread = curthread;
1338 1344  
1339 1345          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1340 1346  
1341 1347          /*
1342 1348           * Look to see if there is already a cluster started
1343 1349           * for this file.
1344 1350           */
1345 1351          mutex_enter(&nsrv->async_write_lock);
1346 1352          for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1347 1353                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1348 1354                      sizeof (fhandle_t)) == 0)
1349 1355                          break;
1350 1356          }
1351 1357  
1352 1358          /*
1353 1359           * If lp is non-NULL, then there is already a cluster
1354 1360           * started.  We need to place ourselves in the cluster
1355 1361           * list in the right place as determined by starting
1356 1362           * offset.  Conflicts with non-blocking mandatory locked
1357 1363           * regions will be checked when the cluster is processed.
1358 1364           */
1359 1365          if (lp != NULL) {
1360 1366                  rp = lp->list;
1361 1367                  trp = NULL;
1362 1368                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1363 1369                          trp = rp;
1364 1370                          rp = rp->list;
1365 1371                  }
1366 1372                  nrp->list = rp;
1367 1373                  if (trp == NULL)
1368 1374                          lp->list = nrp;
1369 1375                  else
1370 1376                          trp->list = nrp;
1371 1377                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1372 1378                          cv_wait(&lp->cv, &nsrv->async_write_lock);
1373 1379                  mutex_exit(&nsrv->async_write_lock);
1374 1380  
1375 1381                  return;
1376 1382          }
1377 1383  
1378 1384          /*
1379 1385           * No cluster started yet, start one and add ourselves
1380 1386           * to the list of clusters.
1381 1387           */
1382 1388          nrp->list = NULL;
1383 1389  
1384 1390          nlp = &nlpsp;
1385 1391          nlp->fhp = &wa->wa_fhandle;
1386 1392          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1387 1393          nlp->list = nrp;
1388 1394          nlp->next = NULL;
1389 1395  
1390 1396          if (nsrv->async_write_head == NULL) {
1391 1397                  nsrv->async_write_head = nlp;
1392 1398          } else {
1393 1399                  lp = nsrv->async_write_head;
1394 1400                  while (lp->next != NULL)
1395 1401                          lp = lp->next;
1396 1402                  lp->next = nlp;
1397 1403          }
1398 1404          mutex_exit(&nsrv->async_write_lock);
1399 1405  
1400 1406          /*
1401 1407           * Convert the file handle common to all of the requests
1402 1408           * in this cluster to a vnode.
1403 1409           */
1404 1410          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1405 1411          if (vp == NULL) {
1406 1412                  mutex_enter(&nsrv->async_write_lock);
1407 1413                  if (nsrv->async_write_head == nlp)
1408 1414                          nsrv->async_write_head = nlp->next;
1409 1415                  else {
1410 1416                          lp = nsrv->async_write_head;
1411 1417                          while (lp->next != nlp)
1412 1418                                  lp = lp->next;
1413 1419                          lp->next = nlp->next;
1414 1420                  }
1415 1421                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1416 1422                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1417 1423                          rp->ns->ns_status = NFSERR_STALE;
1418 1424                          rp->thread->t_flag |= t_flag;
1419 1425                  }
1420 1426                  cv_broadcast(&nlp->cv);
1421 1427                  mutex_exit(&nsrv->async_write_lock);
1422 1428  
1423 1429                  return;
1424 1430          }
1425 1431  
1426 1432          /*
1427 1433           * Can only write regular files.  Attempts to write any
1428 1434           * other file types fail with EISDIR.
1429 1435           */
1430 1436          if (vp->v_type != VREG) {
1431 1437                  VN_RELE(vp);
1432 1438                  mutex_enter(&nsrv->async_write_lock);
1433 1439                  if (nsrv->async_write_head == nlp)
1434 1440                          nsrv->async_write_head = nlp->next;
1435 1441                  else {
1436 1442                          lp = nsrv->async_write_head;
1437 1443                          while (lp->next != nlp)
1438 1444                                  lp = lp->next;
1439 1445                          lp->next = nlp->next;
1440 1446                  }
1441 1447                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1442 1448                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1443 1449                          rp->ns->ns_status = NFSERR_ISDIR;
1444 1450                          rp->thread->t_flag |= t_flag;
1445 1451                  }
1446 1452                  cv_broadcast(&nlp->cv);
1447 1453                  mutex_exit(&nsrv->async_write_lock);
1448 1454  
1449 1455                  return;
1450 1456          }
1451 1457  
1452 1458          /*
1453 1459           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1454 1460           * deadlock with ufs.
1455 1461           */
1456 1462          if (nbl_need_check(vp)) {
1457 1463                  nbl_start_crit(vp, RW_READER);
1458 1464                  in_crit = 1;
1459 1465          }
1460 1466  
1461 1467          ct.cc_sysid = 0;
1462 1468          ct.cc_pid = 0;
1463 1469          ct.cc_caller_id = nfs2_srv_caller_id;
1464 1470          ct.cc_flags = CC_DONTBLOCK;
1465 1471  
1466 1472          /*
1467 1473           * Lock the file for writing.  This operation provides
1468 1474           * the delay which allows clusters to grow.
1469 1475           */
1470 1476          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1471 1477  
1472 1478          /* check if a monitor detected a delegation conflict */
1473 1479          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1474 1480                  if (in_crit)
1475 1481                          nbl_end_crit(vp);
1476 1482                  VN_RELE(vp);
1477 1483                  /* mark as wouldblock so response is dropped */
1478 1484                  curthread->t_flag |= T_WOULDBLOCK;
1479 1485                  mutex_enter(&nsrv->async_write_lock);
1480 1486                  if (nsrv->async_write_head == nlp)
1481 1487                          nsrv->async_write_head = nlp->next;
1482 1488                  else {
1483 1489                          lp = nsrv->async_write_head;
1484 1490                          while (lp->next != nlp)
1485 1491                                  lp = lp->next;
1486 1492                          lp->next = nlp->next;
1487 1493                  }
1488 1494                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1489 1495                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1490 1496                                  rp->ns->ns_status = puterrno(error);
1491 1497                                  rp->thread->t_flag |= T_WOULDBLOCK;
1492 1498                          }
1493 1499                  }
1494 1500                  cv_broadcast(&nlp->cv);
1495 1501                  mutex_exit(&nsrv->async_write_lock);
1496 1502  
1497 1503                  return;
1498 1504          }
1499 1505  
1500 1506          /*
1501 1507           * Disconnect this cluster from the list of clusters.
1502 1508           * The cluster that is being dealt with must be fixed
1503 1509           * in size after this point, so there is no reason
1504 1510           * to leave it on the list so that new requests can
1505 1511           * find it.
1506 1512           *
1507 1513           * The algorithm is that the first write request will
1508 1514           * create a cluster, convert the file handle to a
1509 1515           * vnode pointer, and then lock the file for writing.
1510 1516           * This request is not likely to be clustered with
1511 1517           * any others.  However, the next request will create
1512 1518           * a new cluster and be blocked in VOP_RWLOCK while
1513 1519           * the first request is being processed.  This delay
1514 1520           * will allow more requests to be clustered in this
1515 1521           * second cluster.
1516 1522           */
1517 1523          mutex_enter(&nsrv->async_write_lock);
1518 1524          if (nsrv->async_write_head == nlp)
1519 1525                  nsrv->async_write_head = nlp->next;
1520 1526          else {
1521 1527                  lp = nsrv->async_write_head;
1522 1528                  while (lp->next != nlp)
1523 1529                          lp = lp->next;
1524 1530                  lp->next = nlp->next;
1525 1531          }
1526 1532          mutex_exit(&nsrv->async_write_lock);
1527 1533  
1528 1534          /*
1529 1535           * Step through the list of requests in this cluster.
1530 1536           * We need to check permissions to make sure that all
1531 1537           * of the requests have sufficient permission to write
1532 1538           * the file.  A cluster can be composed of requests
1533 1539           * from different clients and different users on each
1534 1540           * client.
1535 1541           *
1536 1542           * As a side effect, we also calculate the size of the
1537 1543           * byte range that this cluster encompasses.
1538 1544           */
1539 1545          rp = nlp->list;
1540 1546          off = rp->wa->wa_offset;
1541 1547          len = (uint_t)0;
1542 1548          do {
1543 1549                  if (rdonly(rp->ro, vp)) {
1544 1550                          rp->ns->ns_status = NFSERR_ROFS;
1545 1551                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1546 1552                          rp->thread->t_flag |= t_flag;
1547 1553                          continue;
1548 1554                  }
1549 1555  
1550 1556                  va.va_mask = AT_UID|AT_MODE;
1551 1557  
1552 1558                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1553 1559  
1554 1560                  if (!error) {
1555 1561                          if (crgetuid(rp->cr) != va.va_uid) {
1556 1562                                  /*
1557 1563                                   * This is a kludge to allow writes of files
1558 1564                                   * created with read only permission.  The
1559 1565                                   * owner of the file is always allowed to
1560 1566                                   * write it.
1561 1567                                   */
1562 1568                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1563 1569                          }
1564 1570                          if (!error && MANDLOCK(vp, va.va_mode))
1565 1571                                  error = EACCES;
1566 1572                  }
1567 1573  
1568 1574                  /*
1569 1575                   * Check for a conflict with a nbmand-locked region.
1570 1576                   */
1571 1577                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1572 1578                      rp->wa->wa_count, 0, NULL)) {
1573 1579                          error = EACCES;
1574 1580                  }
1575 1581  
1576 1582                  if (error) {
1577 1583                          rp->ns->ns_status = puterrno(error);
1578 1584                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1579 1585                          rp->thread->t_flag |= t_flag;
1580 1586                          continue;
1581 1587                  }
1582 1588                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1583 1589                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1584 1590          } while ((rp = rp->list) != NULL);
1585 1591  
1586 1592          /*
1587 1593           * Step through the cluster attempting to gather as many
1588 1594           * requests which are contiguous as possible.  These
1589 1595           * contiguous requests are handled via one call to VOP_WRITE
1590 1596           * instead of different calls to VOP_WRITE.  We also keep
1591 1597           * track of the fact that any data was written.
1592 1598           */
1593 1599          rp = nlp->list;
1594 1600          data_written = 0;
1595 1601          do {
1596 1602                  /*
1597 1603                   * Skip any requests which are already marked as having an
1598 1604                   * error.
1599 1605                   */
1600 1606                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1601 1607                          rp = rp->list;
1602 1608                          continue;
1603 1609                  }
1604 1610  
1605 1611                  /*
1606 1612                   * Count the number of iovec's which are required
1607 1613                   * to handle this set of requests.  One iovec is
1608 1614                   * needed for each data buffer, whether addressed
1609 1615                   * by wa_data or by the b_rptr pointers in the
1610 1616                   * mblk chains.
1611 1617                   */
1612 1618                  iovcnt = 0;
1613 1619                  lrp = rp;
1614 1620                  for (;;) {
1615 1621                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1616 1622                                  iovcnt++;
1617 1623                          else {
1618 1624                                  m = lrp->wa->wa_mblk;
1619 1625                                  while (m != NULL) {
1620 1626                                          iovcnt++;
1621 1627                                          m = m->b_cont;
1622 1628                                  }
1623 1629                          }
1624 1630                          if (lrp->list == NULL ||
1625 1631                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1626 1632                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1627 1633                              lrp->list->wa->wa_offset) {
1628 1634                                  lrp = lrp->list;
1629 1635                                  break;
1630 1636                          }
1631 1637                          lrp = lrp->list;
1632 1638                  }
1633 1639  
1634 1640                  if (iovcnt <= MAXCLIOVECS) {
1635 1641  #ifdef DEBUG
1636 1642                          rfs_write_hits++;
1637 1643  #endif
1638 1644                          niovp = iov;
1639 1645                  } else {
1640 1646  #ifdef DEBUG
1641 1647                          rfs_write_misses++;
1642 1648  #endif
1643 1649                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1644 1650                  }
1645 1651                  /*
1646 1652                   * Put together the scatter/gather iovecs.
1647 1653                   */
1648 1654                  iovp = niovp;
1649 1655                  trp = rp;
1650 1656                  count = 0;
1651 1657                  do {
1652 1658                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1653 1659                                  if (trp->wa->wa_rlist) {
1654 1660                                          iovp->iov_base =
1655 1661                                              (char *)((trp->wa->wa_rlist)->
1656 1662                                              u.c_daddr3);
1657 1663                                          iovp->iov_len = trp->wa->wa_count;
1658 1664                                  } else  {
1659 1665                                          iovp->iov_base = trp->wa->wa_data;
1660 1666                                          iovp->iov_len = trp->wa->wa_count;
1661 1667                                  }
1662 1668                                  iovp++;
1663 1669                          } else {
1664 1670                                  m = trp->wa->wa_mblk;
1665 1671                                  rcount = trp->wa->wa_count;
1666 1672                                  while (m != NULL) {
1667 1673                                          iovp->iov_base = (caddr_t)m->b_rptr;
1668 1674                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1669 1675                                          rcount -= iovp->iov_len;
1670 1676                                          if (rcount < 0)
1671 1677                                                  iovp->iov_len += rcount;
1672 1678                                          iovp++;
1673 1679                                          if (rcount <= 0)
1674 1680                                                  break;
1675 1681                                          m = m->b_cont;
1676 1682                                  }
1677 1683                          }
1678 1684                          count += trp->wa->wa_count;
1679 1685                          trp = trp->list;
1680 1686                  } while (trp != lrp);
1681 1687  
1682 1688                  uio.uio_iov = niovp;
1683 1689                  uio.uio_iovcnt = iovcnt;
1684 1690                  uio.uio_segflg = UIO_SYSSPACE;
1685 1691                  uio.uio_extflg = UIO_COPY_DEFAULT;
1686 1692                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1687 1693                  uio.uio_resid = count;
1688 1694                  /*
1689 1695                   * The limit is checked on the client. We
1690 1696                   * should allow any size writes here.
1691 1697                   */
1692 1698                  uio.uio_llimit = curproc->p_fsz_ctl;
1693 1699                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1694 1700                  if (rlimit < (rlim64_t)uio.uio_resid)
1695 1701                          uio.uio_resid = (uint_t)rlimit;
1696 1702  
1697 1703                  /*
1698 1704                   * For now we assume no append mode.
1699 1705                   */
1700 1706  
1701 1707                  /*
1702 1708                   * We're changing creds because VM may fault
1703 1709                   * and we need the cred of the current
1704 1710                   * thread to be used if quota * checking is
1705 1711                   * enabled.
1706 1712                   */
1707 1713                  savecred = curthread->t_cred;
1708 1714                  curthread->t_cred = cr;
1709 1715                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1710 1716                  curthread->t_cred = savecred;
1711 1717  
1712 1718                  /* check if a monitor detected a delegation conflict */
1713 1719                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1714 1720                          /* mark as wouldblock so response is dropped */
1715 1721                          curthread->t_flag |= T_WOULDBLOCK;
1716 1722  
1717 1723                  if (niovp != iov)
1718 1724                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1719 1725  
1720 1726                  if (!error) {
1721 1727                          data_written = 1;
1722 1728                          /*
1723 1729                           * Get attributes again so we send the latest mod
1724 1730                           * time to the client side for its cache.
1725 1731                           */
1726 1732                          va.va_mask = AT_ALL;    /* now we want everything */
1727 1733  
1728 1734                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1729 1735  
1730 1736                          if (!error)
1731 1737                                  acl_perm(vp, exi, &va, rp->cr);
1732 1738                  }
1733 1739  
1734 1740                  /*
1735 1741                   * Fill in the status responses for each request
1736 1742                   * which was just handled.  Also, copy the latest
1737 1743                   * attributes in to the attribute responses if
1738 1744                   * appropriate.
1739 1745                   */
1740 1746                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1741 1747                  do {
1742 1748                          rp->thread->t_flag |= t_flag;
1743 1749                          /* check for overflows */
1744 1750                          if (!error) {
1745 1751                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1746 1752                          }
1747 1753                          rp->ns->ns_status = puterrno(error);
1748 1754                          rp = rp->list;
1749 1755                  } while (rp != lrp);
1750 1756          } while (rp != NULL);
1751 1757  
1752 1758          /*
1753 1759           * If any data was written at all, then we need to flush
1754 1760           * the data and metadata to stable storage.
1755 1761           */
1756 1762          if (data_written) {
1757 1763                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1758 1764  
1759 1765                  if (!error) {
1760 1766                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1761 1767                  }
1762 1768          }
1763 1769  
1764 1770          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1765 1771  
1766 1772          if (in_crit)
1767 1773                  nbl_end_crit(vp);
1768 1774          VN_RELE(vp);
1769 1775  
1770 1776          t_flag = curthread->t_flag & T_WOULDBLOCK;
1771 1777          mutex_enter(&nsrv->async_write_lock);
1772 1778          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1773 1779                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1774 1780                          rp->ns->ns_status = puterrno(error);
1775 1781                          rp->thread->t_flag |= t_flag;
1776 1782                  }
1777 1783          }
1778 1784          cv_broadcast(&nlp->cv);
1779 1785          mutex_exit(&nsrv->async_write_lock);
1780 1786  
1781 1787  }
1782 1788  
1783 1789  void *
1784 1790  rfs_write_getfh(struct nfswriteargs *wa)
1785 1791  {
1786 1792          return (&wa->wa_fhandle);
1787 1793  }
1788 1794  
1789 1795  /*
1790 1796   * Create a file.
1791 1797   * Creates a file with given attributes and returns those attributes
1792 1798   * and an fhandle for the new file.
1793 1799   */
1794 1800  void
1795 1801  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1796 1802      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1797 1803  {
1798 1804          int error;
1799 1805          int lookuperr;
1800 1806          int in_crit = 0;
1801 1807          struct vattr va;
1802 1808          vnode_t *vp;
1803 1809          vnode_t *realvp;
1804 1810          vnode_t *dvp;
1805 1811          char *name = args->ca_da.da_name;
1806 1812          vnode_t *tvp = NULL;
1807 1813          int mode;
1808 1814          int lookup_ok;
1809 1815          bool_t trunc;
1810 1816          struct sockaddr *ca;
1811 1817  
1812 1818          /*
1813 1819           * Disallow NULL paths
1814 1820           */
1815 1821          if (name == NULL || *name == '\0') {
1816 1822                  dr->dr_status = NFSERR_ACCES;
1817 1823                  return;
1818 1824          }
1819 1825  
1820 1826          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1821 1827          if (dvp == NULL) {
1822 1828                  dr->dr_status = NFSERR_STALE;
1823 1829                  return;
1824 1830          }
1825 1831  
1826 1832          error = sattr_to_vattr(args->ca_sa, &va);
1827 1833          if (error) {
1828 1834                  dr->dr_status = puterrno(error);
1829 1835                  return;
1830 1836          }
1831 1837  
1832 1838          /*
1833 1839           * Must specify the mode.
1834 1840           */
1835 1841          if (!(va.va_mask & AT_MODE)) {
1836 1842                  VN_RELE(dvp);
1837 1843                  dr->dr_status = NFSERR_INVAL;
1838 1844                  return;
1839 1845          }
1840 1846  
1841 1847          /*
1842 1848           * This is a completely gross hack to make mknod
1843 1849           * work over the wire until we can wack the protocol
1844 1850           */
1845 1851          if ((va.va_mode & IFMT) == IFCHR) {
1846 1852                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1847 1853                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1848 1854                  else {
1849 1855                          va.va_type = VCHR;
1850 1856                          /*
1851 1857                           * uncompress the received dev_t
1852 1858                           * if the top half is zero indicating a request
1853 1859                           * from an `older style' OS.
1854 1860                           */
1855 1861                          if ((va.va_size & 0xffff0000) == 0)
1856 1862                                  va.va_rdev = nfsv2_expdev(va.va_size);
1857 1863                          else
1858 1864                                  va.va_rdev = (dev_t)va.va_size;
1859 1865                  }
1860 1866                  va.va_mask &= ~AT_SIZE;
1861 1867          } else if ((va.va_mode & IFMT) == IFBLK) {
1862 1868                  va.va_type = VBLK;
1863 1869                  /*
1864 1870                   * uncompress the received dev_t
1865 1871                   * if the top half is zero indicating a request
1866 1872                   * from an `older style' OS.
1867 1873                   */
1868 1874                  if ((va.va_size & 0xffff0000) == 0)
1869 1875                          va.va_rdev = nfsv2_expdev(va.va_size);
1870 1876                  else
1871 1877                          va.va_rdev = (dev_t)va.va_size;
1872 1878                  va.va_mask &= ~AT_SIZE;
1873 1879          } else if ((va.va_mode & IFMT) == IFSOCK) {
1874 1880                  va.va_type = VSOCK;
1875 1881          } else {
1876 1882                  va.va_type = VREG;
1877 1883          }
1878 1884          va.va_mode &= ~IFMT;
1879 1885          va.va_mask |= AT_TYPE;
1880 1886  
1881 1887          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1882 1888          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1883 1889              MAXPATHLEN);
1884 1890          if (name == NULL) {
1885 1891                  dr->dr_status = puterrno(EINVAL);
1886 1892                  return;
1887 1893          }
1888 1894  
1889 1895          /*
1890 1896           * Why was the choice made to use VWRITE as the mode to the
1891 1897           * call to VOP_CREATE ? This results in a bug.  When a client
1892 1898           * opens a file that already exists and is RDONLY, the second
1893 1899           * open fails with an EACESS because of the mode.
1894 1900           * bug ID 1054648.
1895 1901           */
1896 1902          lookup_ok = 0;
1897 1903          mode = VWRITE;
1898 1904          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1899 1905                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1900 1906                      NULL, NULL, NULL);
1901 1907                  if (!error) {
1902 1908                          struct vattr at;
1903 1909  
1904 1910                          lookup_ok = 1;
1905 1911                          at.va_mask = AT_MODE;
1906 1912                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1907 1913                          if (!error)
1908 1914                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1909 1915                          VN_RELE(tvp);
1910 1916                          tvp = NULL;
1911 1917                  }
1912 1918          }
1913 1919  
1914 1920          if (!lookup_ok) {
1915 1921                  if (rdonly(ro, dvp)) {
1916 1922                          error = EROFS;
1917 1923                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1918 1924                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1919 1925                          error = EPERM;
1920 1926                  } else {
1921 1927                          error = 0;
1922 1928                  }
1923 1929          }
1924 1930  
1925 1931          /*
1926 1932           * If file size is being modified on an already existing file
1927 1933           * make sure that there are no conflicting non-blocking mandatory
1928 1934           * locks in the region being manipulated. Return EACCES if there
1929 1935           * are conflicting locks.
1930 1936           */
1931 1937          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1932 1938                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1933 1939                      NULL, NULL, NULL);
1934 1940  
1935 1941                  if (!lookuperr &&
1936 1942                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1937 1943                          VN_RELE(tvp);
1938 1944                          curthread->t_flag |= T_WOULDBLOCK;
1939 1945                          goto out;
1940 1946                  }
1941 1947  
1942 1948                  if (!lookuperr && nbl_need_check(tvp)) {
1943 1949                          /*
1944 1950                           * The file exists. Now check if it has any
1945 1951                           * conflicting non-blocking mandatory locks
1946 1952                           * in the region being changed.
1947 1953                           */
1948 1954                          struct vattr bva;
1949 1955                          u_offset_t offset;
1950 1956                          ssize_t length;
1951 1957  
1952 1958                          nbl_start_crit(tvp, RW_READER);
1953 1959                          in_crit = 1;
1954 1960  
1955 1961                          bva.va_mask = AT_SIZE;
1956 1962                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1957 1963                          if (!error) {
1958 1964                                  if (va.va_size < bva.va_size) {
1959 1965                                          offset = va.va_size;
1960 1966                                          length = bva.va_size - va.va_size;
1961 1967                                  } else {
1962 1968                                          offset = bva.va_size;
1963 1969                                          length = va.va_size - bva.va_size;
1964 1970                                  }
1965 1971                                  if (length) {
1966 1972                                          if (nbl_conflict(tvp, NBL_WRITE,
1967 1973                                              offset, length, 0, NULL)) {
1968 1974                                                  error = EACCES;
1969 1975                                          }
1970 1976                                  }
1971 1977                          }
1972 1978                          if (error) {
1973 1979                                  nbl_end_crit(tvp);
1974 1980                                  VN_RELE(tvp);
1975 1981                                  in_crit = 0;
1976 1982                          }
1977 1983                  } else if (tvp != NULL) {
1978 1984                          VN_RELE(tvp);
1979 1985                  }
1980 1986          }
1981 1987  
1982 1988          if (!error) {
1983 1989                  /*
1984 1990                   * If filesystem is shared with nosuid the remove any
1985 1991                   * setuid/setgid bits on create.
1986 1992                   */
1987 1993                  if (va.va_type == VREG &&
1988 1994                      exi->exi_export.ex_flags & EX_NOSUID)
1989 1995                          va.va_mode &= ~(VSUID | VSGID);
1990 1996  
1991 1997                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1992 1998                      NULL, NULL);
1993 1999  
1994 2000                  if (!error) {
1995 2001  
1996 2002                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1997 2003                                  trunc = TRUE;
1998 2004                          else
1999 2005                                  trunc = FALSE;
2000 2006  
2001 2007                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2002 2008                                  VN_RELE(vp);
2003 2009                                  curthread->t_flag |= T_WOULDBLOCK;
2004 2010                                  goto out;
2005 2011                          }
2006 2012                          va.va_mask = AT_ALL;
2007 2013  
2008 2014                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2009 2015  
2010 2016                          /* check for overflows */
2011 2017                          if (!error) {
2012 2018                                  acl_perm(vp, exi, &va, cr);
2013 2019                                  error = vattr_to_nattr(&va, &dr->dr_attr);
2014 2020                                  if (!error) {
2015 2021                                          error = makefh(&dr->dr_fhandle, vp,
2016 2022                                              exi);
2017 2023                                  }
2018 2024                          }
2019 2025                          /*
2020 2026                           * Force modified metadata out to stable storage.
2021 2027                           *
2022 2028                           * if a underlying vp exists, pass it to VOP_FSYNC
2023 2029                           */
2024 2030                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
2025 2031                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2026 2032                          else
2027 2033                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2028 2034                          VN_RELE(vp);
2029 2035                  }
2030 2036  
2031 2037                  if (in_crit) {
2032 2038                          nbl_end_crit(tvp);
2033 2039                          VN_RELE(tvp);
2034 2040                  }
2035 2041          }
2036 2042  
2037 2043          /*
2038 2044           * Force modified data and metadata out to stable storage.
2039 2045           */
2040 2046          (void) VOP_FSYNC(dvp, 0, cr, NULL);
2041 2047  
2042 2048  out:
2043 2049  
2044 2050          VN_RELE(dvp);
2045 2051  
2046 2052          dr->dr_status = puterrno(error);
2047 2053  
2048 2054          if (name != args->ca_da.da_name)
2049 2055                  kmem_free(name, MAXPATHLEN);
2050 2056  }
2051 2057  void *
2052 2058  rfs_create_getfh(struct nfscreatargs *args)
2053 2059  {
2054 2060          return (args->ca_da.da_fhandle);
2055 2061  }
2056 2062  
2057 2063  /*
2058 2064   * Remove a file.
2059 2065   * Remove named file from parent directory.
2060 2066   */
2061 2067  /* ARGSUSED */
2062 2068  void
2063 2069  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2064 2070      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2065 2071  {
2066 2072          int error = 0;
2067 2073          vnode_t *vp;
2068 2074          vnode_t *targvp;
2069 2075          int in_crit = 0;
2070 2076  
2071 2077          /*
2072 2078           * Disallow NULL paths
2073 2079           */
2074 2080          if (da->da_name == NULL || *da->da_name == '\0') {
2075 2081                  *status = NFSERR_ACCES;
2076 2082                  return;
2077 2083          }
2078 2084  
2079 2085          vp = nfs_fhtovp(da->da_fhandle, exi);
2080 2086          if (vp == NULL) {
2081 2087                  *status = NFSERR_STALE;
2082 2088                  return;
2083 2089          }
2084 2090  
2085 2091          if (rdonly(ro, vp)) {
2086 2092                  VN_RELE(vp);
2087 2093                  *status = NFSERR_ROFS;
2088 2094                  return;
2089 2095          }
2090 2096  
2091 2097          /*
2092 2098           * Check for a conflict with a non-blocking mandatory share reservation.
2093 2099           */
2094 2100          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2095 2101              NULL, cr, NULL, NULL, NULL);
2096 2102          if (error != 0) {
2097 2103                  VN_RELE(vp);
2098 2104                  *status = puterrno(error);
2099 2105                  return;
2100 2106          }
2101 2107  
2102 2108          /*
2103 2109           * If the file is delegated to an v4 client, then initiate
2104 2110           * recall and drop this request (by setting T_WOULDBLOCK).
2105 2111           * The client will eventually re-transmit the request and
2106 2112           * (hopefully), by then, the v4 client will have returned
2107 2113           * the delegation.
2108 2114           */
2109 2115  
2110 2116          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2111 2117                  VN_RELE(vp);
2112 2118                  VN_RELE(targvp);
2113 2119                  curthread->t_flag |= T_WOULDBLOCK;
2114 2120                  return;
2115 2121          }
2116 2122  
2117 2123          if (nbl_need_check(targvp)) {
2118 2124                  nbl_start_crit(targvp, RW_READER);
2119 2125                  in_crit = 1;
2120 2126                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2121 2127                          error = EACCES;
2122 2128                          goto out;
2123 2129                  }
2124 2130          }
2125 2131  
2126 2132          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2127 2133  
2128 2134          /*
2129 2135           * Force modified data and metadata out to stable storage.
2130 2136           */
2131 2137          (void) VOP_FSYNC(vp, 0, cr, NULL);
2132 2138  
2133 2139  out:
2134 2140          if (in_crit)
2135 2141                  nbl_end_crit(targvp);
2136 2142          VN_RELE(targvp);
2137 2143          VN_RELE(vp);
2138 2144  
2139 2145          *status = puterrno(error);
2140 2146  
2141 2147  }
2142 2148  
2143 2149  void *
2144 2150  rfs_remove_getfh(struct nfsdiropargs *da)
2145 2151  {
2146 2152          return (da->da_fhandle);
2147 2153  }
2148 2154  
2149 2155  /*
2150 2156   * rename a file
2151 2157   * Give a file (from) a new name (to).
2152 2158   */
2153 2159  /* ARGSUSED */
2154 2160  void
2155 2161  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2156 2162      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2157 2163  {
2158 2164          int error = 0;
2159 2165          vnode_t *fromvp;
2160 2166          vnode_t *tovp;
2161 2167          struct exportinfo *to_exi;
2162 2168          fhandle_t *fh;
2163 2169          vnode_t *srcvp;
2164 2170          vnode_t *targvp;
2165 2171          int in_crit = 0;
2166 2172  
2167 2173          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2168 2174          if (fromvp == NULL) {
2169 2175                  *status = NFSERR_STALE;
2170 2176                  return;
2171 2177          }
2172 2178  
2173 2179          fh = args->rna_to.da_fhandle;
2174 2180          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2175 2181          if (to_exi == NULL) {
2176 2182                  VN_RELE(fromvp);
2177 2183                  *status = NFSERR_ACCES;
2178 2184                  return;
2179 2185          }
2180 2186          exi_rele(to_exi);
2181 2187  
2182 2188          if (to_exi != exi) {
2183 2189                  VN_RELE(fromvp);
2184 2190                  *status = NFSERR_XDEV;
2185 2191                  return;
2186 2192          }
2187 2193  
2188 2194          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2189 2195          if (tovp == NULL) {
2190 2196                  VN_RELE(fromvp);
2191 2197                  *status = NFSERR_STALE;
2192 2198                  return;
2193 2199          }
2194 2200  
2195 2201          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2196 2202                  VN_RELE(tovp);
2197 2203                  VN_RELE(fromvp);
2198 2204                  *status = NFSERR_NOTDIR;
2199 2205                  return;
2200 2206          }
2201 2207  
2202 2208          /*
2203 2209           * Disallow NULL paths
2204 2210           */
2205 2211          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2206 2212              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2207 2213                  VN_RELE(tovp);
2208 2214                  VN_RELE(fromvp);
2209 2215                  *status = NFSERR_ACCES;
2210 2216                  return;
2211 2217          }
2212 2218  
2213 2219          if (rdonly(ro, tovp)) {
2214 2220                  VN_RELE(tovp);
2215 2221                  VN_RELE(fromvp);
2216 2222                  *status = NFSERR_ROFS;
2217 2223                  return;
2218 2224          }
2219 2225  
2220 2226          /*
2221 2227           * Check for a conflict with a non-blocking mandatory share reservation.
2222 2228           */
2223 2229          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2224 2230              NULL, cr, NULL, NULL, NULL);
2225 2231          if (error != 0) {
2226 2232                  VN_RELE(tovp);
2227 2233                  VN_RELE(fromvp);
2228 2234                  *status = puterrno(error);
2229 2235                  return;
2230 2236          }
2231 2237  
2232 2238          /* Check for delegations on the source file */
2233 2239  
2234 2240          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2235 2241                  VN_RELE(tovp);
2236 2242                  VN_RELE(fromvp);
2237 2243                  VN_RELE(srcvp);
2238 2244                  curthread->t_flag |= T_WOULDBLOCK;
2239 2245                  return;
2240 2246          }
2241 2247  
2242 2248          /* Check for delegation on the file being renamed over, if it exists */
2243 2249  
2244 2250          if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2245 2251              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2246 2252              NULL, NULL, NULL) == 0) {
2247 2253  
2248 2254                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2249 2255                          VN_RELE(tovp);
2250 2256                          VN_RELE(fromvp);
2251 2257                          VN_RELE(srcvp);
2252 2258                          VN_RELE(targvp);
2253 2259                          curthread->t_flag |= T_WOULDBLOCK;
2254 2260                          return;
2255 2261                  }
2256 2262                  VN_RELE(targvp);
2257 2263          }
2258 2264  
2259 2265  
2260 2266          if (nbl_need_check(srcvp)) {
2261 2267                  nbl_start_crit(srcvp, RW_READER);
2262 2268                  in_crit = 1;
2263 2269                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2264 2270                          error = EACCES;
2265 2271                          goto out;
2266 2272                  }
2267 2273          }
2268 2274  
2269 2275          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2270 2276              tovp, args->rna_to.da_name, cr, NULL, 0);
2271 2277  
2272 2278          if (error == 0)
2273 2279                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2274 2280                      strlen(args->rna_to.da_name));
2275 2281  
2276 2282          /*
2277 2283           * Force modified data and metadata out to stable storage.
2278 2284           */
2279 2285          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2280 2286          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2281 2287  
2282 2288  out:
2283 2289          if (in_crit)
2284 2290                  nbl_end_crit(srcvp);
2285 2291          VN_RELE(srcvp);
2286 2292          VN_RELE(tovp);
2287 2293          VN_RELE(fromvp);
2288 2294  
2289 2295          *status = puterrno(error);
2290 2296  
2291 2297  }
2292 2298  void *
2293 2299  rfs_rename_getfh(struct nfsrnmargs *args)
2294 2300  {
2295 2301          return (args->rna_from.da_fhandle);
2296 2302  }
2297 2303  
2298 2304  /*
2299 2305   * Link to a file.
2300 2306   * Create a file (to) which is a hard link to the given file (from).
2301 2307   */
2302 2308  /* ARGSUSED */
2303 2309  void
2304 2310  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2305 2311      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2306 2312  {
2307 2313          int error;
2308 2314          vnode_t *fromvp;
2309 2315          vnode_t *tovp;
2310 2316          struct exportinfo *to_exi;
2311 2317          fhandle_t *fh;
2312 2318  
2313 2319          fromvp = nfs_fhtovp(args->la_from, exi);
2314 2320          if (fromvp == NULL) {
2315 2321                  *status = NFSERR_STALE;
2316 2322                  return;
2317 2323          }
2318 2324  
2319 2325          fh = args->la_to.da_fhandle;
2320 2326          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2321 2327          if (to_exi == NULL) {
2322 2328                  VN_RELE(fromvp);
2323 2329                  *status = NFSERR_ACCES;
2324 2330                  return;
2325 2331          }
2326 2332          exi_rele(to_exi);
2327 2333  
2328 2334          if (to_exi != exi) {
2329 2335                  VN_RELE(fromvp);
2330 2336                  *status = NFSERR_XDEV;
2331 2337                  return;
2332 2338          }
2333 2339  
2334 2340          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2335 2341          if (tovp == NULL) {
2336 2342                  VN_RELE(fromvp);
2337 2343                  *status = NFSERR_STALE;
2338 2344                  return;
2339 2345          }
2340 2346  
2341 2347          if (tovp->v_type != VDIR) {
2342 2348                  VN_RELE(tovp);
2343 2349                  VN_RELE(fromvp);
2344 2350                  *status = NFSERR_NOTDIR;
2345 2351                  return;
2346 2352          }
2347 2353          /*
2348 2354           * Disallow NULL paths
2349 2355           */
2350 2356          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2351 2357                  VN_RELE(tovp);
2352 2358                  VN_RELE(fromvp);
2353 2359                  *status = NFSERR_ACCES;
2354 2360                  return;
2355 2361          }
2356 2362  
2357 2363          if (rdonly(ro, tovp)) {
2358 2364                  VN_RELE(tovp);
2359 2365                  VN_RELE(fromvp);
2360 2366                  *status = NFSERR_ROFS;
2361 2367                  return;
2362 2368          }
2363 2369  
2364 2370          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2365 2371  
2366 2372          /*
2367 2373           * Force modified data and metadata out to stable storage.
2368 2374           */
2369 2375          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2370 2376          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2371 2377  
2372 2378          VN_RELE(tovp);
2373 2379          VN_RELE(fromvp);
2374 2380  
2375 2381          *status = puterrno(error);
2376 2382  
2377 2383  }
2378 2384  void *
2379 2385  rfs_link_getfh(struct nfslinkargs *args)
2380 2386  {
2381 2387          return (args->la_from);
2382 2388  }
2383 2389  
2384 2390  /*
2385 2391   * Symbolicly link to a file.
2386 2392   * Create a file (to) with the given attributes which is a symbolic link
2387 2393   * to the given path name (to).
2388 2394   */
2389 2395  void
2390 2396  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2391 2397      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2392 2398  {
2393 2399          int error;
2394 2400          struct vattr va;
2395 2401          vnode_t *vp;
2396 2402          vnode_t *svp;
2397 2403          int lerror;
2398 2404          struct sockaddr *ca;
2399 2405          char *name = NULL;
2400 2406  
2401 2407          /*
2402 2408           * Disallow NULL paths
2403 2409           */
2404 2410          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2405 2411                  *status = NFSERR_ACCES;
2406 2412                  return;
2407 2413          }
2408 2414  
2409 2415          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2410 2416          if (vp == NULL) {
2411 2417                  *status = NFSERR_STALE;
2412 2418                  return;
2413 2419          }
2414 2420  
2415 2421          if (rdonly(ro, vp)) {
2416 2422                  VN_RELE(vp);
2417 2423                  *status = NFSERR_ROFS;
2418 2424                  return;
2419 2425          }
2420 2426  
2421 2427          error = sattr_to_vattr(args->sla_sa, &va);
2422 2428          if (error) {
2423 2429                  VN_RELE(vp);
2424 2430                  *status = puterrno(error);
2425 2431                  return;
2426 2432          }
2427 2433  
2428 2434          if (!(va.va_mask & AT_MODE)) {
2429 2435                  VN_RELE(vp);
2430 2436                  *status = NFSERR_INVAL;
2431 2437                  return;
2432 2438          }
2433 2439  
2434 2440          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2435 2441          name = nfscmd_convname(ca, exi, args->sla_tnm,
2436 2442              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2437 2443  
2438 2444          if (name == NULL) {
2439 2445                  *status = NFSERR_ACCES;
2440 2446                  return;
2441 2447          }
2442 2448  
2443 2449          va.va_type = VLNK;
2444 2450          va.va_mask |= AT_TYPE;
2445 2451  
2446 2452          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2447 2453  
2448 2454          /*
2449 2455           * Force new data and metadata out to stable storage.
2450 2456           */
2451 2457          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2452 2458              NULL, cr, NULL, NULL, NULL);
2453 2459  
2454 2460          if (!lerror) {
2455 2461                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2456 2462                  VN_RELE(svp);
2457 2463          }
2458 2464  
2459 2465          /*
2460 2466           * Force modified data and metadata out to stable storage.
2461 2467           */
2462 2468          (void) VOP_FSYNC(vp, 0, cr, NULL);
2463 2469  
2464 2470          VN_RELE(vp);
2465 2471  
2466 2472          *status = puterrno(error);
2467 2473          if (name != args->sla_tnm)
2468 2474                  kmem_free(name, MAXPATHLEN);
2469 2475  
2470 2476  }
2471 2477  void *
2472 2478  rfs_symlink_getfh(struct nfsslargs *args)
2473 2479  {
2474 2480          return (args->sla_from.da_fhandle);
2475 2481  }
2476 2482  
2477 2483  /*
2478 2484   * Make a directory.
2479 2485   * Create a directory with the given name, parent directory, and attributes.
2480 2486   * Returns a file handle and attributes for the new directory.
2481 2487   */
2482 2488  /* ARGSUSED */
2483 2489  void
2484 2490  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2485 2491      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2486 2492  {
2487 2493          int error;
2488 2494          struct vattr va;
2489 2495          vnode_t *dvp = NULL;
2490 2496          vnode_t *vp;
2491 2497          char *name = args->ca_da.da_name;
2492 2498  
2493 2499          /*
2494 2500           * Disallow NULL paths
2495 2501           */
2496 2502          if (name == NULL || *name == '\0') {
2497 2503                  dr->dr_status = NFSERR_ACCES;
2498 2504                  return;
2499 2505          }
2500 2506  
2501 2507          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2502 2508          if (vp == NULL) {
2503 2509                  dr->dr_status = NFSERR_STALE;
2504 2510                  return;
2505 2511          }
2506 2512  
2507 2513          if (rdonly(ro, vp)) {
2508 2514                  VN_RELE(vp);
2509 2515                  dr->dr_status = NFSERR_ROFS;
2510 2516                  return;
2511 2517          }
2512 2518  
2513 2519          error = sattr_to_vattr(args->ca_sa, &va);
2514 2520          if (error) {
2515 2521                  VN_RELE(vp);
2516 2522                  dr->dr_status = puterrno(error);
2517 2523                  return;
2518 2524          }
2519 2525  
2520 2526          if (!(va.va_mask & AT_MODE)) {
2521 2527                  VN_RELE(vp);
2522 2528                  dr->dr_status = NFSERR_INVAL;
2523 2529                  return;
2524 2530          }
2525 2531  
2526 2532          va.va_type = VDIR;
2527 2533          va.va_mask |= AT_TYPE;
2528 2534  
2529 2535          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2530 2536  
2531 2537          if (!error) {
2532 2538                  /*
2533 2539                   * Attribtutes of the newly created directory should
2534 2540                   * be returned to the client.
2535 2541                   */
2536 2542                  va.va_mask = AT_ALL; /* We want everything */
2537 2543                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2538 2544  
2539 2545                  /* check for overflows */
2540 2546                  if (!error) {
2541 2547                          acl_perm(vp, exi, &va, cr);
2542 2548                          error = vattr_to_nattr(&va, &dr->dr_attr);
2543 2549                          if (!error) {
2544 2550                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2545 2551                          }
2546 2552                  }
2547 2553                  /*
2548 2554                   * Force new data and metadata out to stable storage.
2549 2555                   */
2550 2556                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2551 2557                  VN_RELE(dvp);
2552 2558          }
2553 2559  
2554 2560          /*
2555 2561           * Force modified data and metadata out to stable storage.
2556 2562           */
2557 2563          (void) VOP_FSYNC(vp, 0, cr, NULL);
2558 2564  
2559 2565          VN_RELE(vp);
2560 2566  
2561 2567          dr->dr_status = puterrno(error);
2562 2568  
2563 2569  }
2564 2570  void *
2565 2571  rfs_mkdir_getfh(struct nfscreatargs *args)
2566 2572  {
2567 2573          return (args->ca_da.da_fhandle);
2568 2574  }
2569 2575  
2570 2576  /*
2571 2577   * Remove a directory.
2572 2578   * Remove the given directory name from the given parent directory.
2573 2579   */
2574 2580  /* ARGSUSED */
2575 2581  void
2576 2582  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2577 2583      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2578 2584  {
2579 2585          int error;
2580 2586          vnode_t *vp;
2581 2587  
2582 2588          /*
2583 2589           * Disallow NULL paths
2584 2590           */
2585 2591          if (da->da_name == NULL || *da->da_name == '\0') {
2586 2592                  *status = NFSERR_ACCES;
2587 2593                  return;
2588 2594          }
2589 2595  
2590 2596          vp = nfs_fhtovp(da->da_fhandle, exi);
2591 2597          if (vp == NULL) {
2592 2598                  *status = NFSERR_STALE;
2593 2599                  return;
2594 2600          }
2595 2601  
2596 2602          if (rdonly(ro, vp)) {
2597 2603                  VN_RELE(vp);
2598 2604                  *status = NFSERR_ROFS;
2599 2605                  return;
2600 2606          }
2601 2607  
2602 2608          /*
2603 2609           * VOP_RMDIR takes a third argument (the current
2604 2610           * directory of the process).  That's because someone
2605 2611           * wants to return EINVAL if one tries to remove ".".
2606 2612           * Of course, NFS servers have no idea what their
2607 2613           * clients' current directories are.  We fake it by
2608 2614           * supplying a vnode known to exist and illegal to
2609 2615           * remove.
2610 2616           */
2611 2617          error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2612 2618  
2613 2619          /*
2614 2620           * Force modified data and metadata out to stable storage.
2615 2621           */
2616 2622          (void) VOP_FSYNC(vp, 0, cr, NULL);
2617 2623  
2618 2624          VN_RELE(vp);
2619 2625  
2620 2626          /*
2621 2627           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2622 2628           * if the directory is not empty.  A System V NFS server
2623 2629           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2624 2630           * over the wire.
2625 2631           */
2626 2632          if (error == EEXIST)
2627 2633                  *status = NFSERR_NOTEMPTY;
2628 2634          else
2629 2635                  *status = puterrno(error);
2630 2636  
2631 2637  }
2632 2638  void *
2633 2639  rfs_rmdir_getfh(struct nfsdiropargs *da)
2634 2640  {
2635 2641          return (da->da_fhandle);
2636 2642  }
2637 2643  
2638 2644  /* ARGSUSED */
2639 2645  void
2640 2646  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2641 2647      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2642 2648  {
2643 2649          int error;
2644 2650          int iseof;
2645 2651          struct iovec iov;
2646 2652          struct uio uio;
2647 2653          vnode_t *vp;
2648 2654          char *ndata = NULL;
2649 2655          struct sockaddr *ca;
2650 2656          size_t nents;
2651 2657          int ret;
2652 2658  
2653 2659          vp = nfs_fhtovp(&rda->rda_fh, exi);
2654 2660          if (vp == NULL) {
2655 2661                  rd->rd_entries = NULL;
2656 2662                  rd->rd_status = NFSERR_STALE;
2657 2663                  return;
2658 2664          }
2659 2665  
2660 2666          if (vp->v_type != VDIR) {
2661 2667                  VN_RELE(vp);
2662 2668                  rd->rd_entries = NULL;
2663 2669                  rd->rd_status = NFSERR_NOTDIR;
2664 2670                  return;
2665 2671          }
2666 2672  
2667 2673          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2668 2674  
2669 2675          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2670 2676  
2671 2677          if (error) {
2672 2678                  rd->rd_entries = NULL;
2673 2679                  goto bad;
2674 2680          }
2675 2681  
2676 2682          if (rda->rda_count == 0) {
2677 2683                  rd->rd_entries = NULL;
2678 2684                  rd->rd_size = 0;
2679 2685                  rd->rd_eof = FALSE;
2680 2686                  goto bad;
2681 2687          }
2682 2688  
2683 2689          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2684 2690  
2685 2691          /*
2686 2692           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2687 2693           */
2688 2694          rd->rd_bufsize = (uint_t)rda->rda_count;
2689 2695          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2690 2696  
2691 2697          /*
2692 2698           * Set up io vector to read directory data
2693 2699           */
2694 2700          iov.iov_base = (caddr_t)rd->rd_entries;
2695 2701          iov.iov_len = rda->rda_count;
2696 2702          uio.uio_iov = &iov;
2697 2703          uio.uio_iovcnt = 1;
2698 2704          uio.uio_segflg = UIO_SYSSPACE;
2699 2705          uio.uio_extflg = UIO_COPY_CACHED;
2700 2706          uio.uio_loffset = (offset_t)rda->rda_offset;
2701 2707          uio.uio_resid = rda->rda_count;
2702 2708  
2703 2709          /*
2704 2710           * read directory
2705 2711           */
2706 2712          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2707 2713  
2708 2714          /*
2709 2715           * Clean up
2710 2716           */
2711 2717          if (!error) {
2712 2718                  /*
2713 2719                   * set size and eof
2714 2720                   */
2715 2721                  if (uio.uio_resid == rda->rda_count) {
2716 2722                          rd->rd_size = 0;
2717 2723                          rd->rd_eof = TRUE;
2718 2724                  } else {
2719 2725                          rd->rd_size = (uint32_t)(rda->rda_count -
2720 2726                              uio.uio_resid);
2721 2727                          rd->rd_eof = iseof ? TRUE : FALSE;
2722 2728                  }
2723 2729          }
2724 2730  
2725 2731          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2726 2732          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2727 2733          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2728 2734              rda->rda_count, &ndata);
2729 2735  
2730 2736          if (ret != 0) {
2731 2737                  size_t dropbytes;
2732 2738                  /*
2733 2739                   * We had to drop one or more entries in order to fit
2734 2740                   * during the character conversion.  We need to patch
2735 2741                   * up the size and eof info.
2736 2742                   */
2737 2743                  if (rd->rd_eof)
2738 2744                          rd->rd_eof = FALSE;
2739 2745                  dropbytes = nfscmd_dropped_entrysize(
2740 2746                      (struct dirent64 *)rd->rd_entries, nents, ret);
2741 2747                  rd->rd_size -= dropbytes;
2742 2748          }
2743 2749          if (ndata == NULL) {
2744 2750                  ndata = (char *)rd->rd_entries;
2745 2751          } else if (ndata != (char *)rd->rd_entries) {
2746 2752                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2747 2753                  rd->rd_entries = (void *)ndata;
2748 2754                  rd->rd_bufsize = rda->rda_count;
2749 2755          }
2750 2756  
2751 2757  bad:
2752 2758          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2753 2759  
2754 2760  #if 0 /* notyet */
2755 2761          /*
2756 2762           * Don't do this.  It causes local disk writes when just
2757 2763           * reading the file and the overhead is deemed larger
2758 2764           * than the benefit.
2759 2765           */
2760 2766          /*
2761 2767           * Force modified metadata out to stable storage.
2762 2768           */
2763 2769          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2764 2770  #endif
2765 2771  
2766 2772          VN_RELE(vp);
2767 2773  
2768 2774          rd->rd_status = puterrno(error);
2769 2775  
2770 2776  }
2771 2777  void *
2772 2778  rfs_readdir_getfh(struct nfsrddirargs *rda)
2773 2779  {
2774 2780          return (&rda->rda_fh);
2775 2781  }
2776 2782  void
2777 2783  rfs_rddirfree(struct nfsrddirres *rd)
2778 2784  {
2779 2785          if (rd->rd_entries != NULL)
2780 2786                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2781 2787  }
2782 2788  
2783 2789  /* ARGSUSED */
2784 2790  void
2785 2791  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2786 2792      struct svc_req *req, cred_t *cr, bool_t ro)
2787 2793  {
2788 2794          int error;
2789 2795          struct statvfs64 sb;
2790 2796          vnode_t *vp;
2791 2797  
2792 2798          vp = nfs_fhtovp(fh, exi);
2793 2799          if (vp == NULL) {
2794 2800                  fs->fs_status = NFSERR_STALE;
2795 2801                  return;
2796 2802          }
2797 2803  
2798 2804          error = VFS_STATVFS(vp->v_vfsp, &sb);
2799 2805  
2800 2806          if (!error) {
2801 2807                  fs->fs_tsize = nfstsize();
2802 2808                  fs->fs_bsize = sb.f_frsize;
2803 2809                  fs->fs_blocks = sb.f_blocks;
2804 2810                  fs->fs_bfree = sb.f_bfree;
2805 2811                  fs->fs_bavail = sb.f_bavail;
2806 2812          }
2807 2813  
2808 2814          VN_RELE(vp);
2809 2815  
2810 2816          fs->fs_status = puterrno(error);
2811 2817  
2812 2818  }
2813 2819  void *
2814 2820  rfs_statfs_getfh(fhandle_t *fh)
2815 2821  {
2816 2822          return (fh);
2817 2823  }
2818 2824  
2819 2825  static int
2820 2826  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2821 2827  {
2822 2828          vap->va_mask = 0;
2823 2829  
2824 2830          /*
2825 2831           * There was a sign extension bug in some VFS based systems
2826 2832           * which stored the mode as a short.  When it would get
2827 2833           * assigned to a u_long, no sign extension would occur.
2828 2834           * It needed to, but this wasn't noticed because sa_mode
2829 2835           * would then get assigned back to the short, thus ignoring
2830 2836           * the upper 16 bits of sa_mode.
2831 2837           *
2832 2838           * To make this implementation work for both broken
2833 2839           * clients and good clients, we check for both versions
2834 2840           * of the mode.
2835 2841           */
2836 2842          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2837 2843              sa->sa_mode != (uint32_t)-1) {
2838 2844                  vap->va_mask |= AT_MODE;
2839 2845                  vap->va_mode = sa->sa_mode;
2840 2846          }
2841 2847          if (sa->sa_uid != (uint32_t)-1) {
2842 2848                  vap->va_mask |= AT_UID;
2843 2849                  vap->va_uid = sa->sa_uid;
2844 2850          }
2845 2851          if (sa->sa_gid != (uint32_t)-1) {
2846 2852                  vap->va_mask |= AT_GID;
2847 2853                  vap->va_gid = sa->sa_gid;
2848 2854          }
2849 2855          if (sa->sa_size != (uint32_t)-1) {
2850 2856                  vap->va_mask |= AT_SIZE;
2851 2857                  vap->va_size = sa->sa_size;
2852 2858          }
2853 2859          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2854 2860              sa->sa_atime.tv_usec != (int32_t)-1) {
2855 2861  #ifndef _LP64
2856 2862                  /* return error if time overflow */
2857 2863                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2858 2864                          return (EOVERFLOW);
2859 2865  #endif
2860 2866                  vap->va_mask |= AT_ATIME;
2861 2867                  /*
2862 2868                   * nfs protocol defines times as unsigned so don't extend sign,
2863 2869                   * unless sysadmin set nfs_allow_preepoch_time.
2864 2870                   */
2865 2871                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2866 2872                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2867 2873          }
2868 2874          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2869 2875              sa->sa_mtime.tv_usec != (int32_t)-1) {
2870 2876  #ifndef _LP64
2871 2877                  /* return error if time overflow */
2872 2878                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2873 2879                          return (EOVERFLOW);
2874 2880  #endif
2875 2881                  vap->va_mask |= AT_MTIME;
2876 2882                  /*
2877 2883                   * nfs protocol defines times as unsigned so don't extend sign,
2878 2884                   * unless sysadmin set nfs_allow_preepoch_time.
2879 2885                   */
2880 2886                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2881 2887                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2882 2888          }
2883 2889          return (0);
2884 2890  }
2885 2891  
2886 2892  static const enum nfsftype vt_to_nf[] = {
2887 2893          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2888 2894  };
2889 2895  
2890 2896  /*
2891 2897   * check the following fields for overflow: nodeid, size, and time.
2892 2898   * There could be a problem when converting 64-bit LP64 fields
2893 2899   * into 32-bit ones.  Return an error if there is an overflow.
2894 2900   */
2895 2901  int
2896 2902  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2897 2903  {
2898 2904          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2899 2905          na->na_type = vt_to_nf[vap->va_type];
2900 2906  
2901 2907          if (vap->va_mode == (unsigned short) -1)
2902 2908                  na->na_mode = (uint32_t)-1;
2903 2909          else
2904 2910                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2905 2911  
2906 2912          if (vap->va_uid == (unsigned short)(-1))
2907 2913                  na->na_uid = (uint32_t)(-1);
2908 2914          else if (vap->va_uid == UID_NOBODY)
2909 2915                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2910 2916          else
2911 2917                  na->na_uid = vap->va_uid;
2912 2918  
2913 2919          if (vap->va_gid == (unsigned short)(-1))
2914 2920                  na->na_gid = (uint32_t)-1;
2915 2921          else if (vap->va_gid == GID_NOBODY)
2916 2922                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2917 2923          else
2918 2924                  na->na_gid = vap->va_gid;
2919 2925  
2920 2926          /*
2921 2927           * Do we need to check fsid for overflow?  It is 64-bit in the
2922 2928           * vattr, but are bigger than 32 bit values supported?
2923 2929           */
2924 2930          na->na_fsid = vap->va_fsid;
2925 2931  
2926 2932          na->na_nodeid = vap->va_nodeid;
2927 2933  
2928 2934          /*
2929 2935           * Check to make sure that the nodeid is representable over the
2930 2936           * wire without losing bits.
2931 2937           */
2932 2938          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2933 2939                  return (EFBIG);
2934 2940          na->na_nlink = vap->va_nlink;
2935 2941  
2936 2942          /*
2937 2943           * Check for big files here, instead of at the caller.  See
2938 2944           * comments in cstat for large special file explanation.
2939 2945           */
2940 2946          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2941 2947                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2942 2948                          return (EFBIG);
2943 2949                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2944 2950                          /* UNKNOWN_SIZE | OVERFLOW */
2945 2951                          na->na_size = MAXOFF32_T;
2946 2952                  } else
2947 2953                          na->na_size = vap->va_size;
2948 2954          } else
2949 2955                  na->na_size = vap->va_size;
2950 2956  
2951 2957          /*
2952 2958           * If the vnode times overflow the 32-bit times that NFS2
2953 2959           * uses on the wire then return an error.
2954 2960           */
2955 2961          if (!NFS_VAP_TIME_OK(vap)) {
2956 2962                  return (EOVERFLOW);
2957 2963          }
2958 2964          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2959 2965          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2960 2966  
2961 2967          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2962 2968          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2963 2969  
2964 2970          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2965 2971          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2966 2972  
2967 2973          /*
2968 2974           * If the dev_t will fit into 16 bits then compress
2969 2975           * it, otherwise leave it alone. See comments in
2970 2976           * nfs_client.c.
2971 2977           */
2972 2978          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2973 2979              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2974 2980                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2975 2981          else
2976 2982                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2977 2983  
2978 2984          na->na_blocks = vap->va_nblocks;
2979 2985          na->na_blocksize = vap->va_blksize;
2980 2986  
2981 2987          /*
2982 2988           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2983 2989           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2984 2990           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2985 2991           *
2986 2992           * BUYER BEWARE:
2987 2993           *  If you are porting the NFS to a non-Sun server, you probably
2988 2994           *  don't want to include the following block of code.  The
2989 2995           *  over-the-wire special file types will be changing with the
2990 2996           *  NFS Protocol Revision.
2991 2997           */
2992 2998          if (vap->va_type == VFIFO)
2993 2999                  NA_SETFIFO(na);
2994 3000          return (0);
2995 3001  }
2996 3002  
2997 3003  /*
2998 3004   * acl v2 support: returns approximate permission.
2999 3005   *      default: returns minimal permission (more restrictive)
3000 3006   *      aclok: returns maximal permission (less restrictive)
3001 3007   *      This routine changes the permissions that are alaredy in *va.
3002 3008   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3003 3009   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3004 3010   */
3005 3011  static void
3006 3012  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3007 3013  {
3008 3014          vsecattr_t      vsa;
3009 3015          int             aclcnt;
3010 3016          aclent_t        *aclentp;
3011 3017          mode_t          mask_perm;
3012 3018          mode_t          grp_perm;
3013 3019          mode_t          other_perm;
3014 3020          mode_t          other_orig;
3015 3021          int             error;
3016 3022  
3017 3023          /* dont care default acl */
3018 3024          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3019 3025          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3020 3026  
3021 3027          if (!error) {
3022 3028                  aclcnt = vsa.vsa_aclcnt;
3023 3029                  if (aclcnt > MIN_ACL_ENTRIES) {
3024 3030                          /* non-trivial ACL */
3025 3031                          aclentp = vsa.vsa_aclentp;
3026 3032                          if (exi->exi_export.ex_flags & EX_ACLOK) {
3027 3033                                  /* maximal permissions */
3028 3034                                  grp_perm = 0;
3029 3035                                  other_perm = 0;
3030 3036                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3031 3037                                          switch (aclentp->a_type) {
3032 3038                                          case USER_OBJ:
3033 3039                                                  break;
3034 3040                                          case USER:
3035 3041                                                  grp_perm |=
3036 3042                                                      aclentp->a_perm << 3;
3037 3043                                                  other_perm |= aclentp->a_perm;
3038 3044                                                  break;
3039 3045                                          case GROUP_OBJ:
3040 3046                                                  grp_perm |=
3041 3047                                                      aclentp->a_perm << 3;
3042 3048                                                  break;
3043 3049                                          case GROUP:
3044 3050                                                  other_perm |= aclentp->a_perm;
3045 3051                                                  break;
3046 3052                                          case OTHER_OBJ:
3047 3053                                                  other_orig = aclentp->a_perm;
3048 3054                                                  break;
3049 3055                                          case CLASS_OBJ:
3050 3056                                                  mask_perm = aclentp->a_perm;
3051 3057                                                  break;
3052 3058                                          default:
3053 3059                                                  break;
3054 3060                                          }
3055 3061                                  }
3056 3062                                  grp_perm &= mask_perm << 3;
3057 3063                                  other_perm &= mask_perm;
3058 3064                                  other_perm |= other_orig;
3059 3065  
3060 3066                          } else {
3061 3067                                  /* minimal permissions */
3062 3068                                  grp_perm = 070;
3063 3069                                  other_perm = 07;
3064 3070                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3065 3071                                          switch (aclentp->a_type) {
3066 3072                                          case USER_OBJ:
3067 3073                                                  break;
3068 3074                                          case USER:
3069 3075                                          case CLASS_OBJ:
3070 3076                                                  grp_perm &=
3071 3077                                                      aclentp->a_perm << 3;
3072 3078                                                  other_perm &=
3073 3079                                                      aclentp->a_perm;
3074 3080                                                  break;
3075 3081                                          case GROUP_OBJ:
3076 3082                                                  grp_perm &=
3077 3083                                                      aclentp->a_perm << 3;
3078 3084                                                  break;
3079 3085                                          case GROUP:
3080 3086                                                  other_perm &=
3081 3087                                                      aclentp->a_perm;
3082 3088                                                  break;
3083 3089                                          case OTHER_OBJ:
3084 3090                                                  other_perm &=
3085 3091                                                      aclentp->a_perm;
3086 3092                                                  break;
3087 3093                                          default:
3088 3094                                                  break;
3089 3095                                          }
3090 3096                                  }
3091 3097                          }
3092 3098                          /* copy to va */
3093 3099                          va->va_mode &= ~077;
3094 3100                          va->va_mode |= grp_perm | other_perm;
3095 3101                  }

↓ open down ↓

1766 lines elided

↑ open up ↑

3096 3102                  if (vsa.vsa_aclcnt)
3097 3103                          kmem_free(vsa.vsa_aclentp,
3098 3104                              vsa.vsa_aclcnt * sizeof (aclent_t));
3099 3105          }
3100 3106  }
3101 3107  
3102 3108  void
3103 3109  rfs_srvrinit(void)
3104 3110  {
3105 3111          nfs2_srv_caller_id = fs_new_caller_id();
3106      -        zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3107 3112  }
3108 3113  
3109 3114  void
3110 3115  rfs_srvrfini(void)
3111 3116  {
3112 3117  }
3113 3118  
3114 3119  /* ARGSUSED */
3115      -static void *
3116      -rfs_zone_init(zoneid_t zoneid)
     3120 +void
     3121 +rfs_srv_zone_init(nfs_globals_t *ng)
3117 3122  {
3118 3123          nfs_srv_t *ns;
3119 3124  
3120 3125          ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3121 3126  
3122 3127          mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3123 3128          ns->write_async = 1;
3124 3129  
3125      -        return (ns);
     3130 +        ng->nfs_srv = ns;
3126 3131  }
3127 3132  
3128 3133  /* ARGSUSED */
3129      -static void
3130      -rfs_zone_fini(zoneid_t zoneid, void *data)
     3134 +void
     3135 +rfs_srv_zone_fini(nfs_globals_t *ng)
3131 3136  {
3132      -        nfs_srv_t *ns;
     3137 +        nfs_srv_t *ns = ng->nfs_srv;
3133 3138  
3134      -        ns = (nfs_srv_t *)data;
     3139 +        ng->nfs_srv = NULL;
     3140 +
3135 3141          mutex_destroy(&ns->async_write_lock);
3136 3142          kmem_free(ns, sizeof (*ns));
3137 3143  }
3138 3144  
3139 3145  static int
3140 3146  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3141 3147  {
3142 3148          struct clist    *wcl;
3143 3149          int             wlist_len;
3144 3150          uint32_t        count = rr->rr_count;

3145 3151  
3146 3152          wcl = ra->ra_wlist;
3147 3153  
3148 3154          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3149 3155                  return (FALSE);
3150 3156          }
3151 3157  
3152 3158          wcl = ra->ra_wlist;
3153 3159          rr->rr_ok.rrok_wlist_len = wlist_len;
3154 3160          rr->rr_ok.rrok_wlist = wcl;
3155 3161  
3156 3162          return (TRUE);
3157 3163  }

↓ open down ↓

13 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX