vgusev-nfsstat Wdiff usr/src/uts/common/fs/nfs/nfs_srv.c

Print this page

nfssrv: nfsstat reports zeroed data in zone

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30   30   *      All rights reserved.
  31   31   */
  32   32  
  33   33  /*
  34   34   * Copyright 2018 Nexenta Systems, Inc.
  35   35   * Copyright (c) 2016 by Delphix. All rights reserved.
  36   36   */
  37   37  
  38   38  #include <sys/param.h>
  39   39  #include <sys/types.h>
  40   40  #include <sys/systm.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/stat.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/statvfs.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/kstat.h>
  52   52  #include <sys/dirent.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/vtrace.h>
  56   56  #include <sys/mode.h>
  57   57  #include <sys/acl.h>
  58   58  #include <sys/nbmlock.h>
  59   59  #include <sys/policy.h>
  60   60  #include <sys/sdt.h>
  61   61  
  62   62  #include <rpc/types.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/svc.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/export.h>
  68   68  #include <nfs/nfs_cmd.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/as.h>
  72   72  #include <vm/seg.h>
  73   73  #include <vm/seg_map.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  #include <sys/strsubr.h>
  77   77  
  78   78  struct rfs_async_write_list;
  79   79  
  80   80  /*
  81   81   * Zone globals of NFSv2 server
  82   82   */
  83   83  typedef struct nfs_srv {
  84   84          kmutex_t                        async_write_lock;
  85   85          struct rfs_async_write_list     *async_write_head;
  86   86  
  87   87          /*
  88   88           * enables write clustering if == 1
  89   89           */
  90   90          int             write_async;
  91   91  } nfs_srv_t;
  92   92  
  93   93  /*
  94   94   * These are the interface routines for the server side of the
  95   95   * Network File System.  See the NFS version 2 protocol specification
  96   96   * for a description of this interface.
  97   97   */
  98   98  
  99   99  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100  100  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101  101                          cred_t *);
 102  102  
 103  103  
 104  104  /*
 105  105   * Some "over the wire" UNIX file types.  These are encoded
 106  106   * into the mode.  This needs to be fixed in the next rev.
 107  107   */
 108  108  #define IFMT            0170000         /* type of file */
 109  109  #define IFCHR           0020000         /* character special */
 110  110  #define IFBLK           0060000         /* block special */
 111  111  #define IFSOCK          0140000         /* socket */
 112  112  
 113  113  u_longlong_t nfs2_srv_caller_id;
 114  114  
 115  115  static nfs_srv_t *
 116  116  nfs_get_srv(void)
 117  117  {
 118  118          nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119  119          nfs_srv_t *srv = ng->nfs_srv;
 120  120          ASSERT(srv != NULL);
 121  121          return (srv);
 122  122  }
 123  123  
 124  124  /*
 125  125   * Get file attributes.
 126  126   * Returns the current attributes of the file with the given fhandle.
 127  127   */
 128  128  /* ARGSUSED */
 129  129  void
 130  130  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131  131      struct svc_req *req, cred_t *cr, bool_t ro)
 132  132  {
 133  133          int error;
 134  134          vnode_t *vp;
 135  135          struct vattr va;
 136  136  
 137  137          vp = nfs_fhtovp(fhp, exi);
 138  138          if (vp == NULL) {
 139  139                  ns->ns_status = NFSERR_STALE;
 140  140                  return;
 141  141          }
 142  142  
 143  143          /*
 144  144           * Do the getattr.
 145  145           */
 146  146          va.va_mask = AT_ALL;    /* we want all the attributes */
 147  147  
 148  148          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149  149  
 150  150          /* check for overflows */
 151  151          if (!error) {
 152  152                  /* Lie about the object type for a referral */
 153  153                  if (vn_is_nfs_reparse(vp, cr))
 154  154                          va.va_type = VLNK;
 155  155  
 156  156                  acl_perm(vp, exi, &va, cr);
 157  157                  error = vattr_to_nattr(&va, &ns->ns_attr);
 158  158          }
 159  159  
 160  160          VN_RELE(vp);
 161  161  
 162  162          ns->ns_status = puterrno(error);
 163  163  }
 164  164  void *
 165  165  rfs_getattr_getfh(fhandle_t *fhp)
 166  166  {
 167  167          return (fhp);
 168  168  }
 169  169  
 170  170  /*
 171  171   * Set file attributes.
 172  172   * Sets the attributes of the file with the given fhandle.  Returns
 173  173   * the new attributes.
 174  174   */
 175  175  /* ARGSUSED */
 176  176  void
 177  177  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178  178      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179  179  {
 180  180          int error;
 181  181          int flag;
 182  182          int in_crit = 0;
 183  183          vnode_t *vp;
 184  184          struct vattr va;
 185  185          struct vattr bva;
 186  186          struct flock64 bf;
 187  187          caller_context_t ct;
 188  188  
 189  189  
 190  190          vp = nfs_fhtovp(&args->saa_fh, exi);
 191  191          if (vp == NULL) {
 192  192                  ns->ns_status = NFSERR_STALE;
 193  193                  return;
 194  194          }
 195  195  
 196  196          if (rdonly(ro, vp)) {
 197  197                  VN_RELE(vp);
 198  198                  ns->ns_status = NFSERR_ROFS;
 199  199                  return;
 200  200          }
 201  201  
 202  202          error = sattr_to_vattr(&args->saa_sa, &va);
 203  203          if (error) {
 204  204                  VN_RELE(vp);
 205  205                  ns->ns_status = puterrno(error);
 206  206                  return;
 207  207          }
 208  208  
 209  209          /*
 210  210           * If the client is requesting a change to the mtime,
 211  211           * but the nanosecond field is set to 1 billion, then
 212  212           * this is a flag to the server that it should set the
 213  213           * atime and mtime fields to the server's current time.
 214  214           * The 1 billion number actually came from the client
 215  215           * as 1 million, but the units in the over the wire
 216  216           * request are microseconds instead of nanoseconds.
 217  217           *
 218  218           * This is an overload of the protocol and should be
 219  219           * documented in the NFS Version 2 protocol specification.
 220  220           */
 221  221          if (va.va_mask & AT_MTIME) {
 222  222                  if (va.va_mtime.tv_nsec == 1000000000) {
 223  223                          gethrestime(&va.va_mtime);
 224  224                          va.va_atime = va.va_mtime;
 225  225                          va.va_mask |= AT_ATIME;
 226  226                          flag = 0;
 227  227                  } else
 228  228                          flag = ATTR_UTIME;
 229  229          } else
 230  230                  flag = 0;
 231  231  
 232  232          /*
 233  233           * If the filesystem is exported with nosuid, then mask off
 234  234           * the setuid and setgid bits.
 235  235           */
 236  236          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237  237              (exi->exi_export.ex_flags & EX_NOSUID))
 238  238                  va.va_mode &= ~(VSUID | VSGID);
 239  239  
 240  240          ct.cc_sysid = 0;
 241  241          ct.cc_pid = 0;
 242  242          ct.cc_caller_id = nfs2_srv_caller_id;
 243  243          ct.cc_flags = CC_DONTBLOCK;
 244  244  
 245  245          /*
 246  246           * We need to specially handle size changes because it is
 247  247           * possible for the client to create a file with modes
 248  248           * which indicate read-only, but with the file opened for
 249  249           * writing.  If the client then tries to set the size of
 250  250           * the file, then the normal access checking done in
 251  251           * VOP_SETATTR would prevent the client from doing so,
 252  252           * although it should be legal for it to do so.  To get
 253  253           * around this, we do the access checking for ourselves
 254  254           * and then use VOP_SPACE which doesn't do the access
 255  255           * checking which VOP_SETATTR does. VOP_SPACE can only
 256  256           * operate on VREG files, let VOP_SETATTR handle the other
 257  257           * extremely rare cases.
 258  258           * Also the client should not be allowed to change the
 259  259           * size of the file if there is a conflicting non-blocking
 260  260           * mandatory lock in the region of change.
 261  261           */
 262  262          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263  263                  if (nbl_need_check(vp)) {
 264  264                          nbl_start_crit(vp, RW_READER);
 265  265                          in_crit = 1;
 266  266                  }
 267  267  
 268  268                  bva.va_mask = AT_UID | AT_SIZE;
 269  269  
 270  270                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271  271  
 272  272                  if (error) {
 273  273                          if (in_crit)
 274  274                                  nbl_end_crit(vp);
 275  275                          VN_RELE(vp);
 276  276                          ns->ns_status = puterrno(error);
 277  277                          return;
 278  278                  }
 279  279  
 280  280                  if (in_crit) {
 281  281                          u_offset_t offset;
 282  282                          ssize_t length;
 283  283  
 284  284                          if (va.va_size < bva.va_size) {
 285  285                                  offset = va.va_size;
 286  286                                  length = bva.va_size - va.va_size;
 287  287                          } else {
 288  288                                  offset = bva.va_size;
 289  289                                  length = va.va_size - bva.va_size;
 290  290                          }
 291  291                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292  292                              NULL)) {
 293  293                                  error = EACCES;
 294  294                          }
 295  295                  }
 296  296  
 297  297                  if (crgetuid(cr) == bva.va_uid && !error &&
 298  298                      va.va_size != bva.va_size) {
 299  299                          va.va_mask &= ~AT_SIZE;
 300  300                          bf.l_type = F_WRLCK;
 301  301                          bf.l_whence = 0;
 302  302                          bf.l_start = (off64_t)va.va_size;
 303  303                          bf.l_len = 0;
 304  304                          bf.l_sysid = 0;
 305  305                          bf.l_pid = 0;
 306  306  
 307  307                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308  308                              (offset_t)va.va_size, cr, &ct);
 309  309                  }
 310  310                  if (in_crit)
 311  311                          nbl_end_crit(vp);
 312  312          } else
 313  313                  error = 0;
 314  314  
 315  315          /*
 316  316           * Do the setattr.
 317  317           */
 318  318          if (!error && va.va_mask) {
 319  319                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320  320          }
 321  321  
 322  322          /*
 323  323           * check if the monitor on either vop_space or vop_setattr detected
 324  324           * a delegation conflict and if so, mark the thread flag as
 325  325           * wouldblock so that the response is dropped and the client will
 326  326           * try again.
 327  327           */
 328  328          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329  329                  VN_RELE(vp);
 330  330                  curthread->t_flag |= T_WOULDBLOCK;
 331  331                  return;
 332  332          }
 333  333  
 334  334          if (!error) {
 335  335                  va.va_mask = AT_ALL;    /* get everything */
 336  336  
 337  337                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338  338  
 339  339                  /* check for overflows */
 340  340                  if (!error) {
 341  341                          acl_perm(vp, exi, &va, cr);
 342  342                          error = vattr_to_nattr(&va, &ns->ns_attr);
 343  343                  }
 344  344          }
 345  345  
 346  346          ct.cc_flags = 0;
 347  347  
 348  348          /*
 349  349           * Force modified metadata out to stable storage.
 350  350           */
 351  351          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352  352  
 353  353          VN_RELE(vp);
 354  354  
 355  355          ns->ns_status = puterrno(error);
 356  356  }
 357  357  void *
 358  358  rfs_setattr_getfh(struct nfssaargs *args)
 359  359  {
 360  360          return (&args->saa_fh);
 361  361  }
 362  362  
 363  363  /* Change and release @exip and @vpp only in success */
 364  364  int
 365  365  rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366  366  {
 367  367          struct exportinfo *exi;
 368  368          vnode_t *vp = *vpp;
 369  369          fid_t fid;
 370  370          int error;
 371  371  
 372  372          VN_HOLD(vp);
 373  373  
 374  374          if ((error = traverse(&vp)) != 0) {
 375  375                  VN_RELE(vp);
 376  376                  return (error);
 377  377          }
 378  378  
 379  379          bzero(&fid, sizeof (fid));
 380  380          fid.fid_len = MAXFIDSZ;
 381  381          error = VOP_FID(vp, &fid, NULL);
 382  382          if (error) {
 383  383                  VN_RELE(vp);
 384  384                  return (error);
 385  385          }
 386  386  
 387  387          exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388  388          if (exi == NULL ||
 389  389              (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390  390                  /*
 391  391                   * It is not error, just subdir is not exported
 392  392                   * or "nohide" is not set
 393  393                   */
 394  394                  if (exi != NULL)
 395  395                          exi_rele(exi);
 396  396                  VN_RELE(vp);
 397  397          } else {
 398  398                  /* go to submount */
 399  399                  exi_rele(*exip);
 400  400                  *exip = exi;
 401  401  
 402  402                  VN_RELE(*vpp);
 403  403                  *vpp = vp;
 404  404          }
 405  405  
 406  406          return (0);
 407  407  }
 408  408  
 409  409  /*
 410  410   * Given mounted "dvp" and "exi", go upper mountpoint
 411  411   * with dvp/exi correction
 412  412   * Return 0 in success
 413  413   */
 414  414  int
 415  415  rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416  416  {
 417  417          struct exportinfo *exi;
 418  418          vnode_t *dvp = *dvpp;
 419  419          vnode_t *zone_rootvp;
 420  420  
 421  421          zone_rootvp = (*exip)->exi_ne->exi_root->exi_vp;
 422  422          ASSERT((dvp->v_flag & VROOT) || VN_CMP(zone_rootvp, dvp));
 423  423  
 424  424          VN_HOLD(dvp);
 425  425          dvp = untraverse(dvp, zone_rootvp);
 426  426          exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 427  427          if (exi == NULL) {
 428  428                  VN_RELE(dvp);
 429  429                  return (-1);
 430  430          }
 431  431  
 432  432          ASSERT3U(exi->exi_zoneid, ==, (*exip)->exi_zoneid);
 433  433          exi_rele(*exip);
 434  434          *exip = exi;
 435  435          VN_RELE(*dvpp);
 436  436          *dvpp = dvp;
 437  437  
 438  438          return (0);
 439  439  }
 440  440  /*
 441  441   * Directory lookup.
 442  442   * Returns an fhandle and file attributes for file name in a directory.
 443  443   */
 444  444  /* ARGSUSED */
 445  445  void
 446  446  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 447  447      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 448  448  {
 449  449          int error;
 450  450          vnode_t *dvp;
 451  451          vnode_t *vp;
 452  452          struct vattr va;
 453  453          fhandle_t *fhp = da->da_fhandle;
 454  454          struct sec_ol sec = {0, 0};
 455  455          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 456  456          char *name;
 457  457          struct sockaddr *ca;
 458  458  
 459  459          /*
 460  460           * Trusted Extension doesn't support NFSv2. MOUNT
 461  461           * will reject v2 clients. Need to prevent v2 client
 462  462           * access via WebNFS here.
 463  463           */
 464  464          if (is_system_labeled() && req->rq_vers == 2) {
 465  465                  dr->dr_status = NFSERR_ACCES;
 466  466                  return;
 467  467          }
 468  468  
 469  469          /*
 470  470           * Disallow NULL paths
 471  471           */
 472  472          if (da->da_name == NULL || *da->da_name == '\0') {
 473  473                  dr->dr_status = NFSERR_ACCES;
 474  474                  return;
 475  475          }
 476  476  
 477  477          /*
 478  478           * Allow lookups from the root - the default
 479  479           * location of the public filehandle.
 480  480           */
 481  481          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 482  482                  dvp = ZONE_ROOTVP();
 483  483                  VN_HOLD(dvp);
 484  484          } else {
 485  485                  dvp = nfs_fhtovp(fhp, exi);
 486  486                  if (dvp == NULL) {
 487  487                          dr->dr_status = NFSERR_STALE;
 488  488                          return;
 489  489                  }
 490  490          }
 491  491  
 492  492          exi_hold(exi);
 493  493          ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 494  494  
 495  495          /*
 496  496           * Not allow lookup beyond root.
 497  497           * If the filehandle matches a filehandle of the exi,
 498  498           * then the ".." refers beyond the root of an exported filesystem.
 499  499           */
 500  500          if (strcmp(da->da_name, "..") == 0 &&
 501  501              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 502  502                  if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 503  503                      ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 504  504                          /*
 505  505                           * special case for ".." and 'nohide'exported root
 506  506                           */
 507  507                          if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 508  508                                  error = NFSERR_ACCES;
 509  509                                  goto out;
 510  510                          }
 511  511                  } else  {
 512  512                          error = NFSERR_NOENT;
 513  513                          goto out;
 514  514                  }
 515  515          }
 516  516  
 517  517          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 518  518          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 519  519              MAXPATHLEN);
 520  520  
 521  521          if (name == NULL) {
 522  522                  error = NFSERR_ACCES;
 523  523                  goto out;
 524  524          }
 525  525  
 526  526          /*
 527  527           * If the public filehandle is used then allow
 528  528           * a multi-component lookup, i.e. evaluate
 529  529           * a pathname and follow symbolic links if
 530  530           * necessary.
 531  531           *
 532  532           * This may result in a vnode in another filesystem
 533  533           * which is OK as long as the filesystem is exported.
 534  534           */
 535  535          if (PUBLIC_FH2(fhp)) {
 536  536                  publicfh_flag = TRUE;
 537  537  
 538  538                  exi_rele(exi);
 539  539                  exi = NULL;
 540  540  
 541  541                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 542  542                      &sec);
 543  543          } else {
 544  544                  /*
 545  545                   * Do a normal single component lookup.
 546  546                   */
 547  547                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 548  548                      NULL, NULL, NULL);
 549  549          }
 550  550  
 551  551          if (name != da->da_name)
 552  552                  kmem_free(name, MAXPATHLEN);
 553  553  
 554  554          if (error == 0 && vn_ismntpt(vp)) {
 555  555                  error = rfs_cross_mnt(&vp, &exi);
 556  556                  if (error)
 557  557                          VN_RELE(vp);
 558  558          }
 559  559  
 560  560          if (!error) {
 561  561                  va.va_mask = AT_ALL;    /* we want everything */
 562  562  
 563  563                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 564  564  
 565  565                  /* check for overflows */
 566  566                  if (!error) {
 567  567                          acl_perm(vp, exi, &va, cr);
 568  568                          error = vattr_to_nattr(&va, &dr->dr_attr);
 569  569                          if (!error) {
 570  570                                  if (sec.sec_flags & SEC_QUERY)
 571  571                                          error = makefh_ol(&dr->dr_fhandle, exi,
 572  572                                              sec.sec_index);
 573  573                                  else {
 574  574                                          error = makefh(&dr->dr_fhandle, vp,
 575  575                                              exi);
 576  576                                          if (!error && publicfh_flag &&
 577  577                                              !chk_clnt_sec(exi, req))
 578  578                                                  auth_weak = TRUE;
 579  579                                  }
 580  580                          }
 581  581                  }
 582  582                  VN_RELE(vp);
 583  583          }
 584  584  
 585  585  out:
 586  586          VN_RELE(dvp);
 587  587  
 588  588          if (exi != NULL)
 589  589                  exi_rele(exi);
 590  590  
 591  591          /*
 592  592           * If it's public fh, no 0x81, and client's flavor is
 593  593           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 594  594           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 595  595           */
 596  596          if (auth_weak)
 597  597                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 598  598          else
 599  599                  dr->dr_status = puterrno(error);
 600  600  }
 601  601  void *
 602  602  rfs_lookup_getfh(struct nfsdiropargs *da)
 603  603  {
 604  604          return (da->da_fhandle);
 605  605  }
 606  606  
 607  607  /*
 608  608   * Read symbolic link.
 609  609   * Returns the string in the symbolic link at the given fhandle.
 610  610   */
 611  611  /* ARGSUSED */
 612  612  void
 613  613  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 614  614      struct svc_req *req, cred_t *cr, bool_t ro)
 615  615  {
 616  616          int error;
 617  617          struct iovec iov;
 618  618          struct uio uio;
 619  619          vnode_t *vp;
 620  620          struct vattr va;
 621  621          struct sockaddr *ca;
 622  622          char *name = NULL;
 623  623          int is_referral = 0;
 624  624  
 625  625          vp = nfs_fhtovp(fhp, exi);
 626  626          if (vp == NULL) {
 627  627                  rl->rl_data = NULL;
 628  628                  rl->rl_status = NFSERR_STALE;
 629  629                  return;
 630  630          }
 631  631  
 632  632          va.va_mask = AT_MODE;
 633  633  
 634  634          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 635  635  
 636  636          if (error) {
 637  637                  VN_RELE(vp);
 638  638                  rl->rl_data = NULL;
 639  639                  rl->rl_status = puterrno(error);
 640  640                  return;
 641  641          }
 642  642  
 643  643          if (MANDLOCK(vp, va.va_mode)) {
 644  644                  VN_RELE(vp);
 645  645                  rl->rl_data = NULL;
 646  646                  rl->rl_status = NFSERR_ACCES;
 647  647                  return;
 648  648          }
 649  649  
 650  650          /* We lied about the object type for a referral */
 651  651          if (vn_is_nfs_reparse(vp, cr))
 652  652                  is_referral = 1;
 653  653  
 654  654          /*
 655  655           * XNFS and RFC1094 require us to return ENXIO if argument
 656  656           * is not a link. BUGID 1138002.
 657  657           */
 658  658          if (vp->v_type != VLNK && !is_referral) {
 659  659                  VN_RELE(vp);
 660  660                  rl->rl_data = NULL;
 661  661                  rl->rl_status = NFSERR_NXIO;
 662  662                  return;

↓ open down ↓

662 lines elided

↑ open up ↑

 663  663          }
 664  664  
 665  665          /*
 666  666           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 667  667           */
 668  668          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 669  669  
 670  670          if (is_referral) {
 671  671                  char *s;
 672  672                  size_t strsz;
      673 +                kstat_named_t *stat =
      674 +                    exi->exi_ne->ne_globals->svstat[NFS_VERSION];
 673  675  
 674  676                  /* Get an artificial symlink based on a referral */
 675  677                  s = build_symlink(vp, cr, &strsz);
 676      -                global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
      678 +                stat[NFS_REFERLINKS].value.ui64++;
 677  679                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 678  680                      vnode_t *, vp, char *, s);
 679  681                  if (s == NULL)
 680  682                          error = EINVAL;
 681  683                  else {
 682  684                          error = 0;
 683  685                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 684  686                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 685  687                          kmem_free(s, strsz);
 686  688                  }

 687  689  
 688  690          } else {
 689  691  
 690  692                  /*
 691  693                   * Set up io vector to read sym link data
 692  694                   */
 693  695                  iov.iov_base = rl->rl_data;
 694  696                  iov.iov_len = NFS_MAXPATHLEN;
 695  697                  uio.uio_iov = &iov;
 696  698                  uio.uio_iovcnt = 1;
 697  699                  uio.uio_segflg = UIO_SYSSPACE;
 698  700                  uio.uio_extflg = UIO_COPY_CACHED;
 699  701                  uio.uio_loffset = (offset_t)0;
 700  702                  uio.uio_resid = NFS_MAXPATHLEN;
 701  703  
 702  704                  /*
 703  705                   * Do the readlink.
 704  706                   */
 705  707                  error = VOP_READLINK(vp, &uio, cr, NULL);
 706  708  
 707  709                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 708  710  
 709  711                  if (!error)
 710  712                          rl->rl_data[rl->rl_count] = '\0';
 711  713  
 712  714          }
 713  715  
 714  716  
 715  717          VN_RELE(vp);
 716  718  
 717  719          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 718  720          name = nfscmd_convname(ca, exi, rl->rl_data,
 719  721              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 720  722  
 721  723          if (name != NULL && name != rl->rl_data) {
 722  724                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 723  725                  rl->rl_data = name;
 724  726          }
 725  727  
 726  728          /*
 727  729           * XNFS and RFC1094 require us to return ENXIO if argument
 728  730           * is not a link. UFS returns EINVAL if this is the case,
 729  731           * so we do the mapping here. BUGID 1138002.
 730  732           */
 731  733          if (error == EINVAL)
 732  734                  rl->rl_status = NFSERR_NXIO;
 733  735          else
 734  736                  rl->rl_status = puterrno(error);
 735  737  
 736  738  }
 737  739  void *
 738  740  rfs_readlink_getfh(fhandle_t *fhp)
 739  741  {
 740  742          return (fhp);
 741  743  }
 742  744  /*
 743  745   * Free data allocated by rfs_readlink
 744  746   */
 745  747  void
 746  748  rfs_rlfree(struct nfsrdlnres *rl)
 747  749  {
 748  750          if (rl->rl_data != NULL)
 749  751                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 750  752  }
 751  753  
 752  754  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 753  755  
 754  756  /*
 755  757   * Read data.
 756  758   * Returns some data read from the file at the given fhandle.
 757  759   */
 758  760  /* ARGSUSED */
 759  761  void
 760  762  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 761  763      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 762  764  {
 763  765          vnode_t *vp;
 764  766          int error;
 765  767          struct vattr va;
 766  768          struct iovec iov;
 767  769          struct uio uio;
 768  770          mblk_t *mp;
 769  771          int alloc_err = 0;
 770  772          int in_crit = 0;
 771  773          caller_context_t ct;
 772  774  
 773  775          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 774  776          if (vp == NULL) {
 775  777                  rr->rr_data = NULL;
 776  778                  rr->rr_status = NFSERR_STALE;
 777  779                  return;
 778  780          }
 779  781  
 780  782          if (vp->v_type != VREG) {
 781  783                  VN_RELE(vp);
 782  784                  rr->rr_data = NULL;
 783  785                  rr->rr_status = NFSERR_ISDIR;
 784  786                  return;
 785  787          }
 786  788  
 787  789          ct.cc_sysid = 0;
 788  790          ct.cc_pid = 0;
 789  791          ct.cc_caller_id = nfs2_srv_caller_id;
 790  792          ct.cc_flags = CC_DONTBLOCK;
 791  793  
 792  794          /*
 793  795           * Enter the critical region before calling VOP_RWLOCK
 794  796           * to avoid a deadlock with write requests.
 795  797           */
 796  798          if (nbl_need_check(vp)) {
 797  799                  nbl_start_crit(vp, RW_READER);
 798  800                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 799  801                      0, NULL)) {
 800  802                          nbl_end_crit(vp);
 801  803                          VN_RELE(vp);
 802  804                          rr->rr_data = NULL;
 803  805                          rr->rr_status = NFSERR_ACCES;
 804  806                          return;
 805  807                  }
 806  808                  in_crit = 1;
 807  809          }
 808  810  
 809  811          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 810  812  
 811  813          /* check if a monitor detected a delegation conflict */
 812  814          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 813  815                  if (in_crit)
 814  816                          nbl_end_crit(vp);
 815  817                  VN_RELE(vp);
 816  818                  /* mark as wouldblock so response is dropped */
 817  819                  curthread->t_flag |= T_WOULDBLOCK;
 818  820  
 819  821                  rr->rr_data = NULL;
 820  822                  return;
 821  823          }
 822  824  
 823  825          va.va_mask = AT_ALL;
 824  826  
 825  827          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 826  828  
 827  829          if (error) {
 828  830                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 829  831                  if (in_crit)
 830  832                          nbl_end_crit(vp);
 831  833  
 832  834                  VN_RELE(vp);
 833  835                  rr->rr_data = NULL;
 834  836                  rr->rr_status = puterrno(error);
 835  837  
 836  838                  return;
 837  839          }
 838  840  
 839  841          /*
 840  842           * This is a kludge to allow reading of files created
 841  843           * with no read permission.  The owner of the file
 842  844           * is always allowed to read it.
 843  845           */
 844  846          if (crgetuid(cr) != va.va_uid) {
 845  847                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 846  848  
 847  849                  if (error) {
 848  850                          /*
 849  851                           * Exec is the same as read over the net because
 850  852                           * of demand loading.
 851  853                           */
 852  854                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 853  855                  }
 854  856                  if (error) {
 855  857                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 856  858                          if (in_crit)
 857  859                                  nbl_end_crit(vp);
 858  860                          VN_RELE(vp);
 859  861                          rr->rr_data = NULL;
 860  862                          rr->rr_status = puterrno(error);
 861  863  
 862  864                          return;
 863  865                  }
 864  866          }
 865  867  
 866  868          if (MANDLOCK(vp, va.va_mode)) {
 867  869                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 868  870                  if (in_crit)
 869  871                          nbl_end_crit(vp);
 870  872  
 871  873                  VN_RELE(vp);
 872  874                  rr->rr_data = NULL;
 873  875                  rr->rr_status = NFSERR_ACCES;
 874  876  
 875  877                  return;
 876  878          }
 877  879  
 878  880          rr->rr_ok.rrok_wlist_len = 0;
 879  881          rr->rr_ok.rrok_wlist = NULL;
 880  882  
 881  883          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 882  884                  rr->rr_count = 0;
 883  885                  rr->rr_data = NULL;
 884  886                  /*
 885  887                   * In this case, status is NFS_OK, but there is no data
 886  888                   * to encode. So set rr_mp to NULL.
 887  889                   */
 888  890                  rr->rr_mp = NULL;
 889  891                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 890  892                  if (rr->rr_ok.rrok_wlist)
 891  893                          clist_zero_len(rr->rr_ok.rrok_wlist);
 892  894                  goto done;
 893  895          }
 894  896  
 895  897          if (ra->ra_wlist) {
 896  898                  mp = NULL;
 897  899                  rr->rr_mp = NULL;
 898  900                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 899  901                  if (ra->ra_count > iov.iov_len) {
 900  902                          rr->rr_data = NULL;
 901  903                          rr->rr_status = NFSERR_INVAL;
 902  904                          goto done;
 903  905                  }
 904  906          } else {
 905  907                  /*
 906  908                   * mp will contain the data to be sent out in the read reply.
 907  909                   * This will be freed after the reply has been sent out (by the
 908  910                   * driver).
 909  911                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 910  912                   * that the call to xdrmblk_putmblk() never fails.
 911  913                   */
 912  914                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 913  915                      &alloc_err);
 914  916                  ASSERT(mp != NULL);
 915  917                  ASSERT(alloc_err == 0);
 916  918  
 917  919                  rr->rr_mp = mp;
 918  920  
 919  921                  /*
 920  922                   * Set up io vector
 921  923                   */
 922  924                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 923  925                  iov.iov_len = ra->ra_count;
 924  926          }
 925  927  
 926  928          uio.uio_iov = &iov;
 927  929          uio.uio_iovcnt = 1;
 928  930          uio.uio_segflg = UIO_SYSSPACE;
 929  931          uio.uio_extflg = UIO_COPY_CACHED;
 930  932          uio.uio_loffset = (offset_t)ra->ra_offset;
 931  933          uio.uio_resid = ra->ra_count;
 932  934  
 933  935          error = VOP_READ(vp, &uio, 0, cr, &ct);
 934  936  
 935  937          if (error) {
 936  938                  if (mp)
 937  939                          freeb(mp);
 938  940  
 939  941                  /*
 940  942                   * check if a monitor detected a delegation conflict and
 941  943                   * mark as wouldblock so response is dropped
 942  944                   */
 943  945                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 944  946                          curthread->t_flag |= T_WOULDBLOCK;
 945  947                  else
 946  948                          rr->rr_status = puterrno(error);
 947  949  
 948  950                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 949  951                  if (in_crit)
 950  952                          nbl_end_crit(vp);
 951  953  
 952  954                  VN_RELE(vp);
 953  955                  rr->rr_data = NULL;
 954  956  
 955  957                  return;
 956  958          }
 957  959  
 958  960          /*
 959  961           * Get attributes again so we can send the latest access
 960  962           * time to the client side for its cache.
 961  963           */
 962  964          va.va_mask = AT_ALL;
 963  965  
 964  966          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 965  967  
 966  968          if (error) {
 967  969                  if (mp)
 968  970                          freeb(mp);
 969  971  
 970  972                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 971  973                  if (in_crit)
 972  974                          nbl_end_crit(vp);
 973  975  
 974  976                  VN_RELE(vp);
 975  977                  rr->rr_data = NULL;
 976  978                  rr->rr_status = puterrno(error);
 977  979  
 978  980                  return;
 979  981          }
 980  982  
 981  983          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 982  984  
 983  985          if (mp) {
 984  986                  rr->rr_data = (char *)mp->b_datap->db_base;
 985  987          } else {
 986  988                  if (ra->ra_wlist) {
 987  989                          rr->rr_data = (caddr_t)iov.iov_base;
 988  990                          if (!rdma_setup_read_data2(ra, rr)) {
 989  991                                  rr->rr_data = NULL;
 990  992                                  rr->rr_status = puterrno(NFSERR_INVAL);
 991  993                          }
 992  994                  }
 993  995          }
 994  996  done:
 995  997          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 996  998          if (in_crit)
 997  999                  nbl_end_crit(vp);
 998 1000  
 999 1001          acl_perm(vp, exi, &va, cr);
1000 1002  
1001 1003          /* check for overflows */
1002 1004          error = vattr_to_nattr(&va, &rr->rr_attr);
1003 1005  
1004 1006          VN_RELE(vp);
1005 1007  
1006 1008          rr->rr_status = puterrno(error);
1007 1009  }
1008 1010  
1009 1011  /*
1010 1012   * Free data allocated by rfs_read
1011 1013   */
1012 1014  void
1013 1015  rfs_rdfree(struct nfsrdresult *rr)
1014 1016  {
1015 1017          mblk_t *mp;
1016 1018  
1017 1019          if (rr->rr_status == NFS_OK) {
1018 1020                  mp = rr->rr_mp;
1019 1021                  if (mp != NULL)
1020 1022                          freeb(mp);
1021 1023          }
1022 1024  }
1023 1025  
1024 1026  void *
1025 1027  rfs_read_getfh(struct nfsreadargs *ra)
1026 1028  {
1027 1029          return (&ra->ra_fhandle);
1028 1030  }
1029 1031  
1030 1032  #define MAX_IOVECS      12
1031 1033  
1032 1034  #ifdef DEBUG
1033 1035  static int rfs_write_sync_hits = 0;
1034 1036  static int rfs_write_sync_misses = 0;
1035 1037  #endif
1036 1038  
1037 1039  /*
1038 1040   * Write data to file.
1039 1041   * Returns attributes of a file after writing some data to it.
1040 1042   *
1041 1043   * Any changes made here, especially in error handling might have
1042 1044   * to also be done in rfs_write (which clusters write requests).
1043 1045   */
1044 1046  /* ARGSUSED */
1045 1047  void
1046 1048  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1047 1049      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1048 1050  {
1049 1051          int error;
1050 1052          vnode_t *vp;
1051 1053          rlim64_t rlimit;
1052 1054          struct vattr va;
1053 1055          struct uio uio;
1054 1056          struct iovec iov[MAX_IOVECS];
1055 1057          mblk_t *m;
1056 1058          struct iovec *iovp;
1057 1059          int iovcnt;
1058 1060          cred_t *savecred;
1059 1061          int in_crit = 0;
1060 1062          caller_context_t ct;
1061 1063  
1062 1064          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1063 1065          if (vp == NULL) {
1064 1066                  ns->ns_status = NFSERR_STALE;
1065 1067                  return;
1066 1068          }
1067 1069  
1068 1070          if (rdonly(ro, vp)) {
1069 1071                  VN_RELE(vp);
1070 1072                  ns->ns_status = NFSERR_ROFS;
1071 1073                  return;
1072 1074          }
1073 1075  
1074 1076          if (vp->v_type != VREG) {
1075 1077                  VN_RELE(vp);
1076 1078                  ns->ns_status = NFSERR_ISDIR;
1077 1079                  return;
1078 1080          }
1079 1081  
1080 1082          ct.cc_sysid = 0;
1081 1083          ct.cc_pid = 0;
1082 1084          ct.cc_caller_id = nfs2_srv_caller_id;
1083 1085          ct.cc_flags = CC_DONTBLOCK;
1084 1086  
1085 1087          va.va_mask = AT_UID|AT_MODE;
1086 1088  
1087 1089          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1088 1090  
1089 1091          if (error) {
1090 1092                  VN_RELE(vp);
1091 1093                  ns->ns_status = puterrno(error);
1092 1094  
1093 1095                  return;
1094 1096          }
1095 1097  
1096 1098          if (crgetuid(cr) != va.va_uid) {
1097 1099                  /*
1098 1100                   * This is a kludge to allow writes of files created
1099 1101                   * with read only permission.  The owner of the file
1100 1102                   * is always allowed to write it.
1101 1103                   */
1102 1104                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1103 1105  
1104 1106                  if (error) {
1105 1107                          VN_RELE(vp);
1106 1108                          ns->ns_status = puterrno(error);
1107 1109                          return;
1108 1110                  }
1109 1111          }
1110 1112  
1111 1113          /*
1112 1114           * Can't access a mandatory lock file.  This might cause
1113 1115           * the NFS service thread to block forever waiting for a
1114 1116           * lock to be released that will never be released.
1115 1117           */
1116 1118          if (MANDLOCK(vp, va.va_mode)) {
1117 1119                  VN_RELE(vp);
1118 1120                  ns->ns_status = NFSERR_ACCES;
1119 1121                  return;
1120 1122          }
1121 1123  
1122 1124          /*
1123 1125           * We have to enter the critical region before calling VOP_RWLOCK
1124 1126           * to avoid a deadlock with ufs.
1125 1127           */
1126 1128          if (nbl_need_check(vp)) {
1127 1129                  nbl_start_crit(vp, RW_READER);
1128 1130                  in_crit = 1;
1129 1131                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1130 1132                      wa->wa_count, 0, NULL)) {
1131 1133                          error = EACCES;
1132 1134                          goto out;
1133 1135                  }
1134 1136          }
1135 1137  
1136 1138          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1137 1139  
1138 1140          /* check if a monitor detected a delegation conflict */
1139 1141          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1140 1142                  goto out;
1141 1143          }
1142 1144  
1143 1145          if (wa->wa_data || wa->wa_rlist) {
1144 1146                  /* Do the RDMA thing if necessary */
1145 1147                  if (wa->wa_rlist) {
1146 1148                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1147 1149                          iov[0].iov_len = wa->wa_count;
1148 1150                  } else  {
1149 1151                          iov[0].iov_base = wa->wa_data;
1150 1152                          iov[0].iov_len = wa->wa_count;
1151 1153                  }
1152 1154                  uio.uio_iov = iov;
1153 1155                  uio.uio_iovcnt = 1;
1154 1156                  uio.uio_segflg = UIO_SYSSPACE;
1155 1157                  uio.uio_extflg = UIO_COPY_DEFAULT;
1156 1158                  uio.uio_loffset = (offset_t)wa->wa_offset;
1157 1159                  uio.uio_resid = wa->wa_count;
1158 1160                  /*
1159 1161                   * The limit is checked on the client. We
1160 1162                   * should allow any size writes here.
1161 1163                   */
1162 1164                  uio.uio_llimit = curproc->p_fsz_ctl;
1163 1165                  rlimit = uio.uio_llimit - wa->wa_offset;
1164 1166                  if (rlimit < (rlim64_t)uio.uio_resid)
1165 1167                          uio.uio_resid = (uint_t)rlimit;
1166 1168  
1167 1169                  /*
1168 1170                   * for now we assume no append mode
1169 1171                   */
1170 1172                  /*
1171 1173                   * We're changing creds because VM may fault and we need
1172 1174                   * the cred of the current thread to be used if quota
1173 1175                   * checking is enabled.
1174 1176                   */
1175 1177                  savecred = curthread->t_cred;
1176 1178                  curthread->t_cred = cr;
1177 1179                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1178 1180                  curthread->t_cred = savecred;
1179 1181          } else {
1180 1182  
1181 1183                  iovcnt = 0;
1182 1184                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1183 1185                          iovcnt++;
1184 1186                  if (iovcnt <= MAX_IOVECS) {
1185 1187  #ifdef DEBUG
1186 1188                          rfs_write_sync_hits++;
1187 1189  #endif
1188 1190                          iovp = iov;
1189 1191                  } else {
1190 1192  #ifdef DEBUG
1191 1193                          rfs_write_sync_misses++;
1192 1194  #endif
1193 1195                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1194 1196                  }
1195 1197                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1196 1198                  uio.uio_iov = iovp;
1197 1199                  uio.uio_iovcnt = iovcnt;
1198 1200                  uio.uio_segflg = UIO_SYSSPACE;
1199 1201                  uio.uio_extflg = UIO_COPY_DEFAULT;
1200 1202                  uio.uio_loffset = (offset_t)wa->wa_offset;
1201 1203                  uio.uio_resid = wa->wa_count;
1202 1204                  /*
1203 1205                   * The limit is checked on the client. We
1204 1206                   * should allow any size writes here.
1205 1207                   */
1206 1208                  uio.uio_llimit = curproc->p_fsz_ctl;
1207 1209                  rlimit = uio.uio_llimit - wa->wa_offset;
1208 1210                  if (rlimit < (rlim64_t)uio.uio_resid)
1209 1211                          uio.uio_resid = (uint_t)rlimit;
1210 1212  
1211 1213                  /*
1212 1214                   * For now we assume no append mode.
1213 1215                   */
1214 1216                  /*
1215 1217                   * We're changing creds because VM may fault and we need
1216 1218                   * the cred of the current thread to be used if quota
1217 1219                   * checking is enabled.
1218 1220                   */
1219 1221                  savecred = curthread->t_cred;
1220 1222                  curthread->t_cred = cr;
1221 1223                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1222 1224                  curthread->t_cred = savecred;
1223 1225  
1224 1226                  if (iovp != iov)
1225 1227                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1226 1228          }
1227 1229  
1228 1230          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1229 1231  
1230 1232          if (!error) {
1231 1233                  /*
1232 1234                   * Get attributes again so we send the latest mod
1233 1235                   * time to the client side for its cache.
1234 1236                   */
1235 1237                  va.va_mask = AT_ALL;    /* now we want everything */
1236 1238  
1237 1239                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1238 1240  
1239 1241                  /* check for overflows */
1240 1242                  if (!error) {
1241 1243                          acl_perm(vp, exi, &va, cr);
1242 1244                          error = vattr_to_nattr(&va, &ns->ns_attr);
1243 1245                  }
1244 1246          }
1245 1247  
1246 1248  out:
1247 1249          if (in_crit)
1248 1250                  nbl_end_crit(vp);
1249 1251          VN_RELE(vp);
1250 1252  
1251 1253          /* check if a monitor detected a delegation conflict */
1252 1254          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1253 1255                  /* mark as wouldblock so response is dropped */
1254 1256                  curthread->t_flag |= T_WOULDBLOCK;
1255 1257          else
1256 1258                  ns->ns_status = puterrno(error);
1257 1259  
1258 1260  }
1259 1261  
1260 1262  struct rfs_async_write {
1261 1263          struct nfswriteargs *wa;
1262 1264          struct nfsattrstat *ns;
1263 1265          struct svc_req *req;
1264 1266          cred_t *cr;
1265 1267          bool_t ro;
1266 1268          kthread_t *thread;
1267 1269          struct rfs_async_write *list;
1268 1270  };
1269 1271  
1270 1272  struct rfs_async_write_list {
1271 1273          fhandle_t *fhp;
1272 1274          kcondvar_t cv;
1273 1275          struct rfs_async_write *list;
1274 1276          struct rfs_async_write_list *next;
1275 1277  };
1276 1278  
1277 1279  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1278 1280  static kmutex_t rfs_async_write_lock;
1279 1281  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1280 1282  
1281 1283  #define MAXCLIOVECS     42
1282 1284  #define RFSWRITE_INITVAL (enum nfsstat) -1
1283 1285  
1284 1286  #ifdef DEBUG
1285 1287  static int rfs_write_hits = 0;
1286 1288  static int rfs_write_misses = 0;
1287 1289  #endif
1288 1290  
1289 1291  /*
1290 1292   * Write data to file.
1291 1293   * Returns attributes of a file after writing some data to it.
1292 1294   */
1293 1295  void
1294 1296  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1295 1297      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1296 1298  {
1297 1299          int error;
1298 1300          vnode_t *vp;
1299 1301          rlim64_t rlimit;
1300 1302          struct vattr va;
1301 1303          struct uio uio;
1302 1304          struct rfs_async_write_list *lp;
1303 1305          struct rfs_async_write_list *nlp;
1304 1306          struct rfs_async_write *rp;
1305 1307          struct rfs_async_write *nrp;
1306 1308          struct rfs_async_write *trp;
1307 1309          struct rfs_async_write *lrp;
1308 1310          int data_written;
1309 1311          int iovcnt;
1310 1312          mblk_t *m;
1311 1313          struct iovec *iovp;
1312 1314          struct iovec *niovp;
1313 1315          struct iovec iov[MAXCLIOVECS];
1314 1316          int count;
1315 1317          int rcount;
1316 1318          uint_t off;
1317 1319          uint_t len;
1318 1320          struct rfs_async_write nrpsp;
1319 1321          struct rfs_async_write_list nlpsp;
1320 1322          ushort_t t_flag;
1321 1323          cred_t *savecred;
1322 1324          int in_crit = 0;
1323 1325          caller_context_t ct;
1324 1326          nfs_srv_t *nsrv;
1325 1327  
1326 1328          ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1327 1329          nsrv = nfs_get_srv();
1328 1330          if (!nsrv->write_async) {
1329 1331                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1330 1332                  return;
1331 1333          }
1332 1334  
1333 1335          /*
1334 1336           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1335 1337           * is considered an OK.
1336 1338           */
1337 1339          ns->ns_status = RFSWRITE_INITVAL;
1338 1340  
1339 1341          nrp = &nrpsp;
1340 1342          nrp->wa = wa;
1341 1343          nrp->ns = ns;
1342 1344          nrp->req = req;
1343 1345          nrp->cr = cr;
1344 1346          nrp->ro = ro;
1345 1347          nrp->thread = curthread;
1346 1348  
1347 1349          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1348 1350  
1349 1351          /*
1350 1352           * Look to see if there is already a cluster started
1351 1353           * for this file.
1352 1354           */
1353 1355          mutex_enter(&nsrv->async_write_lock);
1354 1356          for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1355 1357                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1356 1358                      sizeof (fhandle_t)) == 0)
1357 1359                          break;
1358 1360          }
1359 1361  
1360 1362          /*
1361 1363           * If lp is non-NULL, then there is already a cluster
1362 1364           * started.  We need to place ourselves in the cluster
1363 1365           * list in the right place as determined by starting
1364 1366           * offset.  Conflicts with non-blocking mandatory locked
1365 1367           * regions will be checked when the cluster is processed.
1366 1368           */
1367 1369          if (lp != NULL) {
1368 1370                  rp = lp->list;
1369 1371                  trp = NULL;
1370 1372                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1371 1373                          trp = rp;
1372 1374                          rp = rp->list;
1373 1375                  }
1374 1376                  nrp->list = rp;
1375 1377                  if (trp == NULL)
1376 1378                          lp->list = nrp;
1377 1379                  else
1378 1380                          trp->list = nrp;
1379 1381                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1380 1382                          cv_wait(&lp->cv, &nsrv->async_write_lock);
1381 1383                  mutex_exit(&nsrv->async_write_lock);
1382 1384  
1383 1385                  return;
1384 1386          }
1385 1387  
1386 1388          /*
1387 1389           * No cluster started yet, start one and add ourselves
1388 1390           * to the list of clusters.
1389 1391           */
1390 1392          nrp->list = NULL;
1391 1393  
1392 1394          nlp = &nlpsp;
1393 1395          nlp->fhp = &wa->wa_fhandle;
1394 1396          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1395 1397          nlp->list = nrp;
1396 1398          nlp->next = NULL;
1397 1399  
1398 1400          if (nsrv->async_write_head == NULL) {
1399 1401                  nsrv->async_write_head = nlp;
1400 1402          } else {
1401 1403                  lp = nsrv->async_write_head;
1402 1404                  while (lp->next != NULL)
1403 1405                          lp = lp->next;
1404 1406                  lp->next = nlp;
1405 1407          }
1406 1408          mutex_exit(&nsrv->async_write_lock);
1407 1409  
1408 1410          /*
1409 1411           * Convert the file handle common to all of the requests
1410 1412           * in this cluster to a vnode.
1411 1413           */
1412 1414          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1413 1415          if (vp == NULL) {
1414 1416                  mutex_enter(&nsrv->async_write_lock);
1415 1417                  if (nsrv->async_write_head == nlp)
1416 1418                          nsrv->async_write_head = nlp->next;
1417 1419                  else {
1418 1420                          lp = nsrv->async_write_head;
1419 1421                          while (lp->next != nlp)
1420 1422                                  lp = lp->next;
1421 1423                          lp->next = nlp->next;
1422 1424                  }
1423 1425                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1424 1426                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1425 1427                          rp->ns->ns_status = NFSERR_STALE;
1426 1428                          rp->thread->t_flag |= t_flag;
1427 1429                  }
1428 1430                  cv_broadcast(&nlp->cv);
1429 1431                  mutex_exit(&nsrv->async_write_lock);
1430 1432  
1431 1433                  return;
1432 1434          }
1433 1435  
1434 1436          /*
1435 1437           * Can only write regular files.  Attempts to write any
1436 1438           * other file types fail with EISDIR.
1437 1439           */
1438 1440          if (vp->v_type != VREG) {
1439 1441                  VN_RELE(vp);
1440 1442                  mutex_enter(&nsrv->async_write_lock);
1441 1443                  if (nsrv->async_write_head == nlp)
1442 1444                          nsrv->async_write_head = nlp->next;
1443 1445                  else {
1444 1446                          lp = nsrv->async_write_head;
1445 1447                          while (lp->next != nlp)
1446 1448                                  lp = lp->next;
1447 1449                          lp->next = nlp->next;
1448 1450                  }
1449 1451                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1450 1452                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1451 1453                          rp->ns->ns_status = NFSERR_ISDIR;
1452 1454                          rp->thread->t_flag |= t_flag;
1453 1455                  }
1454 1456                  cv_broadcast(&nlp->cv);
1455 1457                  mutex_exit(&nsrv->async_write_lock);
1456 1458  
1457 1459                  return;
1458 1460          }
1459 1461  
1460 1462          /*
1461 1463           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1462 1464           * deadlock with ufs.
1463 1465           */
1464 1466          if (nbl_need_check(vp)) {
1465 1467                  nbl_start_crit(vp, RW_READER);
1466 1468                  in_crit = 1;
1467 1469          }
1468 1470  
1469 1471          ct.cc_sysid = 0;
1470 1472          ct.cc_pid = 0;
1471 1473          ct.cc_caller_id = nfs2_srv_caller_id;
1472 1474          ct.cc_flags = CC_DONTBLOCK;
1473 1475  
1474 1476          /*
1475 1477           * Lock the file for writing.  This operation provides
1476 1478           * the delay which allows clusters to grow.
1477 1479           */
1478 1480          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1479 1481  
1480 1482          /* check if a monitor detected a delegation conflict */
1481 1483          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1482 1484                  if (in_crit)
1483 1485                          nbl_end_crit(vp);
1484 1486                  VN_RELE(vp);
1485 1487                  /* mark as wouldblock so response is dropped */
1486 1488                  curthread->t_flag |= T_WOULDBLOCK;
1487 1489                  mutex_enter(&nsrv->async_write_lock);
1488 1490                  if (nsrv->async_write_head == nlp)
1489 1491                          nsrv->async_write_head = nlp->next;
1490 1492                  else {
1491 1493                          lp = nsrv->async_write_head;
1492 1494                          while (lp->next != nlp)
1493 1495                                  lp = lp->next;
1494 1496                          lp->next = nlp->next;
1495 1497                  }
1496 1498                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1497 1499                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1498 1500                                  rp->ns->ns_status = puterrno(error);
1499 1501                                  rp->thread->t_flag |= T_WOULDBLOCK;
1500 1502                          }
1501 1503                  }
1502 1504                  cv_broadcast(&nlp->cv);
1503 1505                  mutex_exit(&nsrv->async_write_lock);
1504 1506  
1505 1507                  return;
1506 1508          }
1507 1509  
1508 1510          /*
1509 1511           * Disconnect this cluster from the list of clusters.
1510 1512           * The cluster that is being dealt with must be fixed
1511 1513           * in size after this point, so there is no reason
1512 1514           * to leave it on the list so that new requests can
1513 1515           * find it.
1514 1516           *
1515 1517           * The algorithm is that the first write request will
1516 1518           * create a cluster, convert the file handle to a
1517 1519           * vnode pointer, and then lock the file for writing.
1518 1520           * This request is not likely to be clustered with
1519 1521           * any others.  However, the next request will create
1520 1522           * a new cluster and be blocked in VOP_RWLOCK while
1521 1523           * the first request is being processed.  This delay
1522 1524           * will allow more requests to be clustered in this
1523 1525           * second cluster.
1524 1526           */
1525 1527          mutex_enter(&nsrv->async_write_lock);
1526 1528          if (nsrv->async_write_head == nlp)
1527 1529                  nsrv->async_write_head = nlp->next;
1528 1530          else {
1529 1531                  lp = nsrv->async_write_head;
1530 1532                  while (lp->next != nlp)
1531 1533                          lp = lp->next;
1532 1534                  lp->next = nlp->next;
1533 1535          }
1534 1536          mutex_exit(&nsrv->async_write_lock);
1535 1537  
1536 1538          /*
1537 1539           * Step through the list of requests in this cluster.
1538 1540           * We need to check permissions to make sure that all
1539 1541           * of the requests have sufficient permission to write
1540 1542           * the file.  A cluster can be composed of requests
1541 1543           * from different clients and different users on each
1542 1544           * client.
1543 1545           *
1544 1546           * As a side effect, we also calculate the size of the
1545 1547           * byte range that this cluster encompasses.
1546 1548           */
1547 1549          rp = nlp->list;
1548 1550          off = rp->wa->wa_offset;
1549 1551          len = (uint_t)0;
1550 1552          do {
1551 1553                  if (rdonly(rp->ro, vp)) {
1552 1554                          rp->ns->ns_status = NFSERR_ROFS;
1553 1555                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1554 1556                          rp->thread->t_flag |= t_flag;
1555 1557                          continue;
1556 1558                  }
1557 1559  
1558 1560                  va.va_mask = AT_UID|AT_MODE;
1559 1561  
1560 1562                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1561 1563  
1562 1564                  if (!error) {
1563 1565                          if (crgetuid(rp->cr) != va.va_uid) {
1564 1566                                  /*
1565 1567                                   * This is a kludge to allow writes of files
1566 1568                                   * created with read only permission.  The
1567 1569                                   * owner of the file is always allowed to
1568 1570                                   * write it.
1569 1571                                   */
1570 1572                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1571 1573                          }
1572 1574                          if (!error && MANDLOCK(vp, va.va_mode))
1573 1575                                  error = EACCES;
1574 1576                  }
1575 1577  
1576 1578                  /*
1577 1579                   * Check for a conflict with a nbmand-locked region.
1578 1580                   */
1579 1581                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1580 1582                      rp->wa->wa_count, 0, NULL)) {
1581 1583                          error = EACCES;
1582 1584                  }
1583 1585  
1584 1586                  if (error) {
1585 1587                          rp->ns->ns_status = puterrno(error);
1586 1588                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1587 1589                          rp->thread->t_flag |= t_flag;
1588 1590                          continue;
1589 1591                  }
1590 1592                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1591 1593                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1592 1594          } while ((rp = rp->list) != NULL);
1593 1595  
1594 1596          /*
1595 1597           * Step through the cluster attempting to gather as many
1596 1598           * requests which are contiguous as possible.  These
1597 1599           * contiguous requests are handled via one call to VOP_WRITE
1598 1600           * instead of different calls to VOP_WRITE.  We also keep
1599 1601           * track of the fact that any data was written.
1600 1602           */
1601 1603          rp = nlp->list;
1602 1604          data_written = 0;
1603 1605          do {
1604 1606                  /*
1605 1607                   * Skip any requests which are already marked as having an
1606 1608                   * error.
1607 1609                   */
1608 1610                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1609 1611                          rp = rp->list;
1610 1612                          continue;
1611 1613                  }
1612 1614  
1613 1615                  /*
1614 1616                   * Count the number of iovec's which are required
1615 1617                   * to handle this set of requests.  One iovec is
1616 1618                   * needed for each data buffer, whether addressed
1617 1619                   * by wa_data or by the b_rptr pointers in the
1618 1620                   * mblk chains.
1619 1621                   */
1620 1622                  iovcnt = 0;
1621 1623                  lrp = rp;
1622 1624                  for (;;) {
1623 1625                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1624 1626                                  iovcnt++;
1625 1627                          else {
1626 1628                                  m = lrp->wa->wa_mblk;
1627 1629                                  while (m != NULL) {
1628 1630                                          iovcnt++;
1629 1631                                          m = m->b_cont;
1630 1632                                  }
1631 1633                          }
1632 1634                          if (lrp->list == NULL ||
1633 1635                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1634 1636                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1635 1637                              lrp->list->wa->wa_offset) {
1636 1638                                  lrp = lrp->list;
1637 1639                                  break;
1638 1640                          }
1639 1641                          lrp = lrp->list;
1640 1642                  }
1641 1643  
1642 1644                  if (iovcnt <= MAXCLIOVECS) {
1643 1645  #ifdef DEBUG
1644 1646                          rfs_write_hits++;
1645 1647  #endif
1646 1648                          niovp = iov;
1647 1649                  } else {
1648 1650  #ifdef DEBUG
1649 1651                          rfs_write_misses++;
1650 1652  #endif
1651 1653                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1652 1654                  }
1653 1655                  /*
1654 1656                   * Put together the scatter/gather iovecs.
1655 1657                   */
1656 1658                  iovp = niovp;
1657 1659                  trp = rp;
1658 1660                  count = 0;
1659 1661                  do {
1660 1662                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1661 1663                                  if (trp->wa->wa_rlist) {
1662 1664                                          iovp->iov_base =
1663 1665                                              (char *)((trp->wa->wa_rlist)->
1664 1666                                              u.c_daddr3);
1665 1667                                          iovp->iov_len = trp->wa->wa_count;
1666 1668                                  } else  {
1667 1669                                          iovp->iov_base = trp->wa->wa_data;
1668 1670                                          iovp->iov_len = trp->wa->wa_count;
1669 1671                                  }
1670 1672                                  iovp++;
1671 1673                          } else {
1672 1674                                  m = trp->wa->wa_mblk;
1673 1675                                  rcount = trp->wa->wa_count;
1674 1676                                  while (m != NULL) {
1675 1677                                          iovp->iov_base = (caddr_t)m->b_rptr;
1676 1678                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1677 1679                                          rcount -= iovp->iov_len;
1678 1680                                          if (rcount < 0)
1679 1681                                                  iovp->iov_len += rcount;
1680 1682                                          iovp++;
1681 1683                                          if (rcount <= 0)
1682 1684                                                  break;
1683 1685                                          m = m->b_cont;
1684 1686                                  }
1685 1687                          }
1686 1688                          count += trp->wa->wa_count;
1687 1689                          trp = trp->list;
1688 1690                  } while (trp != lrp);
1689 1691  
1690 1692                  uio.uio_iov = niovp;
1691 1693                  uio.uio_iovcnt = iovcnt;
1692 1694                  uio.uio_segflg = UIO_SYSSPACE;
1693 1695                  uio.uio_extflg = UIO_COPY_DEFAULT;
1694 1696                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1695 1697                  uio.uio_resid = count;
1696 1698                  /*
1697 1699                   * The limit is checked on the client. We
1698 1700                   * should allow any size writes here.
1699 1701                   */
1700 1702                  uio.uio_llimit = curproc->p_fsz_ctl;
1701 1703                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1702 1704                  if (rlimit < (rlim64_t)uio.uio_resid)
1703 1705                          uio.uio_resid = (uint_t)rlimit;
1704 1706  
1705 1707                  /*
1706 1708                   * For now we assume no append mode.
1707 1709                   */
1708 1710  
1709 1711                  /*
1710 1712                   * We're changing creds because VM may fault
1711 1713                   * and we need the cred of the current
1712 1714                   * thread to be used if quota * checking is
1713 1715                   * enabled.
1714 1716                   */
1715 1717                  savecred = curthread->t_cred;
1716 1718                  curthread->t_cred = cr;
1717 1719                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1718 1720                  curthread->t_cred = savecred;
1719 1721  
1720 1722                  /* check if a monitor detected a delegation conflict */
1721 1723                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1722 1724                          /* mark as wouldblock so response is dropped */
1723 1725                          curthread->t_flag |= T_WOULDBLOCK;
1724 1726  
1725 1727                  if (niovp != iov)
1726 1728                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1727 1729  
1728 1730                  if (!error) {
1729 1731                          data_written = 1;
1730 1732                          /*
1731 1733                           * Get attributes again so we send the latest mod
1732 1734                           * time to the client side for its cache.
1733 1735                           */
1734 1736                          va.va_mask = AT_ALL;    /* now we want everything */
1735 1737  
1736 1738                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1737 1739  
1738 1740                          if (!error)
1739 1741                                  acl_perm(vp, exi, &va, rp->cr);
1740 1742                  }
1741 1743  
1742 1744                  /*
1743 1745                   * Fill in the status responses for each request
1744 1746                   * which was just handled.  Also, copy the latest
1745 1747                   * attributes in to the attribute responses if
1746 1748                   * appropriate.
1747 1749                   */
1748 1750                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1749 1751                  do {
1750 1752                          rp->thread->t_flag |= t_flag;
1751 1753                          /* check for overflows */
1752 1754                          if (!error) {
1753 1755                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1754 1756                          }
1755 1757                          rp->ns->ns_status = puterrno(error);
1756 1758                          rp = rp->list;
1757 1759                  } while (rp != lrp);
1758 1760          } while (rp != NULL);
1759 1761  
1760 1762          /*
1761 1763           * If any data was written at all, then we need to flush
1762 1764           * the data and metadata to stable storage.
1763 1765           */
1764 1766          if (data_written) {
1765 1767                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1766 1768  
1767 1769                  if (!error) {
1768 1770                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1769 1771                  }
1770 1772          }
1771 1773  
1772 1774          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1773 1775  
1774 1776          if (in_crit)
1775 1777                  nbl_end_crit(vp);
1776 1778          VN_RELE(vp);
1777 1779  
1778 1780          t_flag = curthread->t_flag & T_WOULDBLOCK;
1779 1781          mutex_enter(&nsrv->async_write_lock);
1780 1782          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1781 1783                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1782 1784                          rp->ns->ns_status = puterrno(error);
1783 1785                          rp->thread->t_flag |= t_flag;
1784 1786                  }
1785 1787          }
1786 1788          cv_broadcast(&nlp->cv);
1787 1789          mutex_exit(&nsrv->async_write_lock);
1788 1790  
1789 1791  }
1790 1792  
1791 1793  void *
1792 1794  rfs_write_getfh(struct nfswriteargs *wa)
1793 1795  {
1794 1796          return (&wa->wa_fhandle);
1795 1797  }
1796 1798  
1797 1799  /*
1798 1800   * Create a file.
1799 1801   * Creates a file with given attributes and returns those attributes
1800 1802   * and an fhandle for the new file.
1801 1803   */
1802 1804  void
1803 1805  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1804 1806      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1805 1807  {
1806 1808          int error;
1807 1809          int lookuperr;
1808 1810          int in_crit = 0;
1809 1811          struct vattr va;
1810 1812          vnode_t *vp;
1811 1813          vnode_t *realvp;
1812 1814          vnode_t *dvp;
1813 1815          char *name = args->ca_da.da_name;
1814 1816          vnode_t *tvp = NULL;
1815 1817          int mode;
1816 1818          int lookup_ok;
1817 1819          bool_t trunc;
1818 1820          struct sockaddr *ca;
1819 1821  
1820 1822          /*
1821 1823           * Disallow NULL paths
1822 1824           */
1823 1825          if (name == NULL || *name == '\0') {
1824 1826                  dr->dr_status = NFSERR_ACCES;
1825 1827                  return;
1826 1828          }
1827 1829  
1828 1830          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1829 1831          if (dvp == NULL) {
1830 1832                  dr->dr_status = NFSERR_STALE;
1831 1833                  return;
1832 1834          }
1833 1835  
1834 1836          error = sattr_to_vattr(args->ca_sa, &va);
1835 1837          if (error) {
1836 1838                  dr->dr_status = puterrno(error);
1837 1839                  return;
1838 1840          }
1839 1841  
1840 1842          /*
1841 1843           * Must specify the mode.
1842 1844           */
1843 1845          if (!(va.va_mask & AT_MODE)) {
1844 1846                  VN_RELE(dvp);
1845 1847                  dr->dr_status = NFSERR_INVAL;
1846 1848                  return;
1847 1849          }
1848 1850  
1849 1851          /*
1850 1852           * This is a completely gross hack to make mknod
1851 1853           * work over the wire until we can wack the protocol
1852 1854           */
1853 1855          if ((va.va_mode & IFMT) == IFCHR) {
1854 1856                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1855 1857                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1856 1858                  else {
1857 1859                          va.va_type = VCHR;
1858 1860                          /*
1859 1861                           * uncompress the received dev_t
1860 1862                           * if the top half is zero indicating a request
1861 1863                           * from an `older style' OS.
1862 1864                           */
1863 1865                          if ((va.va_size & 0xffff0000) == 0)
1864 1866                                  va.va_rdev = nfsv2_expdev(va.va_size);
1865 1867                          else
1866 1868                                  va.va_rdev = (dev_t)va.va_size;
1867 1869                  }
1868 1870                  va.va_mask &= ~AT_SIZE;
1869 1871          } else if ((va.va_mode & IFMT) == IFBLK) {
1870 1872                  va.va_type = VBLK;
1871 1873                  /*
1872 1874                   * uncompress the received dev_t
1873 1875                   * if the top half is zero indicating a request
1874 1876                   * from an `older style' OS.
1875 1877                   */
1876 1878                  if ((va.va_size & 0xffff0000) == 0)
1877 1879                          va.va_rdev = nfsv2_expdev(va.va_size);
1878 1880                  else
1879 1881                          va.va_rdev = (dev_t)va.va_size;
1880 1882                  va.va_mask &= ~AT_SIZE;
1881 1883          } else if ((va.va_mode & IFMT) == IFSOCK) {
1882 1884                  va.va_type = VSOCK;
1883 1885          } else {
1884 1886                  va.va_type = VREG;
1885 1887          }
1886 1888          va.va_mode &= ~IFMT;
1887 1889          va.va_mask |= AT_TYPE;
1888 1890  
1889 1891          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1890 1892          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1891 1893              MAXPATHLEN);
1892 1894          if (name == NULL) {
1893 1895                  dr->dr_status = puterrno(EINVAL);
1894 1896                  return;
1895 1897          }
1896 1898  
1897 1899          /*
1898 1900           * Why was the choice made to use VWRITE as the mode to the
1899 1901           * call to VOP_CREATE ? This results in a bug.  When a client
1900 1902           * opens a file that already exists and is RDONLY, the second
1901 1903           * open fails with an EACESS because of the mode.
1902 1904           * bug ID 1054648.
1903 1905           */
1904 1906          lookup_ok = 0;
1905 1907          mode = VWRITE;
1906 1908          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1907 1909                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1908 1910                      NULL, NULL, NULL);
1909 1911                  if (!error) {
1910 1912                          struct vattr at;
1911 1913  
1912 1914                          lookup_ok = 1;
1913 1915                          at.va_mask = AT_MODE;
1914 1916                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1915 1917                          if (!error)
1916 1918                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1917 1919                          VN_RELE(tvp);
1918 1920                          tvp = NULL;
1919 1921                  }
1920 1922          }
1921 1923  
1922 1924          if (!lookup_ok) {
1923 1925                  if (rdonly(ro, dvp)) {
1924 1926                          error = EROFS;
1925 1927                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1926 1928                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1927 1929                          error = EPERM;
1928 1930                  } else {
1929 1931                          error = 0;
1930 1932                  }
1931 1933          }
1932 1934  
1933 1935          /*
1934 1936           * If file size is being modified on an already existing file
1935 1937           * make sure that there are no conflicting non-blocking mandatory
1936 1938           * locks in the region being manipulated. Return EACCES if there
1937 1939           * are conflicting locks.
1938 1940           */
1939 1941          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1940 1942                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1941 1943                      NULL, NULL, NULL);
1942 1944  
1943 1945                  if (!lookuperr &&
1944 1946                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1945 1947                          VN_RELE(tvp);
1946 1948                          curthread->t_flag |= T_WOULDBLOCK;
1947 1949                          goto out;
1948 1950                  }
1949 1951  
1950 1952                  if (!lookuperr && nbl_need_check(tvp)) {
1951 1953                          /*
1952 1954                           * The file exists. Now check if it has any
1953 1955                           * conflicting non-blocking mandatory locks
1954 1956                           * in the region being changed.
1955 1957                           */
1956 1958                          struct vattr bva;
1957 1959                          u_offset_t offset;
1958 1960                          ssize_t length;
1959 1961  
1960 1962                          nbl_start_crit(tvp, RW_READER);
1961 1963                          in_crit = 1;
1962 1964  
1963 1965                          bva.va_mask = AT_SIZE;
1964 1966                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1965 1967                          if (!error) {
1966 1968                                  if (va.va_size < bva.va_size) {
1967 1969                                          offset = va.va_size;
1968 1970                                          length = bva.va_size - va.va_size;
1969 1971                                  } else {
1970 1972                                          offset = bva.va_size;
1971 1973                                          length = va.va_size - bva.va_size;
1972 1974                                  }
1973 1975                                  if (length) {
1974 1976                                          if (nbl_conflict(tvp, NBL_WRITE,
1975 1977                                              offset, length, 0, NULL)) {
1976 1978                                                  error = EACCES;
1977 1979                                          }
1978 1980                                  }
1979 1981                          }
1980 1982                          if (error) {
1981 1983                                  nbl_end_crit(tvp);
1982 1984                                  VN_RELE(tvp);
1983 1985                                  in_crit = 0;
1984 1986                          }
1985 1987                  } else if (tvp != NULL) {
1986 1988                          VN_RELE(tvp);
1987 1989                  }
1988 1990          }
1989 1991  
1990 1992          if (!error) {
1991 1993                  /*
1992 1994                   * If filesystem is shared with nosuid the remove any
1993 1995                   * setuid/setgid bits on create.
1994 1996                   */
1995 1997                  if (va.va_type == VREG &&
1996 1998                      exi->exi_export.ex_flags & EX_NOSUID)
1997 1999                          va.va_mode &= ~(VSUID | VSGID);
1998 2000  
1999 2001                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
2000 2002                      NULL, NULL);
2001 2003  
2002 2004                  if (!error) {
2003 2005  
2004 2006                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2005 2007                                  trunc = TRUE;
2006 2008                          else
2007 2009                                  trunc = FALSE;
2008 2010  
2009 2011                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2010 2012                                  VN_RELE(vp);
2011 2013                                  curthread->t_flag |= T_WOULDBLOCK;
2012 2014                                  goto out;
2013 2015                          }
2014 2016                          va.va_mask = AT_ALL;
2015 2017  
2016 2018                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2017 2019  
2018 2020                          /* check for overflows */
2019 2021                          if (!error) {
2020 2022                                  acl_perm(vp, exi, &va, cr);
2021 2023                                  error = vattr_to_nattr(&va, &dr->dr_attr);
2022 2024                                  if (!error) {
2023 2025                                          error = makefh(&dr->dr_fhandle, vp,
2024 2026                                              exi);
2025 2027                                  }
2026 2028                          }
2027 2029                          /*
2028 2030                           * Force modified metadata out to stable storage.
2029 2031                           *
2030 2032                           * if a underlying vp exists, pass it to VOP_FSYNC
2031 2033                           */
2032 2034                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
2033 2035                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2034 2036                          else
2035 2037                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2036 2038                          VN_RELE(vp);
2037 2039                  }
2038 2040  
2039 2041                  if (in_crit) {
2040 2042                          nbl_end_crit(tvp);
2041 2043                          VN_RELE(tvp);
2042 2044                  }
2043 2045          }
2044 2046  
2045 2047          /*
2046 2048           * Force modified data and metadata out to stable storage.
2047 2049           */
2048 2050          (void) VOP_FSYNC(dvp, 0, cr, NULL);
2049 2051  
2050 2052  out:
2051 2053  
2052 2054          VN_RELE(dvp);
2053 2055  
2054 2056          dr->dr_status = puterrno(error);
2055 2057  
2056 2058          if (name != args->ca_da.da_name)
2057 2059                  kmem_free(name, MAXPATHLEN);
2058 2060  }
2059 2061  void *
2060 2062  rfs_create_getfh(struct nfscreatargs *args)
2061 2063  {
2062 2064          return (args->ca_da.da_fhandle);
2063 2065  }
2064 2066  
2065 2067  /*
2066 2068   * Remove a file.
2067 2069   * Remove named file from parent directory.
2068 2070   */
2069 2071  /* ARGSUSED */
2070 2072  void
2071 2073  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2072 2074      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2073 2075  {
2074 2076          int error = 0;
2075 2077          vnode_t *vp;
2076 2078          vnode_t *targvp;
2077 2079          int in_crit = 0;
2078 2080  
2079 2081          /*
2080 2082           * Disallow NULL paths
2081 2083           */
2082 2084          if (da->da_name == NULL || *da->da_name == '\0') {
2083 2085                  *status = NFSERR_ACCES;
2084 2086                  return;
2085 2087          }
2086 2088  
2087 2089          vp = nfs_fhtovp(da->da_fhandle, exi);
2088 2090          if (vp == NULL) {
2089 2091                  *status = NFSERR_STALE;
2090 2092                  return;
2091 2093          }
2092 2094  
2093 2095          if (rdonly(ro, vp)) {
2094 2096                  VN_RELE(vp);
2095 2097                  *status = NFSERR_ROFS;
2096 2098                  return;
2097 2099          }
2098 2100  
2099 2101          /*
2100 2102           * Check for a conflict with a non-blocking mandatory share reservation.
2101 2103           */
2102 2104          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2103 2105              NULL, cr, NULL, NULL, NULL);
2104 2106          if (error != 0) {
2105 2107                  VN_RELE(vp);
2106 2108                  *status = puterrno(error);
2107 2109                  return;
2108 2110          }
2109 2111  
2110 2112          /*
2111 2113           * If the file is delegated to an v4 client, then initiate
2112 2114           * recall and drop this request (by setting T_WOULDBLOCK).
2113 2115           * The client will eventually re-transmit the request and
2114 2116           * (hopefully), by then, the v4 client will have returned
2115 2117           * the delegation.
2116 2118           */
2117 2119  
2118 2120          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2119 2121                  VN_RELE(vp);
2120 2122                  VN_RELE(targvp);
2121 2123                  curthread->t_flag |= T_WOULDBLOCK;
2122 2124                  return;
2123 2125          }
2124 2126  
2125 2127          if (nbl_need_check(targvp)) {
2126 2128                  nbl_start_crit(targvp, RW_READER);
2127 2129                  in_crit = 1;
2128 2130                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2129 2131                          error = EACCES;
2130 2132                          goto out;
2131 2133                  }
2132 2134          }
2133 2135  
2134 2136          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2135 2137  
2136 2138          /*
2137 2139           * Force modified data and metadata out to stable storage.
2138 2140           */
2139 2141          (void) VOP_FSYNC(vp, 0, cr, NULL);
2140 2142  
2141 2143  out:
2142 2144          if (in_crit)
2143 2145                  nbl_end_crit(targvp);
2144 2146          VN_RELE(targvp);
2145 2147          VN_RELE(vp);
2146 2148  
2147 2149          *status = puterrno(error);
2148 2150  
2149 2151  }
2150 2152  
2151 2153  void *
2152 2154  rfs_remove_getfh(struct nfsdiropargs *da)
2153 2155  {
2154 2156          return (da->da_fhandle);
2155 2157  }
2156 2158  
2157 2159  /*
2158 2160   * rename a file
2159 2161   * Give a file (from) a new name (to).
2160 2162   */
2161 2163  /* ARGSUSED */
2162 2164  void
2163 2165  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2164 2166      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2165 2167  {
2166 2168          int error = 0;
2167 2169          vnode_t *fromvp;
2168 2170          vnode_t *tovp;
2169 2171          struct exportinfo *to_exi;
2170 2172          fhandle_t *fh;
2171 2173          vnode_t *srcvp;
2172 2174          vnode_t *targvp;
2173 2175          int in_crit = 0;
2174 2176  
2175 2177          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2176 2178          if (fromvp == NULL) {
2177 2179                  *status = NFSERR_STALE;
2178 2180                  return;
2179 2181          }
2180 2182  
2181 2183          fh = args->rna_to.da_fhandle;
2182 2184          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2183 2185          if (to_exi == NULL) {
2184 2186                  VN_RELE(fromvp);
2185 2187                  *status = NFSERR_ACCES;
2186 2188                  return;
2187 2189          }
2188 2190          exi_rele(to_exi);
2189 2191  
2190 2192          if (to_exi != exi) {
2191 2193                  VN_RELE(fromvp);
2192 2194                  *status = NFSERR_XDEV;
2193 2195                  return;
2194 2196          }
2195 2197  
2196 2198          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2197 2199          if (tovp == NULL) {
2198 2200                  VN_RELE(fromvp);
2199 2201                  *status = NFSERR_STALE;
2200 2202                  return;
2201 2203          }
2202 2204  
2203 2205          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2204 2206                  VN_RELE(tovp);
2205 2207                  VN_RELE(fromvp);
2206 2208                  *status = NFSERR_NOTDIR;
2207 2209                  return;
2208 2210          }
2209 2211  
2210 2212          /*
2211 2213           * Disallow NULL paths
2212 2214           */
2213 2215          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2214 2216              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2215 2217                  VN_RELE(tovp);
2216 2218                  VN_RELE(fromvp);
2217 2219                  *status = NFSERR_ACCES;
2218 2220                  return;
2219 2221          }
2220 2222  
2221 2223          if (rdonly(ro, tovp)) {
2222 2224                  VN_RELE(tovp);
2223 2225                  VN_RELE(fromvp);
2224 2226                  *status = NFSERR_ROFS;
2225 2227                  return;
2226 2228          }
2227 2229  
2228 2230          /*
2229 2231           * Check for a conflict with a non-blocking mandatory share reservation.
2230 2232           */
2231 2233          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2232 2234              NULL, cr, NULL, NULL, NULL);
2233 2235          if (error != 0) {
2234 2236                  VN_RELE(tovp);
2235 2237                  VN_RELE(fromvp);
2236 2238                  *status = puterrno(error);
2237 2239                  return;
2238 2240          }
2239 2241  
2240 2242          /* Check for delegations on the source file */
2241 2243  
2242 2244          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2243 2245                  VN_RELE(tovp);
2244 2246                  VN_RELE(fromvp);
2245 2247                  VN_RELE(srcvp);
2246 2248                  curthread->t_flag |= T_WOULDBLOCK;
2247 2249                  return;
2248 2250          }
2249 2251  
2250 2252          /* Check for delegation on the file being renamed over, if it exists */
2251 2253  
2252 2254          if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2253 2255              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2254 2256              NULL, NULL, NULL) == 0) {
2255 2257  
2256 2258                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2257 2259                          VN_RELE(tovp);
2258 2260                          VN_RELE(fromvp);
2259 2261                          VN_RELE(srcvp);
2260 2262                          VN_RELE(targvp);
2261 2263                          curthread->t_flag |= T_WOULDBLOCK;
2262 2264                          return;
2263 2265                  }
2264 2266                  VN_RELE(targvp);
2265 2267          }
2266 2268  
2267 2269  
2268 2270          if (nbl_need_check(srcvp)) {
2269 2271                  nbl_start_crit(srcvp, RW_READER);
2270 2272                  in_crit = 1;
2271 2273                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2272 2274                          error = EACCES;
2273 2275                          goto out;
2274 2276                  }
2275 2277          }
2276 2278  
2277 2279          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2278 2280              tovp, args->rna_to.da_name, cr, NULL, 0);
2279 2281  
2280 2282          if (error == 0)
2281 2283                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2282 2284                      strlen(args->rna_to.da_name));
2283 2285  
2284 2286          /*
2285 2287           * Force modified data and metadata out to stable storage.
2286 2288           */
2287 2289          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2288 2290          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2289 2291  
2290 2292  out:
2291 2293          if (in_crit)
2292 2294                  nbl_end_crit(srcvp);
2293 2295          VN_RELE(srcvp);
2294 2296          VN_RELE(tovp);
2295 2297          VN_RELE(fromvp);
2296 2298  
2297 2299          *status = puterrno(error);
2298 2300  
2299 2301  }
2300 2302  void *
2301 2303  rfs_rename_getfh(struct nfsrnmargs *args)
2302 2304  {
2303 2305          return (args->rna_from.da_fhandle);
2304 2306  }
2305 2307  
2306 2308  /*
2307 2309   * Link to a file.
2308 2310   * Create a file (to) which is a hard link to the given file (from).
2309 2311   */
2310 2312  /* ARGSUSED */
2311 2313  void
2312 2314  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2313 2315      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2314 2316  {
2315 2317          int error;
2316 2318          vnode_t *fromvp;
2317 2319          vnode_t *tovp;
2318 2320          struct exportinfo *to_exi;
2319 2321          fhandle_t *fh;
2320 2322  
2321 2323          fromvp = nfs_fhtovp(args->la_from, exi);
2322 2324          if (fromvp == NULL) {
2323 2325                  *status = NFSERR_STALE;
2324 2326                  return;
2325 2327          }
2326 2328  
2327 2329          fh = args->la_to.da_fhandle;
2328 2330          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2329 2331          if (to_exi == NULL) {
2330 2332                  VN_RELE(fromvp);
2331 2333                  *status = NFSERR_ACCES;
2332 2334                  return;
2333 2335          }
2334 2336          exi_rele(to_exi);
2335 2337  
2336 2338          if (to_exi != exi) {
2337 2339                  VN_RELE(fromvp);
2338 2340                  *status = NFSERR_XDEV;
2339 2341                  return;
2340 2342          }
2341 2343  
2342 2344          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2343 2345          if (tovp == NULL) {
2344 2346                  VN_RELE(fromvp);
2345 2347                  *status = NFSERR_STALE;
2346 2348                  return;
2347 2349          }
2348 2350  
2349 2351          if (tovp->v_type != VDIR) {
2350 2352                  VN_RELE(tovp);
2351 2353                  VN_RELE(fromvp);
2352 2354                  *status = NFSERR_NOTDIR;
2353 2355                  return;
2354 2356          }
2355 2357          /*
2356 2358           * Disallow NULL paths
2357 2359           */
2358 2360          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2359 2361                  VN_RELE(tovp);
2360 2362                  VN_RELE(fromvp);
2361 2363                  *status = NFSERR_ACCES;
2362 2364                  return;
2363 2365          }
2364 2366  
2365 2367          if (rdonly(ro, tovp)) {
2366 2368                  VN_RELE(tovp);
2367 2369                  VN_RELE(fromvp);
2368 2370                  *status = NFSERR_ROFS;
2369 2371                  return;
2370 2372          }
2371 2373  
2372 2374          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2373 2375  
2374 2376          /*
2375 2377           * Force modified data and metadata out to stable storage.
2376 2378           */
2377 2379          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2378 2380          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2379 2381  
2380 2382          VN_RELE(tovp);
2381 2383          VN_RELE(fromvp);
2382 2384  
2383 2385          *status = puterrno(error);
2384 2386  
2385 2387  }
2386 2388  void *
2387 2389  rfs_link_getfh(struct nfslinkargs *args)
2388 2390  {
2389 2391          return (args->la_from);
2390 2392  }
2391 2393  
2392 2394  /*
2393 2395   * Symbolicly link to a file.
2394 2396   * Create a file (to) with the given attributes which is a symbolic link
2395 2397   * to the given path name (to).
2396 2398   */
2397 2399  void
2398 2400  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2399 2401      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2400 2402  {
2401 2403          int error;
2402 2404          struct vattr va;
2403 2405          vnode_t *vp;
2404 2406          vnode_t *svp;
2405 2407          int lerror;
2406 2408          struct sockaddr *ca;
2407 2409          char *name = NULL;
2408 2410  
2409 2411          /*
2410 2412           * Disallow NULL paths
2411 2413           */
2412 2414          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2413 2415                  *status = NFSERR_ACCES;
2414 2416                  return;
2415 2417          }
2416 2418  
2417 2419          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2418 2420          if (vp == NULL) {
2419 2421                  *status = NFSERR_STALE;
2420 2422                  return;
2421 2423          }
2422 2424  
2423 2425          if (rdonly(ro, vp)) {
2424 2426                  VN_RELE(vp);
2425 2427                  *status = NFSERR_ROFS;
2426 2428                  return;
2427 2429          }
2428 2430  
2429 2431          error = sattr_to_vattr(args->sla_sa, &va);
2430 2432          if (error) {
2431 2433                  VN_RELE(vp);
2432 2434                  *status = puterrno(error);
2433 2435                  return;
2434 2436          }
2435 2437  
2436 2438          if (!(va.va_mask & AT_MODE)) {
2437 2439                  VN_RELE(vp);
2438 2440                  *status = NFSERR_INVAL;
2439 2441                  return;
2440 2442          }
2441 2443  
2442 2444          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2443 2445          name = nfscmd_convname(ca, exi, args->sla_tnm,
2444 2446              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2445 2447  
2446 2448          if (name == NULL) {
2447 2449                  *status = NFSERR_ACCES;
2448 2450                  return;
2449 2451          }
2450 2452  
2451 2453          va.va_type = VLNK;
2452 2454          va.va_mask |= AT_TYPE;
2453 2455  
2454 2456          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2455 2457  
2456 2458          /*
2457 2459           * Force new data and metadata out to stable storage.
2458 2460           */
2459 2461          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2460 2462              NULL, cr, NULL, NULL, NULL);
2461 2463  
2462 2464          if (!lerror) {
2463 2465                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2464 2466                  VN_RELE(svp);
2465 2467          }
2466 2468  
2467 2469          /*
2468 2470           * Force modified data and metadata out to stable storage.
2469 2471           */
2470 2472          (void) VOP_FSYNC(vp, 0, cr, NULL);
2471 2473  
2472 2474          VN_RELE(vp);
2473 2475  
2474 2476          *status = puterrno(error);
2475 2477          if (name != args->sla_tnm)
2476 2478                  kmem_free(name, MAXPATHLEN);
2477 2479  
2478 2480  }
2479 2481  void *
2480 2482  rfs_symlink_getfh(struct nfsslargs *args)
2481 2483  {
2482 2484          return (args->sla_from.da_fhandle);
2483 2485  }
2484 2486  
2485 2487  /*
2486 2488   * Make a directory.
2487 2489   * Create a directory with the given name, parent directory, and attributes.
2488 2490   * Returns a file handle and attributes for the new directory.
2489 2491   */
2490 2492  /* ARGSUSED */
2491 2493  void
2492 2494  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2493 2495      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2494 2496  {
2495 2497          int error;
2496 2498          struct vattr va;
2497 2499          vnode_t *dvp = NULL;
2498 2500          vnode_t *vp;
2499 2501          char *name = args->ca_da.da_name;
2500 2502  
2501 2503          /*
2502 2504           * Disallow NULL paths
2503 2505           */
2504 2506          if (name == NULL || *name == '\0') {
2505 2507                  dr->dr_status = NFSERR_ACCES;
2506 2508                  return;
2507 2509          }
2508 2510  
2509 2511          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2510 2512          if (vp == NULL) {
2511 2513                  dr->dr_status = NFSERR_STALE;
2512 2514                  return;
2513 2515          }
2514 2516  
2515 2517          if (rdonly(ro, vp)) {
2516 2518                  VN_RELE(vp);
2517 2519                  dr->dr_status = NFSERR_ROFS;
2518 2520                  return;
2519 2521          }
2520 2522  
2521 2523          error = sattr_to_vattr(args->ca_sa, &va);
2522 2524          if (error) {
2523 2525                  VN_RELE(vp);
2524 2526                  dr->dr_status = puterrno(error);
2525 2527                  return;
2526 2528          }
2527 2529  
2528 2530          if (!(va.va_mask & AT_MODE)) {
2529 2531                  VN_RELE(vp);
2530 2532                  dr->dr_status = NFSERR_INVAL;
2531 2533                  return;
2532 2534          }
2533 2535  
2534 2536          va.va_type = VDIR;
2535 2537          va.va_mask |= AT_TYPE;
2536 2538  
2537 2539          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2538 2540  
2539 2541          if (!error) {
2540 2542                  /*
2541 2543                   * Attribtutes of the newly created directory should
2542 2544                   * be returned to the client.
2543 2545                   */
2544 2546                  va.va_mask = AT_ALL; /* We want everything */
2545 2547                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2546 2548  
2547 2549                  /* check for overflows */
2548 2550                  if (!error) {
2549 2551                          acl_perm(vp, exi, &va, cr);
2550 2552                          error = vattr_to_nattr(&va, &dr->dr_attr);
2551 2553                          if (!error) {
2552 2554                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2553 2555                          }
2554 2556                  }
2555 2557                  /*
2556 2558                   * Force new data and metadata out to stable storage.
2557 2559                   */
2558 2560                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2559 2561                  VN_RELE(dvp);
2560 2562          }
2561 2563  
2562 2564          /*
2563 2565           * Force modified data and metadata out to stable storage.
2564 2566           */
2565 2567          (void) VOP_FSYNC(vp, 0, cr, NULL);
2566 2568  
2567 2569          VN_RELE(vp);
2568 2570  
2569 2571          dr->dr_status = puterrno(error);
2570 2572  
2571 2573  }
2572 2574  void *
2573 2575  rfs_mkdir_getfh(struct nfscreatargs *args)
2574 2576  {
2575 2577          return (args->ca_da.da_fhandle);
2576 2578  }
2577 2579  
2578 2580  /*
2579 2581   * Remove a directory.
2580 2582   * Remove the given directory name from the given parent directory.
2581 2583   */
2582 2584  /* ARGSUSED */
2583 2585  void
2584 2586  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2585 2587      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2586 2588  {
2587 2589          int error;
2588 2590          vnode_t *vp;
2589 2591  
2590 2592          /*
2591 2593           * Disallow NULL paths
2592 2594           */
2593 2595          if (da->da_name == NULL || *da->da_name == '\0') {
2594 2596                  *status = NFSERR_ACCES;
2595 2597                  return;
2596 2598          }
2597 2599  
2598 2600          vp = nfs_fhtovp(da->da_fhandle, exi);
2599 2601          if (vp == NULL) {
2600 2602                  *status = NFSERR_STALE;
2601 2603                  return;
2602 2604          }
2603 2605  
2604 2606          if (rdonly(ro, vp)) {
2605 2607                  VN_RELE(vp);
2606 2608                  *status = NFSERR_ROFS;
2607 2609                  return;
2608 2610          }
2609 2611  
2610 2612          /*
2611 2613           * VOP_RMDIR takes a third argument (the current
2612 2614           * directory of the process).  That's because someone
2613 2615           * wants to return EINVAL if one tries to remove ".".
2614 2616           * Of course, NFS servers have no idea what their
2615 2617           * clients' current directories are.  We fake it by
2616 2618           * supplying a vnode known to exist and illegal to
2617 2619           * remove.
2618 2620           */
2619 2621          error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2620 2622  
2621 2623          /*
2622 2624           * Force modified data and metadata out to stable storage.
2623 2625           */
2624 2626          (void) VOP_FSYNC(vp, 0, cr, NULL);
2625 2627  
2626 2628          VN_RELE(vp);
2627 2629  
2628 2630          /*
2629 2631           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2630 2632           * if the directory is not empty.  A System V NFS server
2631 2633           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2632 2634           * over the wire.
2633 2635           */
2634 2636          if (error == EEXIST)
2635 2637                  *status = NFSERR_NOTEMPTY;
2636 2638          else
2637 2639                  *status = puterrno(error);
2638 2640  
2639 2641  }
2640 2642  void *
2641 2643  rfs_rmdir_getfh(struct nfsdiropargs *da)
2642 2644  {
2643 2645          return (da->da_fhandle);
2644 2646  }
2645 2647  
2646 2648  /* ARGSUSED */
2647 2649  void
2648 2650  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2649 2651      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2650 2652  {
2651 2653          int error;
2652 2654          int iseof;
2653 2655          struct iovec iov;
2654 2656          struct uio uio;
2655 2657          vnode_t *vp;
2656 2658          char *ndata = NULL;
2657 2659          struct sockaddr *ca;
2658 2660          size_t nents;
2659 2661          int ret;
2660 2662  
2661 2663          vp = nfs_fhtovp(&rda->rda_fh, exi);
2662 2664          if (vp == NULL) {
2663 2665                  rd->rd_entries = NULL;
2664 2666                  rd->rd_status = NFSERR_STALE;
2665 2667                  return;
2666 2668          }
2667 2669  
2668 2670          if (vp->v_type != VDIR) {
2669 2671                  VN_RELE(vp);
2670 2672                  rd->rd_entries = NULL;
2671 2673                  rd->rd_status = NFSERR_NOTDIR;
2672 2674                  return;
2673 2675          }
2674 2676  
2675 2677          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2676 2678  
2677 2679          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2678 2680  
2679 2681          if (error) {
2680 2682                  rd->rd_entries = NULL;
2681 2683                  goto bad;
2682 2684          }
2683 2685  
2684 2686          if (rda->rda_count == 0) {
2685 2687                  rd->rd_entries = NULL;
2686 2688                  rd->rd_size = 0;
2687 2689                  rd->rd_eof = FALSE;
2688 2690                  goto bad;
2689 2691          }
2690 2692  
2691 2693          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2692 2694  
2693 2695          /*
2694 2696           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2695 2697           */
2696 2698          rd->rd_bufsize = (uint_t)rda->rda_count;
2697 2699          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2698 2700  
2699 2701          /*
2700 2702           * Set up io vector to read directory data
2701 2703           */
2702 2704          iov.iov_base = (caddr_t)rd->rd_entries;
2703 2705          iov.iov_len = rda->rda_count;
2704 2706          uio.uio_iov = &iov;
2705 2707          uio.uio_iovcnt = 1;
2706 2708          uio.uio_segflg = UIO_SYSSPACE;
2707 2709          uio.uio_extflg = UIO_COPY_CACHED;
2708 2710          uio.uio_loffset = (offset_t)rda->rda_offset;
2709 2711          uio.uio_resid = rda->rda_count;
2710 2712  
2711 2713          /*
2712 2714           * read directory
2713 2715           */
2714 2716          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2715 2717  
2716 2718          /*
2717 2719           * Clean up
2718 2720           */
2719 2721          if (!error) {
2720 2722                  /*
2721 2723                   * set size and eof
2722 2724                   */
2723 2725                  if (uio.uio_resid == rda->rda_count) {
2724 2726                          rd->rd_size = 0;
2725 2727                          rd->rd_eof = TRUE;
2726 2728                  } else {
2727 2729                          rd->rd_size = (uint32_t)(rda->rda_count -
2728 2730                              uio.uio_resid);
2729 2731                          rd->rd_eof = iseof ? TRUE : FALSE;
2730 2732                  }
2731 2733          }
2732 2734  
2733 2735          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2734 2736          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2735 2737          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2736 2738              rda->rda_count, &ndata);
2737 2739  
2738 2740          if (ret != 0) {
2739 2741                  size_t dropbytes;
2740 2742                  /*
2741 2743                   * We had to drop one or more entries in order to fit
2742 2744                   * during the character conversion.  We need to patch
2743 2745                   * up the size and eof info.
2744 2746                   */
2745 2747                  if (rd->rd_eof)
2746 2748                          rd->rd_eof = FALSE;
2747 2749                  dropbytes = nfscmd_dropped_entrysize(
2748 2750                      (struct dirent64 *)rd->rd_entries, nents, ret);
2749 2751                  rd->rd_size -= dropbytes;
2750 2752          }
2751 2753          if (ndata == NULL) {
2752 2754                  ndata = (char *)rd->rd_entries;
2753 2755          } else if (ndata != (char *)rd->rd_entries) {
2754 2756                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2755 2757                  rd->rd_entries = (void *)ndata;
2756 2758                  rd->rd_bufsize = rda->rda_count;
2757 2759          }
2758 2760  
2759 2761  bad:
2760 2762          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2761 2763  
2762 2764  #if 0 /* notyet */
2763 2765          /*
2764 2766           * Don't do this.  It causes local disk writes when just
2765 2767           * reading the file and the overhead is deemed larger
2766 2768           * than the benefit.
2767 2769           */
2768 2770          /*
2769 2771           * Force modified metadata out to stable storage.
2770 2772           */
2771 2773          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2772 2774  #endif
2773 2775  
2774 2776          VN_RELE(vp);
2775 2777  
2776 2778          rd->rd_status = puterrno(error);
2777 2779  
2778 2780  }
2779 2781  void *
2780 2782  rfs_readdir_getfh(struct nfsrddirargs *rda)
2781 2783  {
2782 2784          return (&rda->rda_fh);
2783 2785  }
2784 2786  void
2785 2787  rfs_rddirfree(struct nfsrddirres *rd)
2786 2788  {
2787 2789          if (rd->rd_entries != NULL)
2788 2790                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2789 2791  }
2790 2792  
2791 2793  /* ARGSUSED */
2792 2794  void
2793 2795  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2794 2796      struct svc_req *req, cred_t *cr, bool_t ro)
2795 2797  {
2796 2798          int error;
2797 2799          struct statvfs64 sb;
2798 2800          vnode_t *vp;
2799 2801  
2800 2802          vp = nfs_fhtovp(fh, exi);
2801 2803          if (vp == NULL) {
2802 2804                  fs->fs_status = NFSERR_STALE;
2803 2805                  return;
2804 2806          }
2805 2807  
2806 2808          error = VFS_STATVFS(vp->v_vfsp, &sb);
2807 2809  
2808 2810          if (!error) {
2809 2811                  fs->fs_tsize = nfstsize();
2810 2812                  fs->fs_bsize = sb.f_frsize;
2811 2813                  fs->fs_blocks = sb.f_blocks;
2812 2814                  fs->fs_bfree = sb.f_bfree;
2813 2815                  fs->fs_bavail = sb.f_bavail;
2814 2816          }
2815 2817  
2816 2818          VN_RELE(vp);
2817 2819  
2818 2820          fs->fs_status = puterrno(error);
2819 2821  
2820 2822  }
2821 2823  void *
2822 2824  rfs_statfs_getfh(fhandle_t *fh)
2823 2825  {
2824 2826          return (fh);
2825 2827  }
2826 2828  
2827 2829  static int
2828 2830  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2829 2831  {
2830 2832          vap->va_mask = 0;
2831 2833  
2832 2834          /*
2833 2835           * There was a sign extension bug in some VFS based systems
2834 2836           * which stored the mode as a short.  When it would get
2835 2837           * assigned to a u_long, no sign extension would occur.
2836 2838           * It needed to, but this wasn't noticed because sa_mode
2837 2839           * would then get assigned back to the short, thus ignoring
2838 2840           * the upper 16 bits of sa_mode.
2839 2841           *
2840 2842           * To make this implementation work for both broken
2841 2843           * clients and good clients, we check for both versions
2842 2844           * of the mode.
2843 2845           */
2844 2846          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2845 2847              sa->sa_mode != (uint32_t)-1) {
2846 2848                  vap->va_mask |= AT_MODE;
2847 2849                  vap->va_mode = sa->sa_mode;
2848 2850          }
2849 2851          if (sa->sa_uid != (uint32_t)-1) {
2850 2852                  vap->va_mask |= AT_UID;
2851 2853                  vap->va_uid = sa->sa_uid;
2852 2854          }
2853 2855          if (sa->sa_gid != (uint32_t)-1) {
2854 2856                  vap->va_mask |= AT_GID;
2855 2857                  vap->va_gid = sa->sa_gid;
2856 2858          }
2857 2859          if (sa->sa_size != (uint32_t)-1) {
2858 2860                  vap->va_mask |= AT_SIZE;
2859 2861                  vap->va_size = sa->sa_size;
2860 2862          }
2861 2863          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2862 2864              sa->sa_atime.tv_usec != (int32_t)-1) {
2863 2865  #ifndef _LP64
2864 2866                  /* return error if time overflow */
2865 2867                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2866 2868                          return (EOVERFLOW);
2867 2869  #endif
2868 2870                  vap->va_mask |= AT_ATIME;
2869 2871                  /*
2870 2872                   * nfs protocol defines times as unsigned so don't extend sign,
2871 2873                   * unless sysadmin set nfs_allow_preepoch_time.
2872 2874                   */
2873 2875                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2874 2876                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2875 2877          }
2876 2878          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2877 2879              sa->sa_mtime.tv_usec != (int32_t)-1) {
2878 2880  #ifndef _LP64
2879 2881                  /* return error if time overflow */
2880 2882                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2881 2883                          return (EOVERFLOW);
2882 2884  #endif
2883 2885                  vap->va_mask |= AT_MTIME;
2884 2886                  /*
2885 2887                   * nfs protocol defines times as unsigned so don't extend sign,
2886 2888                   * unless sysadmin set nfs_allow_preepoch_time.
2887 2889                   */
2888 2890                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2889 2891                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2890 2892          }
2891 2893          return (0);
2892 2894  }
2893 2895  
2894 2896  static const enum nfsftype vt_to_nf[] = {
2895 2897          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2896 2898  };
2897 2899  
2898 2900  /*
2899 2901   * check the following fields for overflow: nodeid, size, and time.
2900 2902   * There could be a problem when converting 64-bit LP64 fields
2901 2903   * into 32-bit ones.  Return an error if there is an overflow.
2902 2904   */
2903 2905  int
2904 2906  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2905 2907  {
2906 2908          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2907 2909          na->na_type = vt_to_nf[vap->va_type];
2908 2910  
2909 2911          if (vap->va_mode == (unsigned short) -1)
2910 2912                  na->na_mode = (uint32_t)-1;
2911 2913          else
2912 2914                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2913 2915  
2914 2916          if (vap->va_uid == (unsigned short)(-1))
2915 2917                  na->na_uid = (uint32_t)(-1);
2916 2918          else if (vap->va_uid == UID_NOBODY)
2917 2919                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2918 2920          else
2919 2921                  na->na_uid = vap->va_uid;
2920 2922  
2921 2923          if (vap->va_gid == (unsigned short)(-1))
2922 2924                  na->na_gid = (uint32_t)-1;
2923 2925          else if (vap->va_gid == GID_NOBODY)
2924 2926                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2925 2927          else
2926 2928                  na->na_gid = vap->va_gid;
2927 2929  
2928 2930          /*
2929 2931           * Do we need to check fsid for overflow?  It is 64-bit in the
2930 2932           * vattr, but are bigger than 32 bit values supported?
2931 2933           */
2932 2934          na->na_fsid = vap->va_fsid;
2933 2935  
2934 2936          na->na_nodeid = vap->va_nodeid;
2935 2937  
2936 2938          /*
2937 2939           * Check to make sure that the nodeid is representable over the
2938 2940           * wire without losing bits.
2939 2941           */
2940 2942          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2941 2943                  return (EFBIG);
2942 2944          na->na_nlink = vap->va_nlink;
2943 2945  
2944 2946          /*
2945 2947           * Check for big files here, instead of at the caller.  See
2946 2948           * comments in cstat for large special file explanation.
2947 2949           */
2948 2950          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2949 2951                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2950 2952                          return (EFBIG);
2951 2953                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2952 2954                          /* UNKNOWN_SIZE | OVERFLOW */
2953 2955                          na->na_size = MAXOFF32_T;
2954 2956                  } else
2955 2957                          na->na_size = vap->va_size;
2956 2958          } else
2957 2959                  na->na_size = vap->va_size;
2958 2960  
2959 2961          /*
2960 2962           * If the vnode times overflow the 32-bit times that NFS2
2961 2963           * uses on the wire then return an error.
2962 2964           */
2963 2965          if (!NFS_VAP_TIME_OK(vap)) {
2964 2966                  return (EOVERFLOW);
2965 2967          }
2966 2968          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2967 2969          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2968 2970  
2969 2971          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2970 2972          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2971 2973  
2972 2974          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2973 2975          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2974 2976  
2975 2977          /*
2976 2978           * If the dev_t will fit into 16 bits then compress
2977 2979           * it, otherwise leave it alone. See comments in
2978 2980           * nfs_client.c.
2979 2981           */
2980 2982          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2981 2983              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2982 2984                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2983 2985          else
2984 2986                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2985 2987  
2986 2988          na->na_blocks = vap->va_nblocks;
2987 2989          na->na_blocksize = vap->va_blksize;
2988 2990  
2989 2991          /*
2990 2992           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2991 2993           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2992 2994           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2993 2995           *
2994 2996           * BUYER BEWARE:
2995 2997           *  If you are porting the NFS to a non-Sun server, you probably
2996 2998           *  don't want to include the following block of code.  The
2997 2999           *  over-the-wire special file types will be changing with the
2998 3000           *  NFS Protocol Revision.
2999 3001           */
3000 3002          if (vap->va_type == VFIFO)
3001 3003                  NA_SETFIFO(na);
3002 3004          return (0);
3003 3005  }
3004 3006  
3005 3007  /*
3006 3008   * acl v2 support: returns approximate permission.
3007 3009   *      default: returns minimal permission (more restrictive)
3008 3010   *      aclok: returns maximal permission (less restrictive)
3009 3011   *      This routine changes the permissions that are alaredy in *va.
3010 3012   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3011 3013   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3012 3014   */
3013 3015  static void
3014 3016  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3015 3017  {
3016 3018          vsecattr_t      vsa;
3017 3019          int             aclcnt;
3018 3020          aclent_t        *aclentp;
3019 3021          mode_t          mask_perm;
3020 3022          mode_t          grp_perm;
3021 3023          mode_t          other_perm;
3022 3024          mode_t          other_orig;
3023 3025          int             error;
3024 3026  
3025 3027          /* dont care default acl */
3026 3028          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3027 3029          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3028 3030  
3029 3031          if (!error) {
3030 3032                  aclcnt = vsa.vsa_aclcnt;
3031 3033                  if (aclcnt > MIN_ACL_ENTRIES) {
3032 3034                          /* non-trivial ACL */
3033 3035                          aclentp = vsa.vsa_aclentp;
3034 3036                          if (exi->exi_export.ex_flags & EX_ACLOK) {
3035 3037                                  /* maximal permissions */
3036 3038                                  grp_perm = 0;
3037 3039                                  other_perm = 0;
3038 3040                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3039 3041                                          switch (aclentp->a_type) {
3040 3042                                          case USER_OBJ:
3041 3043                                                  break;
3042 3044                                          case USER:
3043 3045                                                  grp_perm |=
3044 3046                                                      aclentp->a_perm << 3;
3045 3047                                                  other_perm |= aclentp->a_perm;
3046 3048                                                  break;
3047 3049                                          case GROUP_OBJ:
3048 3050                                                  grp_perm |=
3049 3051                                                      aclentp->a_perm << 3;
3050 3052                                                  break;
3051 3053                                          case GROUP:
3052 3054                                                  other_perm |= aclentp->a_perm;
3053 3055                                                  break;
3054 3056                                          case OTHER_OBJ:
3055 3057                                                  other_orig = aclentp->a_perm;
3056 3058                                                  break;
3057 3059                                          case CLASS_OBJ:
3058 3060                                                  mask_perm = aclentp->a_perm;
3059 3061                                                  break;
3060 3062                                          default:
3061 3063                                                  break;
3062 3064                                          }
3063 3065                                  }
3064 3066                                  grp_perm &= mask_perm << 3;
3065 3067                                  other_perm &= mask_perm;
3066 3068                                  other_perm |= other_orig;
3067 3069  
3068 3070                          } else {
3069 3071                                  /* minimal permissions */
3070 3072                                  grp_perm = 070;
3071 3073                                  other_perm = 07;
3072 3074                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3073 3075                                          switch (aclentp->a_type) {
3074 3076                                          case USER_OBJ:
3075 3077                                                  break;
3076 3078                                          case USER:
3077 3079                                          case CLASS_OBJ:
3078 3080                                                  grp_perm &=
3079 3081                                                      aclentp->a_perm << 3;
3080 3082                                                  other_perm &=
3081 3083                                                      aclentp->a_perm;
3082 3084                                                  break;
3083 3085                                          case GROUP_OBJ:
3084 3086                                                  grp_perm &=
3085 3087                                                      aclentp->a_perm << 3;
3086 3088                                                  break;
3087 3089                                          case GROUP:
3088 3090                                                  other_perm &=
3089 3091                                                      aclentp->a_perm;
3090 3092                                                  break;
3091 3093                                          case OTHER_OBJ:
3092 3094                                                  other_perm &=
3093 3095                                                      aclentp->a_perm;
3094 3096                                                  break;
3095 3097                                          default:
3096 3098                                                  break;
3097 3099                                          }
3098 3100                                  }
3099 3101                          }
3100 3102                          /* copy to va */
3101 3103                          va->va_mode &= ~077;
3102 3104                          va->va_mode |= grp_perm | other_perm;
3103 3105                  }
3104 3106                  if (vsa.vsa_aclcnt)
3105 3107                          kmem_free(vsa.vsa_aclentp,
3106 3108                              vsa.vsa_aclcnt * sizeof (aclent_t));
3107 3109          }
3108 3110  }
3109 3111  
3110 3112  void
3111 3113  rfs_srvrinit(void)
3112 3114  {
3113 3115          nfs2_srv_caller_id = fs_new_caller_id();
3114 3116  }
3115 3117  
3116 3118  void
3117 3119  rfs_srvrfini(void)
3118 3120  {
3119 3121  }
3120 3122  
3121 3123  /* ARGSUSED */
3122 3124  void
3123 3125  rfs_srv_zone_init(nfs_globals_t *ng)
3124 3126  {
3125 3127          nfs_srv_t *ns;
3126 3128  
3127 3129          ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3128 3130  
3129 3131          mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3130 3132          ns->write_async = 1;
3131 3133  
3132 3134          ng->nfs_srv = ns;
3133 3135  }
3134 3136  
3135 3137  /* ARGSUSED */
3136 3138  void
3137 3139  rfs_srv_zone_fini(nfs_globals_t *ng)
3138 3140  {
3139 3141          nfs_srv_t *ns = ng->nfs_srv;
3140 3142  
3141 3143          ng->nfs_srv = NULL;
3142 3144  
3143 3145          mutex_destroy(&ns->async_write_lock);
3144 3146          kmem_free(ns, sizeof (*ns));
3145 3147  }
3146 3148  
3147 3149  static int
3148 3150  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3149 3151  {
3150 3152          struct clist    *wcl;
3151 3153          int             wlist_len;
3152 3154          uint32_t        count = rr->rr_count;
3153 3155  
3154 3156          wcl = ra->ra_wlist;
3155 3157  
3156 3158          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3157 3159                  return (FALSE);
3158 3160          }
3159 3161  
3160 3162          wcl = ra->ra_wlist;
3161 3163          rr->rr_ok.rrok_wlist_len = wlist_len;
3162 3164          rr->rr_ok.rrok_wlist = wcl;
3163 3165  
3164 3166          return (TRUE);
3165 3167  }

↓ open down ↓

2479 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX