Print this page
    
Revert exi_zone to exi_zoneid, and install exi_ne backpointer
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30   30   *      All rights reserved.
  31   31   */
  32   32  
  33   33  /*
  34   34   * Copyright 2018 Nexenta Systems, Inc.
  35   35   * Copyright (c) 2016 by Delphix. All rights reserved.
  36   36   */
  37   37  
  38   38  #include <sys/param.h>
  39   39  #include <sys/types.h>
  40   40  #include <sys/systm.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/stat.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/statvfs.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/kstat.h>
  52   52  #include <sys/dirent.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/vtrace.h>
  56   56  #include <sys/mode.h>
  57   57  #include <sys/acl.h>
  58   58  #include <sys/nbmlock.h>
  59   59  #include <sys/policy.h>
  60   60  #include <sys/sdt.h>
  61   61  
  62   62  #include <rpc/types.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/svc.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/export.h>
  68   68  #include <nfs/nfs_cmd.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/as.h>
  72   72  #include <vm/seg.h>
  73   73  #include <vm/seg_map.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  #include <sys/strsubr.h>
  77   77  
  78   78  struct rfs_async_write_list;
  79   79  
  80   80  /*
  81   81   * Zone globals of NFSv2 server
  82   82   */
  83   83  typedef struct nfs_srv {
  84   84          kmutex_t                        async_write_lock;
  85   85          struct rfs_async_write_list     *async_write_head;
  86   86  
  87   87          /*
  88   88           * enables write clustering if == 1
  89   89           */
  90   90          int             write_async;
  91   91  } nfs_srv_t;
  92   92  
  93   93  /*
  94   94   * These are the interface routines for the server side of the
  95   95   * Network File System.  See the NFS version 2 protocol specification
  96   96   * for a description of this interface.
  97   97   */
  98   98  
  99   99  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100  100  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101  101                          cred_t *);
 102  102  
 103  103  
 104  104  /*
 105  105   * Some "over the wire" UNIX file types.  These are encoded
 106  106   * into the mode.  This needs to be fixed in the next rev.
 107  107   */
 108  108  #define IFMT            0170000         /* type of file */
 109  109  #define IFCHR           0020000         /* character special */
 110  110  #define IFBLK           0060000         /* block special */
 111  111  #define IFSOCK          0140000         /* socket */
 112  112  
 113  113  u_longlong_t nfs2_srv_caller_id;
 114  114  
 115  115  static nfs_srv_t *
 116  116  nfs_get_srv(void)
 117  117  {
 118  118          nfs_globals_t *ng = zone_getspecific(nfssrv_zone_key, curzone);
 119  119          nfs_srv_t *srv = ng->nfs_srv;
 120  120          ASSERT(srv != NULL);
 121  121          return (srv);
 122  122  }
 123  123  
 124  124  /*
 125  125   * Get file attributes.
 126  126   * Returns the current attributes of the file with the given fhandle.
 127  127   */
 128  128  /* ARGSUSED */
 129  129  void
 130  130  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 131  131      struct svc_req *req, cred_t *cr, bool_t ro)
 132  132  {
 133  133          int error;
 134  134          vnode_t *vp;
 135  135          struct vattr va;
 136  136  
 137  137          vp = nfs_fhtovp(fhp, exi);
 138  138          if (vp == NULL) {
 139  139                  ns->ns_status = NFSERR_STALE;
 140  140                  return;
 141  141          }
 142  142  
 143  143          /*
 144  144           * Do the getattr.
 145  145           */
 146  146          va.va_mask = AT_ALL;    /* we want all the attributes */
 147  147  
 148  148          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 149  149  
 150  150          /* check for overflows */
 151  151          if (!error) {
 152  152                  /* Lie about the object type for a referral */
 153  153                  if (vn_is_nfs_reparse(vp, cr))
 154  154                          va.va_type = VLNK;
 155  155  
 156  156                  acl_perm(vp, exi, &va, cr);
 157  157                  error = vattr_to_nattr(&va, &ns->ns_attr);
 158  158          }
 159  159  
 160  160          VN_RELE(vp);
 161  161  
 162  162          ns->ns_status = puterrno(error);
 163  163  }
 164  164  void *
 165  165  rfs_getattr_getfh(fhandle_t *fhp)
 166  166  {
 167  167          return (fhp);
 168  168  }
 169  169  
 170  170  /*
 171  171   * Set file attributes.
 172  172   * Sets the attributes of the file with the given fhandle.  Returns
 173  173   * the new attributes.
 174  174   */
 175  175  /* ARGSUSED */
 176  176  void
 177  177  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 178  178      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 179  179  {
 180  180          int error;
 181  181          int flag;
 182  182          int in_crit = 0;
 183  183          vnode_t *vp;
 184  184          struct vattr va;
 185  185          struct vattr bva;
 186  186          struct flock64 bf;
 187  187          caller_context_t ct;
 188  188  
 189  189  
 190  190          vp = nfs_fhtovp(&args->saa_fh, exi);
 191  191          if (vp == NULL) {
 192  192                  ns->ns_status = NFSERR_STALE;
 193  193                  return;
 194  194          }
 195  195  
 196  196          if (rdonly(ro, vp)) {
 197  197                  VN_RELE(vp);
 198  198                  ns->ns_status = NFSERR_ROFS;
 199  199                  return;
 200  200          }
 201  201  
 202  202          error = sattr_to_vattr(&args->saa_sa, &va);
 203  203          if (error) {
 204  204                  VN_RELE(vp);
 205  205                  ns->ns_status = puterrno(error);
 206  206                  return;
 207  207          }
 208  208  
 209  209          /*
 210  210           * If the client is requesting a change to the mtime,
 211  211           * but the nanosecond field is set to 1 billion, then
 212  212           * this is a flag to the server that it should set the
 213  213           * atime and mtime fields to the server's current time.
 214  214           * The 1 billion number actually came from the client
 215  215           * as 1 million, but the units in the over the wire
 216  216           * request are microseconds instead of nanoseconds.
 217  217           *
 218  218           * This is an overload of the protocol and should be
 219  219           * documented in the NFS Version 2 protocol specification.
 220  220           */
 221  221          if (va.va_mask & AT_MTIME) {
 222  222                  if (va.va_mtime.tv_nsec == 1000000000) {
 223  223                          gethrestime(&va.va_mtime);
 224  224                          va.va_atime = va.va_mtime;
 225  225                          va.va_mask |= AT_ATIME;
 226  226                          flag = 0;
 227  227                  } else
 228  228                          flag = ATTR_UTIME;
 229  229          } else
 230  230                  flag = 0;
 231  231  
 232  232          /*
 233  233           * If the filesystem is exported with nosuid, then mask off
 234  234           * the setuid and setgid bits.
 235  235           */
 236  236          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 237  237              (exi->exi_export.ex_flags & EX_NOSUID))
 238  238                  va.va_mode &= ~(VSUID | VSGID);
 239  239  
 240  240          ct.cc_sysid = 0;
 241  241          ct.cc_pid = 0;
 242  242          ct.cc_caller_id = nfs2_srv_caller_id;
 243  243          ct.cc_flags = CC_DONTBLOCK;
 244  244  
 245  245          /*
 246  246           * We need to specially handle size changes because it is
 247  247           * possible for the client to create a file with modes
 248  248           * which indicate read-only, but with the file opened for
 249  249           * writing.  If the client then tries to set the size of
 250  250           * the file, then the normal access checking done in
 251  251           * VOP_SETATTR would prevent the client from doing so,
 252  252           * although it should be legal for it to do so.  To get
 253  253           * around this, we do the access checking for ourselves
 254  254           * and then use VOP_SPACE which doesn't do the access
 255  255           * checking which VOP_SETATTR does. VOP_SPACE can only
 256  256           * operate on VREG files, let VOP_SETATTR handle the other
 257  257           * extremely rare cases.
 258  258           * Also the client should not be allowed to change the
 259  259           * size of the file if there is a conflicting non-blocking
 260  260           * mandatory lock in the region of change.
 261  261           */
 262  262          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 263  263                  if (nbl_need_check(vp)) {
 264  264                          nbl_start_crit(vp, RW_READER);
 265  265                          in_crit = 1;
 266  266                  }
 267  267  
 268  268                  bva.va_mask = AT_UID | AT_SIZE;
 269  269  
 270  270                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 271  271  
 272  272                  if (error) {
 273  273                          if (in_crit)
 274  274                                  nbl_end_crit(vp);
 275  275                          VN_RELE(vp);
 276  276                          ns->ns_status = puterrno(error);
 277  277                          return;
 278  278                  }
 279  279  
 280  280                  if (in_crit) {
 281  281                          u_offset_t offset;
 282  282                          ssize_t length;
 283  283  
 284  284                          if (va.va_size < bva.va_size) {
 285  285                                  offset = va.va_size;
 286  286                                  length = bva.va_size - va.va_size;
 287  287                          } else {
 288  288                                  offset = bva.va_size;
 289  289                                  length = va.va_size - bva.va_size;
 290  290                          }
 291  291                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 292  292                              NULL)) {
 293  293                                  error = EACCES;
 294  294                          }
 295  295                  }
 296  296  
 297  297                  if (crgetuid(cr) == bva.va_uid && !error &&
 298  298                      va.va_size != bva.va_size) {
 299  299                          va.va_mask &= ~AT_SIZE;
 300  300                          bf.l_type = F_WRLCK;
 301  301                          bf.l_whence = 0;
 302  302                          bf.l_start = (off64_t)va.va_size;
 303  303                          bf.l_len = 0;
 304  304                          bf.l_sysid = 0;
 305  305                          bf.l_pid = 0;
 306  306  
 307  307                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 308  308                              (offset_t)va.va_size, cr, &ct);
 309  309                  }
 310  310                  if (in_crit)
 311  311                          nbl_end_crit(vp);
 312  312          } else
 313  313                  error = 0;
 314  314  
 315  315          /*
 316  316           * Do the setattr.
 317  317           */
 318  318          if (!error && va.va_mask) {
 319  319                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 320  320          }
 321  321  
 322  322          /*
 323  323           * check if the monitor on either vop_space or vop_setattr detected
 324  324           * a delegation conflict and if so, mark the thread flag as
 325  325           * wouldblock so that the response is dropped and the client will
 326  326           * try again.
 327  327           */
 328  328          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 329  329                  VN_RELE(vp);
 330  330                  curthread->t_flag |= T_WOULDBLOCK;
 331  331                  return;
 332  332          }
 333  333  
 334  334          if (!error) {
 335  335                  va.va_mask = AT_ALL;    /* get everything */
 336  336  
 337  337                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 338  338  
 339  339                  /* check for overflows */
 340  340                  if (!error) {
 341  341                          acl_perm(vp, exi, &va, cr);
 342  342                          error = vattr_to_nattr(&va, &ns->ns_attr);
 343  343                  }
 344  344          }
 345  345  
 346  346          ct.cc_flags = 0;
 347  347  
 348  348          /*
 349  349           * Force modified metadata out to stable storage.
 350  350           */
 351  351          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 352  352  
 353  353          VN_RELE(vp);
 354  354  
 355  355          ns->ns_status = puterrno(error);
 356  356  }
 357  357  void *
 358  358  rfs_setattr_getfh(struct nfssaargs *args)
 359  359  {
 360  360          return (&args->saa_fh);
 361  361  }
 362  362  
 363  363  /* Change and release @exip and @vpp only in success */
 364  364  int
 365  365  rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 366  366  {
 367  367          struct exportinfo *exi;
 368  368          vnode_t *vp = *vpp;
 369  369          fid_t fid;
 370  370          int error;
 371  371  
 372  372          VN_HOLD(vp);
 373  373  
 374  374          if ((error = traverse(&vp)) != 0) {
 375  375                  VN_RELE(vp);
 376  376                  return (error);
 377  377          }
 378  378  
 379  379          bzero(&fid, sizeof (fid));
 380  380          fid.fid_len = MAXFIDSZ;
 381  381          error = VOP_FID(vp, &fid, NULL);
 382  382          if (error) {
 383  383                  VN_RELE(vp);
 384  384                  return (error);
 385  385          }
 386  386  
 387  387          exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 388  388          if (exi == NULL ||
 389  389              (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 390  390                  /*
 391  391                   * It is not error, just subdir is not exported
 392  392                   * or "nohide" is not set
 393  393                   */
 394  394                  if (exi != NULL)
 395  395                          exi_rele(exi);
 396  396                  VN_RELE(vp);
 397  397          } else {
 398  398                  /* go to submount */
 399  399                  exi_rele(*exip);
 400  400                  *exip = exi;
 401  401  
 402  402                  VN_RELE(*vpp);
 403  403                  *vpp = vp;
 404  404          }
 405  405  
 406  406          return (0);
 407  407  }
 408  408  
 409  409  /*
  
    | 
      ↓ open down ↓ | 
    409 lines elided | 
    
      ↑ open up ↑ | 
  
 410  410   * Given mounted "dvp" and "exi", go upper mountpoint
 411  411   * with dvp/exi correction
 412  412   * Return 0 in success
 413  413   */
 414  414  int
 415  415  rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 416  416  {
 417  417          struct exportinfo *exi;
 418  418          vnode_t *dvp = *dvpp;
 419  419  
 420      -        ASSERT3P((*exip)->exi_zone, ==, curzone);
      420 +        ASSERT3U((*exip)->exi_zoneid, ==, curzone->zone_id);
 421  421          ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 422  422  
 423  423          VN_HOLD(dvp);
 424  424          dvp = untraverse(dvp);
 425  425          exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 426  426          if (exi == NULL) {
 427  427                  VN_RELE(dvp);
 428  428                  return (-1);
 429  429          }
 430  430  
 431      -        ASSERT3P(exi->exi_zone, ==, curzone);
      431 +        ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 432  432          exi_rele(*exip);
 433  433          *exip = exi;
 434  434          VN_RELE(*dvpp);
 435  435          *dvpp = dvp;
 436  436  
 437  437          return (0);
 438  438  }
 439  439  /*
 440  440   * Directory lookup.
 441  441   * Returns an fhandle and file attributes for file name in a directory.
 442  442   */
 443  443  /* ARGSUSED */
 444  444  void
 445  445  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 446  446      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 447  447  {
 448  448          int error;
 449  449          vnode_t *dvp;
 450  450          vnode_t *vp;
 451  451          struct vattr va;
 452  452          fhandle_t *fhp = da->da_fhandle;
 453  453          struct sec_ol sec = {0, 0};
 454  454          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 455  455          char *name;
 456  456          struct sockaddr *ca;
 457  457  
 458  458          /*
 459  459           * Trusted Extension doesn't support NFSv2. MOUNT
 460  460           * will reject v2 clients. Need to prevent v2 client
 461  461           * access via WebNFS here.
 462  462           */
 463  463          if (is_system_labeled() && req->rq_vers == 2) {
 464  464                  dr->dr_status = NFSERR_ACCES;
 465  465                  return;
 466  466          }
 467  467  
 468  468          /*
 469  469           * Disallow NULL paths
 470  470           */
 471  471          if (da->da_name == NULL || *da->da_name == '\0') {
 472  472                  dr->dr_status = NFSERR_ACCES;
 473  473                  return;
 474  474          }
 475  475  
 476  476          /*
 477  477           * Allow lookups from the root - the default
 478  478           * location of the public filehandle.
 479  479           */
 480  480          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 481  481                  dvp = ZONE_ROOTVP();
  
    | 
      ↓ open down ↓ | 
    40 lines elided | 
    
      ↑ open up ↑ | 
  
 482  482                  VN_HOLD(dvp);
 483  483          } else {
 484  484                  dvp = nfs_fhtovp(fhp, exi);
 485  485                  if (dvp == NULL) {
 486  486                          dr->dr_status = NFSERR_STALE;
 487  487                          return;
 488  488                  }
 489  489          }
 490  490  
 491  491          exi_hold(exi);
 492      -        ASSERT3P(exi->exi_zone, ==, curzone);
      492 +        ASSERT3U(exi->exi_zoneid, ==, curzone->zone_id);
 493  493  
 494  494          /*
 495  495           * Not allow lookup beyond root.
 496  496           * If the filehandle matches a filehandle of the exi,
 497  497           * then the ".." refers beyond the root of an exported filesystem.
 498  498           */
 499  499          if (strcmp(da->da_name, "..") == 0 &&
 500  500              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 501  501                  if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 502  502                      ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {
 503  503                          /*
 504  504                           * special case for ".." and 'nohide'exported root
 505  505                           */
 506  506                          if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 507  507                                  error = NFSERR_ACCES;
 508  508                                  goto out;
 509  509                          }
 510  510                  } else  {
 511  511                          error = NFSERR_NOENT;
 512  512                          goto out;
 513  513                  }
 514  514          }
 515  515  
 516  516          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 517  517          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 518  518              MAXPATHLEN);
 519  519  
 520  520          if (name == NULL) {
 521  521                  error = NFSERR_ACCES;
 522  522                  goto out;
 523  523          }
 524  524  
 525  525          /*
 526  526           * If the public filehandle is used then allow
 527  527           * a multi-component lookup, i.e. evaluate
 528  528           * a pathname and follow symbolic links if
 529  529           * necessary.
 530  530           *
 531  531           * This may result in a vnode in another filesystem
 532  532           * which is OK as long as the filesystem is exported.
 533  533           */
 534  534          if (PUBLIC_FH2(fhp)) {
 535  535                  publicfh_flag = TRUE;
 536  536  
 537  537                  exi_rele(exi);
 538  538                  exi = NULL;
 539  539  
 540  540                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 541  541                      &sec);
 542  542          } else {
 543  543                  /*
 544  544                   * Do a normal single component lookup.
 545  545                   */
 546  546                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 547  547                      NULL, NULL, NULL);
 548  548          }
 549  549  
 550  550          if (name != da->da_name)
 551  551                  kmem_free(name, MAXPATHLEN);
 552  552  
 553  553          if (error == 0 && vn_ismntpt(vp)) {
 554  554                  error = rfs_cross_mnt(&vp, &exi);
 555  555                  if (error)
 556  556                          VN_RELE(vp);
 557  557          }
 558  558  
 559  559          if (!error) {
 560  560                  va.va_mask = AT_ALL;    /* we want everything */
 561  561  
 562  562                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 563  563  
 564  564                  /* check for overflows */
 565  565                  if (!error) {
 566  566                          acl_perm(vp, exi, &va, cr);
 567  567                          error = vattr_to_nattr(&va, &dr->dr_attr);
 568  568                          if (!error) {
 569  569                                  if (sec.sec_flags & SEC_QUERY)
 570  570                                          error = makefh_ol(&dr->dr_fhandle, exi,
 571  571                                              sec.sec_index);
 572  572                                  else {
 573  573                                          error = makefh(&dr->dr_fhandle, vp,
 574  574                                              exi);
 575  575                                          if (!error && publicfh_flag &&
 576  576                                              !chk_clnt_sec(exi, req))
 577  577                                                  auth_weak = TRUE;
 578  578                                  }
 579  579                          }
 580  580                  }
 581  581                  VN_RELE(vp);
 582  582          }
 583  583  
 584  584  out:
 585  585          VN_RELE(dvp);
 586  586  
 587  587          if (exi != NULL)
 588  588                  exi_rele(exi);
 589  589  
 590  590          /*
 591  591           * If it's public fh, no 0x81, and client's flavor is
 592  592           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 593  593           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 594  594           */
 595  595          if (auth_weak)
 596  596                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 597  597          else
 598  598                  dr->dr_status = puterrno(error);
 599  599  }
 600  600  void *
 601  601  rfs_lookup_getfh(struct nfsdiropargs *da)
 602  602  {
 603  603          return (da->da_fhandle);
 604  604  }
 605  605  
 606  606  /*
 607  607   * Read symbolic link.
 608  608   * Returns the string in the symbolic link at the given fhandle.
 609  609   */
 610  610  /* ARGSUSED */
 611  611  void
 612  612  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 613  613      struct svc_req *req, cred_t *cr, bool_t ro)
 614  614  {
 615  615          int error;
 616  616          struct iovec iov;
 617  617          struct uio uio;
 618  618          vnode_t *vp;
 619  619          struct vattr va;
 620  620          struct sockaddr *ca;
 621  621          char *name = NULL;
 622  622          int is_referral = 0;
 623  623  
 624  624          vp = nfs_fhtovp(fhp, exi);
 625  625          if (vp == NULL) {
 626  626                  rl->rl_data = NULL;
 627  627                  rl->rl_status = NFSERR_STALE;
 628  628                  return;
 629  629          }
 630  630  
 631  631          va.va_mask = AT_MODE;
 632  632  
 633  633          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 634  634  
 635  635          if (error) {
 636  636                  VN_RELE(vp);
 637  637                  rl->rl_data = NULL;
 638  638                  rl->rl_status = puterrno(error);
 639  639                  return;
 640  640          }
 641  641  
 642  642          if (MANDLOCK(vp, va.va_mode)) {
 643  643                  VN_RELE(vp);
 644  644                  rl->rl_data = NULL;
 645  645                  rl->rl_status = NFSERR_ACCES;
 646  646                  return;
 647  647          }
 648  648  
 649  649          /* We lied about the object type for a referral */
 650  650          if (vn_is_nfs_reparse(vp, cr))
 651  651                  is_referral = 1;
 652  652  
 653  653          /*
 654  654           * XNFS and RFC1094 require us to return ENXIO if argument
 655  655           * is not a link. BUGID 1138002.
 656  656           */
 657  657          if (vp->v_type != VLNK && !is_referral) {
 658  658                  VN_RELE(vp);
 659  659                  rl->rl_data = NULL;
 660  660                  rl->rl_status = NFSERR_NXIO;
 661  661                  return;
 662  662          }
 663  663  
 664  664          /*
 665  665           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 666  666           */
 667  667          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 668  668  
 669  669          if (is_referral) {
 670  670                  char *s;
 671  671                  size_t strsz;
 672  672  
 673  673                  /* Get an artificial symlink based on a referral */
 674  674                  s = build_symlink(vp, cr, &strsz);
 675  675                  global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 676  676                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 677  677                      vnode_t *, vp, char *, s);
 678  678                  if (s == NULL)
 679  679                          error = EINVAL;
 680  680                  else {
 681  681                          error = 0;
 682  682                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 683  683                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 684  684                          kmem_free(s, strsz);
 685  685                  }
 686  686  
 687  687          } else {
 688  688  
 689  689                  /*
 690  690                   * Set up io vector to read sym link data
 691  691                   */
 692  692                  iov.iov_base = rl->rl_data;
 693  693                  iov.iov_len = NFS_MAXPATHLEN;
 694  694                  uio.uio_iov = &iov;
 695  695                  uio.uio_iovcnt = 1;
 696  696                  uio.uio_segflg = UIO_SYSSPACE;
 697  697                  uio.uio_extflg = UIO_COPY_CACHED;
 698  698                  uio.uio_loffset = (offset_t)0;
 699  699                  uio.uio_resid = NFS_MAXPATHLEN;
 700  700  
 701  701                  /*
 702  702                   * Do the readlink.
 703  703                   */
 704  704                  error = VOP_READLINK(vp, &uio, cr, NULL);
 705  705  
 706  706                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 707  707  
 708  708                  if (!error)
 709  709                          rl->rl_data[rl->rl_count] = '\0';
 710  710  
 711  711          }
 712  712  
 713  713  
 714  714          VN_RELE(vp);
 715  715  
 716  716          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 717  717          name = nfscmd_convname(ca, exi, rl->rl_data,
 718  718              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 719  719  
 720  720          if (name != NULL && name != rl->rl_data) {
 721  721                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 722  722                  rl->rl_data = name;
 723  723          }
 724  724  
 725  725          /*
 726  726           * XNFS and RFC1094 require us to return ENXIO if argument
 727  727           * is not a link. UFS returns EINVAL if this is the case,
 728  728           * so we do the mapping here. BUGID 1138002.
 729  729           */
 730  730          if (error == EINVAL)
 731  731                  rl->rl_status = NFSERR_NXIO;
 732  732          else
 733  733                  rl->rl_status = puterrno(error);
 734  734  
 735  735  }
 736  736  void *
 737  737  rfs_readlink_getfh(fhandle_t *fhp)
 738  738  {
 739  739          return (fhp);
 740  740  }
 741  741  /*
 742  742   * Free data allocated by rfs_readlink
 743  743   */
 744  744  void
 745  745  rfs_rlfree(struct nfsrdlnres *rl)
 746  746  {
 747  747          if (rl->rl_data != NULL)
 748  748                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 749  749  }
 750  750  
 751  751  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 752  752  
 753  753  /*
 754  754   * Read data.
 755  755   * Returns some data read from the file at the given fhandle.
 756  756   */
 757  757  /* ARGSUSED */
 758  758  void
 759  759  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 760  760      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 761  761  {
 762  762          vnode_t *vp;
 763  763          int error;
 764  764          struct vattr va;
 765  765          struct iovec iov;
 766  766          struct uio uio;
 767  767          mblk_t *mp;
 768  768          int alloc_err = 0;
 769  769          int in_crit = 0;
 770  770          caller_context_t ct;
 771  771  
 772  772          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 773  773          if (vp == NULL) {
 774  774                  rr->rr_data = NULL;
 775  775                  rr->rr_status = NFSERR_STALE;
 776  776                  return;
 777  777          }
 778  778  
 779  779          if (vp->v_type != VREG) {
 780  780                  VN_RELE(vp);
 781  781                  rr->rr_data = NULL;
 782  782                  rr->rr_status = NFSERR_ISDIR;
 783  783                  return;
 784  784          }
 785  785  
 786  786          ct.cc_sysid = 0;
 787  787          ct.cc_pid = 0;
 788  788          ct.cc_caller_id = nfs2_srv_caller_id;
 789  789          ct.cc_flags = CC_DONTBLOCK;
 790  790  
 791  791          /*
 792  792           * Enter the critical region before calling VOP_RWLOCK
 793  793           * to avoid a deadlock with write requests.
 794  794           */
 795  795          if (nbl_need_check(vp)) {
 796  796                  nbl_start_crit(vp, RW_READER);
 797  797                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 798  798                      0, NULL)) {
 799  799                          nbl_end_crit(vp);
 800  800                          VN_RELE(vp);
 801  801                          rr->rr_data = NULL;
 802  802                          rr->rr_status = NFSERR_ACCES;
 803  803                          return;
 804  804                  }
 805  805                  in_crit = 1;
 806  806          }
 807  807  
 808  808          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 809  809  
 810  810          /* check if a monitor detected a delegation conflict */
 811  811          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 812  812                  if (in_crit)
 813  813                          nbl_end_crit(vp);
 814  814                  VN_RELE(vp);
 815  815                  /* mark as wouldblock so response is dropped */
 816  816                  curthread->t_flag |= T_WOULDBLOCK;
 817  817  
 818  818                  rr->rr_data = NULL;
 819  819                  return;
 820  820          }
 821  821  
 822  822          va.va_mask = AT_ALL;
 823  823  
 824  824          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 825  825  
 826  826          if (error) {
 827  827                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 828  828                  if (in_crit)
 829  829                          nbl_end_crit(vp);
 830  830  
 831  831                  VN_RELE(vp);
 832  832                  rr->rr_data = NULL;
 833  833                  rr->rr_status = puterrno(error);
 834  834  
 835  835                  return;
 836  836          }
 837  837  
 838  838          /*
 839  839           * This is a kludge to allow reading of files created
 840  840           * with no read permission.  The owner of the file
 841  841           * is always allowed to read it.
 842  842           */
 843  843          if (crgetuid(cr) != va.va_uid) {
 844  844                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 845  845  
 846  846                  if (error) {
 847  847                          /*
 848  848                           * Exec is the same as read over the net because
 849  849                           * of demand loading.
 850  850                           */
 851  851                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 852  852                  }
 853  853                  if (error) {
 854  854                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 855  855                          if (in_crit)
 856  856                                  nbl_end_crit(vp);
 857  857                          VN_RELE(vp);
 858  858                          rr->rr_data = NULL;
 859  859                          rr->rr_status = puterrno(error);
 860  860  
 861  861                          return;
 862  862                  }
 863  863          }
 864  864  
 865  865          if (MANDLOCK(vp, va.va_mode)) {
 866  866                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 867  867                  if (in_crit)
 868  868                          nbl_end_crit(vp);
 869  869  
 870  870                  VN_RELE(vp);
 871  871                  rr->rr_data = NULL;
 872  872                  rr->rr_status = NFSERR_ACCES;
 873  873  
 874  874                  return;
 875  875          }
 876  876  
 877  877          rr->rr_ok.rrok_wlist_len = 0;
 878  878          rr->rr_ok.rrok_wlist = NULL;
 879  879  
 880  880          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 881  881                  rr->rr_count = 0;
 882  882                  rr->rr_data = NULL;
 883  883                  /*
 884  884                   * In this case, status is NFS_OK, but there is no data
 885  885                   * to encode. So set rr_mp to NULL.
 886  886                   */
 887  887                  rr->rr_mp = NULL;
 888  888                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 889  889                  if (rr->rr_ok.rrok_wlist)
 890  890                          clist_zero_len(rr->rr_ok.rrok_wlist);
 891  891                  goto done;
 892  892          }
 893  893  
 894  894          if (ra->ra_wlist) {
 895  895                  mp = NULL;
 896  896                  rr->rr_mp = NULL;
 897  897                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 898  898                  if (ra->ra_count > iov.iov_len) {
 899  899                          rr->rr_data = NULL;
 900  900                          rr->rr_status = NFSERR_INVAL;
 901  901                          goto done;
 902  902                  }
 903  903          } else {
 904  904                  /*
 905  905                   * mp will contain the data to be sent out in the read reply.
 906  906                   * This will be freed after the reply has been sent out (by the
 907  907                   * driver).
 908  908                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 909  909                   * that the call to xdrmblk_putmblk() never fails.
 910  910                   */
 911  911                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 912  912                      &alloc_err);
 913  913                  ASSERT(mp != NULL);
 914  914                  ASSERT(alloc_err == 0);
 915  915  
 916  916                  rr->rr_mp = mp;
 917  917  
 918  918                  /*
 919  919                   * Set up io vector
 920  920                   */
 921  921                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 922  922                  iov.iov_len = ra->ra_count;
 923  923          }
 924  924  
 925  925          uio.uio_iov = &iov;
 926  926          uio.uio_iovcnt = 1;
 927  927          uio.uio_segflg = UIO_SYSSPACE;
 928  928          uio.uio_extflg = UIO_COPY_CACHED;
 929  929          uio.uio_loffset = (offset_t)ra->ra_offset;
 930  930          uio.uio_resid = ra->ra_count;
 931  931  
 932  932          error = VOP_READ(vp, &uio, 0, cr, &ct);
 933  933  
 934  934          if (error) {
 935  935                  if (mp)
 936  936                          freeb(mp);
 937  937  
 938  938                  /*
 939  939                   * check if a monitor detected a delegation conflict and
 940  940                   * mark as wouldblock so response is dropped
 941  941                   */
 942  942                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 943  943                          curthread->t_flag |= T_WOULDBLOCK;
 944  944                  else
 945  945                          rr->rr_status = puterrno(error);
 946  946  
 947  947                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 948  948                  if (in_crit)
 949  949                          nbl_end_crit(vp);
 950  950  
 951  951                  VN_RELE(vp);
 952  952                  rr->rr_data = NULL;
 953  953  
 954  954                  return;
 955  955          }
 956  956  
 957  957          /*
 958  958           * Get attributes again so we can send the latest access
 959  959           * time to the client side for its cache.
 960  960           */
 961  961          va.va_mask = AT_ALL;
 962  962  
 963  963          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 964  964  
 965  965          if (error) {
 966  966                  if (mp)
 967  967                          freeb(mp);
 968  968  
 969  969                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 970  970                  if (in_crit)
 971  971                          nbl_end_crit(vp);
 972  972  
 973  973                  VN_RELE(vp);
 974  974                  rr->rr_data = NULL;
 975  975                  rr->rr_status = puterrno(error);
 976  976  
 977  977                  return;
 978  978          }
 979  979  
 980  980          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 981  981  
 982  982          if (mp) {
 983  983                  rr->rr_data = (char *)mp->b_datap->db_base;
 984  984          } else {
 985  985                  if (ra->ra_wlist) {
 986  986                          rr->rr_data = (caddr_t)iov.iov_base;
 987  987                          if (!rdma_setup_read_data2(ra, rr)) {
 988  988                                  rr->rr_data = NULL;
 989  989                                  rr->rr_status = puterrno(NFSERR_INVAL);
 990  990                          }
 991  991                  }
 992  992          }
 993  993  done:
 994  994          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 995  995          if (in_crit)
 996  996                  nbl_end_crit(vp);
 997  997  
 998  998          acl_perm(vp, exi, &va, cr);
 999  999  
1000 1000          /* check for overflows */
1001 1001          error = vattr_to_nattr(&va, &rr->rr_attr);
1002 1002  
1003 1003          VN_RELE(vp);
1004 1004  
1005 1005          rr->rr_status = puterrno(error);
1006 1006  }
1007 1007  
1008 1008  /*
1009 1009   * Free data allocated by rfs_read
1010 1010   */
1011 1011  void
1012 1012  rfs_rdfree(struct nfsrdresult *rr)
1013 1013  {
1014 1014          mblk_t *mp;
1015 1015  
1016 1016          if (rr->rr_status == NFS_OK) {
1017 1017                  mp = rr->rr_mp;
1018 1018                  if (mp != NULL)
1019 1019                          freeb(mp);
1020 1020          }
1021 1021  }
1022 1022  
1023 1023  void *
1024 1024  rfs_read_getfh(struct nfsreadargs *ra)
1025 1025  {
1026 1026          return (&ra->ra_fhandle);
1027 1027  }
1028 1028  
1029 1029  #define MAX_IOVECS      12
1030 1030  
1031 1031  #ifdef DEBUG
1032 1032  static int rfs_write_sync_hits = 0;
1033 1033  static int rfs_write_sync_misses = 0;
1034 1034  #endif
1035 1035  
1036 1036  /*
1037 1037   * Write data to file.
1038 1038   * Returns attributes of a file after writing some data to it.
1039 1039   *
1040 1040   * Any changes made here, especially in error handling might have
1041 1041   * to also be done in rfs_write (which clusters write requests).
1042 1042   */
1043 1043  /* ARGSUSED */
1044 1044  void
1045 1045  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1046 1046      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1047 1047  {
1048 1048          int error;
1049 1049          vnode_t *vp;
1050 1050          rlim64_t rlimit;
1051 1051          struct vattr va;
1052 1052          struct uio uio;
1053 1053          struct iovec iov[MAX_IOVECS];
1054 1054          mblk_t *m;
1055 1055          struct iovec *iovp;
1056 1056          int iovcnt;
1057 1057          cred_t *savecred;
1058 1058          int in_crit = 0;
1059 1059          caller_context_t ct;
1060 1060  
1061 1061          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1062 1062          if (vp == NULL) {
1063 1063                  ns->ns_status = NFSERR_STALE;
1064 1064                  return;
1065 1065          }
1066 1066  
1067 1067          if (rdonly(ro, vp)) {
1068 1068                  VN_RELE(vp);
1069 1069                  ns->ns_status = NFSERR_ROFS;
1070 1070                  return;
1071 1071          }
1072 1072  
1073 1073          if (vp->v_type != VREG) {
1074 1074                  VN_RELE(vp);
1075 1075                  ns->ns_status = NFSERR_ISDIR;
1076 1076                  return;
1077 1077          }
1078 1078  
1079 1079          ct.cc_sysid = 0;
1080 1080          ct.cc_pid = 0;
1081 1081          ct.cc_caller_id = nfs2_srv_caller_id;
1082 1082          ct.cc_flags = CC_DONTBLOCK;
1083 1083  
1084 1084          va.va_mask = AT_UID|AT_MODE;
1085 1085  
1086 1086          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1087 1087  
1088 1088          if (error) {
1089 1089                  VN_RELE(vp);
1090 1090                  ns->ns_status = puterrno(error);
1091 1091  
1092 1092                  return;
1093 1093          }
1094 1094  
1095 1095          if (crgetuid(cr) != va.va_uid) {
1096 1096                  /*
1097 1097                   * This is a kludge to allow writes of files created
1098 1098                   * with read only permission.  The owner of the file
1099 1099                   * is always allowed to write it.
1100 1100                   */
1101 1101                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1102 1102  
1103 1103                  if (error) {
1104 1104                          VN_RELE(vp);
1105 1105                          ns->ns_status = puterrno(error);
1106 1106                          return;
1107 1107                  }
1108 1108          }
1109 1109  
1110 1110          /*
1111 1111           * Can't access a mandatory lock file.  This might cause
1112 1112           * the NFS service thread to block forever waiting for a
1113 1113           * lock to be released that will never be released.
1114 1114           */
1115 1115          if (MANDLOCK(vp, va.va_mode)) {
1116 1116                  VN_RELE(vp);
1117 1117                  ns->ns_status = NFSERR_ACCES;
1118 1118                  return;
1119 1119          }
1120 1120  
1121 1121          /*
1122 1122           * We have to enter the critical region before calling VOP_RWLOCK
1123 1123           * to avoid a deadlock with ufs.
1124 1124           */
1125 1125          if (nbl_need_check(vp)) {
1126 1126                  nbl_start_crit(vp, RW_READER);
1127 1127                  in_crit = 1;
1128 1128                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1129 1129                      wa->wa_count, 0, NULL)) {
1130 1130                          error = EACCES;
1131 1131                          goto out;
1132 1132                  }
1133 1133          }
1134 1134  
1135 1135          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1136 1136  
1137 1137          /* check if a monitor detected a delegation conflict */
1138 1138          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1139 1139                  goto out;
1140 1140          }
1141 1141  
1142 1142          if (wa->wa_data || wa->wa_rlist) {
1143 1143                  /* Do the RDMA thing if necessary */
1144 1144                  if (wa->wa_rlist) {
1145 1145                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1146 1146                          iov[0].iov_len = wa->wa_count;
1147 1147                  } else  {
1148 1148                          iov[0].iov_base = wa->wa_data;
1149 1149                          iov[0].iov_len = wa->wa_count;
1150 1150                  }
1151 1151                  uio.uio_iov = iov;
1152 1152                  uio.uio_iovcnt = 1;
1153 1153                  uio.uio_segflg = UIO_SYSSPACE;
1154 1154                  uio.uio_extflg = UIO_COPY_DEFAULT;
1155 1155                  uio.uio_loffset = (offset_t)wa->wa_offset;
1156 1156                  uio.uio_resid = wa->wa_count;
1157 1157                  /*
1158 1158                   * The limit is checked on the client. We
1159 1159                   * should allow any size writes here.
1160 1160                   */
1161 1161                  uio.uio_llimit = curproc->p_fsz_ctl;
1162 1162                  rlimit = uio.uio_llimit - wa->wa_offset;
1163 1163                  if (rlimit < (rlim64_t)uio.uio_resid)
1164 1164                          uio.uio_resid = (uint_t)rlimit;
1165 1165  
1166 1166                  /*
1167 1167                   * for now we assume no append mode
1168 1168                   */
1169 1169                  /*
1170 1170                   * We're changing creds because VM may fault and we need
1171 1171                   * the cred of the current thread to be used if quota
1172 1172                   * checking is enabled.
1173 1173                   */
1174 1174                  savecred = curthread->t_cred;
1175 1175                  curthread->t_cred = cr;
1176 1176                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1177 1177                  curthread->t_cred = savecred;
1178 1178          } else {
1179 1179  
1180 1180                  iovcnt = 0;
1181 1181                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1182 1182                          iovcnt++;
1183 1183                  if (iovcnt <= MAX_IOVECS) {
1184 1184  #ifdef DEBUG
1185 1185                          rfs_write_sync_hits++;
1186 1186  #endif
1187 1187                          iovp = iov;
1188 1188                  } else {
1189 1189  #ifdef DEBUG
1190 1190                          rfs_write_sync_misses++;
1191 1191  #endif
1192 1192                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1193 1193                  }
1194 1194                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1195 1195                  uio.uio_iov = iovp;
1196 1196                  uio.uio_iovcnt = iovcnt;
1197 1197                  uio.uio_segflg = UIO_SYSSPACE;
1198 1198                  uio.uio_extflg = UIO_COPY_DEFAULT;
1199 1199                  uio.uio_loffset = (offset_t)wa->wa_offset;
1200 1200                  uio.uio_resid = wa->wa_count;
1201 1201                  /*
1202 1202                   * The limit is checked on the client. We
1203 1203                   * should allow any size writes here.
1204 1204                   */
1205 1205                  uio.uio_llimit = curproc->p_fsz_ctl;
1206 1206                  rlimit = uio.uio_llimit - wa->wa_offset;
1207 1207                  if (rlimit < (rlim64_t)uio.uio_resid)
1208 1208                          uio.uio_resid = (uint_t)rlimit;
1209 1209  
1210 1210                  /*
1211 1211                   * For now we assume no append mode.
1212 1212                   */
1213 1213                  /*
1214 1214                   * We're changing creds because VM may fault and we need
1215 1215                   * the cred of the current thread to be used if quota
1216 1216                   * checking is enabled.
1217 1217                   */
1218 1218                  savecred = curthread->t_cred;
1219 1219                  curthread->t_cred = cr;
1220 1220                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1221 1221                  curthread->t_cred = savecred;
1222 1222  
1223 1223                  if (iovp != iov)
1224 1224                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1225 1225          }
1226 1226  
1227 1227          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1228 1228  
1229 1229          if (!error) {
1230 1230                  /*
1231 1231                   * Get attributes again so we send the latest mod
1232 1232                   * time to the client side for its cache.
1233 1233                   */
1234 1234                  va.va_mask = AT_ALL;    /* now we want everything */
1235 1235  
1236 1236                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1237 1237  
1238 1238                  /* check for overflows */
1239 1239                  if (!error) {
1240 1240                          acl_perm(vp, exi, &va, cr);
1241 1241                          error = vattr_to_nattr(&va, &ns->ns_attr);
1242 1242                  }
1243 1243          }
1244 1244  
1245 1245  out:
1246 1246          if (in_crit)
1247 1247                  nbl_end_crit(vp);
1248 1248          VN_RELE(vp);
1249 1249  
1250 1250          /* check if a monitor detected a delegation conflict */
1251 1251          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1252 1252                  /* mark as wouldblock so response is dropped */
1253 1253                  curthread->t_flag |= T_WOULDBLOCK;
1254 1254          else
1255 1255                  ns->ns_status = puterrno(error);
1256 1256  
1257 1257  }
1258 1258  
1259 1259  struct rfs_async_write {
1260 1260          struct nfswriteargs *wa;
1261 1261          struct nfsattrstat *ns;
1262 1262          struct svc_req *req;
1263 1263          cred_t *cr;
1264 1264          bool_t ro;
1265 1265          kthread_t *thread;
1266 1266          struct rfs_async_write *list;
1267 1267  };
1268 1268  
1269 1269  struct rfs_async_write_list {
1270 1270          fhandle_t *fhp;
1271 1271          kcondvar_t cv;
1272 1272          struct rfs_async_write *list;
1273 1273          struct rfs_async_write_list *next;
1274 1274  };
1275 1275  
1276 1276  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1277 1277  static kmutex_t rfs_async_write_lock;
1278 1278  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1279 1279  
1280 1280  #define MAXCLIOVECS     42
1281 1281  #define RFSWRITE_INITVAL (enum nfsstat) -1
1282 1282  
1283 1283  #ifdef DEBUG
1284 1284  static int rfs_write_hits = 0;
1285 1285  static int rfs_write_misses = 0;
1286 1286  #endif
1287 1287  
1288 1288  /*
1289 1289   * Write data to file.
1290 1290   * Returns attributes of a file after writing some data to it.
1291 1291   */
1292 1292  void
1293 1293  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1294 1294      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1295 1295  {
1296 1296          int error;
1297 1297          vnode_t *vp;
1298 1298          rlim64_t rlimit;
1299 1299          struct vattr va;
1300 1300          struct uio uio;
1301 1301          struct rfs_async_write_list *lp;
1302 1302          struct rfs_async_write_list *nlp;
1303 1303          struct rfs_async_write *rp;
1304 1304          struct rfs_async_write *nrp;
1305 1305          struct rfs_async_write *trp;
1306 1306          struct rfs_async_write *lrp;
1307 1307          int data_written;
1308 1308          int iovcnt;
1309 1309          mblk_t *m;
1310 1310          struct iovec *iovp;
1311 1311          struct iovec *niovp;
1312 1312          struct iovec iov[MAXCLIOVECS];
1313 1313          int count;
1314 1314          int rcount;
1315 1315          uint_t off;
1316 1316          uint_t len;
1317 1317          struct rfs_async_write nrpsp;
1318 1318          struct rfs_async_write_list nlpsp;
1319 1319          ushort_t t_flag;
1320 1320          cred_t *savecred;
1321 1321          int in_crit = 0;
1322 1322          caller_context_t ct;
1323 1323          nfs_srv_t *nsrv;
1324 1324  
1325 1325          ASSERT(exi == NULL || exi->exi_zoneid == curzone->zone_id);
1326 1326          nsrv = nfs_get_srv();
1327 1327          if (!nsrv->write_async) {
1328 1328                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1329 1329                  return;
1330 1330          }
1331 1331  
1332 1332          /*
1333 1333           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1334 1334           * is considered an OK.
1335 1335           */
1336 1336          ns->ns_status = RFSWRITE_INITVAL;
1337 1337  
1338 1338          nrp = &nrpsp;
1339 1339          nrp->wa = wa;
1340 1340          nrp->ns = ns;
1341 1341          nrp->req = req;
1342 1342          nrp->cr = cr;
1343 1343          nrp->ro = ro;
1344 1344          nrp->thread = curthread;
1345 1345  
1346 1346          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1347 1347  
1348 1348          /*
1349 1349           * Look to see if there is already a cluster started
1350 1350           * for this file.
1351 1351           */
1352 1352          mutex_enter(&nsrv->async_write_lock);
1353 1353          for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1354 1354                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1355 1355                      sizeof (fhandle_t)) == 0)
1356 1356                          break;
1357 1357          }
1358 1358  
1359 1359          /*
1360 1360           * If lp is non-NULL, then there is already a cluster
1361 1361           * started.  We need to place ourselves in the cluster
1362 1362           * list in the right place as determined by starting
1363 1363           * offset.  Conflicts with non-blocking mandatory locked
1364 1364           * regions will be checked when the cluster is processed.
1365 1365           */
1366 1366          if (lp != NULL) {
1367 1367                  rp = lp->list;
1368 1368                  trp = NULL;
1369 1369                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1370 1370                          trp = rp;
1371 1371                          rp = rp->list;
1372 1372                  }
1373 1373                  nrp->list = rp;
1374 1374                  if (trp == NULL)
1375 1375                          lp->list = nrp;
1376 1376                  else
1377 1377                          trp->list = nrp;
1378 1378                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1379 1379                          cv_wait(&lp->cv, &nsrv->async_write_lock);
1380 1380                  mutex_exit(&nsrv->async_write_lock);
1381 1381  
1382 1382                  return;
1383 1383          }
1384 1384  
1385 1385          /*
1386 1386           * No cluster started yet, start one and add ourselves
1387 1387           * to the list of clusters.
1388 1388           */
1389 1389          nrp->list = NULL;
1390 1390  
1391 1391          nlp = &nlpsp;
1392 1392          nlp->fhp = &wa->wa_fhandle;
1393 1393          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1394 1394          nlp->list = nrp;
1395 1395          nlp->next = NULL;
1396 1396  
1397 1397          if (nsrv->async_write_head == NULL) {
1398 1398                  nsrv->async_write_head = nlp;
1399 1399          } else {
1400 1400                  lp = nsrv->async_write_head;
1401 1401                  while (lp->next != NULL)
1402 1402                          lp = lp->next;
1403 1403                  lp->next = nlp;
1404 1404          }
1405 1405          mutex_exit(&nsrv->async_write_lock);
1406 1406  
1407 1407          /*
1408 1408           * Convert the file handle common to all of the requests
1409 1409           * in this cluster to a vnode.
1410 1410           */
1411 1411          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1412 1412          if (vp == NULL) {
1413 1413                  mutex_enter(&nsrv->async_write_lock);
1414 1414                  if (nsrv->async_write_head == nlp)
1415 1415                          nsrv->async_write_head = nlp->next;
1416 1416                  else {
1417 1417                          lp = nsrv->async_write_head;
1418 1418                          while (lp->next != nlp)
1419 1419                                  lp = lp->next;
1420 1420                          lp->next = nlp->next;
1421 1421                  }
1422 1422                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1423 1423                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1424 1424                          rp->ns->ns_status = NFSERR_STALE;
1425 1425                          rp->thread->t_flag |= t_flag;
1426 1426                  }
1427 1427                  cv_broadcast(&nlp->cv);
1428 1428                  mutex_exit(&nsrv->async_write_lock);
1429 1429  
1430 1430                  return;
1431 1431          }
1432 1432  
1433 1433          /*
1434 1434           * Can only write regular files.  Attempts to write any
1435 1435           * other file types fail with EISDIR.
1436 1436           */
1437 1437          if (vp->v_type != VREG) {
1438 1438                  VN_RELE(vp);
1439 1439                  mutex_enter(&nsrv->async_write_lock);
1440 1440                  if (nsrv->async_write_head == nlp)
1441 1441                          nsrv->async_write_head = nlp->next;
1442 1442                  else {
1443 1443                          lp = nsrv->async_write_head;
1444 1444                          while (lp->next != nlp)
1445 1445                                  lp = lp->next;
1446 1446                          lp->next = nlp->next;
1447 1447                  }
1448 1448                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1449 1449                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1450 1450                          rp->ns->ns_status = NFSERR_ISDIR;
1451 1451                          rp->thread->t_flag |= t_flag;
1452 1452                  }
1453 1453                  cv_broadcast(&nlp->cv);
1454 1454                  mutex_exit(&nsrv->async_write_lock);
1455 1455  
1456 1456                  return;
1457 1457          }
1458 1458  
1459 1459          /*
1460 1460           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1461 1461           * deadlock with ufs.
1462 1462           */
1463 1463          if (nbl_need_check(vp)) {
1464 1464                  nbl_start_crit(vp, RW_READER);
1465 1465                  in_crit = 1;
1466 1466          }
1467 1467  
1468 1468          ct.cc_sysid = 0;
1469 1469          ct.cc_pid = 0;
1470 1470          ct.cc_caller_id = nfs2_srv_caller_id;
1471 1471          ct.cc_flags = CC_DONTBLOCK;
1472 1472  
1473 1473          /*
1474 1474           * Lock the file for writing.  This operation provides
1475 1475           * the delay which allows clusters to grow.
1476 1476           */
1477 1477          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1478 1478  
1479 1479          /* check if a monitor detected a delegation conflict */
1480 1480          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1481 1481                  if (in_crit)
1482 1482                          nbl_end_crit(vp);
1483 1483                  VN_RELE(vp);
1484 1484                  /* mark as wouldblock so response is dropped */
1485 1485                  curthread->t_flag |= T_WOULDBLOCK;
1486 1486                  mutex_enter(&nsrv->async_write_lock);
1487 1487                  if (nsrv->async_write_head == nlp)
1488 1488                          nsrv->async_write_head = nlp->next;
1489 1489                  else {
1490 1490                          lp = nsrv->async_write_head;
1491 1491                          while (lp->next != nlp)
1492 1492                                  lp = lp->next;
1493 1493                          lp->next = nlp->next;
1494 1494                  }
1495 1495                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1496 1496                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1497 1497                                  rp->ns->ns_status = puterrno(error);
1498 1498                                  rp->thread->t_flag |= T_WOULDBLOCK;
1499 1499                          }
1500 1500                  }
1501 1501                  cv_broadcast(&nlp->cv);
1502 1502                  mutex_exit(&nsrv->async_write_lock);
1503 1503  
1504 1504                  return;
1505 1505          }
1506 1506  
1507 1507          /*
1508 1508           * Disconnect this cluster from the list of clusters.
1509 1509           * The cluster that is being dealt with must be fixed
1510 1510           * in size after this point, so there is no reason
1511 1511           * to leave it on the list so that new requests can
1512 1512           * find it.
1513 1513           *
1514 1514           * The algorithm is that the first write request will
1515 1515           * create a cluster, convert the file handle to a
1516 1516           * vnode pointer, and then lock the file for writing.
1517 1517           * This request is not likely to be clustered with
1518 1518           * any others.  However, the next request will create
1519 1519           * a new cluster and be blocked in VOP_RWLOCK while
1520 1520           * the first request is being processed.  This delay
1521 1521           * will allow more requests to be clustered in this
1522 1522           * second cluster.
1523 1523           */
1524 1524          mutex_enter(&nsrv->async_write_lock);
1525 1525          if (nsrv->async_write_head == nlp)
1526 1526                  nsrv->async_write_head = nlp->next;
1527 1527          else {
1528 1528                  lp = nsrv->async_write_head;
1529 1529                  while (lp->next != nlp)
1530 1530                          lp = lp->next;
1531 1531                  lp->next = nlp->next;
1532 1532          }
1533 1533          mutex_exit(&nsrv->async_write_lock);
1534 1534  
1535 1535          /*
1536 1536           * Step through the list of requests in this cluster.
1537 1537           * We need to check permissions to make sure that all
1538 1538           * of the requests have sufficient permission to write
1539 1539           * the file.  A cluster can be composed of requests
1540 1540           * from different clients and different users on each
1541 1541           * client.
1542 1542           *
1543 1543           * As a side effect, we also calculate the size of the
1544 1544           * byte range that this cluster encompasses.
1545 1545           */
1546 1546          rp = nlp->list;
1547 1547          off = rp->wa->wa_offset;
1548 1548          len = (uint_t)0;
1549 1549          do {
1550 1550                  if (rdonly(rp->ro, vp)) {
1551 1551                          rp->ns->ns_status = NFSERR_ROFS;
1552 1552                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1553 1553                          rp->thread->t_flag |= t_flag;
1554 1554                          continue;
1555 1555                  }
1556 1556  
1557 1557                  va.va_mask = AT_UID|AT_MODE;
1558 1558  
1559 1559                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1560 1560  
1561 1561                  if (!error) {
1562 1562                          if (crgetuid(rp->cr) != va.va_uid) {
1563 1563                                  /*
1564 1564                                   * This is a kludge to allow writes of files
1565 1565                                   * created with read only permission.  The
1566 1566                                   * owner of the file is always allowed to
1567 1567                                   * write it.
1568 1568                                   */
1569 1569                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1570 1570                          }
1571 1571                          if (!error && MANDLOCK(vp, va.va_mode))
1572 1572                                  error = EACCES;
1573 1573                  }
1574 1574  
1575 1575                  /*
1576 1576                   * Check for a conflict with a nbmand-locked region.
1577 1577                   */
1578 1578                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1579 1579                      rp->wa->wa_count, 0, NULL)) {
1580 1580                          error = EACCES;
1581 1581                  }
1582 1582  
1583 1583                  if (error) {
1584 1584                          rp->ns->ns_status = puterrno(error);
1585 1585                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1586 1586                          rp->thread->t_flag |= t_flag;
1587 1587                          continue;
1588 1588                  }
1589 1589                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1590 1590                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1591 1591          } while ((rp = rp->list) != NULL);
1592 1592  
1593 1593          /*
1594 1594           * Step through the cluster attempting to gather as many
1595 1595           * requests which are contiguous as possible.  These
1596 1596           * contiguous requests are handled via one call to VOP_WRITE
1597 1597           * instead of different calls to VOP_WRITE.  We also keep
1598 1598           * track of the fact that any data was written.
1599 1599           */
1600 1600          rp = nlp->list;
1601 1601          data_written = 0;
1602 1602          do {
1603 1603                  /*
1604 1604                   * Skip any requests which are already marked as having an
1605 1605                   * error.
1606 1606                   */
1607 1607                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1608 1608                          rp = rp->list;
1609 1609                          continue;
1610 1610                  }
1611 1611  
1612 1612                  /*
1613 1613                   * Count the number of iovec's which are required
1614 1614                   * to handle this set of requests.  One iovec is
1615 1615                   * needed for each data buffer, whether addressed
1616 1616                   * by wa_data or by the b_rptr pointers in the
1617 1617                   * mblk chains.
1618 1618                   */
1619 1619                  iovcnt = 0;
1620 1620                  lrp = rp;
1621 1621                  for (;;) {
1622 1622                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1623 1623                                  iovcnt++;
1624 1624                          else {
1625 1625                                  m = lrp->wa->wa_mblk;
1626 1626                                  while (m != NULL) {
1627 1627                                          iovcnt++;
1628 1628                                          m = m->b_cont;
1629 1629                                  }
1630 1630                          }
1631 1631                          if (lrp->list == NULL ||
1632 1632                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1633 1633                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1634 1634                              lrp->list->wa->wa_offset) {
1635 1635                                  lrp = lrp->list;
1636 1636                                  break;
1637 1637                          }
1638 1638                          lrp = lrp->list;
1639 1639                  }
1640 1640  
1641 1641                  if (iovcnt <= MAXCLIOVECS) {
1642 1642  #ifdef DEBUG
1643 1643                          rfs_write_hits++;
1644 1644  #endif
1645 1645                          niovp = iov;
1646 1646                  } else {
1647 1647  #ifdef DEBUG
1648 1648                          rfs_write_misses++;
1649 1649  #endif
1650 1650                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1651 1651                  }
1652 1652                  /*
1653 1653                   * Put together the scatter/gather iovecs.
1654 1654                   */
1655 1655                  iovp = niovp;
1656 1656                  trp = rp;
1657 1657                  count = 0;
1658 1658                  do {
1659 1659                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1660 1660                                  if (trp->wa->wa_rlist) {
1661 1661                                          iovp->iov_base =
1662 1662                                              (char *)((trp->wa->wa_rlist)->
1663 1663                                              u.c_daddr3);
1664 1664                                          iovp->iov_len = trp->wa->wa_count;
1665 1665                                  } else  {
1666 1666                                          iovp->iov_base = trp->wa->wa_data;
1667 1667                                          iovp->iov_len = trp->wa->wa_count;
1668 1668                                  }
1669 1669                                  iovp++;
1670 1670                          } else {
1671 1671                                  m = trp->wa->wa_mblk;
1672 1672                                  rcount = trp->wa->wa_count;
1673 1673                                  while (m != NULL) {
1674 1674                                          iovp->iov_base = (caddr_t)m->b_rptr;
1675 1675                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1676 1676                                          rcount -= iovp->iov_len;
1677 1677                                          if (rcount < 0)
1678 1678                                                  iovp->iov_len += rcount;
1679 1679                                          iovp++;
1680 1680                                          if (rcount <= 0)
1681 1681                                                  break;
1682 1682                                          m = m->b_cont;
1683 1683                                  }
1684 1684                          }
1685 1685                          count += trp->wa->wa_count;
1686 1686                          trp = trp->list;
1687 1687                  } while (trp != lrp);
1688 1688  
1689 1689                  uio.uio_iov = niovp;
1690 1690                  uio.uio_iovcnt = iovcnt;
1691 1691                  uio.uio_segflg = UIO_SYSSPACE;
1692 1692                  uio.uio_extflg = UIO_COPY_DEFAULT;
1693 1693                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1694 1694                  uio.uio_resid = count;
1695 1695                  /*
1696 1696                   * The limit is checked on the client. We
1697 1697                   * should allow any size writes here.
1698 1698                   */
1699 1699                  uio.uio_llimit = curproc->p_fsz_ctl;
1700 1700                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1701 1701                  if (rlimit < (rlim64_t)uio.uio_resid)
1702 1702                          uio.uio_resid = (uint_t)rlimit;
1703 1703  
1704 1704                  /*
1705 1705                   * For now we assume no append mode.
1706 1706                   */
1707 1707  
1708 1708                  /*
1709 1709                   * We're changing creds because VM may fault
1710 1710                   * and we need the cred of the current
1711 1711                   * thread to be used if quota * checking is
1712 1712                   * enabled.
1713 1713                   */
1714 1714                  savecred = curthread->t_cred;
1715 1715                  curthread->t_cred = cr;
1716 1716                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1717 1717                  curthread->t_cred = savecred;
1718 1718  
1719 1719                  /* check if a monitor detected a delegation conflict */
1720 1720                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1721 1721                          /* mark as wouldblock so response is dropped */
1722 1722                          curthread->t_flag |= T_WOULDBLOCK;
1723 1723  
1724 1724                  if (niovp != iov)
1725 1725                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1726 1726  
1727 1727                  if (!error) {
1728 1728                          data_written = 1;
1729 1729                          /*
1730 1730                           * Get attributes again so we send the latest mod
1731 1731                           * time to the client side for its cache.
1732 1732                           */
1733 1733                          va.va_mask = AT_ALL;    /* now we want everything */
1734 1734  
1735 1735                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1736 1736  
1737 1737                          if (!error)
1738 1738                                  acl_perm(vp, exi, &va, rp->cr);
1739 1739                  }
1740 1740  
1741 1741                  /*
1742 1742                   * Fill in the status responses for each request
1743 1743                   * which was just handled.  Also, copy the latest
1744 1744                   * attributes in to the attribute responses if
1745 1745                   * appropriate.
1746 1746                   */
1747 1747                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1748 1748                  do {
1749 1749                          rp->thread->t_flag |= t_flag;
1750 1750                          /* check for overflows */
1751 1751                          if (!error) {
1752 1752                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1753 1753                          }
1754 1754                          rp->ns->ns_status = puterrno(error);
1755 1755                          rp = rp->list;
1756 1756                  } while (rp != lrp);
1757 1757          } while (rp != NULL);
1758 1758  
1759 1759          /*
1760 1760           * If any data was written at all, then we need to flush
1761 1761           * the data and metadata to stable storage.
1762 1762           */
1763 1763          if (data_written) {
1764 1764                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1765 1765  
1766 1766                  if (!error) {
1767 1767                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1768 1768                  }
1769 1769          }
1770 1770  
1771 1771          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1772 1772  
1773 1773          if (in_crit)
1774 1774                  nbl_end_crit(vp);
1775 1775          VN_RELE(vp);
1776 1776  
1777 1777          t_flag = curthread->t_flag & T_WOULDBLOCK;
1778 1778          mutex_enter(&nsrv->async_write_lock);
1779 1779          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1780 1780                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1781 1781                          rp->ns->ns_status = puterrno(error);
1782 1782                          rp->thread->t_flag |= t_flag;
1783 1783                  }
1784 1784          }
1785 1785          cv_broadcast(&nlp->cv);
1786 1786          mutex_exit(&nsrv->async_write_lock);
1787 1787  
1788 1788  }
1789 1789  
1790 1790  void *
1791 1791  rfs_write_getfh(struct nfswriteargs *wa)
1792 1792  {
1793 1793          return (&wa->wa_fhandle);
1794 1794  }
1795 1795  
1796 1796  /*
1797 1797   * Create a file.
1798 1798   * Creates a file with given attributes and returns those attributes
1799 1799   * and an fhandle for the new file.
1800 1800   */
1801 1801  void
1802 1802  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1803 1803      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1804 1804  {
1805 1805          int error;
1806 1806          int lookuperr;
1807 1807          int in_crit = 0;
1808 1808          struct vattr va;
1809 1809          vnode_t *vp;
1810 1810          vnode_t *realvp;
1811 1811          vnode_t *dvp;
1812 1812          char *name = args->ca_da.da_name;
1813 1813          vnode_t *tvp = NULL;
1814 1814          int mode;
1815 1815          int lookup_ok;
1816 1816          bool_t trunc;
1817 1817          struct sockaddr *ca;
1818 1818  
1819 1819          /*
1820 1820           * Disallow NULL paths
1821 1821           */
1822 1822          if (name == NULL || *name == '\0') {
1823 1823                  dr->dr_status = NFSERR_ACCES;
1824 1824                  return;
1825 1825          }
1826 1826  
1827 1827          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1828 1828          if (dvp == NULL) {
1829 1829                  dr->dr_status = NFSERR_STALE;
1830 1830                  return;
1831 1831          }
1832 1832  
1833 1833          error = sattr_to_vattr(args->ca_sa, &va);
1834 1834          if (error) {
1835 1835                  dr->dr_status = puterrno(error);
1836 1836                  return;
1837 1837          }
1838 1838  
1839 1839          /*
1840 1840           * Must specify the mode.
1841 1841           */
1842 1842          if (!(va.va_mask & AT_MODE)) {
1843 1843                  VN_RELE(dvp);
1844 1844                  dr->dr_status = NFSERR_INVAL;
1845 1845                  return;
1846 1846          }
1847 1847  
1848 1848          /*
1849 1849           * This is a completely gross hack to make mknod
1850 1850           * work over the wire until we can wack the protocol
1851 1851           */
1852 1852          if ((va.va_mode & IFMT) == IFCHR) {
1853 1853                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1854 1854                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1855 1855                  else {
1856 1856                          va.va_type = VCHR;
1857 1857                          /*
1858 1858                           * uncompress the received dev_t
1859 1859                           * if the top half is zero indicating a request
1860 1860                           * from an `older style' OS.
1861 1861                           */
1862 1862                          if ((va.va_size & 0xffff0000) == 0)
1863 1863                                  va.va_rdev = nfsv2_expdev(va.va_size);
1864 1864                          else
1865 1865                                  va.va_rdev = (dev_t)va.va_size;
1866 1866                  }
1867 1867                  va.va_mask &= ~AT_SIZE;
1868 1868          } else if ((va.va_mode & IFMT) == IFBLK) {
1869 1869                  va.va_type = VBLK;
1870 1870                  /*
1871 1871                   * uncompress the received dev_t
1872 1872                   * if the top half is zero indicating a request
1873 1873                   * from an `older style' OS.
1874 1874                   */
1875 1875                  if ((va.va_size & 0xffff0000) == 0)
1876 1876                          va.va_rdev = nfsv2_expdev(va.va_size);
1877 1877                  else
1878 1878                          va.va_rdev = (dev_t)va.va_size;
1879 1879                  va.va_mask &= ~AT_SIZE;
1880 1880          } else if ((va.va_mode & IFMT) == IFSOCK) {
1881 1881                  va.va_type = VSOCK;
1882 1882          } else {
1883 1883                  va.va_type = VREG;
1884 1884          }
1885 1885          va.va_mode &= ~IFMT;
1886 1886          va.va_mask |= AT_TYPE;
1887 1887  
1888 1888          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1889 1889          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1890 1890              MAXPATHLEN);
1891 1891          if (name == NULL) {
1892 1892                  dr->dr_status = puterrno(EINVAL);
1893 1893                  return;
1894 1894          }
1895 1895  
1896 1896          /*
1897 1897           * Why was the choice made to use VWRITE as the mode to the
1898 1898           * call to VOP_CREATE ? This results in a bug.  When a client
1899 1899           * opens a file that already exists and is RDONLY, the second
1900 1900           * open fails with an EACESS because of the mode.
1901 1901           * bug ID 1054648.
1902 1902           */
1903 1903          lookup_ok = 0;
1904 1904          mode = VWRITE;
1905 1905          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1906 1906                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1907 1907                      NULL, NULL, NULL);
1908 1908                  if (!error) {
1909 1909                          struct vattr at;
1910 1910  
1911 1911                          lookup_ok = 1;
1912 1912                          at.va_mask = AT_MODE;
1913 1913                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1914 1914                          if (!error)
1915 1915                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1916 1916                          VN_RELE(tvp);
1917 1917                          tvp = NULL;
1918 1918                  }
1919 1919          }
1920 1920  
1921 1921          if (!lookup_ok) {
1922 1922                  if (rdonly(ro, dvp)) {
1923 1923                          error = EROFS;
1924 1924                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1925 1925                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1926 1926                          error = EPERM;
1927 1927                  } else {
1928 1928                          error = 0;
1929 1929                  }
1930 1930          }
1931 1931  
1932 1932          /*
1933 1933           * If file size is being modified on an already existing file
1934 1934           * make sure that there are no conflicting non-blocking mandatory
1935 1935           * locks in the region being manipulated. Return EACCES if there
1936 1936           * are conflicting locks.
1937 1937           */
1938 1938          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1939 1939                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1940 1940                      NULL, NULL, NULL);
1941 1941  
1942 1942                  if (!lookuperr &&
1943 1943                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1944 1944                          VN_RELE(tvp);
1945 1945                          curthread->t_flag |= T_WOULDBLOCK;
1946 1946                          goto out;
1947 1947                  }
1948 1948  
1949 1949                  if (!lookuperr && nbl_need_check(tvp)) {
1950 1950                          /*
1951 1951                           * The file exists. Now check if it has any
1952 1952                           * conflicting non-blocking mandatory locks
1953 1953                           * in the region being changed.
1954 1954                           */
1955 1955                          struct vattr bva;
1956 1956                          u_offset_t offset;
1957 1957                          ssize_t length;
1958 1958  
1959 1959                          nbl_start_crit(tvp, RW_READER);
1960 1960                          in_crit = 1;
1961 1961  
1962 1962                          bva.va_mask = AT_SIZE;
1963 1963                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1964 1964                          if (!error) {
1965 1965                                  if (va.va_size < bva.va_size) {
1966 1966                                          offset = va.va_size;
1967 1967                                          length = bva.va_size - va.va_size;
1968 1968                                  } else {
1969 1969                                          offset = bva.va_size;
1970 1970                                          length = va.va_size - bva.va_size;
1971 1971                                  }
1972 1972                                  if (length) {
1973 1973                                          if (nbl_conflict(tvp, NBL_WRITE,
1974 1974                                              offset, length, 0, NULL)) {
1975 1975                                                  error = EACCES;
1976 1976                                          }
1977 1977                                  }
1978 1978                          }
1979 1979                          if (error) {
1980 1980                                  nbl_end_crit(tvp);
1981 1981                                  VN_RELE(tvp);
1982 1982                                  in_crit = 0;
1983 1983                          }
1984 1984                  } else if (tvp != NULL) {
1985 1985                          VN_RELE(tvp);
1986 1986                  }
1987 1987          }
1988 1988  
1989 1989          if (!error) {
1990 1990                  /*
1991 1991                   * If filesystem is shared with nosuid the remove any
1992 1992                   * setuid/setgid bits on create.
1993 1993                   */
1994 1994                  if (va.va_type == VREG &&
1995 1995                      exi->exi_export.ex_flags & EX_NOSUID)
1996 1996                          va.va_mode &= ~(VSUID | VSGID);
1997 1997  
1998 1998                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1999 1999                      NULL, NULL);
2000 2000  
2001 2001                  if (!error) {
2002 2002  
2003 2003                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
2004 2004                                  trunc = TRUE;
2005 2005                          else
2006 2006                                  trunc = FALSE;
2007 2007  
2008 2008                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
2009 2009                                  VN_RELE(vp);
2010 2010                                  curthread->t_flag |= T_WOULDBLOCK;
2011 2011                                  goto out;
2012 2012                          }
2013 2013                          va.va_mask = AT_ALL;
2014 2014  
2015 2015                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2016 2016  
2017 2017                          /* check for overflows */
2018 2018                          if (!error) {
2019 2019                                  acl_perm(vp, exi, &va, cr);
2020 2020                                  error = vattr_to_nattr(&va, &dr->dr_attr);
2021 2021                                  if (!error) {
2022 2022                                          error = makefh(&dr->dr_fhandle, vp,
2023 2023                                              exi);
2024 2024                                  }
2025 2025                          }
2026 2026                          /*
2027 2027                           * Force modified metadata out to stable storage.
2028 2028                           *
2029 2029                           * if a underlying vp exists, pass it to VOP_FSYNC
2030 2030                           */
2031 2031                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
2032 2032                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2033 2033                          else
2034 2034                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2035 2035                          VN_RELE(vp);
2036 2036                  }
2037 2037  
2038 2038                  if (in_crit) {
2039 2039                          nbl_end_crit(tvp);
2040 2040                          VN_RELE(tvp);
2041 2041                  }
2042 2042          }
2043 2043  
2044 2044          /*
2045 2045           * Force modified data and metadata out to stable storage.
2046 2046           */
2047 2047          (void) VOP_FSYNC(dvp, 0, cr, NULL);
2048 2048  
2049 2049  out:
2050 2050  
2051 2051          VN_RELE(dvp);
2052 2052  
2053 2053          dr->dr_status = puterrno(error);
2054 2054  
2055 2055          if (name != args->ca_da.da_name)
2056 2056                  kmem_free(name, MAXPATHLEN);
2057 2057  }
2058 2058  void *
2059 2059  rfs_create_getfh(struct nfscreatargs *args)
2060 2060  {
2061 2061          return (args->ca_da.da_fhandle);
2062 2062  }
2063 2063  
2064 2064  /*
2065 2065   * Remove a file.
2066 2066   * Remove named file from parent directory.
2067 2067   */
2068 2068  /* ARGSUSED */
2069 2069  void
2070 2070  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2071 2071      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2072 2072  {
2073 2073          int error = 0;
2074 2074          vnode_t *vp;
2075 2075          vnode_t *targvp;
2076 2076          int in_crit = 0;
2077 2077  
2078 2078          /*
2079 2079           * Disallow NULL paths
2080 2080           */
2081 2081          if (da->da_name == NULL || *da->da_name == '\0') {
2082 2082                  *status = NFSERR_ACCES;
2083 2083                  return;
2084 2084          }
2085 2085  
2086 2086          vp = nfs_fhtovp(da->da_fhandle, exi);
2087 2087          if (vp == NULL) {
2088 2088                  *status = NFSERR_STALE;
2089 2089                  return;
2090 2090          }
2091 2091  
2092 2092          if (rdonly(ro, vp)) {
2093 2093                  VN_RELE(vp);
2094 2094                  *status = NFSERR_ROFS;
2095 2095                  return;
2096 2096          }
2097 2097  
2098 2098          /*
2099 2099           * Check for a conflict with a non-blocking mandatory share reservation.
2100 2100           */
2101 2101          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2102 2102              NULL, cr, NULL, NULL, NULL);
2103 2103          if (error != 0) {
2104 2104                  VN_RELE(vp);
2105 2105                  *status = puterrno(error);
2106 2106                  return;
2107 2107          }
2108 2108  
2109 2109          /*
2110 2110           * If the file is delegated to an v4 client, then initiate
2111 2111           * recall and drop this request (by setting T_WOULDBLOCK).
2112 2112           * The client will eventually re-transmit the request and
2113 2113           * (hopefully), by then, the v4 client will have returned
2114 2114           * the delegation.
2115 2115           */
2116 2116  
2117 2117          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2118 2118                  VN_RELE(vp);
2119 2119                  VN_RELE(targvp);
2120 2120                  curthread->t_flag |= T_WOULDBLOCK;
2121 2121                  return;
2122 2122          }
2123 2123  
2124 2124          if (nbl_need_check(targvp)) {
2125 2125                  nbl_start_crit(targvp, RW_READER);
2126 2126                  in_crit = 1;
2127 2127                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2128 2128                          error = EACCES;
2129 2129                          goto out;
2130 2130                  }
2131 2131          }
2132 2132  
2133 2133          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2134 2134  
2135 2135          /*
2136 2136           * Force modified data and metadata out to stable storage.
2137 2137           */
2138 2138          (void) VOP_FSYNC(vp, 0, cr, NULL);
2139 2139  
2140 2140  out:
2141 2141          if (in_crit)
2142 2142                  nbl_end_crit(targvp);
2143 2143          VN_RELE(targvp);
2144 2144          VN_RELE(vp);
2145 2145  
2146 2146          *status = puterrno(error);
2147 2147  
2148 2148  }
2149 2149  
2150 2150  void *
2151 2151  rfs_remove_getfh(struct nfsdiropargs *da)
2152 2152  {
2153 2153          return (da->da_fhandle);
2154 2154  }
2155 2155  
2156 2156  /*
2157 2157   * rename a file
2158 2158   * Give a file (from) a new name (to).
2159 2159   */
2160 2160  /* ARGSUSED */
2161 2161  void
2162 2162  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2163 2163      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2164 2164  {
2165 2165          int error = 0;
2166 2166          vnode_t *fromvp;
2167 2167          vnode_t *tovp;
2168 2168          struct exportinfo *to_exi;
2169 2169          fhandle_t *fh;
2170 2170          vnode_t *srcvp;
2171 2171          vnode_t *targvp;
2172 2172          int in_crit = 0;
2173 2173  
2174 2174          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2175 2175          if (fromvp == NULL) {
2176 2176                  *status = NFSERR_STALE;
2177 2177                  return;
2178 2178          }
2179 2179  
2180 2180          fh = args->rna_to.da_fhandle;
2181 2181          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2182 2182          if (to_exi == NULL) {
2183 2183                  VN_RELE(fromvp);
2184 2184                  *status = NFSERR_ACCES;
2185 2185                  return;
2186 2186          }
2187 2187          exi_rele(to_exi);
2188 2188  
2189 2189          if (to_exi != exi) {
2190 2190                  VN_RELE(fromvp);
2191 2191                  *status = NFSERR_XDEV;
2192 2192                  return;
2193 2193          }
2194 2194  
2195 2195          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2196 2196          if (tovp == NULL) {
2197 2197                  VN_RELE(fromvp);
2198 2198                  *status = NFSERR_STALE;
2199 2199                  return;
2200 2200          }
2201 2201  
2202 2202          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2203 2203                  VN_RELE(tovp);
2204 2204                  VN_RELE(fromvp);
2205 2205                  *status = NFSERR_NOTDIR;
2206 2206                  return;
2207 2207          }
2208 2208  
2209 2209          /*
2210 2210           * Disallow NULL paths
2211 2211           */
2212 2212          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2213 2213              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2214 2214                  VN_RELE(tovp);
2215 2215                  VN_RELE(fromvp);
2216 2216                  *status = NFSERR_ACCES;
2217 2217                  return;
2218 2218          }
2219 2219  
2220 2220          if (rdonly(ro, tovp)) {
2221 2221                  VN_RELE(tovp);
2222 2222                  VN_RELE(fromvp);
2223 2223                  *status = NFSERR_ROFS;
2224 2224                  return;
2225 2225          }
2226 2226  
2227 2227          /*
2228 2228           * Check for a conflict with a non-blocking mandatory share reservation.
2229 2229           */
2230 2230          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2231 2231              NULL, cr, NULL, NULL, NULL);
2232 2232          if (error != 0) {
2233 2233                  VN_RELE(tovp);
2234 2234                  VN_RELE(fromvp);
2235 2235                  *status = puterrno(error);
2236 2236                  return;
2237 2237          }
2238 2238  
2239 2239          /* Check for delegations on the source file */
2240 2240  
2241 2241          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2242 2242                  VN_RELE(tovp);
2243 2243                  VN_RELE(fromvp);
2244 2244                  VN_RELE(srcvp);
2245 2245                  curthread->t_flag |= T_WOULDBLOCK;
2246 2246                  return;
2247 2247          }
2248 2248  
2249 2249          /* Check for delegation on the file being renamed over, if it exists */
2250 2250  
2251 2251          if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2252 2252              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2253 2253              NULL, NULL, NULL) == 0) {
2254 2254  
2255 2255                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2256 2256                          VN_RELE(tovp);
2257 2257                          VN_RELE(fromvp);
2258 2258                          VN_RELE(srcvp);
2259 2259                          VN_RELE(targvp);
2260 2260                          curthread->t_flag |= T_WOULDBLOCK;
2261 2261                          return;
2262 2262                  }
2263 2263                  VN_RELE(targvp);
2264 2264          }
2265 2265  
2266 2266  
2267 2267          if (nbl_need_check(srcvp)) {
2268 2268                  nbl_start_crit(srcvp, RW_READER);
2269 2269                  in_crit = 1;
2270 2270                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2271 2271                          error = EACCES;
2272 2272                          goto out;
2273 2273                  }
2274 2274          }
2275 2275  
2276 2276          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2277 2277              tovp, args->rna_to.da_name, cr, NULL, 0);
2278 2278  
2279 2279          if (error == 0)
2280 2280                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2281 2281                      strlen(args->rna_to.da_name));
2282 2282  
2283 2283          /*
2284 2284           * Force modified data and metadata out to stable storage.
2285 2285           */
2286 2286          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2287 2287          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2288 2288  
2289 2289  out:
2290 2290          if (in_crit)
2291 2291                  nbl_end_crit(srcvp);
2292 2292          VN_RELE(srcvp);
2293 2293          VN_RELE(tovp);
2294 2294          VN_RELE(fromvp);
2295 2295  
2296 2296          *status = puterrno(error);
2297 2297  
2298 2298  }
2299 2299  void *
2300 2300  rfs_rename_getfh(struct nfsrnmargs *args)
2301 2301  {
2302 2302          return (args->rna_from.da_fhandle);
2303 2303  }
2304 2304  
2305 2305  /*
2306 2306   * Link to a file.
2307 2307   * Create a file (to) which is a hard link to the given file (from).
2308 2308   */
2309 2309  /* ARGSUSED */
2310 2310  void
2311 2311  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2312 2312      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2313 2313  {
2314 2314          int error;
2315 2315          vnode_t *fromvp;
2316 2316          vnode_t *tovp;
2317 2317          struct exportinfo *to_exi;
2318 2318          fhandle_t *fh;
2319 2319  
2320 2320          fromvp = nfs_fhtovp(args->la_from, exi);
2321 2321          if (fromvp == NULL) {
2322 2322                  *status = NFSERR_STALE;
2323 2323                  return;
2324 2324          }
2325 2325  
2326 2326          fh = args->la_to.da_fhandle;
2327 2327          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2328 2328          if (to_exi == NULL) {
2329 2329                  VN_RELE(fromvp);
2330 2330                  *status = NFSERR_ACCES;
2331 2331                  return;
2332 2332          }
2333 2333          exi_rele(to_exi);
2334 2334  
2335 2335          if (to_exi != exi) {
2336 2336                  VN_RELE(fromvp);
2337 2337                  *status = NFSERR_XDEV;
2338 2338                  return;
2339 2339          }
2340 2340  
2341 2341          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2342 2342          if (tovp == NULL) {
2343 2343                  VN_RELE(fromvp);
2344 2344                  *status = NFSERR_STALE;
2345 2345                  return;
2346 2346          }
2347 2347  
2348 2348          if (tovp->v_type != VDIR) {
2349 2349                  VN_RELE(tovp);
2350 2350                  VN_RELE(fromvp);
2351 2351                  *status = NFSERR_NOTDIR;
2352 2352                  return;
2353 2353          }
2354 2354          /*
2355 2355           * Disallow NULL paths
2356 2356           */
2357 2357          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2358 2358                  VN_RELE(tovp);
2359 2359                  VN_RELE(fromvp);
2360 2360                  *status = NFSERR_ACCES;
2361 2361                  return;
2362 2362          }
2363 2363  
2364 2364          if (rdonly(ro, tovp)) {
2365 2365                  VN_RELE(tovp);
2366 2366                  VN_RELE(fromvp);
2367 2367                  *status = NFSERR_ROFS;
2368 2368                  return;
2369 2369          }
2370 2370  
2371 2371          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2372 2372  
2373 2373          /*
2374 2374           * Force modified data and metadata out to stable storage.
2375 2375           */
2376 2376          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2377 2377          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2378 2378  
2379 2379          VN_RELE(tovp);
2380 2380          VN_RELE(fromvp);
2381 2381  
2382 2382          *status = puterrno(error);
2383 2383  
2384 2384  }
2385 2385  void *
2386 2386  rfs_link_getfh(struct nfslinkargs *args)
2387 2387  {
2388 2388          return (args->la_from);
2389 2389  }
2390 2390  
2391 2391  /*
2392 2392   * Symbolicly link to a file.
2393 2393   * Create a file (to) with the given attributes which is a symbolic link
2394 2394   * to the given path name (to).
2395 2395   */
2396 2396  void
2397 2397  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2398 2398      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2399 2399  {
2400 2400          int error;
2401 2401          struct vattr va;
2402 2402          vnode_t *vp;
2403 2403          vnode_t *svp;
2404 2404          int lerror;
2405 2405          struct sockaddr *ca;
2406 2406          char *name = NULL;
2407 2407  
2408 2408          /*
2409 2409           * Disallow NULL paths
2410 2410           */
2411 2411          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2412 2412                  *status = NFSERR_ACCES;
2413 2413                  return;
2414 2414          }
2415 2415  
2416 2416          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2417 2417          if (vp == NULL) {
2418 2418                  *status = NFSERR_STALE;
2419 2419                  return;
2420 2420          }
2421 2421  
2422 2422          if (rdonly(ro, vp)) {
2423 2423                  VN_RELE(vp);
2424 2424                  *status = NFSERR_ROFS;
2425 2425                  return;
2426 2426          }
2427 2427  
2428 2428          error = sattr_to_vattr(args->sla_sa, &va);
2429 2429          if (error) {
2430 2430                  VN_RELE(vp);
2431 2431                  *status = puterrno(error);
2432 2432                  return;
2433 2433          }
2434 2434  
2435 2435          if (!(va.va_mask & AT_MODE)) {
2436 2436                  VN_RELE(vp);
2437 2437                  *status = NFSERR_INVAL;
2438 2438                  return;
2439 2439          }
2440 2440  
2441 2441          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2442 2442          name = nfscmd_convname(ca, exi, args->sla_tnm,
2443 2443              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2444 2444  
2445 2445          if (name == NULL) {
2446 2446                  *status = NFSERR_ACCES;
2447 2447                  return;
2448 2448          }
2449 2449  
2450 2450          va.va_type = VLNK;
2451 2451          va.va_mask |= AT_TYPE;
2452 2452  
2453 2453          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2454 2454  
2455 2455          /*
2456 2456           * Force new data and metadata out to stable storage.
2457 2457           */
2458 2458          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2459 2459              NULL, cr, NULL, NULL, NULL);
2460 2460  
2461 2461          if (!lerror) {
2462 2462                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2463 2463                  VN_RELE(svp);
2464 2464          }
2465 2465  
2466 2466          /*
2467 2467           * Force modified data and metadata out to stable storage.
2468 2468           */
2469 2469          (void) VOP_FSYNC(vp, 0, cr, NULL);
2470 2470  
2471 2471          VN_RELE(vp);
2472 2472  
2473 2473          *status = puterrno(error);
2474 2474          if (name != args->sla_tnm)
2475 2475                  kmem_free(name, MAXPATHLEN);
2476 2476  
2477 2477  }
2478 2478  void *
2479 2479  rfs_symlink_getfh(struct nfsslargs *args)
2480 2480  {
2481 2481          return (args->sla_from.da_fhandle);
2482 2482  }
2483 2483  
2484 2484  /*
2485 2485   * Make a directory.
2486 2486   * Create a directory with the given name, parent directory, and attributes.
2487 2487   * Returns a file handle and attributes for the new directory.
2488 2488   */
2489 2489  /* ARGSUSED */
2490 2490  void
2491 2491  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2492 2492      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2493 2493  {
2494 2494          int error;
2495 2495          struct vattr va;
2496 2496          vnode_t *dvp = NULL;
2497 2497          vnode_t *vp;
2498 2498          char *name = args->ca_da.da_name;
2499 2499  
2500 2500          /*
2501 2501           * Disallow NULL paths
2502 2502           */
2503 2503          if (name == NULL || *name == '\0') {
2504 2504                  dr->dr_status = NFSERR_ACCES;
2505 2505                  return;
2506 2506          }
2507 2507  
2508 2508          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2509 2509          if (vp == NULL) {
2510 2510                  dr->dr_status = NFSERR_STALE;
2511 2511                  return;
2512 2512          }
2513 2513  
2514 2514          if (rdonly(ro, vp)) {
2515 2515                  VN_RELE(vp);
2516 2516                  dr->dr_status = NFSERR_ROFS;
2517 2517                  return;
2518 2518          }
2519 2519  
2520 2520          error = sattr_to_vattr(args->ca_sa, &va);
2521 2521          if (error) {
2522 2522                  VN_RELE(vp);
2523 2523                  dr->dr_status = puterrno(error);
2524 2524                  return;
2525 2525          }
2526 2526  
2527 2527          if (!(va.va_mask & AT_MODE)) {
2528 2528                  VN_RELE(vp);
2529 2529                  dr->dr_status = NFSERR_INVAL;
2530 2530                  return;
2531 2531          }
2532 2532  
2533 2533          va.va_type = VDIR;
2534 2534          va.va_mask |= AT_TYPE;
2535 2535  
2536 2536          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2537 2537  
2538 2538          if (!error) {
2539 2539                  /*
2540 2540                   * Attribtutes of the newly created directory should
2541 2541                   * be returned to the client.
2542 2542                   */
2543 2543                  va.va_mask = AT_ALL; /* We want everything */
2544 2544                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2545 2545  
2546 2546                  /* check for overflows */
2547 2547                  if (!error) {
2548 2548                          acl_perm(vp, exi, &va, cr);
2549 2549                          error = vattr_to_nattr(&va, &dr->dr_attr);
2550 2550                          if (!error) {
2551 2551                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2552 2552                          }
2553 2553                  }
2554 2554                  /*
2555 2555                   * Force new data and metadata out to stable storage.
2556 2556                   */
2557 2557                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2558 2558                  VN_RELE(dvp);
2559 2559          }
2560 2560  
2561 2561          /*
2562 2562           * Force modified data and metadata out to stable storage.
2563 2563           */
2564 2564          (void) VOP_FSYNC(vp, 0, cr, NULL);
2565 2565  
2566 2566          VN_RELE(vp);
2567 2567  
2568 2568          dr->dr_status = puterrno(error);
2569 2569  
2570 2570  }
2571 2571  void *
2572 2572  rfs_mkdir_getfh(struct nfscreatargs *args)
2573 2573  {
2574 2574          return (args->ca_da.da_fhandle);
2575 2575  }
2576 2576  
2577 2577  /*
2578 2578   * Remove a directory.
2579 2579   * Remove the given directory name from the given parent directory.
2580 2580   */
2581 2581  /* ARGSUSED */
2582 2582  void
2583 2583  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2584 2584      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2585 2585  {
2586 2586          int error;
2587 2587          vnode_t *vp;
2588 2588  
2589 2589          /*
2590 2590           * Disallow NULL paths
2591 2591           */
2592 2592          if (da->da_name == NULL || *da->da_name == '\0') {
2593 2593                  *status = NFSERR_ACCES;
2594 2594                  return;
2595 2595          }
2596 2596  
2597 2597          vp = nfs_fhtovp(da->da_fhandle, exi);
2598 2598          if (vp == NULL) {
2599 2599                  *status = NFSERR_STALE;
2600 2600                  return;
2601 2601          }
2602 2602  
2603 2603          if (rdonly(ro, vp)) {
2604 2604                  VN_RELE(vp);
2605 2605                  *status = NFSERR_ROFS;
2606 2606                  return;
2607 2607          }
2608 2608  
2609 2609          /*
2610 2610           * VOP_RMDIR takes a third argument (the current
2611 2611           * directory of the process).  That's because someone
2612 2612           * wants to return EINVAL if one tries to remove ".".
2613 2613           * Of course, NFS servers have no idea what their
2614 2614           * clients' current directories are.  We fake it by
2615 2615           * supplying a vnode known to exist and illegal to
2616 2616           * remove.
2617 2617           */
2618 2618          error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2619 2619  
2620 2620          /*
2621 2621           * Force modified data and metadata out to stable storage.
2622 2622           */
2623 2623          (void) VOP_FSYNC(vp, 0, cr, NULL);
2624 2624  
2625 2625          VN_RELE(vp);
2626 2626  
2627 2627          /*
2628 2628           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2629 2629           * if the directory is not empty.  A System V NFS server
2630 2630           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2631 2631           * over the wire.
2632 2632           */
2633 2633          if (error == EEXIST)
2634 2634                  *status = NFSERR_NOTEMPTY;
2635 2635          else
2636 2636                  *status = puterrno(error);
2637 2637  
2638 2638  }
2639 2639  void *
2640 2640  rfs_rmdir_getfh(struct nfsdiropargs *da)
2641 2641  {
2642 2642          return (da->da_fhandle);
2643 2643  }
2644 2644  
2645 2645  /* ARGSUSED */
2646 2646  void
2647 2647  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2648 2648      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2649 2649  {
2650 2650          int error;
2651 2651          int iseof;
2652 2652          struct iovec iov;
2653 2653          struct uio uio;
2654 2654          vnode_t *vp;
2655 2655          char *ndata = NULL;
2656 2656          struct sockaddr *ca;
2657 2657          size_t nents;
2658 2658          int ret;
2659 2659  
2660 2660          vp = nfs_fhtovp(&rda->rda_fh, exi);
2661 2661          if (vp == NULL) {
2662 2662                  rd->rd_entries = NULL;
2663 2663                  rd->rd_status = NFSERR_STALE;
2664 2664                  return;
2665 2665          }
2666 2666  
2667 2667          if (vp->v_type != VDIR) {
2668 2668                  VN_RELE(vp);
2669 2669                  rd->rd_entries = NULL;
2670 2670                  rd->rd_status = NFSERR_NOTDIR;
2671 2671                  return;
2672 2672          }
2673 2673  
2674 2674          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2675 2675  
2676 2676          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2677 2677  
2678 2678          if (error) {
2679 2679                  rd->rd_entries = NULL;
2680 2680                  goto bad;
2681 2681          }
2682 2682  
2683 2683          if (rda->rda_count == 0) {
2684 2684                  rd->rd_entries = NULL;
2685 2685                  rd->rd_size = 0;
2686 2686                  rd->rd_eof = FALSE;
2687 2687                  goto bad;
2688 2688          }
2689 2689  
2690 2690          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2691 2691  
2692 2692          /*
2693 2693           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2694 2694           */
2695 2695          rd->rd_bufsize = (uint_t)rda->rda_count;
2696 2696          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2697 2697  
2698 2698          /*
2699 2699           * Set up io vector to read directory data
2700 2700           */
2701 2701          iov.iov_base = (caddr_t)rd->rd_entries;
2702 2702          iov.iov_len = rda->rda_count;
2703 2703          uio.uio_iov = &iov;
2704 2704          uio.uio_iovcnt = 1;
2705 2705          uio.uio_segflg = UIO_SYSSPACE;
2706 2706          uio.uio_extflg = UIO_COPY_CACHED;
2707 2707          uio.uio_loffset = (offset_t)rda->rda_offset;
2708 2708          uio.uio_resid = rda->rda_count;
2709 2709  
2710 2710          /*
2711 2711           * read directory
2712 2712           */
2713 2713          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2714 2714  
2715 2715          /*
2716 2716           * Clean up
2717 2717           */
2718 2718          if (!error) {
2719 2719                  /*
2720 2720                   * set size and eof
2721 2721                   */
2722 2722                  if (uio.uio_resid == rda->rda_count) {
2723 2723                          rd->rd_size = 0;
2724 2724                          rd->rd_eof = TRUE;
2725 2725                  } else {
2726 2726                          rd->rd_size = (uint32_t)(rda->rda_count -
2727 2727                              uio.uio_resid);
2728 2728                          rd->rd_eof = iseof ? TRUE : FALSE;
2729 2729                  }
2730 2730          }
2731 2731  
2732 2732          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2733 2733          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2734 2734          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2735 2735              rda->rda_count, &ndata);
2736 2736  
2737 2737          if (ret != 0) {
2738 2738                  size_t dropbytes;
2739 2739                  /*
2740 2740                   * We had to drop one or more entries in order to fit
2741 2741                   * during the character conversion.  We need to patch
2742 2742                   * up the size and eof info.
2743 2743                   */
2744 2744                  if (rd->rd_eof)
2745 2745                          rd->rd_eof = FALSE;
2746 2746                  dropbytes = nfscmd_dropped_entrysize(
2747 2747                      (struct dirent64 *)rd->rd_entries, nents, ret);
2748 2748                  rd->rd_size -= dropbytes;
2749 2749          }
2750 2750          if (ndata == NULL) {
2751 2751                  ndata = (char *)rd->rd_entries;
2752 2752          } else if (ndata != (char *)rd->rd_entries) {
2753 2753                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2754 2754                  rd->rd_entries = (void *)ndata;
2755 2755                  rd->rd_bufsize = rda->rda_count;
2756 2756          }
2757 2757  
2758 2758  bad:
2759 2759          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2760 2760  
2761 2761  #if 0 /* notyet */
2762 2762          /*
2763 2763           * Don't do this.  It causes local disk writes when just
2764 2764           * reading the file and the overhead is deemed larger
2765 2765           * than the benefit.
2766 2766           */
2767 2767          /*
2768 2768           * Force modified metadata out to stable storage.
2769 2769           */
2770 2770          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2771 2771  #endif
2772 2772  
2773 2773          VN_RELE(vp);
2774 2774  
2775 2775          rd->rd_status = puterrno(error);
2776 2776  
2777 2777  }
2778 2778  void *
2779 2779  rfs_readdir_getfh(struct nfsrddirargs *rda)
2780 2780  {
2781 2781          return (&rda->rda_fh);
2782 2782  }
2783 2783  void
2784 2784  rfs_rddirfree(struct nfsrddirres *rd)
2785 2785  {
2786 2786          if (rd->rd_entries != NULL)
2787 2787                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2788 2788  }
2789 2789  
2790 2790  /* ARGSUSED */
2791 2791  void
2792 2792  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2793 2793      struct svc_req *req, cred_t *cr, bool_t ro)
2794 2794  {
2795 2795          int error;
2796 2796          struct statvfs64 sb;
2797 2797          vnode_t *vp;
2798 2798  
2799 2799          vp = nfs_fhtovp(fh, exi);
2800 2800          if (vp == NULL) {
2801 2801                  fs->fs_status = NFSERR_STALE;
2802 2802                  return;
2803 2803          }
2804 2804  
2805 2805          error = VFS_STATVFS(vp->v_vfsp, &sb);
2806 2806  
2807 2807          if (!error) {
2808 2808                  fs->fs_tsize = nfstsize();
2809 2809                  fs->fs_bsize = sb.f_frsize;
2810 2810                  fs->fs_blocks = sb.f_blocks;
2811 2811                  fs->fs_bfree = sb.f_bfree;
2812 2812                  fs->fs_bavail = sb.f_bavail;
2813 2813          }
2814 2814  
2815 2815          VN_RELE(vp);
2816 2816  
2817 2817          fs->fs_status = puterrno(error);
2818 2818  
2819 2819  }
2820 2820  void *
2821 2821  rfs_statfs_getfh(fhandle_t *fh)
2822 2822  {
2823 2823          return (fh);
2824 2824  }
2825 2825  
2826 2826  static int
2827 2827  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2828 2828  {
2829 2829          vap->va_mask = 0;
2830 2830  
2831 2831          /*
2832 2832           * There was a sign extension bug in some VFS based systems
2833 2833           * which stored the mode as a short.  When it would get
2834 2834           * assigned to a u_long, no sign extension would occur.
2835 2835           * It needed to, but this wasn't noticed because sa_mode
2836 2836           * would then get assigned back to the short, thus ignoring
2837 2837           * the upper 16 bits of sa_mode.
2838 2838           *
2839 2839           * To make this implementation work for both broken
2840 2840           * clients and good clients, we check for both versions
2841 2841           * of the mode.
2842 2842           */
2843 2843          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2844 2844              sa->sa_mode != (uint32_t)-1) {
2845 2845                  vap->va_mask |= AT_MODE;
2846 2846                  vap->va_mode = sa->sa_mode;
2847 2847          }
2848 2848          if (sa->sa_uid != (uint32_t)-1) {
2849 2849                  vap->va_mask |= AT_UID;
2850 2850                  vap->va_uid = sa->sa_uid;
2851 2851          }
2852 2852          if (sa->sa_gid != (uint32_t)-1) {
2853 2853                  vap->va_mask |= AT_GID;
2854 2854                  vap->va_gid = sa->sa_gid;
2855 2855          }
2856 2856          if (sa->sa_size != (uint32_t)-1) {
2857 2857                  vap->va_mask |= AT_SIZE;
2858 2858                  vap->va_size = sa->sa_size;
2859 2859          }
2860 2860          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2861 2861              sa->sa_atime.tv_usec != (int32_t)-1) {
2862 2862  #ifndef _LP64
2863 2863                  /* return error if time overflow */
2864 2864                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2865 2865                          return (EOVERFLOW);
2866 2866  #endif
2867 2867                  vap->va_mask |= AT_ATIME;
2868 2868                  /*
2869 2869                   * nfs protocol defines times as unsigned so don't extend sign,
2870 2870                   * unless sysadmin set nfs_allow_preepoch_time.
2871 2871                   */
2872 2872                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2873 2873                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2874 2874          }
2875 2875          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2876 2876              sa->sa_mtime.tv_usec != (int32_t)-1) {
2877 2877  #ifndef _LP64
2878 2878                  /* return error if time overflow */
2879 2879                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2880 2880                          return (EOVERFLOW);
2881 2881  #endif
2882 2882                  vap->va_mask |= AT_MTIME;
2883 2883                  /*
2884 2884                   * nfs protocol defines times as unsigned so don't extend sign,
2885 2885                   * unless sysadmin set nfs_allow_preepoch_time.
2886 2886                   */
2887 2887                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2888 2888                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2889 2889          }
2890 2890          return (0);
2891 2891  }
2892 2892  
2893 2893  static const enum nfsftype vt_to_nf[] = {
2894 2894          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2895 2895  };
2896 2896  
2897 2897  /*
2898 2898   * check the following fields for overflow: nodeid, size, and time.
2899 2899   * There could be a problem when converting 64-bit LP64 fields
2900 2900   * into 32-bit ones.  Return an error if there is an overflow.
2901 2901   */
2902 2902  int
2903 2903  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2904 2904  {
2905 2905          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2906 2906          na->na_type = vt_to_nf[vap->va_type];
2907 2907  
2908 2908          if (vap->va_mode == (unsigned short) -1)
2909 2909                  na->na_mode = (uint32_t)-1;
2910 2910          else
2911 2911                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2912 2912  
2913 2913          if (vap->va_uid == (unsigned short)(-1))
2914 2914                  na->na_uid = (uint32_t)(-1);
2915 2915          else if (vap->va_uid == UID_NOBODY)
2916 2916                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2917 2917          else
2918 2918                  na->na_uid = vap->va_uid;
2919 2919  
2920 2920          if (vap->va_gid == (unsigned short)(-1))
2921 2921                  na->na_gid = (uint32_t)-1;
2922 2922          else if (vap->va_gid == GID_NOBODY)
2923 2923                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2924 2924          else
2925 2925                  na->na_gid = vap->va_gid;
2926 2926  
2927 2927          /*
2928 2928           * Do we need to check fsid for overflow?  It is 64-bit in the
2929 2929           * vattr, but are bigger than 32 bit values supported?
2930 2930           */
2931 2931          na->na_fsid = vap->va_fsid;
2932 2932  
2933 2933          na->na_nodeid = vap->va_nodeid;
2934 2934  
2935 2935          /*
2936 2936           * Check to make sure that the nodeid is representable over the
2937 2937           * wire without losing bits.
2938 2938           */
2939 2939          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2940 2940                  return (EFBIG);
2941 2941          na->na_nlink = vap->va_nlink;
2942 2942  
2943 2943          /*
2944 2944           * Check for big files here, instead of at the caller.  See
2945 2945           * comments in cstat for large special file explanation.
2946 2946           */
2947 2947          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2948 2948                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2949 2949                          return (EFBIG);
2950 2950                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2951 2951                          /* UNKNOWN_SIZE | OVERFLOW */
2952 2952                          na->na_size = MAXOFF32_T;
2953 2953                  } else
2954 2954                          na->na_size = vap->va_size;
2955 2955          } else
2956 2956                  na->na_size = vap->va_size;
2957 2957  
2958 2958          /*
2959 2959           * If the vnode times overflow the 32-bit times that NFS2
2960 2960           * uses on the wire then return an error.
2961 2961           */
2962 2962          if (!NFS_VAP_TIME_OK(vap)) {
2963 2963                  return (EOVERFLOW);
2964 2964          }
2965 2965          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2966 2966          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2967 2967  
2968 2968          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2969 2969          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2970 2970  
2971 2971          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2972 2972          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2973 2973  
2974 2974          /*
2975 2975           * If the dev_t will fit into 16 bits then compress
2976 2976           * it, otherwise leave it alone. See comments in
2977 2977           * nfs_client.c.
2978 2978           */
2979 2979          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2980 2980              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2981 2981                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2982 2982          else
2983 2983                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2984 2984  
2985 2985          na->na_blocks = vap->va_nblocks;
2986 2986          na->na_blocksize = vap->va_blksize;
2987 2987  
2988 2988          /*
2989 2989           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2990 2990           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2991 2991           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2992 2992           *
2993 2993           * BUYER BEWARE:
2994 2994           *  If you are porting the NFS to a non-Sun server, you probably
2995 2995           *  don't want to include the following block of code.  The
2996 2996           *  over-the-wire special file types will be changing with the
2997 2997           *  NFS Protocol Revision.
2998 2998           */
2999 2999          if (vap->va_type == VFIFO)
3000 3000                  NA_SETFIFO(na);
3001 3001          return (0);
3002 3002  }
3003 3003  
3004 3004  /*
3005 3005   * acl v2 support: returns approximate permission.
3006 3006   *      default: returns minimal permission (more restrictive)
3007 3007   *      aclok: returns maximal permission (less restrictive)
3008 3008   *      This routine changes the permissions that are alaredy in *va.
3009 3009   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
3010 3010   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3011 3011   */
3012 3012  static void
3013 3013  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3014 3014  {
3015 3015          vsecattr_t      vsa;
3016 3016          int             aclcnt;
3017 3017          aclent_t        *aclentp;
3018 3018          mode_t          mask_perm;
3019 3019          mode_t          grp_perm;
3020 3020          mode_t          other_perm;
3021 3021          mode_t          other_orig;
3022 3022          int             error;
3023 3023  
3024 3024          /* dont care default acl */
3025 3025          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3026 3026          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3027 3027  
3028 3028          if (!error) {
3029 3029                  aclcnt = vsa.vsa_aclcnt;
3030 3030                  if (aclcnt > MIN_ACL_ENTRIES) {
3031 3031                          /* non-trivial ACL */
3032 3032                          aclentp = vsa.vsa_aclentp;
3033 3033                          if (exi->exi_export.ex_flags & EX_ACLOK) {
3034 3034                                  /* maximal permissions */
3035 3035                                  grp_perm = 0;
3036 3036                                  other_perm = 0;
3037 3037                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3038 3038                                          switch (aclentp->a_type) {
3039 3039                                          case USER_OBJ:
3040 3040                                                  break;
3041 3041                                          case USER:
3042 3042                                                  grp_perm |=
3043 3043                                                      aclentp->a_perm << 3;
3044 3044                                                  other_perm |= aclentp->a_perm;
3045 3045                                                  break;
3046 3046                                          case GROUP_OBJ:
3047 3047                                                  grp_perm |=
3048 3048                                                      aclentp->a_perm << 3;
3049 3049                                                  break;
3050 3050                                          case GROUP:
3051 3051                                                  other_perm |= aclentp->a_perm;
3052 3052                                                  break;
3053 3053                                          case OTHER_OBJ:
3054 3054                                                  other_orig = aclentp->a_perm;
3055 3055                                                  break;
3056 3056                                          case CLASS_OBJ:
3057 3057                                                  mask_perm = aclentp->a_perm;
3058 3058                                                  break;
3059 3059                                          default:
3060 3060                                                  break;
3061 3061                                          }
3062 3062                                  }
3063 3063                                  grp_perm &= mask_perm << 3;
3064 3064                                  other_perm &= mask_perm;
3065 3065                                  other_perm |= other_orig;
3066 3066  
3067 3067                          } else {
3068 3068                                  /* minimal permissions */
3069 3069                                  grp_perm = 070;
3070 3070                                  other_perm = 07;
3071 3071                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3072 3072                                          switch (aclentp->a_type) {
3073 3073                                          case USER_OBJ:
3074 3074                                                  break;
3075 3075                                          case USER:
3076 3076                                          case CLASS_OBJ:
3077 3077                                                  grp_perm &=
3078 3078                                                      aclentp->a_perm << 3;
3079 3079                                                  other_perm &=
3080 3080                                                      aclentp->a_perm;
3081 3081                                                  break;
3082 3082                                          case GROUP_OBJ:
3083 3083                                                  grp_perm &=
3084 3084                                                      aclentp->a_perm << 3;
3085 3085                                                  break;
3086 3086                                          case GROUP:
3087 3087                                                  other_perm &=
3088 3088                                                      aclentp->a_perm;
3089 3089                                                  break;
3090 3090                                          case OTHER_OBJ:
3091 3091                                                  other_perm &=
3092 3092                                                      aclentp->a_perm;
3093 3093                                                  break;
3094 3094                                          default:
3095 3095                                                  break;
3096 3096                                          }
3097 3097                                  }
3098 3098                          }
3099 3099                          /* copy to va */
3100 3100                          va->va_mode &= ~077;
3101 3101                          va->va_mode |= grp_perm | other_perm;
3102 3102                  }
3103 3103                  if (vsa.vsa_aclcnt)
3104 3104                          kmem_free(vsa.vsa_aclentp,
3105 3105                              vsa.vsa_aclcnt * sizeof (aclent_t));
3106 3106          }
3107 3107  }
3108 3108  
3109 3109  void
3110 3110  rfs_srvrinit(void)
3111 3111  {
3112 3112          nfs2_srv_caller_id = fs_new_caller_id();
3113 3113  }
3114 3114  
3115 3115  void
3116 3116  rfs_srvrfini(void)
3117 3117  {
3118 3118  }
3119 3119  
3120 3120  /* ARGSUSED */
3121 3121  void
3122 3122  rfs_srv_zone_init(nfs_globals_t *ng)
3123 3123  {
3124 3124          nfs_srv_t *ns;
3125 3125  
3126 3126          ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3127 3127  
3128 3128          mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3129 3129          ns->write_async = 1;
3130 3130  
3131 3131          ng->nfs_srv = ns;
3132 3132  }
3133 3133  
3134 3134  /* ARGSUSED */
3135 3135  void
3136 3136  rfs_srv_zone_fini(nfs_globals_t *ng)
3137 3137  {
3138 3138          nfs_srv_t *ns = ng->nfs_srv;
3139 3139  
3140 3140          ng->nfs_srv = NULL;
3141 3141  
3142 3142          mutex_destroy(&ns->async_write_lock);
3143 3143          kmem_free(ns, sizeof (*ns));
3144 3144  }
3145 3145  
3146 3146  static int
3147 3147  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3148 3148  {
3149 3149          struct clist    *wcl;
3150 3150          int             wlist_len;
3151 3151          uint32_t        count = rr->rr_count;
3152 3152  
3153 3153          wcl = ra->ra_wlist;
3154 3154  
3155 3155          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3156 3156                  return (FALSE);
3157 3157          }
3158 3158  
3159 3159          wcl = ra->ra_wlist;
3160 3160          rr->rr_ok.rrok_wlist_len = wlist_len;
3161 3161          rr->rr_ok.rrok_wlist = wcl;
3162 3162  
3163 3163          return (TRUE);
3164 3164  }
  
    | 
      ↓ open down ↓ | 
    2662 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX