do-not-assume-curzone Wdiff usr/src/uts/common/fs/nfs/nfs_srv.c

Print this page

curzone reality check and teardown changes to use the RIGHT zone

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  30   30   *      All rights reserved.
  31   31   */
  32   32  
  33   33  /*
  34   34   * Copyright 2018 Nexenta Systems, Inc.
  35   35   * Copyright (c) 2016 by Delphix. All rights reserved.
  36   36   */
  37   37  
  38   38  #include <sys/param.h>
  39   39  #include <sys/types.h>
  40   40  #include <sys/systm.h>
  41   41  #include <sys/cred.h>
  42   42  #include <sys/buf.h>
  43   43  #include <sys/vfs.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/stat.h>
  47   47  #include <sys/errno.h>
  48   48  #include <sys/sysmacros.h>
  49   49  #include <sys/statvfs.h>
  50   50  #include <sys/kmem.h>
  51   51  #include <sys/kstat.h>
  52   52  #include <sys/dirent.h>
  53   53  #include <sys/cmn_err.h>
  54   54  #include <sys/debug.h>
  55   55  #include <sys/vtrace.h>
  56   56  #include <sys/mode.h>
  57   57  #include <sys/acl.h>
  58   58  #include <sys/nbmlock.h>
  59   59  #include <sys/policy.h>
  60   60  #include <sys/sdt.h>
  61   61  
  62   62  #include <rpc/types.h>
  63   63  #include <rpc/auth.h>
  64   64  #include <rpc/svc.h>
  65   65  
  66   66  #include <nfs/nfs.h>
  67   67  #include <nfs/export.h>
  68   68  #include <nfs/nfs_cmd.h>
  69   69  
  70   70  #include <vm/hat.h>
  71   71  #include <vm/as.h>
  72   72  #include <vm/seg.h>
  73   73  #include <vm/seg_map.h>
  74   74  #include <vm/seg_kmem.h>
  75   75  
  76   76  #include <sys/strsubr.h>
  77   77  
  78   78  struct rfs_async_write_list;
  79   79  
  80   80  /*
  81   81   * Zone globals of NFSv2 server
  82   82   */
  83   83  typedef struct nfs_srv {
  84   84          kmutex_t                        async_write_lock;
  85   85          struct rfs_async_write_list     *async_write_head;
  86   86  
  87   87          /*
  88   88           * enables write clustering if == 1
  89   89           */
  90   90          int             write_async;
  91   91  } nfs_srv_t;
  92   92  
  93   93  /*
  94   94   * These are the interface routines for the server side of the
  95   95   * Network File System.  See the NFS version 2 protocol specification
  96   96   * for a description of this interface.
  97   97   */
  98   98  
  99   99  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
 100  100  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
 101  101                          cred_t *);
 102  102  static void     *rfs_zone_init(zoneid_t zoneid);
 103  103  static void     rfs_zone_fini(zoneid_t zoneid, void *data);
 104  104  
 105  105  
 106  106  /*
 107  107   * Some "over the wire" UNIX file types.  These are encoded
 108  108   * into the mode.  This needs to be fixed in the next rev.
 109  109   */
 110  110  #define IFMT            0170000         /* type of file */
 111  111  #define IFCHR           0020000         /* character special */
 112  112  #define IFBLK           0060000         /* block special */
 113  113  #define IFSOCK          0140000         /* socket */
 114  114  
 115  115  u_longlong_t nfs2_srv_caller_id;
 116  116  static zone_key_t rfs_zone_key;
 117  117  
 118  118  /*
 119  119   * Get file attributes.
 120  120   * Returns the current attributes of the file with the given fhandle.
 121  121   */
 122  122  /* ARGSUSED */
 123  123  void
 124  124  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 125  125      struct svc_req *req, cred_t *cr, bool_t ro)
 126  126  {
 127  127          int error;
 128  128          vnode_t *vp;
 129  129          struct vattr va;
 130  130  
 131  131          vp = nfs_fhtovp(fhp, exi);
 132  132          if (vp == NULL) {
 133  133                  ns->ns_status = NFSERR_STALE;
 134  134                  return;
 135  135          }
 136  136  
 137  137          /*
 138  138           * Do the getattr.
 139  139           */
 140  140          va.va_mask = AT_ALL;    /* we want all the attributes */
 141  141  
 142  142          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 143  143  
 144  144          /* check for overflows */
 145  145          if (!error) {
 146  146                  /* Lie about the object type for a referral */
 147  147                  if (vn_is_nfs_reparse(vp, cr))
 148  148                          va.va_type = VLNK;
 149  149  
 150  150                  acl_perm(vp, exi, &va, cr);
 151  151                  error = vattr_to_nattr(&va, &ns->ns_attr);
 152  152          }
 153  153  
 154  154          VN_RELE(vp);
 155  155  
 156  156          ns->ns_status = puterrno(error);
 157  157  }
 158  158  void *
 159  159  rfs_getattr_getfh(fhandle_t *fhp)
 160  160  {
 161  161          return (fhp);
 162  162  }
 163  163  
 164  164  /*
 165  165   * Set file attributes.
 166  166   * Sets the attributes of the file with the given fhandle.  Returns
 167  167   * the new attributes.
 168  168   */
 169  169  /* ARGSUSED */
 170  170  void
 171  171  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 172  172      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 173  173  {
 174  174          int error;
 175  175          int flag;
 176  176          int in_crit = 0;
 177  177          vnode_t *vp;
 178  178          struct vattr va;
 179  179          struct vattr bva;
 180  180          struct flock64 bf;
 181  181          caller_context_t ct;
 182  182  
 183  183  
 184  184          vp = nfs_fhtovp(&args->saa_fh, exi);
 185  185          if (vp == NULL) {
 186  186                  ns->ns_status = NFSERR_STALE;
 187  187                  return;
 188  188          }
 189  189  
 190  190          if (rdonly(ro, vp)) {
 191  191                  VN_RELE(vp);
 192  192                  ns->ns_status = NFSERR_ROFS;
 193  193                  return;
 194  194          }
 195  195  
 196  196          error = sattr_to_vattr(&args->saa_sa, &va);
 197  197          if (error) {
 198  198                  VN_RELE(vp);
 199  199                  ns->ns_status = puterrno(error);
 200  200                  return;
 201  201          }
 202  202  
 203  203          /*
 204  204           * If the client is requesting a change to the mtime,
 205  205           * but the nanosecond field is set to 1 billion, then
 206  206           * this is a flag to the server that it should set the
 207  207           * atime and mtime fields to the server's current time.
 208  208           * The 1 billion number actually came from the client
 209  209           * as 1 million, but the units in the over the wire
 210  210           * request are microseconds instead of nanoseconds.
 211  211           *
 212  212           * This is an overload of the protocol and should be
 213  213           * documented in the NFS Version 2 protocol specification.
 214  214           */
 215  215          if (va.va_mask & AT_MTIME) {
 216  216                  if (va.va_mtime.tv_nsec == 1000000000) {
 217  217                          gethrestime(&va.va_mtime);
 218  218                          va.va_atime = va.va_mtime;
 219  219                          va.va_mask |= AT_ATIME;
 220  220                          flag = 0;
 221  221                  } else
 222  222                          flag = ATTR_UTIME;
 223  223          } else
 224  224                  flag = 0;
 225  225  
 226  226          /*
 227  227           * If the filesystem is exported with nosuid, then mask off
 228  228           * the setuid and setgid bits.
 229  229           */
 230  230          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 231  231              (exi->exi_export.ex_flags & EX_NOSUID))
 232  232                  va.va_mode &= ~(VSUID | VSGID);
 233  233  
 234  234          ct.cc_sysid = 0;
 235  235          ct.cc_pid = 0;
 236  236          ct.cc_caller_id = nfs2_srv_caller_id;
 237  237          ct.cc_flags = CC_DONTBLOCK;
 238  238  
 239  239          /*
 240  240           * We need to specially handle size changes because it is
 241  241           * possible for the client to create a file with modes
 242  242           * which indicate read-only, but with the file opened for
 243  243           * writing.  If the client then tries to set the size of
 244  244           * the file, then the normal access checking done in
 245  245           * VOP_SETATTR would prevent the client from doing so,
 246  246           * although it should be legal for it to do so.  To get
 247  247           * around this, we do the access checking for ourselves
 248  248           * and then use VOP_SPACE which doesn't do the access
 249  249           * checking which VOP_SETATTR does. VOP_SPACE can only
 250  250           * operate on VREG files, let VOP_SETATTR handle the other
 251  251           * extremely rare cases.
 252  252           * Also the client should not be allowed to change the
 253  253           * size of the file if there is a conflicting non-blocking
 254  254           * mandatory lock in the region of change.
 255  255           */
 256  256          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 257  257                  if (nbl_need_check(vp)) {
 258  258                          nbl_start_crit(vp, RW_READER);
 259  259                          in_crit = 1;
 260  260                  }
 261  261  
 262  262                  bva.va_mask = AT_UID | AT_SIZE;
 263  263  
 264  264                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 265  265  
 266  266                  if (error) {
 267  267                          if (in_crit)
 268  268                                  nbl_end_crit(vp);
 269  269                          VN_RELE(vp);
 270  270                          ns->ns_status = puterrno(error);
 271  271                          return;
 272  272                  }
 273  273  
 274  274                  if (in_crit) {
 275  275                          u_offset_t offset;
 276  276                          ssize_t length;
 277  277  
 278  278                          if (va.va_size < bva.va_size) {
 279  279                                  offset = va.va_size;
 280  280                                  length = bva.va_size - va.va_size;
 281  281                          } else {
 282  282                                  offset = bva.va_size;
 283  283                                  length = va.va_size - bva.va_size;
 284  284                          }
 285  285                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 286  286                              NULL)) {
 287  287                                  error = EACCES;
 288  288                          }
 289  289                  }
 290  290  
 291  291                  if (crgetuid(cr) == bva.va_uid && !error &&
 292  292                      va.va_size != bva.va_size) {
 293  293                          va.va_mask &= ~AT_SIZE;
 294  294                          bf.l_type = F_WRLCK;
 295  295                          bf.l_whence = 0;
 296  296                          bf.l_start = (off64_t)va.va_size;
 297  297                          bf.l_len = 0;
 298  298                          bf.l_sysid = 0;
 299  299                          bf.l_pid = 0;
 300  300  
 301  301                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 302  302                              (offset_t)va.va_size, cr, &ct);
 303  303                  }
 304  304                  if (in_crit)
 305  305                          nbl_end_crit(vp);
 306  306          } else
 307  307                  error = 0;
 308  308  
 309  309          /*
 310  310           * Do the setattr.
 311  311           */
 312  312          if (!error && va.va_mask) {
 313  313                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 314  314          }
 315  315  
 316  316          /*
 317  317           * check if the monitor on either vop_space or vop_setattr detected
 318  318           * a delegation conflict and if so, mark the thread flag as
 319  319           * wouldblock so that the response is dropped and the client will
 320  320           * try again.
 321  321           */
 322  322          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 323  323                  VN_RELE(vp);
 324  324                  curthread->t_flag |= T_WOULDBLOCK;
 325  325                  return;
 326  326          }
 327  327  
 328  328          if (!error) {
 329  329                  va.va_mask = AT_ALL;    /* get everything */
 330  330  
 331  331                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 332  332  
 333  333                  /* check for overflows */
 334  334                  if (!error) {
 335  335                          acl_perm(vp, exi, &va, cr);
 336  336                          error = vattr_to_nattr(&va, &ns->ns_attr);
 337  337                  }
 338  338          }
 339  339  
 340  340          ct.cc_flags = 0;
 341  341  
 342  342          /*
 343  343           * Force modified metadata out to stable storage.
 344  344           */
 345  345          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 346  346  
 347  347          VN_RELE(vp);
 348  348  
 349  349          ns->ns_status = puterrno(error);
 350  350  }
 351  351  void *
 352  352  rfs_setattr_getfh(struct nfssaargs *args)
 353  353  {
 354  354          return (&args->saa_fh);
 355  355  }
 356  356  
 357  357  /* Change and release @exip and @vpp only in success */
 358  358  int
 359  359  rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
 360  360  {
 361  361          struct exportinfo *exi;
 362  362          vnode_t *vp = *vpp;
 363  363          fid_t fid;
 364  364          int error;
 365  365  
 366  366          VN_HOLD(vp);
 367  367  
 368  368          if ((error = traverse(&vp)) != 0) {
 369  369                  VN_RELE(vp);
 370  370                  return (error);
 371  371          }
 372  372  
 373  373          bzero(&fid, sizeof (fid));
 374  374          fid.fid_len = MAXFIDSZ;
 375  375          error = VOP_FID(vp, &fid, NULL);
 376  376          if (error) {
 377  377                  VN_RELE(vp);
 378  378                  return (error);
 379  379          }
 380  380  
 381  381          exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
 382  382          if (exi == NULL ||
 383  383              (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
 384  384                  /*
 385  385                   * It is not error, just subdir is not exported
 386  386                   * or "nohide" is not set
 387  387                   */
 388  388                  if (exi != NULL)
 389  389                          exi_rele(exi);
 390  390                  VN_RELE(vp);
 391  391          } else {
 392  392                  /* go to submount */
 393  393                  exi_rele(*exip);
 394  394                  *exip = exi;
 395  395  
 396  396                  VN_RELE(*vpp);
 397  397                  *vpp = vp;
 398  398          }
 399  399  
 400  400          return (0);
 401  401  }
 402  402  
 403  403  /*

↓ open down ↓

403 lines elided

↑ open up ↑

 404  404   * Given mounted "dvp" and "exi", go upper mountpoint
 405  405   * with dvp/exi correction
 406  406   * Return 0 in success
 407  407   */
 408  408  int
 409  409  rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
 410  410  {
 411  411          struct exportinfo *exi;
 412  412          vnode_t *dvp = *dvpp;
 413  413  
      414 +        ASSERT3P((*exip)->exi_zone, ==, curzone);
 414  415          ASSERT((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp));
 415  416  
 416  417          VN_HOLD(dvp);
 417  418          dvp = untraverse(dvp);
 418  419          exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
 419  420          if (exi == NULL) {
 420  421                  VN_RELE(dvp);
 421  422                  return (-1);
 422  423          }
 423  424  
      425 +        ASSERT3P(exi->exi_zone, ==, curzone);
 424  426          exi_rele(*exip);
 425  427          *exip = exi;
 426  428          VN_RELE(*dvpp);
 427  429          *dvpp = dvp;
 428  430  
 429  431          return (0);
 430  432  }
 431  433  /*
 432  434   * Directory lookup.
 433  435   * Returns an fhandle and file attributes for file name in a directory.

 434  436   */
 435  437  /* ARGSUSED */
 436  438  void
 437  439  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 438  440      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 439  441  {
 440  442          int error;
 441  443          vnode_t *dvp;
 442  444          vnode_t *vp;
 443  445          struct vattr va;
 444  446          fhandle_t *fhp = da->da_fhandle;
 445  447          struct sec_ol sec = {0, 0};
 446  448          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 447  449          char *name;
 448  450          struct sockaddr *ca;
 449  451  
 450  452          /*
 451  453           * Trusted Extension doesn't support NFSv2. MOUNT
 452  454           * will reject v2 clients. Need to prevent v2 client
 453  455           * access via WebNFS here.
 454  456           */
 455  457          if (is_system_labeled() && req->rq_vers == 2) {
 456  458                  dr->dr_status = NFSERR_ACCES;
 457  459                  return;
 458  460          }
 459  461  
 460  462          /*
 461  463           * Disallow NULL paths
 462  464           */
 463  465          if (da->da_name == NULL || *da->da_name == '\0') {
 464  466                  dr->dr_status = NFSERR_ACCES;
 465  467                  return;
 466  468          }
 467  469  
 468  470          /*
 469  471           * Allow lookups from the root - the default
 470  472           * location of the public filehandle.
 471  473           */
 472  474          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 473  475                  dvp = ZONE_ROOTVP();

↓ open down ↓

40 lines elided

↑ open up ↑

 474  476                  VN_HOLD(dvp);
 475  477          } else {
 476  478                  dvp = nfs_fhtovp(fhp, exi);
 477  479                  if (dvp == NULL) {
 478  480                          dr->dr_status = NFSERR_STALE;
 479  481                          return;
 480  482                  }
 481  483          }
 482  484  
 483  485          exi_hold(exi);
      486 +        ASSERT3P(exi->exi_zone, ==, curzone);
 484  487  
 485  488          /*
 486  489           * Not allow lookup beyond root.
 487  490           * If the filehandle matches a filehandle of the exi,
 488  491           * then the ".." refers beyond the root of an exported filesystem.
 489  492           */
 490  493          if (strcmp(da->da_name, "..") == 0 &&
 491  494              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 492  495                  if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
 493  496                      ((dvp->v_flag & VROOT) || VN_IS_CURZONEROOT(dvp))) {

 494  497                          /*
 495  498                           * special case for ".." and 'nohide'exported root
 496  499                           */
 497  500                          if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
 498  501                                  error = NFSERR_ACCES;
 499  502                                  goto out;
 500  503                          }
 501  504                  } else  {
 502  505                          error = NFSERR_NOENT;
 503  506                          goto out;
 504  507                  }
 505  508          }
 506  509  
 507  510          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 508  511          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 509  512              MAXPATHLEN);
 510  513  
 511  514          if (name == NULL) {
 512  515                  error = NFSERR_ACCES;
 513  516                  goto out;
 514  517          }
 515  518  
 516  519          /*
 517  520           * If the public filehandle is used then allow
 518  521           * a multi-component lookup, i.e. evaluate
 519  522           * a pathname and follow symbolic links if
 520  523           * necessary.
 521  524           *
 522  525           * This may result in a vnode in another filesystem
 523  526           * which is OK as long as the filesystem is exported.
 524  527           */
 525  528          if (PUBLIC_FH2(fhp)) {
 526  529                  publicfh_flag = TRUE;
 527  530  
 528  531                  exi_rele(exi);
 529  532  
 530  533                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 531  534                      &sec);
 532  535          } else {
 533  536                  /*
 534  537                   * Do a normal single component lookup.
 535  538                   */
 536  539                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 537  540                      NULL, NULL, NULL);
 538  541          }
 539  542  
 540  543          if (name != da->da_name)
 541  544                  kmem_free(name, MAXPATHLEN);
 542  545  
 543  546          if (error == 0 && vn_ismntpt(vp)) {
 544  547                  error = rfs_cross_mnt(&vp, &exi);
 545  548                  if (error)
 546  549                          VN_RELE(vp);
 547  550          }
 548  551  
 549  552          if (!error) {
 550  553                  va.va_mask = AT_ALL;    /* we want everything */
 551  554  
 552  555                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 553  556  
 554  557                  /* check for overflows */
 555  558                  if (!error) {
 556  559                          acl_perm(vp, exi, &va, cr);
 557  560                          error = vattr_to_nattr(&va, &dr->dr_attr);
 558  561                          if (!error) {
 559  562                                  if (sec.sec_flags & SEC_QUERY)
 560  563                                          error = makefh_ol(&dr->dr_fhandle, exi,
 561  564                                              sec.sec_index);
 562  565                                  else {
 563  566                                          error = makefh(&dr->dr_fhandle, vp,
 564  567                                              exi);
 565  568                                          if (!error && publicfh_flag &&
 566  569                                              !chk_clnt_sec(exi, req))
 567  570                                                  auth_weak = TRUE;
 568  571                                  }
 569  572                          }
 570  573                  }
 571  574                  VN_RELE(vp);
 572  575          }
 573  576  
 574  577  out:
 575  578          VN_RELE(dvp);
 576  579  
 577  580          if (exi != NULL)
 578  581                  exi_rele(exi);
 579  582  
 580  583          /*
 581  584           * If it's public fh, no 0x81, and client's flavor is
 582  585           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 583  586           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 584  587           */
 585  588          if (auth_weak)
 586  589                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 587  590          else
 588  591                  dr->dr_status = puterrno(error);
 589  592  }
 590  593  void *
 591  594  rfs_lookup_getfh(struct nfsdiropargs *da)
 592  595  {
 593  596          return (da->da_fhandle);
 594  597  }
 595  598  
 596  599  /*
 597  600   * Read symbolic link.
 598  601   * Returns the string in the symbolic link at the given fhandle.
 599  602   */
 600  603  /* ARGSUSED */
 601  604  void
 602  605  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 603  606      struct svc_req *req, cred_t *cr, bool_t ro)
 604  607  {
 605  608          int error;
 606  609          struct iovec iov;
 607  610          struct uio uio;
 608  611          vnode_t *vp;
 609  612          struct vattr va;
 610  613          struct sockaddr *ca;
 611  614          char *name = NULL;
 612  615          int is_referral = 0;
 613  616  
 614  617          vp = nfs_fhtovp(fhp, exi);
 615  618          if (vp == NULL) {
 616  619                  rl->rl_data = NULL;
 617  620                  rl->rl_status = NFSERR_STALE;
 618  621                  return;
 619  622          }
 620  623  
 621  624          va.va_mask = AT_MODE;
 622  625  
 623  626          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 624  627  
 625  628          if (error) {
 626  629                  VN_RELE(vp);
 627  630                  rl->rl_data = NULL;
 628  631                  rl->rl_status = puterrno(error);
 629  632                  return;
 630  633          }
 631  634  
 632  635          if (MANDLOCK(vp, va.va_mode)) {
 633  636                  VN_RELE(vp);
 634  637                  rl->rl_data = NULL;
 635  638                  rl->rl_status = NFSERR_ACCES;
 636  639                  return;
 637  640          }
 638  641  
 639  642          /* We lied about the object type for a referral */
 640  643          if (vn_is_nfs_reparse(vp, cr))
 641  644                  is_referral = 1;
 642  645  
 643  646          /*
 644  647           * XNFS and RFC1094 require us to return ENXIO if argument
 645  648           * is not a link. BUGID 1138002.
 646  649           */
 647  650          if (vp->v_type != VLNK && !is_referral) {
 648  651                  VN_RELE(vp);
 649  652                  rl->rl_data = NULL;
 650  653                  rl->rl_status = NFSERR_NXIO;
 651  654                  return;
 652  655          }
 653  656  
 654  657          /*
 655  658           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 656  659           */
 657  660          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 658  661  
 659  662          if (is_referral) {
 660  663                  char *s;
 661  664                  size_t strsz;
 662  665  
 663  666                  /* Get an artificial symlink based on a referral */
 664  667                  s = build_symlink(vp, cr, &strsz);
 665  668                  global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 666  669                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 667  670                      vnode_t *, vp, char *, s);
 668  671                  if (s == NULL)
 669  672                          error = EINVAL;
 670  673                  else {
 671  674                          error = 0;
 672  675                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 673  676                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 674  677                          kmem_free(s, strsz);
 675  678                  }
 676  679  
 677  680          } else {
 678  681  
 679  682                  /*
 680  683                   * Set up io vector to read sym link data
 681  684                   */
 682  685                  iov.iov_base = rl->rl_data;
 683  686                  iov.iov_len = NFS_MAXPATHLEN;
 684  687                  uio.uio_iov = &iov;
 685  688                  uio.uio_iovcnt = 1;
 686  689                  uio.uio_segflg = UIO_SYSSPACE;
 687  690                  uio.uio_extflg = UIO_COPY_CACHED;
 688  691                  uio.uio_loffset = (offset_t)0;
 689  692                  uio.uio_resid = NFS_MAXPATHLEN;
 690  693  
 691  694                  /*
 692  695                   * Do the readlink.
 693  696                   */
 694  697                  error = VOP_READLINK(vp, &uio, cr, NULL);
 695  698  
 696  699                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 697  700  
 698  701                  if (!error)
 699  702                          rl->rl_data[rl->rl_count] = '\0';
 700  703  
 701  704          }
 702  705  
 703  706  
 704  707          VN_RELE(vp);
 705  708  
 706  709          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 707  710          name = nfscmd_convname(ca, exi, rl->rl_data,
 708  711              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 709  712  
 710  713          if (name != NULL && name != rl->rl_data) {
 711  714                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 712  715                  rl->rl_data = name;
 713  716          }
 714  717  
 715  718          /*
 716  719           * XNFS and RFC1094 require us to return ENXIO if argument
 717  720           * is not a link. UFS returns EINVAL if this is the case,
 718  721           * so we do the mapping here. BUGID 1138002.
 719  722           */
 720  723          if (error == EINVAL)
 721  724                  rl->rl_status = NFSERR_NXIO;
 722  725          else
 723  726                  rl->rl_status = puterrno(error);
 724  727  
 725  728  }
 726  729  void *
 727  730  rfs_readlink_getfh(fhandle_t *fhp)
 728  731  {
 729  732          return (fhp);
 730  733  }
 731  734  /*
 732  735   * Free data allocated by rfs_readlink
 733  736   */
 734  737  void
 735  738  rfs_rlfree(struct nfsrdlnres *rl)
 736  739  {
 737  740          if (rl->rl_data != NULL)
 738  741                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 739  742  }
 740  743  
 741  744  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 742  745  
 743  746  /*
 744  747   * Read data.
 745  748   * Returns some data read from the file at the given fhandle.
 746  749   */
 747  750  /* ARGSUSED */
 748  751  void
 749  752  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 750  753      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 751  754  {
 752  755          vnode_t *vp;
 753  756          int error;
 754  757          struct vattr va;
 755  758          struct iovec iov;
 756  759          struct uio uio;
 757  760          mblk_t *mp;
 758  761          int alloc_err = 0;
 759  762          int in_crit = 0;
 760  763          caller_context_t ct;
 761  764  
 762  765          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 763  766          if (vp == NULL) {
 764  767                  rr->rr_data = NULL;
 765  768                  rr->rr_status = NFSERR_STALE;
 766  769                  return;
 767  770          }
 768  771  
 769  772          if (vp->v_type != VREG) {
 770  773                  VN_RELE(vp);
 771  774                  rr->rr_data = NULL;
 772  775                  rr->rr_status = NFSERR_ISDIR;
 773  776                  return;
 774  777          }
 775  778  
 776  779          ct.cc_sysid = 0;
 777  780          ct.cc_pid = 0;
 778  781          ct.cc_caller_id = nfs2_srv_caller_id;
 779  782          ct.cc_flags = CC_DONTBLOCK;
 780  783  
 781  784          /*
 782  785           * Enter the critical region before calling VOP_RWLOCK
 783  786           * to avoid a deadlock with write requests.
 784  787           */
 785  788          if (nbl_need_check(vp)) {
 786  789                  nbl_start_crit(vp, RW_READER);
 787  790                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 788  791                      0, NULL)) {
 789  792                          nbl_end_crit(vp);
 790  793                          VN_RELE(vp);
 791  794                          rr->rr_data = NULL;
 792  795                          rr->rr_status = NFSERR_ACCES;
 793  796                          return;
 794  797                  }
 795  798                  in_crit = 1;
 796  799          }
 797  800  
 798  801          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 799  802  
 800  803          /* check if a monitor detected a delegation conflict */
 801  804          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 802  805                  if (in_crit)
 803  806                          nbl_end_crit(vp);
 804  807                  VN_RELE(vp);
 805  808                  /* mark as wouldblock so response is dropped */
 806  809                  curthread->t_flag |= T_WOULDBLOCK;
 807  810  
 808  811                  rr->rr_data = NULL;
 809  812                  return;
 810  813          }
 811  814  
 812  815          va.va_mask = AT_ALL;
 813  816  
 814  817          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 815  818  
 816  819          if (error) {
 817  820                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 818  821                  if (in_crit)
 819  822                          nbl_end_crit(vp);
 820  823  
 821  824                  VN_RELE(vp);
 822  825                  rr->rr_data = NULL;
 823  826                  rr->rr_status = puterrno(error);
 824  827  
 825  828                  return;
 826  829          }
 827  830  
 828  831          /*
 829  832           * This is a kludge to allow reading of files created
 830  833           * with no read permission.  The owner of the file
 831  834           * is always allowed to read it.
 832  835           */
 833  836          if (crgetuid(cr) != va.va_uid) {
 834  837                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 835  838  
 836  839                  if (error) {
 837  840                          /*
 838  841                           * Exec is the same as read over the net because
 839  842                           * of demand loading.
 840  843                           */
 841  844                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 842  845                  }
 843  846                  if (error) {
 844  847                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 845  848                          if (in_crit)
 846  849                                  nbl_end_crit(vp);
 847  850                          VN_RELE(vp);
 848  851                          rr->rr_data = NULL;
 849  852                          rr->rr_status = puterrno(error);
 850  853  
 851  854                          return;
 852  855                  }
 853  856          }
 854  857  
 855  858          if (MANDLOCK(vp, va.va_mode)) {
 856  859                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 857  860                  if (in_crit)
 858  861                          nbl_end_crit(vp);
 859  862  
 860  863                  VN_RELE(vp);
 861  864                  rr->rr_data = NULL;
 862  865                  rr->rr_status = NFSERR_ACCES;
 863  866  
 864  867                  return;
 865  868          }
 866  869  
 867  870          rr->rr_ok.rrok_wlist_len = 0;
 868  871          rr->rr_ok.rrok_wlist = NULL;
 869  872  
 870  873          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 871  874                  rr->rr_count = 0;
 872  875                  rr->rr_data = NULL;
 873  876                  /*
 874  877                   * In this case, status is NFS_OK, but there is no data
 875  878                   * to encode. So set rr_mp to NULL.
 876  879                   */
 877  880                  rr->rr_mp = NULL;
 878  881                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 879  882                  if (rr->rr_ok.rrok_wlist)
 880  883                          clist_zero_len(rr->rr_ok.rrok_wlist);
 881  884                  goto done;
 882  885          }
 883  886  
 884  887          if (ra->ra_wlist) {
 885  888                  mp = NULL;
 886  889                  rr->rr_mp = NULL;
 887  890                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 888  891                  if (ra->ra_count > iov.iov_len) {
 889  892                          rr->rr_data = NULL;
 890  893                          rr->rr_status = NFSERR_INVAL;
 891  894                          goto done;
 892  895                  }
 893  896          } else {
 894  897                  /*
 895  898                   * mp will contain the data to be sent out in the read reply.
 896  899                   * This will be freed after the reply has been sent out (by the
 897  900                   * driver).
 898  901                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 899  902                   * that the call to xdrmblk_putmblk() never fails.
 900  903                   */
 901  904                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 902  905                      &alloc_err);
 903  906                  ASSERT(mp != NULL);
 904  907                  ASSERT(alloc_err == 0);
 905  908  
 906  909                  rr->rr_mp = mp;
 907  910  
 908  911                  /*
 909  912                   * Set up io vector
 910  913                   */
 911  914                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 912  915                  iov.iov_len = ra->ra_count;
 913  916          }
 914  917  
 915  918          uio.uio_iov = &iov;
 916  919          uio.uio_iovcnt = 1;
 917  920          uio.uio_segflg = UIO_SYSSPACE;
 918  921          uio.uio_extflg = UIO_COPY_CACHED;
 919  922          uio.uio_loffset = (offset_t)ra->ra_offset;
 920  923          uio.uio_resid = ra->ra_count;
 921  924  
 922  925          error = VOP_READ(vp, &uio, 0, cr, &ct);
 923  926  
 924  927          if (error) {
 925  928                  if (mp)
 926  929                          freeb(mp);
 927  930  
 928  931                  /*
 929  932                   * check if a monitor detected a delegation conflict and
 930  933                   * mark as wouldblock so response is dropped
 931  934                   */
 932  935                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 933  936                          curthread->t_flag |= T_WOULDBLOCK;
 934  937                  else
 935  938                          rr->rr_status = puterrno(error);
 936  939  
 937  940                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 938  941                  if (in_crit)
 939  942                          nbl_end_crit(vp);
 940  943  
 941  944                  VN_RELE(vp);
 942  945                  rr->rr_data = NULL;
 943  946  
 944  947                  return;
 945  948          }
 946  949  
 947  950          /*
 948  951           * Get attributes again so we can send the latest access
 949  952           * time to the client side for its cache.
 950  953           */
 951  954          va.va_mask = AT_ALL;
 952  955  
 953  956          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 954  957  
 955  958          if (error) {
 956  959                  if (mp)
 957  960                          freeb(mp);
 958  961  
 959  962                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 960  963                  if (in_crit)
 961  964                          nbl_end_crit(vp);
 962  965  
 963  966                  VN_RELE(vp);
 964  967                  rr->rr_data = NULL;
 965  968                  rr->rr_status = puterrno(error);
 966  969  
 967  970                  return;
 968  971          }
 969  972  
 970  973          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 971  974  
 972  975          if (mp) {
 973  976                  rr->rr_data = (char *)mp->b_datap->db_base;
 974  977          } else {
 975  978                  if (ra->ra_wlist) {
 976  979                          rr->rr_data = (caddr_t)iov.iov_base;
 977  980                          if (!rdma_setup_read_data2(ra, rr)) {
 978  981                                  rr->rr_data = NULL;
 979  982                                  rr->rr_status = puterrno(NFSERR_INVAL);
 980  983                          }
 981  984                  }
 982  985          }
 983  986  done:
 984  987          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 985  988          if (in_crit)
 986  989                  nbl_end_crit(vp);
 987  990  
 988  991          acl_perm(vp, exi, &va, cr);
 989  992  
 990  993          /* check for overflows */
 991  994          error = vattr_to_nattr(&va, &rr->rr_attr);
 992  995  
 993  996          VN_RELE(vp);
 994  997  
 995  998          rr->rr_status = puterrno(error);
 996  999  }
 997 1000  
 998 1001  /*
 999 1002   * Free data allocated by rfs_read
1000 1003   */
1001 1004  void
1002 1005  rfs_rdfree(struct nfsrdresult *rr)
1003 1006  {
1004 1007          mblk_t *mp;
1005 1008  
1006 1009          if (rr->rr_status == NFS_OK) {
1007 1010                  mp = rr->rr_mp;
1008 1011                  if (mp != NULL)
1009 1012                          freeb(mp);
1010 1013          }
1011 1014  }
1012 1015  
1013 1016  void *
1014 1017  rfs_read_getfh(struct nfsreadargs *ra)
1015 1018  {
1016 1019          return (&ra->ra_fhandle);
1017 1020  }
1018 1021  
1019 1022  #define MAX_IOVECS      12
1020 1023  
1021 1024  #ifdef DEBUG
1022 1025  static int rfs_write_sync_hits = 0;
1023 1026  static int rfs_write_sync_misses = 0;
1024 1027  #endif
1025 1028  
1026 1029  /*
1027 1030   * Write data to file.
1028 1031   * Returns attributes of a file after writing some data to it.
1029 1032   *
1030 1033   * Any changes made here, especially in error handling might have
1031 1034   * to also be done in rfs_write (which clusters write requests).
1032 1035   */
1033 1036  /* ARGSUSED */
1034 1037  void
1035 1038  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
1036 1039      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1037 1040  {
1038 1041          int error;
1039 1042          vnode_t *vp;
1040 1043          rlim64_t rlimit;
1041 1044          struct vattr va;
1042 1045          struct uio uio;
1043 1046          struct iovec iov[MAX_IOVECS];
1044 1047          mblk_t *m;
1045 1048          struct iovec *iovp;
1046 1049          int iovcnt;
1047 1050          cred_t *savecred;
1048 1051          int in_crit = 0;
1049 1052          caller_context_t ct;
1050 1053  
1051 1054          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1052 1055          if (vp == NULL) {
1053 1056                  ns->ns_status = NFSERR_STALE;
1054 1057                  return;
1055 1058          }
1056 1059  
1057 1060          if (rdonly(ro, vp)) {
1058 1061                  VN_RELE(vp);
1059 1062                  ns->ns_status = NFSERR_ROFS;
1060 1063                  return;
1061 1064          }
1062 1065  
1063 1066          if (vp->v_type != VREG) {
1064 1067                  VN_RELE(vp);
1065 1068                  ns->ns_status = NFSERR_ISDIR;
1066 1069                  return;
1067 1070          }
1068 1071  
1069 1072          ct.cc_sysid = 0;
1070 1073          ct.cc_pid = 0;
1071 1074          ct.cc_caller_id = nfs2_srv_caller_id;
1072 1075          ct.cc_flags = CC_DONTBLOCK;
1073 1076  
1074 1077          va.va_mask = AT_UID|AT_MODE;
1075 1078  
1076 1079          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1077 1080  
1078 1081          if (error) {
1079 1082                  VN_RELE(vp);
1080 1083                  ns->ns_status = puterrno(error);
1081 1084  
1082 1085                  return;
1083 1086          }
1084 1087  
1085 1088          if (crgetuid(cr) != va.va_uid) {
1086 1089                  /*
1087 1090                   * This is a kludge to allow writes of files created
1088 1091                   * with read only permission.  The owner of the file
1089 1092                   * is always allowed to write it.
1090 1093                   */
1091 1094                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
1092 1095  
1093 1096                  if (error) {
1094 1097                          VN_RELE(vp);
1095 1098                          ns->ns_status = puterrno(error);
1096 1099                          return;
1097 1100                  }
1098 1101          }
1099 1102  
1100 1103          /*
1101 1104           * Can't access a mandatory lock file.  This might cause
1102 1105           * the NFS service thread to block forever waiting for a
1103 1106           * lock to be released that will never be released.
1104 1107           */
1105 1108          if (MANDLOCK(vp, va.va_mode)) {
1106 1109                  VN_RELE(vp);
1107 1110                  ns->ns_status = NFSERR_ACCES;
1108 1111                  return;
1109 1112          }
1110 1113  
1111 1114          /*
1112 1115           * We have to enter the critical region before calling VOP_RWLOCK
1113 1116           * to avoid a deadlock with ufs.
1114 1117           */
1115 1118          if (nbl_need_check(vp)) {
1116 1119                  nbl_start_crit(vp, RW_READER);
1117 1120                  in_crit = 1;
1118 1121                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1119 1122                      wa->wa_count, 0, NULL)) {
1120 1123                          error = EACCES;
1121 1124                          goto out;
1122 1125                  }
1123 1126          }
1124 1127  
1125 1128          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1126 1129  
1127 1130          /* check if a monitor detected a delegation conflict */
1128 1131          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1129 1132                  goto out;
1130 1133          }
1131 1134  
1132 1135          if (wa->wa_data || wa->wa_rlist) {
1133 1136                  /* Do the RDMA thing if necessary */
1134 1137                  if (wa->wa_rlist) {
1135 1138                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1136 1139                          iov[0].iov_len = wa->wa_count;
1137 1140                  } else  {
1138 1141                          iov[0].iov_base = wa->wa_data;
1139 1142                          iov[0].iov_len = wa->wa_count;
1140 1143                  }
1141 1144                  uio.uio_iov = iov;
1142 1145                  uio.uio_iovcnt = 1;
1143 1146                  uio.uio_segflg = UIO_SYSSPACE;
1144 1147                  uio.uio_extflg = UIO_COPY_DEFAULT;
1145 1148                  uio.uio_loffset = (offset_t)wa->wa_offset;
1146 1149                  uio.uio_resid = wa->wa_count;
1147 1150                  /*
1148 1151                   * The limit is checked on the client. We
1149 1152                   * should allow any size writes here.
1150 1153                   */
1151 1154                  uio.uio_llimit = curproc->p_fsz_ctl;
1152 1155                  rlimit = uio.uio_llimit - wa->wa_offset;
1153 1156                  if (rlimit < (rlim64_t)uio.uio_resid)
1154 1157                          uio.uio_resid = (uint_t)rlimit;
1155 1158  
1156 1159                  /*
1157 1160                   * for now we assume no append mode
1158 1161                   */
1159 1162                  /*
1160 1163                   * We're changing creds because VM may fault and we need
1161 1164                   * the cred of the current thread to be used if quota
1162 1165                   * checking is enabled.
1163 1166                   */
1164 1167                  savecred = curthread->t_cred;
1165 1168                  curthread->t_cred = cr;
1166 1169                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1167 1170                  curthread->t_cred = savecred;
1168 1171          } else {
1169 1172  
1170 1173                  iovcnt = 0;
1171 1174                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1172 1175                          iovcnt++;
1173 1176                  if (iovcnt <= MAX_IOVECS) {
1174 1177  #ifdef DEBUG
1175 1178                          rfs_write_sync_hits++;
1176 1179  #endif
1177 1180                          iovp = iov;
1178 1181                  } else {
1179 1182  #ifdef DEBUG
1180 1183                          rfs_write_sync_misses++;
1181 1184  #endif
1182 1185                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1183 1186                  }
1184 1187                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1185 1188                  uio.uio_iov = iovp;
1186 1189                  uio.uio_iovcnt = iovcnt;
1187 1190                  uio.uio_segflg = UIO_SYSSPACE;
1188 1191                  uio.uio_extflg = UIO_COPY_DEFAULT;
1189 1192                  uio.uio_loffset = (offset_t)wa->wa_offset;
1190 1193                  uio.uio_resid = wa->wa_count;
1191 1194                  /*
1192 1195                   * The limit is checked on the client. We
1193 1196                   * should allow any size writes here.
1194 1197                   */
1195 1198                  uio.uio_llimit = curproc->p_fsz_ctl;
1196 1199                  rlimit = uio.uio_llimit - wa->wa_offset;
1197 1200                  if (rlimit < (rlim64_t)uio.uio_resid)
1198 1201                          uio.uio_resid = (uint_t)rlimit;
1199 1202  
1200 1203                  /*
1201 1204                   * For now we assume no append mode.
1202 1205                   */
1203 1206                  /*
1204 1207                   * We're changing creds because VM may fault and we need
1205 1208                   * the cred of the current thread to be used if quota
1206 1209                   * checking is enabled.
1207 1210                   */
1208 1211                  savecred = curthread->t_cred;
1209 1212                  curthread->t_cred = cr;
1210 1213                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1211 1214                  curthread->t_cred = savecred;
1212 1215  
1213 1216                  if (iovp != iov)
1214 1217                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1215 1218          }
1216 1219  
1217 1220          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1218 1221  
1219 1222          if (!error) {
1220 1223                  /*
1221 1224                   * Get attributes again so we send the latest mod
1222 1225                   * time to the client side for its cache.
1223 1226                   */
1224 1227                  va.va_mask = AT_ALL;    /* now we want everything */
1225 1228  
1226 1229                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1227 1230  
1228 1231                  /* check for overflows */
1229 1232                  if (!error) {
1230 1233                          acl_perm(vp, exi, &va, cr);
1231 1234                          error = vattr_to_nattr(&va, &ns->ns_attr);
1232 1235                  }
1233 1236          }
1234 1237  
1235 1238  out:
1236 1239          if (in_crit)
1237 1240                  nbl_end_crit(vp);
1238 1241          VN_RELE(vp);
1239 1242  
1240 1243          /* check if a monitor detected a delegation conflict */
1241 1244          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1242 1245                  /* mark as wouldblock so response is dropped */
1243 1246                  curthread->t_flag |= T_WOULDBLOCK;
1244 1247          else
1245 1248                  ns->ns_status = puterrno(error);
1246 1249  
1247 1250  }
1248 1251  
1249 1252  struct rfs_async_write {
1250 1253          struct nfswriteargs *wa;
1251 1254          struct nfsattrstat *ns;
1252 1255          struct svc_req *req;
1253 1256          cred_t *cr;
1254 1257          bool_t ro;
1255 1258          kthread_t *thread;
1256 1259          struct rfs_async_write *list;
1257 1260  };
1258 1261  
1259 1262  struct rfs_async_write_list {
1260 1263          fhandle_t *fhp;
1261 1264          kcondvar_t cv;
1262 1265          struct rfs_async_write *list;
1263 1266          struct rfs_async_write_list *next;
1264 1267  };
1265 1268  
1266 1269  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1267 1270  static kmutex_t rfs_async_write_lock;
1268 1271  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1269 1272  
1270 1273  #define MAXCLIOVECS     42
1271 1274  #define RFSWRITE_INITVAL (enum nfsstat) -1
1272 1275  
1273 1276  #ifdef DEBUG
1274 1277  static int rfs_write_hits = 0;
1275 1278  static int rfs_write_misses = 0;
1276 1279  #endif
1277 1280  
1278 1281  /*
1279 1282   * Write data to file.
1280 1283   * Returns attributes of a file after writing some data to it.
1281 1284   */
1282 1285  void
1283 1286  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1284 1287      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1285 1288  {
1286 1289          int error;
1287 1290          vnode_t *vp;
1288 1291          rlim64_t rlimit;
1289 1292          struct vattr va;
1290 1293          struct uio uio;
1291 1294          struct rfs_async_write_list *lp;
1292 1295          struct rfs_async_write_list *nlp;
1293 1296          struct rfs_async_write *rp;
1294 1297          struct rfs_async_write *nrp;
1295 1298          struct rfs_async_write *trp;
1296 1299          struct rfs_async_write *lrp;
1297 1300          int data_written;
1298 1301          int iovcnt;
1299 1302          mblk_t *m;
1300 1303          struct iovec *iovp;
1301 1304          struct iovec *niovp;
1302 1305          struct iovec iov[MAXCLIOVECS];
1303 1306          int count;
1304 1307          int rcount;

↓ open down ↓

811 lines elided

↑ open up ↑

1305 1308          uint_t off;
1306 1309          uint_t len;
1307 1310          struct rfs_async_write nrpsp;
1308 1311          struct rfs_async_write_list nlpsp;
1309 1312          ushort_t t_flag;
1310 1313          cred_t *savecred;
1311 1314          int in_crit = 0;
1312 1315          caller_context_t ct;
1313 1316          nfs_srv_t *nsrv;
1314 1317  
     1318 +        ASSERT3P(curzone, ==, ((exi == NULL) ? curzone : exi->exi_zone));
1315 1319          nsrv = zone_getspecific(rfs_zone_key, curzone);
1316 1320          if (!nsrv->write_async) {
1317 1321                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1318 1322                  return;
1319 1323          }
1320 1324  
1321 1325          /*
1322 1326           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1323 1327           * is considered an OK.
1324 1328           */

1325 1329          ns->ns_status = RFSWRITE_INITVAL;
1326 1330  
1327 1331          nrp = &nrpsp;
1328 1332          nrp->wa = wa;
1329 1333          nrp->ns = ns;
1330 1334          nrp->req = req;
1331 1335          nrp->cr = cr;
1332 1336          nrp->ro = ro;
1333 1337          nrp->thread = curthread;
1334 1338  
1335 1339          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1336 1340  
1337 1341          /*
1338 1342           * Look to see if there is already a cluster started
1339 1343           * for this file.
1340 1344           */
1341 1345          mutex_enter(&nsrv->async_write_lock);
1342 1346          for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1343 1347                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1344 1348                      sizeof (fhandle_t)) == 0)
1345 1349                          break;
1346 1350          }
1347 1351  
1348 1352          /*
1349 1353           * If lp is non-NULL, then there is already a cluster
1350 1354           * started.  We need to place ourselves in the cluster
1351 1355           * list in the right place as determined by starting
1352 1356           * offset.  Conflicts with non-blocking mandatory locked
1353 1357           * regions will be checked when the cluster is processed.
1354 1358           */
1355 1359          if (lp != NULL) {
1356 1360                  rp = lp->list;
1357 1361                  trp = NULL;
1358 1362                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1359 1363                          trp = rp;
1360 1364                          rp = rp->list;
1361 1365                  }
1362 1366                  nrp->list = rp;
1363 1367                  if (trp == NULL)
1364 1368                          lp->list = nrp;
1365 1369                  else
1366 1370                          trp->list = nrp;
1367 1371                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1368 1372                          cv_wait(&lp->cv, &nsrv->async_write_lock);
1369 1373                  mutex_exit(&nsrv->async_write_lock);
1370 1374  
1371 1375                  return;
1372 1376          }
1373 1377  
1374 1378          /*
1375 1379           * No cluster started yet, start one and add ourselves
1376 1380           * to the list of clusters.
1377 1381           */
1378 1382          nrp->list = NULL;
1379 1383  
1380 1384          nlp = &nlpsp;
1381 1385          nlp->fhp = &wa->wa_fhandle;
1382 1386          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1383 1387          nlp->list = nrp;
1384 1388          nlp->next = NULL;
1385 1389  
1386 1390          if (nsrv->async_write_head == NULL) {
1387 1391                  nsrv->async_write_head = nlp;
1388 1392          } else {
1389 1393                  lp = nsrv->async_write_head;
1390 1394                  while (lp->next != NULL)
1391 1395                          lp = lp->next;
1392 1396                  lp->next = nlp;
1393 1397          }
1394 1398          mutex_exit(&nsrv->async_write_lock);
1395 1399  
1396 1400          /*
1397 1401           * Convert the file handle common to all of the requests
1398 1402           * in this cluster to a vnode.
1399 1403           */
1400 1404          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1401 1405          if (vp == NULL) {
1402 1406                  mutex_enter(&nsrv->async_write_lock);
1403 1407                  if (nsrv->async_write_head == nlp)
1404 1408                          nsrv->async_write_head = nlp->next;
1405 1409                  else {
1406 1410                          lp = nsrv->async_write_head;
1407 1411                          while (lp->next != nlp)
1408 1412                                  lp = lp->next;
1409 1413                          lp->next = nlp->next;
1410 1414                  }
1411 1415                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1412 1416                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1413 1417                          rp->ns->ns_status = NFSERR_STALE;
1414 1418                          rp->thread->t_flag |= t_flag;
1415 1419                  }
1416 1420                  cv_broadcast(&nlp->cv);
1417 1421                  mutex_exit(&nsrv->async_write_lock);
1418 1422  
1419 1423                  return;
1420 1424          }
1421 1425  
1422 1426          /*
1423 1427           * Can only write regular files.  Attempts to write any
1424 1428           * other file types fail with EISDIR.
1425 1429           */
1426 1430          if (vp->v_type != VREG) {
1427 1431                  VN_RELE(vp);
1428 1432                  mutex_enter(&nsrv->async_write_lock);
1429 1433                  if (nsrv->async_write_head == nlp)
1430 1434                          nsrv->async_write_head = nlp->next;
1431 1435                  else {
1432 1436                          lp = nsrv->async_write_head;
1433 1437                          while (lp->next != nlp)
1434 1438                                  lp = lp->next;
1435 1439                          lp->next = nlp->next;
1436 1440                  }
1437 1441                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1438 1442                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1439 1443                          rp->ns->ns_status = NFSERR_ISDIR;
1440 1444                          rp->thread->t_flag |= t_flag;
1441 1445                  }
1442 1446                  cv_broadcast(&nlp->cv);
1443 1447                  mutex_exit(&nsrv->async_write_lock);
1444 1448  
1445 1449                  return;
1446 1450          }
1447 1451  
1448 1452          /*
1449 1453           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1450 1454           * deadlock with ufs.
1451 1455           */
1452 1456          if (nbl_need_check(vp)) {
1453 1457                  nbl_start_crit(vp, RW_READER);
1454 1458                  in_crit = 1;
1455 1459          }
1456 1460  
1457 1461          ct.cc_sysid = 0;
1458 1462          ct.cc_pid = 0;
1459 1463          ct.cc_caller_id = nfs2_srv_caller_id;
1460 1464          ct.cc_flags = CC_DONTBLOCK;
1461 1465  
1462 1466          /*
1463 1467           * Lock the file for writing.  This operation provides
1464 1468           * the delay which allows clusters to grow.
1465 1469           */
1466 1470          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1467 1471  
1468 1472          /* check if a monitor detected a delegation conflict */
1469 1473          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1470 1474                  if (in_crit)
1471 1475                          nbl_end_crit(vp);
1472 1476                  VN_RELE(vp);
1473 1477                  /* mark as wouldblock so response is dropped */
1474 1478                  curthread->t_flag |= T_WOULDBLOCK;
1475 1479                  mutex_enter(&nsrv->async_write_lock);
1476 1480                  if (nsrv->async_write_head == nlp)
1477 1481                          nsrv->async_write_head = nlp->next;
1478 1482                  else {
1479 1483                          lp = nsrv->async_write_head;
1480 1484                          while (lp->next != nlp)
1481 1485                                  lp = lp->next;
1482 1486                          lp->next = nlp->next;
1483 1487                  }
1484 1488                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1485 1489                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1486 1490                                  rp->ns->ns_status = puterrno(error);
1487 1491                                  rp->thread->t_flag |= T_WOULDBLOCK;
1488 1492                          }
1489 1493                  }
1490 1494                  cv_broadcast(&nlp->cv);
1491 1495                  mutex_exit(&nsrv->async_write_lock);
1492 1496  
1493 1497                  return;
1494 1498          }
1495 1499  
1496 1500          /*
1497 1501           * Disconnect this cluster from the list of clusters.
1498 1502           * The cluster that is being dealt with must be fixed
1499 1503           * in size after this point, so there is no reason
1500 1504           * to leave it on the list so that new requests can
1501 1505           * find it.
1502 1506           *
1503 1507           * The algorithm is that the first write request will
1504 1508           * create a cluster, convert the file handle to a
1505 1509           * vnode pointer, and then lock the file for writing.
1506 1510           * This request is not likely to be clustered with
1507 1511           * any others.  However, the next request will create
1508 1512           * a new cluster and be blocked in VOP_RWLOCK while
1509 1513           * the first request is being processed.  This delay
1510 1514           * will allow more requests to be clustered in this
1511 1515           * second cluster.
1512 1516           */
1513 1517          mutex_enter(&nsrv->async_write_lock);
1514 1518          if (nsrv->async_write_head == nlp)
1515 1519                  nsrv->async_write_head = nlp->next;
1516 1520          else {
1517 1521                  lp = nsrv->async_write_head;
1518 1522                  while (lp->next != nlp)
1519 1523                          lp = lp->next;
1520 1524                  lp->next = nlp->next;
1521 1525          }
1522 1526          mutex_exit(&nsrv->async_write_lock);
1523 1527  
1524 1528          /*
1525 1529           * Step through the list of requests in this cluster.
1526 1530           * We need to check permissions to make sure that all
1527 1531           * of the requests have sufficient permission to write
1528 1532           * the file.  A cluster can be composed of requests
1529 1533           * from different clients and different users on each
1530 1534           * client.
1531 1535           *
1532 1536           * As a side effect, we also calculate the size of the
1533 1537           * byte range that this cluster encompasses.
1534 1538           */
1535 1539          rp = nlp->list;
1536 1540          off = rp->wa->wa_offset;
1537 1541          len = (uint_t)0;
1538 1542          do {
1539 1543                  if (rdonly(rp->ro, vp)) {
1540 1544                          rp->ns->ns_status = NFSERR_ROFS;
1541 1545                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1542 1546                          rp->thread->t_flag |= t_flag;
1543 1547                          continue;
1544 1548                  }
1545 1549  
1546 1550                  va.va_mask = AT_UID|AT_MODE;
1547 1551  
1548 1552                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1549 1553  
1550 1554                  if (!error) {
1551 1555                          if (crgetuid(rp->cr) != va.va_uid) {
1552 1556                                  /*
1553 1557                                   * This is a kludge to allow writes of files
1554 1558                                   * created with read only permission.  The
1555 1559                                   * owner of the file is always allowed to
1556 1560                                   * write it.
1557 1561                                   */
1558 1562                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1559 1563                          }
1560 1564                          if (!error && MANDLOCK(vp, va.va_mode))
1561 1565                                  error = EACCES;
1562 1566                  }
1563 1567  
1564 1568                  /*
1565 1569                   * Check for a conflict with a nbmand-locked region.
1566 1570                   */
1567 1571                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1568 1572                      rp->wa->wa_count, 0, NULL)) {
1569 1573                          error = EACCES;
1570 1574                  }
1571 1575  
1572 1576                  if (error) {
1573 1577                          rp->ns->ns_status = puterrno(error);
1574 1578                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1575 1579                          rp->thread->t_flag |= t_flag;
1576 1580                          continue;
1577 1581                  }
1578 1582                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1579 1583                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1580 1584          } while ((rp = rp->list) != NULL);
1581 1585  
1582 1586          /*
1583 1587           * Step through the cluster attempting to gather as many
1584 1588           * requests which are contiguous as possible.  These
1585 1589           * contiguous requests are handled via one call to VOP_WRITE
1586 1590           * instead of different calls to VOP_WRITE.  We also keep
1587 1591           * track of the fact that any data was written.
1588 1592           */
1589 1593          rp = nlp->list;
1590 1594          data_written = 0;
1591 1595          do {
1592 1596                  /*
1593 1597                   * Skip any requests which are already marked as having an
1594 1598                   * error.
1595 1599                   */
1596 1600                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1597 1601                          rp = rp->list;
1598 1602                          continue;
1599 1603                  }
1600 1604  
1601 1605                  /*
1602 1606                   * Count the number of iovec's which are required
1603 1607                   * to handle this set of requests.  One iovec is
1604 1608                   * needed for each data buffer, whether addressed
1605 1609                   * by wa_data or by the b_rptr pointers in the
1606 1610                   * mblk chains.
1607 1611                   */
1608 1612                  iovcnt = 0;
1609 1613                  lrp = rp;
1610 1614                  for (;;) {
1611 1615                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1612 1616                                  iovcnt++;
1613 1617                          else {
1614 1618                                  m = lrp->wa->wa_mblk;
1615 1619                                  while (m != NULL) {
1616 1620                                          iovcnt++;
1617 1621                                          m = m->b_cont;
1618 1622                                  }
1619 1623                          }
1620 1624                          if (lrp->list == NULL ||
1621 1625                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1622 1626                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1623 1627                              lrp->list->wa->wa_offset) {
1624 1628                                  lrp = lrp->list;
1625 1629                                  break;
1626 1630                          }
1627 1631                          lrp = lrp->list;
1628 1632                  }
1629 1633  
1630 1634                  if (iovcnt <= MAXCLIOVECS) {
1631 1635  #ifdef DEBUG
1632 1636                          rfs_write_hits++;
1633 1637  #endif
1634 1638                          niovp = iov;
1635 1639                  } else {
1636 1640  #ifdef DEBUG
1637 1641                          rfs_write_misses++;
1638 1642  #endif
1639 1643                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1640 1644                  }
1641 1645                  /*
1642 1646                   * Put together the scatter/gather iovecs.
1643 1647                   */
1644 1648                  iovp = niovp;
1645 1649                  trp = rp;
1646 1650                  count = 0;
1647 1651                  do {
1648 1652                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1649 1653                                  if (trp->wa->wa_rlist) {
1650 1654                                          iovp->iov_base =
1651 1655                                              (char *)((trp->wa->wa_rlist)->
1652 1656                                              u.c_daddr3);
1653 1657                                          iovp->iov_len = trp->wa->wa_count;
1654 1658                                  } else  {
1655 1659                                          iovp->iov_base = trp->wa->wa_data;
1656 1660                                          iovp->iov_len = trp->wa->wa_count;
1657 1661                                  }
1658 1662                                  iovp++;
1659 1663                          } else {
1660 1664                                  m = trp->wa->wa_mblk;
1661 1665                                  rcount = trp->wa->wa_count;
1662 1666                                  while (m != NULL) {
1663 1667                                          iovp->iov_base = (caddr_t)m->b_rptr;
1664 1668                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1665 1669                                          rcount -= iovp->iov_len;
1666 1670                                          if (rcount < 0)
1667 1671                                                  iovp->iov_len += rcount;
1668 1672                                          iovp++;
1669 1673                                          if (rcount <= 0)
1670 1674                                                  break;
1671 1675                                          m = m->b_cont;
1672 1676                                  }
1673 1677                          }
1674 1678                          count += trp->wa->wa_count;
1675 1679                          trp = trp->list;
1676 1680                  } while (trp != lrp);
1677 1681  
1678 1682                  uio.uio_iov = niovp;
1679 1683                  uio.uio_iovcnt = iovcnt;
1680 1684                  uio.uio_segflg = UIO_SYSSPACE;
1681 1685                  uio.uio_extflg = UIO_COPY_DEFAULT;
1682 1686                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1683 1687                  uio.uio_resid = count;
1684 1688                  /*
1685 1689                   * The limit is checked on the client. We
1686 1690                   * should allow any size writes here.
1687 1691                   */
1688 1692                  uio.uio_llimit = curproc->p_fsz_ctl;
1689 1693                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1690 1694                  if (rlimit < (rlim64_t)uio.uio_resid)
1691 1695                          uio.uio_resid = (uint_t)rlimit;
1692 1696  
1693 1697                  /*
1694 1698                   * For now we assume no append mode.
1695 1699                   */
1696 1700  
1697 1701                  /*
1698 1702                   * We're changing creds because VM may fault
1699 1703                   * and we need the cred of the current
1700 1704                   * thread to be used if quota * checking is
1701 1705                   * enabled.
1702 1706                   */
1703 1707                  savecred = curthread->t_cred;
1704 1708                  curthread->t_cred = cr;
1705 1709                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1706 1710                  curthread->t_cred = savecred;
1707 1711  
1708 1712                  /* check if a monitor detected a delegation conflict */
1709 1713                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1710 1714                          /* mark as wouldblock so response is dropped */
1711 1715                          curthread->t_flag |= T_WOULDBLOCK;
1712 1716  
1713 1717                  if (niovp != iov)
1714 1718                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1715 1719  
1716 1720                  if (!error) {
1717 1721                          data_written = 1;
1718 1722                          /*
1719 1723                           * Get attributes again so we send the latest mod
1720 1724                           * time to the client side for its cache.
1721 1725                           */
1722 1726                          va.va_mask = AT_ALL;    /* now we want everything */
1723 1727  
1724 1728                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1725 1729  
1726 1730                          if (!error)
1727 1731                                  acl_perm(vp, exi, &va, rp->cr);
1728 1732                  }
1729 1733  
1730 1734                  /*
1731 1735                   * Fill in the status responses for each request
1732 1736                   * which was just handled.  Also, copy the latest
1733 1737                   * attributes in to the attribute responses if
1734 1738                   * appropriate.
1735 1739                   */
1736 1740                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1737 1741                  do {
1738 1742                          rp->thread->t_flag |= t_flag;
1739 1743                          /* check for overflows */
1740 1744                          if (!error) {
1741 1745                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1742 1746                          }
1743 1747                          rp->ns->ns_status = puterrno(error);
1744 1748                          rp = rp->list;
1745 1749                  } while (rp != lrp);
1746 1750          } while (rp != NULL);
1747 1751  
1748 1752          /*
1749 1753           * If any data was written at all, then we need to flush
1750 1754           * the data and metadata to stable storage.
1751 1755           */
1752 1756          if (data_written) {
1753 1757                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1754 1758  
1755 1759                  if (!error) {
1756 1760                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1757 1761                  }
1758 1762          }
1759 1763  
1760 1764          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1761 1765  
1762 1766          if (in_crit)
1763 1767                  nbl_end_crit(vp);
1764 1768          VN_RELE(vp);
1765 1769  
1766 1770          t_flag = curthread->t_flag & T_WOULDBLOCK;
1767 1771          mutex_enter(&nsrv->async_write_lock);
1768 1772          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1769 1773                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1770 1774                          rp->ns->ns_status = puterrno(error);
1771 1775                          rp->thread->t_flag |= t_flag;
1772 1776                  }
1773 1777          }
1774 1778          cv_broadcast(&nlp->cv);
1775 1779          mutex_exit(&nsrv->async_write_lock);
1776 1780  
1777 1781  }
1778 1782  
1779 1783  void *
1780 1784  rfs_write_getfh(struct nfswriteargs *wa)
1781 1785  {
1782 1786          return (&wa->wa_fhandle);
1783 1787  }
1784 1788  
1785 1789  /*
1786 1790   * Create a file.
1787 1791   * Creates a file with given attributes and returns those attributes
1788 1792   * and an fhandle for the new file.
1789 1793   */
1790 1794  void
1791 1795  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1792 1796      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1793 1797  {
1794 1798          int error;
1795 1799          int lookuperr;
1796 1800          int in_crit = 0;
1797 1801          struct vattr va;
1798 1802          vnode_t *vp;
1799 1803          vnode_t *realvp;
1800 1804          vnode_t *dvp;
1801 1805          char *name = args->ca_da.da_name;
1802 1806          vnode_t *tvp = NULL;
1803 1807          int mode;
1804 1808          int lookup_ok;
1805 1809          bool_t trunc;
1806 1810          struct sockaddr *ca;
1807 1811  
1808 1812          /*
1809 1813           * Disallow NULL paths
1810 1814           */
1811 1815          if (name == NULL || *name == '\0') {
1812 1816                  dr->dr_status = NFSERR_ACCES;
1813 1817                  return;
1814 1818          }
1815 1819  
1816 1820          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1817 1821          if (dvp == NULL) {
1818 1822                  dr->dr_status = NFSERR_STALE;
1819 1823                  return;
1820 1824          }
1821 1825  
1822 1826          error = sattr_to_vattr(args->ca_sa, &va);
1823 1827          if (error) {
1824 1828                  dr->dr_status = puterrno(error);
1825 1829                  return;
1826 1830          }
1827 1831  
1828 1832          /*
1829 1833           * Must specify the mode.
1830 1834           */
1831 1835          if (!(va.va_mask & AT_MODE)) {
1832 1836                  VN_RELE(dvp);
1833 1837                  dr->dr_status = NFSERR_INVAL;
1834 1838                  return;
1835 1839          }
1836 1840  
1837 1841          /*
1838 1842           * This is a completely gross hack to make mknod
1839 1843           * work over the wire until we can wack the protocol
1840 1844           */
1841 1845          if ((va.va_mode & IFMT) == IFCHR) {
1842 1846                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1843 1847                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1844 1848                  else {
1845 1849                          va.va_type = VCHR;
1846 1850                          /*
1847 1851                           * uncompress the received dev_t
1848 1852                           * if the top half is zero indicating a request
1849 1853                           * from an `older style' OS.
1850 1854                           */
1851 1855                          if ((va.va_size & 0xffff0000) == 0)
1852 1856                                  va.va_rdev = nfsv2_expdev(va.va_size);
1853 1857                          else
1854 1858                                  va.va_rdev = (dev_t)va.va_size;
1855 1859                  }
1856 1860                  va.va_mask &= ~AT_SIZE;
1857 1861          } else if ((va.va_mode & IFMT) == IFBLK) {
1858 1862                  va.va_type = VBLK;
1859 1863                  /*
1860 1864                   * uncompress the received dev_t
1861 1865                   * if the top half is zero indicating a request
1862 1866                   * from an `older style' OS.
1863 1867                   */
1864 1868                  if ((va.va_size & 0xffff0000) == 0)
1865 1869                          va.va_rdev = nfsv2_expdev(va.va_size);
1866 1870                  else
1867 1871                          va.va_rdev = (dev_t)va.va_size;
1868 1872                  va.va_mask &= ~AT_SIZE;
1869 1873          } else if ((va.va_mode & IFMT) == IFSOCK) {
1870 1874                  va.va_type = VSOCK;
1871 1875          } else {
1872 1876                  va.va_type = VREG;
1873 1877          }
1874 1878          va.va_mode &= ~IFMT;
1875 1879          va.va_mask |= AT_TYPE;
1876 1880  
1877 1881          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1878 1882          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1879 1883              MAXPATHLEN);
1880 1884          if (name == NULL) {
1881 1885                  dr->dr_status = puterrno(EINVAL);
1882 1886                  return;
1883 1887          }
1884 1888  
1885 1889          /*
1886 1890           * Why was the choice made to use VWRITE as the mode to the
1887 1891           * call to VOP_CREATE ? This results in a bug.  When a client
1888 1892           * opens a file that already exists and is RDONLY, the second
1889 1893           * open fails with an EACESS because of the mode.
1890 1894           * bug ID 1054648.
1891 1895           */
1892 1896          lookup_ok = 0;
1893 1897          mode = VWRITE;
1894 1898          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1895 1899                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1896 1900                      NULL, NULL, NULL);
1897 1901                  if (!error) {
1898 1902                          struct vattr at;
1899 1903  
1900 1904                          lookup_ok = 1;
1901 1905                          at.va_mask = AT_MODE;
1902 1906                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1903 1907                          if (!error)
1904 1908                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1905 1909                          VN_RELE(tvp);
1906 1910                          tvp = NULL;
1907 1911                  }
1908 1912          }
1909 1913  
1910 1914          if (!lookup_ok) {
1911 1915                  if (rdonly(ro, dvp)) {
1912 1916                          error = EROFS;
1913 1917                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1914 1918                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1915 1919                          error = EPERM;
1916 1920                  } else {
1917 1921                          error = 0;
1918 1922                  }
1919 1923          }
1920 1924  
1921 1925          /*
1922 1926           * If file size is being modified on an already existing file
1923 1927           * make sure that there are no conflicting non-blocking mandatory
1924 1928           * locks in the region being manipulated. Return EACCES if there
1925 1929           * are conflicting locks.
1926 1930           */
1927 1931          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1928 1932                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1929 1933                      NULL, NULL, NULL);
1930 1934  
1931 1935                  if (!lookuperr &&
1932 1936                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1933 1937                          VN_RELE(tvp);
1934 1938                          curthread->t_flag |= T_WOULDBLOCK;
1935 1939                          goto out;
1936 1940                  }
1937 1941  
1938 1942                  if (!lookuperr && nbl_need_check(tvp)) {
1939 1943                          /*
1940 1944                           * The file exists. Now check if it has any
1941 1945                           * conflicting non-blocking mandatory locks
1942 1946                           * in the region being changed.
1943 1947                           */
1944 1948                          struct vattr bva;
1945 1949                          u_offset_t offset;
1946 1950                          ssize_t length;
1947 1951  
1948 1952                          nbl_start_crit(tvp, RW_READER);
1949 1953                          in_crit = 1;
1950 1954  
1951 1955                          bva.va_mask = AT_SIZE;
1952 1956                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1953 1957                          if (!error) {
1954 1958                                  if (va.va_size < bva.va_size) {
1955 1959                                          offset = va.va_size;
1956 1960                                          length = bva.va_size - va.va_size;
1957 1961                                  } else {
1958 1962                                          offset = bva.va_size;
1959 1963                                          length = va.va_size - bva.va_size;
1960 1964                                  }
1961 1965                                  if (length) {
1962 1966                                          if (nbl_conflict(tvp, NBL_WRITE,
1963 1967                                              offset, length, 0, NULL)) {
1964 1968                                                  error = EACCES;
1965 1969                                          }
1966 1970                                  }
1967 1971                          }
1968 1972                          if (error) {
1969 1973                                  nbl_end_crit(tvp);
1970 1974                                  VN_RELE(tvp);
1971 1975                                  in_crit = 0;
1972 1976                          }
1973 1977                  } else if (tvp != NULL) {
1974 1978                          VN_RELE(tvp);
1975 1979                  }
1976 1980          }
1977 1981  
1978 1982          if (!error) {
1979 1983                  /*
1980 1984                   * If filesystem is shared with nosuid the remove any
1981 1985                   * setuid/setgid bits on create.
1982 1986                   */
1983 1987                  if (va.va_type == VREG &&
1984 1988                      exi->exi_export.ex_flags & EX_NOSUID)
1985 1989                          va.va_mode &= ~(VSUID | VSGID);
1986 1990  
1987 1991                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1988 1992                      NULL, NULL);
1989 1993  
1990 1994                  if (!error) {
1991 1995  
1992 1996                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1993 1997                                  trunc = TRUE;
1994 1998                          else
1995 1999                                  trunc = FALSE;
1996 2000  
1997 2001                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1998 2002                                  VN_RELE(vp);
1999 2003                                  curthread->t_flag |= T_WOULDBLOCK;
2000 2004                                  goto out;
2001 2005                          }
2002 2006                          va.va_mask = AT_ALL;
2003 2007  
2004 2008                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
2005 2009  
2006 2010                          /* check for overflows */
2007 2011                          if (!error) {
2008 2012                                  acl_perm(vp, exi, &va, cr);
2009 2013                                  error = vattr_to_nattr(&va, &dr->dr_attr);
2010 2014                                  if (!error) {
2011 2015                                          error = makefh(&dr->dr_fhandle, vp,
2012 2016                                              exi);
2013 2017                                  }
2014 2018                          }
2015 2019                          /*
2016 2020                           * Force modified metadata out to stable storage.
2017 2021                           *
2018 2022                           * if a underlying vp exists, pass it to VOP_FSYNC
2019 2023                           */
2020 2024                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
2021 2025                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
2022 2026                          else
2023 2027                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2024 2028                          VN_RELE(vp);
2025 2029                  }
2026 2030  
2027 2031                  if (in_crit) {
2028 2032                          nbl_end_crit(tvp);
2029 2033                          VN_RELE(tvp);
2030 2034                  }
2031 2035          }
2032 2036  
2033 2037          /*
2034 2038           * Force modified data and metadata out to stable storage.
2035 2039           */
2036 2040          (void) VOP_FSYNC(dvp, 0, cr, NULL);
2037 2041  
2038 2042  out:
2039 2043  
2040 2044          VN_RELE(dvp);
2041 2045  
2042 2046          dr->dr_status = puterrno(error);
2043 2047  
2044 2048          if (name != args->ca_da.da_name)
2045 2049                  kmem_free(name, MAXPATHLEN);
2046 2050  }
2047 2051  void *
2048 2052  rfs_create_getfh(struct nfscreatargs *args)
2049 2053  {
2050 2054          return (args->ca_da.da_fhandle);
2051 2055  }
2052 2056  
2053 2057  /*
2054 2058   * Remove a file.
2055 2059   * Remove named file from parent directory.
2056 2060   */
2057 2061  /* ARGSUSED */
2058 2062  void
2059 2063  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
2060 2064      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2061 2065  {
2062 2066          int error = 0;
2063 2067          vnode_t *vp;
2064 2068          vnode_t *targvp;
2065 2069          int in_crit = 0;
2066 2070  
2067 2071          /*
2068 2072           * Disallow NULL paths
2069 2073           */
2070 2074          if (da->da_name == NULL || *da->da_name == '\0') {
2071 2075                  *status = NFSERR_ACCES;
2072 2076                  return;
2073 2077          }
2074 2078  
2075 2079          vp = nfs_fhtovp(da->da_fhandle, exi);
2076 2080          if (vp == NULL) {
2077 2081                  *status = NFSERR_STALE;
2078 2082                  return;
2079 2083          }
2080 2084  
2081 2085          if (rdonly(ro, vp)) {
2082 2086                  VN_RELE(vp);
2083 2087                  *status = NFSERR_ROFS;
2084 2088                  return;
2085 2089          }
2086 2090  
2087 2091          /*
2088 2092           * Check for a conflict with a non-blocking mandatory share reservation.
2089 2093           */
2090 2094          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
2091 2095              NULL, cr, NULL, NULL, NULL);
2092 2096          if (error != 0) {
2093 2097                  VN_RELE(vp);
2094 2098                  *status = puterrno(error);
2095 2099                  return;
2096 2100          }
2097 2101  
2098 2102          /*
2099 2103           * If the file is delegated to an v4 client, then initiate
2100 2104           * recall and drop this request (by setting T_WOULDBLOCK).
2101 2105           * The client will eventually re-transmit the request and
2102 2106           * (hopefully), by then, the v4 client will have returned
2103 2107           * the delegation.
2104 2108           */
2105 2109  
2106 2110          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2107 2111                  VN_RELE(vp);
2108 2112                  VN_RELE(targvp);
2109 2113                  curthread->t_flag |= T_WOULDBLOCK;
2110 2114                  return;
2111 2115          }
2112 2116  
2113 2117          if (nbl_need_check(targvp)) {
2114 2118                  nbl_start_crit(targvp, RW_READER);
2115 2119                  in_crit = 1;
2116 2120                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2117 2121                          error = EACCES;
2118 2122                          goto out;
2119 2123                  }
2120 2124          }
2121 2125  
2122 2126          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2123 2127  
2124 2128          /*
2125 2129           * Force modified data and metadata out to stable storage.
2126 2130           */
2127 2131          (void) VOP_FSYNC(vp, 0, cr, NULL);
2128 2132  
2129 2133  out:
2130 2134          if (in_crit)
2131 2135                  nbl_end_crit(targvp);
2132 2136          VN_RELE(targvp);
2133 2137          VN_RELE(vp);
2134 2138  
2135 2139          *status = puterrno(error);
2136 2140  
2137 2141  }
2138 2142  
2139 2143  void *
2140 2144  rfs_remove_getfh(struct nfsdiropargs *da)
2141 2145  {
2142 2146          return (da->da_fhandle);
2143 2147  }
2144 2148  
2145 2149  /*
2146 2150   * rename a file
2147 2151   * Give a file (from) a new name (to).
2148 2152   */
2149 2153  /* ARGSUSED */
2150 2154  void
2151 2155  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2152 2156      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2153 2157  {
2154 2158          int error = 0;
2155 2159          vnode_t *fromvp;
2156 2160          vnode_t *tovp;
2157 2161          struct exportinfo *to_exi;
2158 2162          fhandle_t *fh;
2159 2163          vnode_t *srcvp;
2160 2164          vnode_t *targvp;
2161 2165          int in_crit = 0;
2162 2166  
2163 2167          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2164 2168          if (fromvp == NULL) {
2165 2169                  *status = NFSERR_STALE;
2166 2170                  return;
2167 2171          }
2168 2172  
2169 2173          fh = args->rna_to.da_fhandle;
2170 2174          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2171 2175          if (to_exi == NULL) {
2172 2176                  VN_RELE(fromvp);
2173 2177                  *status = NFSERR_ACCES;
2174 2178                  return;
2175 2179          }
2176 2180          exi_rele(to_exi);
2177 2181  
2178 2182          if (to_exi != exi) {
2179 2183                  VN_RELE(fromvp);
2180 2184                  *status = NFSERR_XDEV;
2181 2185                  return;
2182 2186          }
2183 2187  
2184 2188          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2185 2189          if (tovp == NULL) {
2186 2190                  VN_RELE(fromvp);
2187 2191                  *status = NFSERR_STALE;
2188 2192                  return;
2189 2193          }
2190 2194  
2191 2195          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2192 2196                  VN_RELE(tovp);
2193 2197                  VN_RELE(fromvp);
2194 2198                  *status = NFSERR_NOTDIR;
2195 2199                  return;
2196 2200          }
2197 2201  
2198 2202          /*
2199 2203           * Disallow NULL paths
2200 2204           */
2201 2205          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2202 2206              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2203 2207                  VN_RELE(tovp);
2204 2208                  VN_RELE(fromvp);
2205 2209                  *status = NFSERR_ACCES;
2206 2210                  return;
2207 2211          }
2208 2212  
2209 2213          if (rdonly(ro, tovp)) {
2210 2214                  VN_RELE(tovp);
2211 2215                  VN_RELE(fromvp);
2212 2216                  *status = NFSERR_ROFS;
2213 2217                  return;
2214 2218          }
2215 2219  
2216 2220          /*
2217 2221           * Check for a conflict with a non-blocking mandatory share reservation.
2218 2222           */
2219 2223          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2220 2224              NULL, cr, NULL, NULL, NULL);
2221 2225          if (error != 0) {
2222 2226                  VN_RELE(tovp);
2223 2227                  VN_RELE(fromvp);
2224 2228                  *status = puterrno(error);
2225 2229                  return;
2226 2230          }
2227 2231  
2228 2232          /* Check for delegations on the source file */
2229 2233  
2230 2234          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2231 2235                  VN_RELE(tovp);
2232 2236                  VN_RELE(fromvp);
2233 2237                  VN_RELE(srcvp);
2234 2238                  curthread->t_flag |= T_WOULDBLOCK;
2235 2239                  return;
2236 2240          }
2237 2241  
2238 2242          /* Check for delegation on the file being renamed over, if it exists */
2239 2243  
2240 2244          if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2241 2245              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2242 2246              NULL, NULL, NULL) == 0) {
2243 2247  
2244 2248                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2245 2249                          VN_RELE(tovp);
2246 2250                          VN_RELE(fromvp);
2247 2251                          VN_RELE(srcvp);
2248 2252                          VN_RELE(targvp);
2249 2253                          curthread->t_flag |= T_WOULDBLOCK;
2250 2254                          return;
2251 2255                  }
2252 2256                  VN_RELE(targvp);
2253 2257          }
2254 2258  
2255 2259  
2256 2260          if (nbl_need_check(srcvp)) {
2257 2261                  nbl_start_crit(srcvp, RW_READER);
2258 2262                  in_crit = 1;
2259 2263                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2260 2264                          error = EACCES;
2261 2265                          goto out;
2262 2266                  }
2263 2267          }
2264 2268  
2265 2269          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2266 2270              tovp, args->rna_to.da_name, cr, NULL, 0);
2267 2271  
2268 2272          if (error == 0)
2269 2273                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2270 2274                      strlen(args->rna_to.da_name));
2271 2275  
2272 2276          /*
2273 2277           * Force modified data and metadata out to stable storage.
2274 2278           */
2275 2279          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2276 2280          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2277 2281  
2278 2282  out:
2279 2283          if (in_crit)
2280 2284                  nbl_end_crit(srcvp);
2281 2285          VN_RELE(srcvp);
2282 2286          VN_RELE(tovp);
2283 2287          VN_RELE(fromvp);
2284 2288  
2285 2289          *status = puterrno(error);
2286 2290  
2287 2291  }
2288 2292  void *
2289 2293  rfs_rename_getfh(struct nfsrnmargs *args)
2290 2294  {
2291 2295          return (args->rna_from.da_fhandle);
2292 2296  }
2293 2297  
2294 2298  /*
2295 2299   * Link to a file.
2296 2300   * Create a file (to) which is a hard link to the given file (from).
2297 2301   */
2298 2302  /* ARGSUSED */
2299 2303  void
2300 2304  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2301 2305      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2302 2306  {
2303 2307          int error;
2304 2308          vnode_t *fromvp;
2305 2309          vnode_t *tovp;
2306 2310          struct exportinfo *to_exi;
2307 2311          fhandle_t *fh;
2308 2312  
2309 2313          fromvp = nfs_fhtovp(args->la_from, exi);
2310 2314          if (fromvp == NULL) {
2311 2315                  *status = NFSERR_STALE;
2312 2316                  return;
2313 2317          }
2314 2318  
2315 2319          fh = args->la_to.da_fhandle;
2316 2320          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2317 2321          if (to_exi == NULL) {
2318 2322                  VN_RELE(fromvp);
2319 2323                  *status = NFSERR_ACCES;
2320 2324                  return;
2321 2325          }
2322 2326          exi_rele(to_exi);
2323 2327  
2324 2328          if (to_exi != exi) {
2325 2329                  VN_RELE(fromvp);
2326 2330                  *status = NFSERR_XDEV;
2327 2331                  return;
2328 2332          }
2329 2333  
2330 2334          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2331 2335          if (tovp == NULL) {
2332 2336                  VN_RELE(fromvp);
2333 2337                  *status = NFSERR_STALE;
2334 2338                  return;
2335 2339          }
2336 2340  
2337 2341          if (tovp->v_type != VDIR) {
2338 2342                  VN_RELE(tovp);
2339 2343                  VN_RELE(fromvp);
2340 2344                  *status = NFSERR_NOTDIR;
2341 2345                  return;
2342 2346          }
2343 2347          /*
2344 2348           * Disallow NULL paths
2345 2349           */
2346 2350          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2347 2351                  VN_RELE(tovp);
2348 2352                  VN_RELE(fromvp);
2349 2353                  *status = NFSERR_ACCES;
2350 2354                  return;
2351 2355          }
2352 2356  
2353 2357          if (rdonly(ro, tovp)) {
2354 2358                  VN_RELE(tovp);
2355 2359                  VN_RELE(fromvp);
2356 2360                  *status = NFSERR_ROFS;
2357 2361                  return;
2358 2362          }
2359 2363  
2360 2364          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2361 2365  
2362 2366          /*
2363 2367           * Force modified data and metadata out to stable storage.
2364 2368           */
2365 2369          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2366 2370          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2367 2371  
2368 2372          VN_RELE(tovp);
2369 2373          VN_RELE(fromvp);
2370 2374  
2371 2375          *status = puterrno(error);
2372 2376  
2373 2377  }
2374 2378  void *
2375 2379  rfs_link_getfh(struct nfslinkargs *args)
2376 2380  {
2377 2381          return (args->la_from);
2378 2382  }
2379 2383  
2380 2384  /*
2381 2385   * Symbolicly link to a file.
2382 2386   * Create a file (to) with the given attributes which is a symbolic link
2383 2387   * to the given path name (to).
2384 2388   */
2385 2389  void
2386 2390  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2387 2391      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2388 2392  {
2389 2393          int error;
2390 2394          struct vattr va;
2391 2395          vnode_t *vp;
2392 2396          vnode_t *svp;
2393 2397          int lerror;
2394 2398          struct sockaddr *ca;
2395 2399          char *name = NULL;
2396 2400  
2397 2401          /*
2398 2402           * Disallow NULL paths
2399 2403           */
2400 2404          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2401 2405                  *status = NFSERR_ACCES;
2402 2406                  return;
2403 2407          }
2404 2408  
2405 2409          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2406 2410          if (vp == NULL) {
2407 2411                  *status = NFSERR_STALE;
2408 2412                  return;
2409 2413          }
2410 2414  
2411 2415          if (rdonly(ro, vp)) {
2412 2416                  VN_RELE(vp);
2413 2417                  *status = NFSERR_ROFS;
2414 2418                  return;
2415 2419          }
2416 2420  
2417 2421          error = sattr_to_vattr(args->sla_sa, &va);
2418 2422          if (error) {
2419 2423                  VN_RELE(vp);
2420 2424                  *status = puterrno(error);
2421 2425                  return;
2422 2426          }
2423 2427  
2424 2428          if (!(va.va_mask & AT_MODE)) {
2425 2429                  VN_RELE(vp);
2426 2430                  *status = NFSERR_INVAL;
2427 2431                  return;
2428 2432          }
2429 2433  
2430 2434          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2431 2435          name = nfscmd_convname(ca, exi, args->sla_tnm,
2432 2436              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2433 2437  
2434 2438          if (name == NULL) {
2435 2439                  *status = NFSERR_ACCES;
2436 2440                  return;
2437 2441          }
2438 2442  
2439 2443          va.va_type = VLNK;
2440 2444          va.va_mask |= AT_TYPE;
2441 2445  
2442 2446          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2443 2447  
2444 2448          /*
2445 2449           * Force new data and metadata out to stable storage.
2446 2450           */
2447 2451          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2448 2452              NULL, cr, NULL, NULL, NULL);
2449 2453  
2450 2454          if (!lerror) {
2451 2455                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2452 2456                  VN_RELE(svp);
2453 2457          }
2454 2458  
2455 2459          /*
2456 2460           * Force modified data and metadata out to stable storage.
2457 2461           */
2458 2462          (void) VOP_FSYNC(vp, 0, cr, NULL);
2459 2463  
2460 2464          VN_RELE(vp);
2461 2465  
2462 2466          *status = puterrno(error);
2463 2467          if (name != args->sla_tnm)
2464 2468                  kmem_free(name, MAXPATHLEN);
2465 2469  
2466 2470  }
2467 2471  void *
2468 2472  rfs_symlink_getfh(struct nfsslargs *args)
2469 2473  {
2470 2474          return (args->sla_from.da_fhandle);
2471 2475  }
2472 2476  
2473 2477  /*
2474 2478   * Make a directory.
2475 2479   * Create a directory with the given name, parent directory, and attributes.
2476 2480   * Returns a file handle and attributes for the new directory.
2477 2481   */
2478 2482  /* ARGSUSED */
2479 2483  void
2480 2484  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2481 2485      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2482 2486  {
2483 2487          int error;
2484 2488          struct vattr va;
2485 2489          vnode_t *dvp = NULL;
2486 2490          vnode_t *vp;
2487 2491          char *name = args->ca_da.da_name;
2488 2492  
2489 2493          /*
2490 2494           * Disallow NULL paths
2491 2495           */
2492 2496          if (name == NULL || *name == '\0') {
2493 2497                  dr->dr_status = NFSERR_ACCES;
2494 2498                  return;
2495 2499          }
2496 2500  
2497 2501          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2498 2502          if (vp == NULL) {
2499 2503                  dr->dr_status = NFSERR_STALE;
2500 2504                  return;
2501 2505          }
2502 2506  
2503 2507          if (rdonly(ro, vp)) {
2504 2508                  VN_RELE(vp);
2505 2509                  dr->dr_status = NFSERR_ROFS;
2506 2510                  return;
2507 2511          }
2508 2512  
2509 2513          error = sattr_to_vattr(args->ca_sa, &va);
2510 2514          if (error) {
2511 2515                  VN_RELE(vp);
2512 2516                  dr->dr_status = puterrno(error);
2513 2517                  return;
2514 2518          }
2515 2519  
2516 2520          if (!(va.va_mask & AT_MODE)) {
2517 2521                  VN_RELE(vp);
2518 2522                  dr->dr_status = NFSERR_INVAL;
2519 2523                  return;
2520 2524          }
2521 2525  
2522 2526          va.va_type = VDIR;
2523 2527          va.va_mask |= AT_TYPE;
2524 2528  
2525 2529          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2526 2530  
2527 2531          if (!error) {
2528 2532                  /*
2529 2533                   * Attribtutes of the newly created directory should
2530 2534                   * be returned to the client.
2531 2535                   */
2532 2536                  va.va_mask = AT_ALL; /* We want everything */
2533 2537                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2534 2538  
2535 2539                  /* check for overflows */
2536 2540                  if (!error) {
2537 2541                          acl_perm(vp, exi, &va, cr);
2538 2542                          error = vattr_to_nattr(&va, &dr->dr_attr);
2539 2543                          if (!error) {
2540 2544                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2541 2545                          }
2542 2546                  }
2543 2547                  /*
2544 2548                   * Force new data and metadata out to stable storage.
2545 2549                   */
2546 2550                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2547 2551                  VN_RELE(dvp);
2548 2552          }
2549 2553  
2550 2554          /*
2551 2555           * Force modified data and metadata out to stable storage.
2552 2556           */
2553 2557          (void) VOP_FSYNC(vp, 0, cr, NULL);
2554 2558  
2555 2559          VN_RELE(vp);
2556 2560  
2557 2561          dr->dr_status = puterrno(error);
2558 2562  
2559 2563  }
2560 2564  void *
2561 2565  rfs_mkdir_getfh(struct nfscreatargs *args)
2562 2566  {
2563 2567          return (args->ca_da.da_fhandle);
2564 2568  }
2565 2569  
2566 2570  /*
2567 2571   * Remove a directory.
2568 2572   * Remove the given directory name from the given parent directory.
2569 2573   */
2570 2574  /* ARGSUSED */
2571 2575  void
2572 2576  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2573 2577      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2574 2578  {
2575 2579          int error;
2576 2580          vnode_t *vp;
2577 2581  
2578 2582          /*
2579 2583           * Disallow NULL paths
2580 2584           */
2581 2585          if (da->da_name == NULL || *da->da_name == '\0') {
2582 2586                  *status = NFSERR_ACCES;
2583 2587                  return;
2584 2588          }
2585 2589  
2586 2590          vp = nfs_fhtovp(da->da_fhandle, exi);
2587 2591          if (vp == NULL) {
2588 2592                  *status = NFSERR_STALE;
2589 2593                  return;
2590 2594          }
2591 2595  
2592 2596          if (rdonly(ro, vp)) {
2593 2597                  VN_RELE(vp);
2594 2598                  *status = NFSERR_ROFS;
2595 2599                  return;
2596 2600          }
2597 2601  
2598 2602          /*
2599 2603           * VOP_RMDIR takes a third argument (the current
2600 2604           * directory of the process).  That's because someone
2601 2605           * wants to return EINVAL if one tries to remove ".".
2602 2606           * Of course, NFS servers have no idea what their
2603 2607           * clients' current directories are.  We fake it by
2604 2608           * supplying a vnode known to exist and illegal to
2605 2609           * remove.
2606 2610           */
2607 2611          error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2608 2612  
2609 2613          /*
2610 2614           * Force modified data and metadata out to stable storage.
2611 2615           */
2612 2616          (void) VOP_FSYNC(vp, 0, cr, NULL);
2613 2617  
2614 2618          VN_RELE(vp);
2615 2619  
2616 2620          /*
2617 2621           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2618 2622           * if the directory is not empty.  A System V NFS server
2619 2623           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2620 2624           * over the wire.
2621 2625           */
2622 2626          if (error == EEXIST)
2623 2627                  *status = NFSERR_NOTEMPTY;
2624 2628          else
2625 2629                  *status = puterrno(error);
2626 2630  
2627 2631  }
2628 2632  void *
2629 2633  rfs_rmdir_getfh(struct nfsdiropargs *da)
2630 2634  {
2631 2635          return (da->da_fhandle);
2632 2636  }
2633 2637  
2634 2638  /* ARGSUSED */
2635 2639  void
2636 2640  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2637 2641      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2638 2642  {
2639 2643          int error;
2640 2644          int iseof;
2641 2645          struct iovec iov;
2642 2646          struct uio uio;
2643 2647          vnode_t *vp;
2644 2648          char *ndata = NULL;
2645 2649          struct sockaddr *ca;
2646 2650          size_t nents;
2647 2651          int ret;
2648 2652  
2649 2653          vp = nfs_fhtovp(&rda->rda_fh, exi);
2650 2654          if (vp == NULL) {
2651 2655                  rd->rd_entries = NULL;
2652 2656                  rd->rd_status = NFSERR_STALE;
2653 2657                  return;
2654 2658          }
2655 2659  
2656 2660          if (vp->v_type != VDIR) {
2657 2661                  VN_RELE(vp);
2658 2662                  rd->rd_entries = NULL;
2659 2663                  rd->rd_status = NFSERR_NOTDIR;
2660 2664                  return;
2661 2665          }
2662 2666  
2663 2667          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2664 2668  
2665 2669          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2666 2670  
2667 2671          if (error) {
2668 2672                  rd->rd_entries = NULL;
2669 2673                  goto bad;
2670 2674          }
2671 2675  
2672 2676          if (rda->rda_count == 0) {
2673 2677                  rd->rd_entries = NULL;
2674 2678                  rd->rd_size = 0;
2675 2679                  rd->rd_eof = FALSE;
2676 2680                  goto bad;
2677 2681          }
2678 2682  
2679 2683          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2680 2684  
2681 2685          /*
2682 2686           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2683 2687           */
2684 2688          rd->rd_bufsize = (uint_t)rda->rda_count;
2685 2689          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2686 2690  
2687 2691          /*
2688 2692           * Set up io vector to read directory data
2689 2693           */
2690 2694          iov.iov_base = (caddr_t)rd->rd_entries;
2691 2695          iov.iov_len = rda->rda_count;
2692 2696          uio.uio_iov = &iov;
2693 2697          uio.uio_iovcnt = 1;
2694 2698          uio.uio_segflg = UIO_SYSSPACE;
2695 2699          uio.uio_extflg = UIO_COPY_CACHED;
2696 2700          uio.uio_loffset = (offset_t)rda->rda_offset;
2697 2701          uio.uio_resid = rda->rda_count;
2698 2702  
2699 2703          /*
2700 2704           * read directory
2701 2705           */
2702 2706          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2703 2707  
2704 2708          /*
2705 2709           * Clean up
2706 2710           */
2707 2711          if (!error) {
2708 2712                  /*
2709 2713                   * set size and eof
2710 2714                   */
2711 2715                  if (uio.uio_resid == rda->rda_count) {
2712 2716                          rd->rd_size = 0;
2713 2717                          rd->rd_eof = TRUE;
2714 2718                  } else {
2715 2719                          rd->rd_size = (uint32_t)(rda->rda_count -
2716 2720                              uio.uio_resid);
2717 2721                          rd->rd_eof = iseof ? TRUE : FALSE;
2718 2722                  }
2719 2723          }
2720 2724  
2721 2725          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2722 2726          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2723 2727          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2724 2728              rda->rda_count, &ndata);
2725 2729  
2726 2730          if (ret != 0) {
2727 2731                  size_t dropbytes;
2728 2732                  /*
2729 2733                   * We had to drop one or more entries in order to fit
2730 2734                   * during the character conversion.  We need to patch
2731 2735                   * up the size and eof info.
2732 2736                   */
2733 2737                  if (rd->rd_eof)
2734 2738                          rd->rd_eof = FALSE;
2735 2739                  dropbytes = nfscmd_dropped_entrysize(
2736 2740                      (struct dirent64 *)rd->rd_entries, nents, ret);
2737 2741                  rd->rd_size -= dropbytes;
2738 2742          }
2739 2743          if (ndata == NULL) {
2740 2744                  ndata = (char *)rd->rd_entries;
2741 2745          } else if (ndata != (char *)rd->rd_entries) {
2742 2746                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2743 2747                  rd->rd_entries = (void *)ndata;
2744 2748                  rd->rd_bufsize = rda->rda_count;
2745 2749          }
2746 2750  
2747 2751  bad:
2748 2752          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2749 2753  
2750 2754  #if 0 /* notyet */
2751 2755          /*
2752 2756           * Don't do this.  It causes local disk writes when just
2753 2757           * reading the file and the overhead is deemed larger
2754 2758           * than the benefit.
2755 2759           */
2756 2760          /*
2757 2761           * Force modified metadata out to stable storage.
2758 2762           */
2759 2763          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2760 2764  #endif
2761 2765  
2762 2766          VN_RELE(vp);
2763 2767  
2764 2768          rd->rd_status = puterrno(error);
2765 2769  
2766 2770  }
2767 2771  void *
2768 2772  rfs_readdir_getfh(struct nfsrddirargs *rda)
2769 2773  {
2770 2774          return (&rda->rda_fh);
2771 2775  }
2772 2776  void
2773 2777  rfs_rddirfree(struct nfsrddirres *rd)
2774 2778  {
2775 2779          if (rd->rd_entries != NULL)
2776 2780                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2777 2781  }
2778 2782  
2779 2783  /* ARGSUSED */
2780 2784  void
2781 2785  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2782 2786      struct svc_req *req, cred_t *cr, bool_t ro)
2783 2787  {
2784 2788          int error;
2785 2789          struct statvfs64 sb;
2786 2790          vnode_t *vp;
2787 2791  
2788 2792          vp = nfs_fhtovp(fh, exi);
2789 2793          if (vp == NULL) {
2790 2794                  fs->fs_status = NFSERR_STALE;
2791 2795                  return;
2792 2796          }
2793 2797  
2794 2798          error = VFS_STATVFS(vp->v_vfsp, &sb);
2795 2799  
2796 2800          if (!error) {
2797 2801                  fs->fs_tsize = nfstsize();
2798 2802                  fs->fs_bsize = sb.f_frsize;
2799 2803                  fs->fs_blocks = sb.f_blocks;
2800 2804                  fs->fs_bfree = sb.f_bfree;
2801 2805                  fs->fs_bavail = sb.f_bavail;
2802 2806          }
2803 2807  
2804 2808          VN_RELE(vp);
2805 2809  
2806 2810          fs->fs_status = puterrno(error);
2807 2811  
2808 2812  }
2809 2813  void *
2810 2814  rfs_statfs_getfh(fhandle_t *fh)
2811 2815  {
2812 2816          return (fh);
2813 2817  }
2814 2818  
2815 2819  static int
2816 2820  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2817 2821  {
2818 2822          vap->va_mask = 0;
2819 2823  
2820 2824          /*
2821 2825           * There was a sign extension bug in some VFS based systems
2822 2826           * which stored the mode as a short.  When it would get
2823 2827           * assigned to a u_long, no sign extension would occur.
2824 2828           * It needed to, but this wasn't noticed because sa_mode
2825 2829           * would then get assigned back to the short, thus ignoring
2826 2830           * the upper 16 bits of sa_mode.
2827 2831           *
2828 2832           * To make this implementation work for both broken
2829 2833           * clients and good clients, we check for both versions
2830 2834           * of the mode.
2831 2835           */
2832 2836          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2833 2837              sa->sa_mode != (uint32_t)-1) {
2834 2838                  vap->va_mask |= AT_MODE;
2835 2839                  vap->va_mode = sa->sa_mode;
2836 2840          }
2837 2841          if (sa->sa_uid != (uint32_t)-1) {
2838 2842                  vap->va_mask |= AT_UID;
2839 2843                  vap->va_uid = sa->sa_uid;
2840 2844          }
2841 2845          if (sa->sa_gid != (uint32_t)-1) {
2842 2846                  vap->va_mask |= AT_GID;
2843 2847                  vap->va_gid = sa->sa_gid;
2844 2848          }
2845 2849          if (sa->sa_size != (uint32_t)-1) {
2846 2850                  vap->va_mask |= AT_SIZE;
2847 2851                  vap->va_size = sa->sa_size;
2848 2852          }
2849 2853          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2850 2854              sa->sa_atime.tv_usec != (int32_t)-1) {
2851 2855  #ifndef _LP64
2852 2856                  /* return error if time overflow */
2853 2857                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2854 2858                          return (EOVERFLOW);
2855 2859  #endif
2856 2860                  vap->va_mask |= AT_ATIME;
2857 2861                  /*
2858 2862                   * nfs protocol defines times as unsigned so don't extend sign,
2859 2863                   * unless sysadmin set nfs_allow_preepoch_time.
2860 2864                   */
2861 2865                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2862 2866                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2863 2867          }
2864 2868          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2865 2869              sa->sa_mtime.tv_usec != (int32_t)-1) {
2866 2870  #ifndef _LP64
2867 2871                  /* return error if time overflow */
2868 2872                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2869 2873                          return (EOVERFLOW);
2870 2874  #endif
2871 2875                  vap->va_mask |= AT_MTIME;
2872 2876                  /*
2873 2877                   * nfs protocol defines times as unsigned so don't extend sign,
2874 2878                   * unless sysadmin set nfs_allow_preepoch_time.
2875 2879                   */
2876 2880                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2877 2881                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2878 2882          }
2879 2883          return (0);
2880 2884  }
2881 2885  
2882 2886  static const enum nfsftype vt_to_nf[] = {
2883 2887          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2884 2888  };
2885 2889  
2886 2890  /*
2887 2891   * check the following fields for overflow: nodeid, size, and time.
2888 2892   * There could be a problem when converting 64-bit LP64 fields
2889 2893   * into 32-bit ones.  Return an error if there is an overflow.
2890 2894   */
2891 2895  int
2892 2896  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2893 2897  {
2894 2898          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2895 2899          na->na_type = vt_to_nf[vap->va_type];
2896 2900  
2897 2901          if (vap->va_mode == (unsigned short) -1)
2898 2902                  na->na_mode = (uint32_t)-1;
2899 2903          else
2900 2904                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2901 2905  
2902 2906          if (vap->va_uid == (unsigned short)(-1))
2903 2907                  na->na_uid = (uint32_t)(-1);
2904 2908          else if (vap->va_uid == UID_NOBODY)
2905 2909                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2906 2910          else
2907 2911                  na->na_uid = vap->va_uid;
2908 2912  
2909 2913          if (vap->va_gid == (unsigned short)(-1))
2910 2914                  na->na_gid = (uint32_t)-1;
2911 2915          else if (vap->va_gid == GID_NOBODY)
2912 2916                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2913 2917          else
2914 2918                  na->na_gid = vap->va_gid;
2915 2919  
2916 2920          /*
2917 2921           * Do we need to check fsid for overflow?  It is 64-bit in the
2918 2922           * vattr, but are bigger than 32 bit values supported?
2919 2923           */
2920 2924          na->na_fsid = vap->va_fsid;
2921 2925  
2922 2926          na->na_nodeid = vap->va_nodeid;
2923 2927  
2924 2928          /*
2925 2929           * Check to make sure that the nodeid is representable over the
2926 2930           * wire without losing bits.
2927 2931           */
2928 2932          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2929 2933                  return (EFBIG);
2930 2934          na->na_nlink = vap->va_nlink;
2931 2935  
2932 2936          /*
2933 2937           * Check for big files here, instead of at the caller.  See
2934 2938           * comments in cstat for large special file explanation.
2935 2939           */
2936 2940          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2937 2941                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2938 2942                          return (EFBIG);
2939 2943                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2940 2944                          /* UNKNOWN_SIZE | OVERFLOW */
2941 2945                          na->na_size = MAXOFF32_T;
2942 2946                  } else
2943 2947                          na->na_size = vap->va_size;
2944 2948          } else
2945 2949                  na->na_size = vap->va_size;
2946 2950  
2947 2951          /*
2948 2952           * If the vnode times overflow the 32-bit times that NFS2
2949 2953           * uses on the wire then return an error.
2950 2954           */
2951 2955          if (!NFS_VAP_TIME_OK(vap)) {
2952 2956                  return (EOVERFLOW);
2953 2957          }
2954 2958          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2955 2959          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2956 2960  
2957 2961          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2958 2962          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2959 2963  
2960 2964          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2961 2965          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2962 2966  
2963 2967          /*
2964 2968           * If the dev_t will fit into 16 bits then compress
2965 2969           * it, otherwise leave it alone. See comments in
2966 2970           * nfs_client.c.
2967 2971           */
2968 2972          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2969 2973              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2970 2974                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2971 2975          else
2972 2976                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2973 2977  
2974 2978          na->na_blocks = vap->va_nblocks;
2975 2979          na->na_blocksize = vap->va_blksize;
2976 2980  
2977 2981          /*
2978 2982           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2979 2983           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2980 2984           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2981 2985           *
2982 2986           * BUYER BEWARE:
2983 2987           *  If you are porting the NFS to a non-Sun server, you probably
2984 2988           *  don't want to include the following block of code.  The
2985 2989           *  over-the-wire special file types will be changing with the
2986 2990           *  NFS Protocol Revision.
2987 2991           */
2988 2992          if (vap->va_type == VFIFO)
2989 2993                  NA_SETFIFO(na);
2990 2994          return (0);
2991 2995  }
2992 2996  
2993 2997  /*
2994 2998   * acl v2 support: returns approximate permission.
2995 2999   *      default: returns minimal permission (more restrictive)
2996 3000   *      aclok: returns maximal permission (less restrictive)
2997 3001   *      This routine changes the permissions that are alaredy in *va.
2998 3002   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2999 3003   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
3000 3004   */
3001 3005  static void
3002 3006  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
3003 3007  {
3004 3008          vsecattr_t      vsa;
3005 3009          int             aclcnt;
3006 3010          aclent_t        *aclentp;
3007 3011          mode_t          mask_perm;
3008 3012          mode_t          grp_perm;
3009 3013          mode_t          other_perm;
3010 3014          mode_t          other_orig;
3011 3015          int             error;
3012 3016  
3013 3017          /* dont care default acl */
3014 3018          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
3015 3019          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
3016 3020  
3017 3021          if (!error) {
3018 3022                  aclcnt = vsa.vsa_aclcnt;
3019 3023                  if (aclcnt > MIN_ACL_ENTRIES) {
3020 3024                          /* non-trivial ACL */
3021 3025                          aclentp = vsa.vsa_aclentp;
3022 3026                          if (exi->exi_export.ex_flags & EX_ACLOK) {
3023 3027                                  /* maximal permissions */
3024 3028                                  grp_perm = 0;
3025 3029                                  other_perm = 0;
3026 3030                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3027 3031                                          switch (aclentp->a_type) {
3028 3032                                          case USER_OBJ:
3029 3033                                                  break;
3030 3034                                          case USER:
3031 3035                                                  grp_perm |=
3032 3036                                                      aclentp->a_perm << 3;
3033 3037                                                  other_perm |= aclentp->a_perm;
3034 3038                                                  break;
3035 3039                                          case GROUP_OBJ:
3036 3040                                                  grp_perm |=
3037 3041                                                      aclentp->a_perm << 3;
3038 3042                                                  break;
3039 3043                                          case GROUP:
3040 3044                                                  other_perm |= aclentp->a_perm;
3041 3045                                                  break;
3042 3046                                          case OTHER_OBJ:
3043 3047                                                  other_orig = aclentp->a_perm;
3044 3048                                                  break;
3045 3049                                          case CLASS_OBJ:
3046 3050                                                  mask_perm = aclentp->a_perm;
3047 3051                                                  break;
3048 3052                                          default:
3049 3053                                                  break;
3050 3054                                          }
3051 3055                                  }
3052 3056                                  grp_perm &= mask_perm << 3;
3053 3057                                  other_perm &= mask_perm;
3054 3058                                  other_perm |= other_orig;
3055 3059  
3056 3060                          } else {
3057 3061                                  /* minimal permissions */
3058 3062                                  grp_perm = 070;
3059 3063                                  other_perm = 07;
3060 3064                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
3061 3065                                          switch (aclentp->a_type) {
3062 3066                                          case USER_OBJ:
3063 3067                                                  break;
3064 3068                                          case USER:
3065 3069                                          case CLASS_OBJ:
3066 3070                                                  grp_perm &=
3067 3071                                                      aclentp->a_perm << 3;
3068 3072                                                  other_perm &=
3069 3073                                                      aclentp->a_perm;
3070 3074                                                  break;
3071 3075                                          case GROUP_OBJ:
3072 3076                                                  grp_perm &=
3073 3077                                                      aclentp->a_perm << 3;
3074 3078                                                  break;
3075 3079                                          case GROUP:
3076 3080                                                  other_perm &=
3077 3081                                                      aclentp->a_perm;
3078 3082                                                  break;
3079 3083                                          case OTHER_OBJ:
3080 3084                                                  other_perm &=
3081 3085                                                      aclentp->a_perm;
3082 3086                                                  break;
3083 3087                                          default:
3084 3088                                                  break;
3085 3089                                          }
3086 3090                                  }
3087 3091                          }
3088 3092                          /* copy to va */
3089 3093                          va->va_mode &= ~077;
3090 3094                          va->va_mode |= grp_perm | other_perm;
3091 3095                  }
3092 3096                  if (vsa.vsa_aclcnt)
3093 3097                          kmem_free(vsa.vsa_aclentp,
3094 3098                              vsa.vsa_aclcnt * sizeof (aclent_t));
3095 3099          }
3096 3100  }
3097 3101  
3098 3102  void
3099 3103  rfs_srvrinit(void)
3100 3104  {
3101 3105          nfs2_srv_caller_id = fs_new_caller_id();
3102 3106          zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
3103 3107  }
3104 3108  
3105 3109  void
3106 3110  rfs_srvrfini(void)
3107 3111  {
3108 3112  }
3109 3113  
3110 3114  /* ARGSUSED */
3111 3115  static void *
3112 3116  rfs_zone_init(zoneid_t zoneid)
3113 3117  {
3114 3118          nfs_srv_t *ns;
3115 3119  
3116 3120          ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
3117 3121  
3118 3122          mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
3119 3123          ns->write_async = 1;
3120 3124  
3121 3125          return (ns);
3122 3126  }
3123 3127  
3124 3128  /* ARGSUSED */
3125 3129  static void
3126 3130  rfs_zone_fini(zoneid_t zoneid, void *data)
3127 3131  {
3128 3132          nfs_srv_t *ns;
3129 3133  
3130 3134          ns = (nfs_srv_t *)data;
3131 3135          mutex_destroy(&ns->async_write_lock);
3132 3136          kmem_free(ns, sizeof (*ns));
3133 3137  }
3134 3138  
3135 3139  static int
3136 3140  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
3137 3141  {
3138 3142          struct clist    *wcl;
3139 3143          int             wlist_len;
3140 3144          uint32_t        count = rr->rr_count;
3141 3145  
3142 3146          wcl = ra->ra_wlist;
3143 3147  
3144 3148          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3145 3149                  return (FALSE);
3146 3150          }
3147 3151  
3148 3152          wcl = ra->ra_wlist;
3149 3153          rr->rr_ok.rrok_wlist_len = wlist_len;
3150 3154          rr->rr_ok.rrok_wlist = wcl;
3151 3155  
3152 3156          return (TRUE);
3153 3157  }

↓ open down ↓

1829 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX