2988 Wdiff usr/src/uts/common/fs/nfs/nfs_srv.c

Print this page

2988 nfssrv: need ability to go to submounts for v3 and v2 protocols
Portions contributed by: Marcel Telka <marcel.telka@nexenta.com>
Portions contributed by: Jean McCormack <jean.mccormack@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Dan Fields <dan.fields@nexenta.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Change-Id: I6fdf110cc17e789353c4442b83a46cb80643456e

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions

↓ open down ↓

10 lines elided

↑ open up ↑

  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
       24 + * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  25   26   */
  26   27  
  27   28  /*
  28   29   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29   30   *      All rights reserved.
  30   31   */
  31   32  
  32   33  #include <sys/param.h>
  33   34  #include <sys/types.h>

  34   35  #include <sys/systm.h>
  35   36  #include <sys/cred.h>
  36   37  #include <sys/buf.h>
  37   38  #include <sys/vfs.h>
  38   39  #include <sys/vnode.h>
  39   40  #include <sys/uio.h>
  40   41  #include <sys/stat.h>
  41   42  #include <sys/errno.h>
  42   43  #include <sys/sysmacros.h>
  43   44  #include <sys/statvfs.h>
  44   45  #include <sys/kmem.h>
  45   46  #include <sys/kstat.h>
  46   47  #include <sys/dirent.h>
  47   48  #include <sys/cmn_err.h>
  48   49  #include <sys/debug.h>
  49   50  #include <sys/vtrace.h>
  50   51  #include <sys/mode.h>
  51   52  #include <sys/acl.h>
  52   53  #include <sys/nbmlock.h>
  53   54  #include <sys/policy.h>
  54   55  #include <sys/sdt.h>
  55   56  
  56   57  #include <rpc/types.h>
  57   58  #include <rpc/auth.h>
  58   59  #include <rpc/svc.h>
  59   60  
  60   61  #include <nfs/nfs.h>
  61   62  #include <nfs/export.h>
  62   63  #include <nfs/nfs_cmd.h>
  63   64  
  64   65  #include <vm/hat.h>
  65   66  #include <vm/as.h>
  66   67  #include <vm/seg.h>
  67   68  #include <vm/seg_map.h>
  68   69  #include <vm/seg_kmem.h>
  69   70  
  70   71  #include <sys/strsubr.h>
  71   72  
  72   73  /*
  73   74   * These are the interface routines for the server side of the
  74   75   * Network File System.  See the NFS version 2 protocol specification
  75   76   * for a description of this interface.
  76   77   */
  77   78  
  78   79  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  79   80  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  80   81                          cred_t *);
  81   82  
  82   83  /*
  83   84   * Some "over the wire" UNIX file types.  These are encoded
  84   85   * into the mode.  This needs to be fixed in the next rev.
  85   86   */
  86   87  #define IFMT            0170000         /* type of file */
  87   88  #define IFCHR           0020000         /* character special */
  88   89  #define IFBLK           0060000         /* block special */
  89   90  #define IFSOCK          0140000         /* socket */
  90   91  
  91   92  u_longlong_t nfs2_srv_caller_id;
  92   93  
  93   94  /*
  94   95   * Get file attributes.
  95   96   * Returns the current attributes of the file with the given fhandle.
  96   97   */
  97   98  /* ARGSUSED */
  98   99  void
  99  100  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 100  101      struct svc_req *req, cred_t *cr, bool_t ro)
 101  102  {
 102  103          int error;
 103  104          vnode_t *vp;
 104  105          struct vattr va;
 105  106  
 106  107          vp = nfs_fhtovp(fhp, exi);
 107  108          if (vp == NULL) {
 108  109                  ns->ns_status = NFSERR_STALE;
 109  110                  return;
 110  111          }
 111  112  
 112  113          /*
 113  114           * Do the getattr.
 114  115           */
 115  116          va.va_mask = AT_ALL;    /* we want all the attributes */
 116  117  
 117  118          error = rfs4_delegated_getattr(vp, &va, 0, cr);
 118  119  
 119  120          /* check for overflows */
 120  121          if (!error) {
 121  122                  /* Lie about the object type for a referral */
 122  123                  if (vn_is_nfs_reparse(vp, cr))
 123  124                          va.va_type = VLNK;
 124  125  
 125  126                  acl_perm(vp, exi, &va, cr);
 126  127                  error = vattr_to_nattr(&va, &ns->ns_attr);
 127  128          }
 128  129  
 129  130          VN_RELE(vp);
 130  131  
 131  132          ns->ns_status = puterrno(error);
 132  133  }
 133  134  void *
 134  135  rfs_getattr_getfh(fhandle_t *fhp)
 135  136  {
 136  137          return (fhp);
 137  138  }
 138  139  
 139  140  /*
 140  141   * Set file attributes.
 141  142   * Sets the attributes of the file with the given fhandle.  Returns
 142  143   * the new attributes.
 143  144   */
 144  145  /* ARGSUSED */
 145  146  void
 146  147  rfs_setattr(struct nfssaargs *args, struct nfsattrstat *ns,
 147  148      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 148  149  {
 149  150          int error;
 150  151          int flag;
 151  152          int in_crit = 0;
 152  153          vnode_t *vp;
 153  154          struct vattr va;
 154  155          struct vattr bva;
 155  156          struct flock64 bf;
 156  157          caller_context_t ct;
 157  158  
 158  159  
 159  160          vp = nfs_fhtovp(&args->saa_fh, exi);
 160  161          if (vp == NULL) {
 161  162                  ns->ns_status = NFSERR_STALE;
 162  163                  return;
 163  164          }
 164  165  
 165  166          if (rdonly(ro, vp)) {
 166  167                  VN_RELE(vp);
 167  168                  ns->ns_status = NFSERR_ROFS;
 168  169                  return;
 169  170          }
 170  171  
 171  172          error = sattr_to_vattr(&args->saa_sa, &va);
 172  173          if (error) {
 173  174                  VN_RELE(vp);
 174  175                  ns->ns_status = puterrno(error);
 175  176                  return;
 176  177          }
 177  178  
 178  179          /*
 179  180           * If the client is requesting a change to the mtime,
 180  181           * but the nanosecond field is set to 1 billion, then
 181  182           * this is a flag to the server that it should set the
 182  183           * atime and mtime fields to the server's current time.
 183  184           * The 1 billion number actually came from the client
 184  185           * as 1 million, but the units in the over the wire
 185  186           * request are microseconds instead of nanoseconds.
 186  187           *
 187  188           * This is an overload of the protocol and should be
 188  189           * documented in the NFS Version 2 protocol specification.
 189  190           */
 190  191          if (va.va_mask & AT_MTIME) {
 191  192                  if (va.va_mtime.tv_nsec == 1000000000) {
 192  193                          gethrestime(&va.va_mtime);
 193  194                          va.va_atime = va.va_mtime;
 194  195                          va.va_mask |= AT_ATIME;
 195  196                          flag = 0;
 196  197                  } else
 197  198                          flag = ATTR_UTIME;
 198  199          } else
 199  200                  flag = 0;
 200  201  
 201  202          /*
 202  203           * If the filesystem is exported with nosuid, then mask off
 203  204           * the setuid and setgid bits.
 204  205           */
 205  206          if ((va.va_mask & AT_MODE) && vp->v_type == VREG &&
 206  207              (exi->exi_export.ex_flags & EX_NOSUID))
 207  208                  va.va_mode &= ~(VSUID | VSGID);
 208  209  
 209  210          ct.cc_sysid = 0;
 210  211          ct.cc_pid = 0;
 211  212          ct.cc_caller_id = nfs2_srv_caller_id;
 212  213          ct.cc_flags = CC_DONTBLOCK;
 213  214  
 214  215          /*
 215  216           * We need to specially handle size changes because it is
 216  217           * possible for the client to create a file with modes
 217  218           * which indicate read-only, but with the file opened for
 218  219           * writing.  If the client then tries to set the size of
 219  220           * the file, then the normal access checking done in
 220  221           * VOP_SETATTR would prevent the client from doing so,
 221  222           * although it should be legal for it to do so.  To get
 222  223           * around this, we do the access checking for ourselves
 223  224           * and then use VOP_SPACE which doesn't do the access
 224  225           * checking which VOP_SETATTR does. VOP_SPACE can only
 225  226           * operate on VREG files, let VOP_SETATTR handle the other
 226  227           * extremely rare cases.
 227  228           * Also the client should not be allowed to change the
 228  229           * size of the file if there is a conflicting non-blocking
 229  230           * mandatory lock in the region of change.
 230  231           */
 231  232          if (vp->v_type == VREG && va.va_mask & AT_SIZE) {
 232  233                  if (nbl_need_check(vp)) {
 233  234                          nbl_start_crit(vp, RW_READER);
 234  235                          in_crit = 1;
 235  236                  }
 236  237  
 237  238                  bva.va_mask = AT_UID | AT_SIZE;
 238  239  
 239  240                  error = VOP_GETATTR(vp, &bva, 0, cr, &ct);
 240  241  
 241  242                  if (error) {
 242  243                          if (in_crit)
 243  244                                  nbl_end_crit(vp);
 244  245                          VN_RELE(vp);
 245  246                          ns->ns_status = puterrno(error);
 246  247                          return;
 247  248                  }
 248  249  
 249  250                  if (in_crit) {
 250  251                          u_offset_t offset;
 251  252                          ssize_t length;
 252  253  
 253  254                          if (va.va_size < bva.va_size) {
 254  255                                  offset = va.va_size;
 255  256                                  length = bva.va_size - va.va_size;
 256  257                          } else {
 257  258                                  offset = bva.va_size;
 258  259                                  length = va.va_size - bva.va_size;
 259  260                          }
 260  261                          if (nbl_conflict(vp, NBL_WRITE, offset, length, 0,
 261  262                              NULL)) {
 262  263                                  error = EACCES;
 263  264                          }
 264  265                  }
 265  266  
 266  267                  if (crgetuid(cr) == bva.va_uid && !error &&
 267  268                      va.va_size != bva.va_size) {
 268  269                          va.va_mask &= ~AT_SIZE;
 269  270                          bf.l_type = F_WRLCK;
 270  271                          bf.l_whence = 0;
 271  272                          bf.l_start = (off64_t)va.va_size;
 272  273                          bf.l_len = 0;
 273  274                          bf.l_sysid = 0;
 274  275                          bf.l_pid = 0;
 275  276  
 276  277                          error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE,
 277  278                              (offset_t)va.va_size, cr, &ct);
 278  279                  }
 279  280                  if (in_crit)
 280  281                          nbl_end_crit(vp);
 281  282          } else
 282  283                  error = 0;
 283  284  
 284  285          /*
 285  286           * Do the setattr.
 286  287           */
 287  288          if (!error && va.va_mask) {
 288  289                  error = VOP_SETATTR(vp, &va, flag, cr, &ct);
 289  290          }
 290  291  
 291  292          /*
 292  293           * check if the monitor on either vop_space or vop_setattr detected
 293  294           * a delegation conflict and if so, mark the thread flag as
 294  295           * wouldblock so that the response is dropped and the client will
 295  296           * try again.
 296  297           */
 297  298          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 298  299                  VN_RELE(vp);
 299  300                  curthread->t_flag |= T_WOULDBLOCK;
 300  301                  return;
 301  302          }
 302  303  
 303  304          if (!error) {
 304  305                  va.va_mask = AT_ALL;    /* get everything */
 305  306  
 306  307                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 307  308  
 308  309                  /* check for overflows */
 309  310                  if (!error) {
 310  311                          acl_perm(vp, exi, &va, cr);
 311  312                          error = vattr_to_nattr(&va, &ns->ns_attr);
 312  313                  }
 313  314          }
 314  315  
 315  316          ct.cc_flags = 0;
 316  317  
 317  318          /*
 318  319           * Force modified metadata out to stable storage.
 319  320           */
 320  321          (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct);
 321  322

↓ open down ↓

288 lines elided

↑ open up ↑

 322  323          VN_RELE(vp);
 323  324  
 324  325          ns->ns_status = puterrno(error);
 325  326  }
 326  327  void *
 327  328  rfs_setattr_getfh(struct nfssaargs *args)
 328  329  {
 329  330          return (&args->saa_fh);
 330  331  }
 331  332  
      333 +/* Change and release @exip and @vpp only in success */
      334 +int
      335 +rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
      336 +{
      337 +        struct exportinfo *exi;
      338 +        vnode_t *vp = *vpp;
      339 +        fid_t fid;
      340 +        int error;
      341 +
      342 +        VN_HOLD(vp);
      343 +
      344 +        if ((error = traverse(&vp)) != 0) {
      345 +                VN_RELE(vp);
      346 +                return (error);
      347 +        }
      348 +
      349 +        bzero(&fid, sizeof (fid));
      350 +        fid.fid_len = MAXFIDSZ;
      351 +        error = VOP_FID(vp, &fid, NULL);
      352 +        if (error) {
      353 +                VN_RELE(vp);
      354 +                return (error);
      355 +        }
      356 +
      357 +        exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
      358 +        if (exi == NULL ||
      359 +            (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
      360 +                /*
      361 +                 * It is not error, just subdir is not exported
      362 +                 * or "nohide" is not set
      363 +                 */
      364 +                if (exi != NULL)
      365 +                        exi_rele(exi);
      366 +                VN_RELE(vp);
      367 +        } else {
      368 +                /* go to submount */
      369 +                exi_rele(*exip);
      370 +                *exip = exi;
      371 +
      372 +                VN_RELE(*vpp);
      373 +                *vpp = vp;
      374 +        }
      375 +
      376 +        return (0);
      377 +}
      378 +
 332  379  /*
      380 + * Given mounted "dvp" and "exi", go upper mountpoint
      381 + * with dvp/exi correction
      382 + * Return 0 in success
      383 + */
      384 +int
      385 +rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
      386 +{
      387 +        struct exportinfo *exi;
      388 +        vnode_t *dvp = *dvpp;
      389 +
      390 +        ASSERT(dvp->v_flag & VROOT);
      391 +
      392 +        VN_HOLD(dvp);
      393 +        dvp = untraverse(dvp);
      394 +        exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
      395 +        if (exi == NULL) {
      396 +                VN_RELE(dvp);
      397 +                return (-1);
      398 +        }
      399 +
      400 +        exi_rele(*exip);
      401 +        *exip = exi;
      402 +        VN_RELE(*dvpp);
      403 +        *dvpp = dvp;
      404 +
      405 +        return (0);
      406 +}
      407 +/*
 333  408   * Directory lookup.
 334  409   * Returns an fhandle and file attributes for file name in a directory.
 335  410   */
 336  411  /* ARGSUSED */
 337  412  void
 338  413  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 339  414      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 340  415  {
 341  416          int error;
 342  417          vnode_t *dvp;

 343  418          vnode_t *vp;
 344  419          struct vattr va;
 345  420          fhandle_t *fhp = da->da_fhandle;
 346  421          struct sec_ol sec = {0, 0};
 347  422          bool_t publicfh_flag = FALSE, auth_weak = FALSE;
 348  423          char *name;
 349  424          struct sockaddr *ca;
 350  425  
 351  426          /*
 352  427           * Trusted Extension doesn't support NFSv2. MOUNT
 353  428           * will reject v2 clients. Need to prevent v2 client
 354  429           * access via WebNFS here.
 355  430           */
 356  431          if (is_system_labeled() && req->rq_vers == 2) {
 357  432                  dr->dr_status = NFSERR_ACCES;
 358  433                  return;
 359  434          }
 360  435  
 361  436          /*
 362  437           * Disallow NULL paths
 363  438           */
 364  439          if (da->da_name == NULL || *da->da_name == '\0') {
 365  440                  dr->dr_status = NFSERR_ACCES;
 366  441                  return;
 367  442          }
 368  443  
 369  444          /*
 370  445           * Allow lookups from the root - the default
 371  446           * location of the public filehandle.
 372  447           */
 373  448          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {

↓ open down ↓

31 lines elided

↑ open up ↑

 374  449                  dvp = rootdir;
 375  450                  VN_HOLD(dvp);
 376  451          } else {
 377  452                  dvp = nfs_fhtovp(fhp, exi);
 378  453                  if (dvp == NULL) {
 379  454                          dr->dr_status = NFSERR_STALE;
 380  455                          return;
 381  456                  }
 382  457          }
 383  458  
      459 +        exi_hold(exi);
      460 +
 384  461          /*
 385  462           * Not allow lookup beyond root.
 386  463           * If the filehandle matches a filehandle of the exi,
 387  464           * then the ".." refers beyond the root of an exported filesystem.
 388  465           */
 389  466          if (strcmp(da->da_name, "..") == 0 &&
 390  467              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 391      -                VN_RELE(dvp);
 392      -                dr->dr_status = NFSERR_NOENT;
 393      -                return;
      468 +                if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
      469 +                    (dvp->v_flag & VROOT)) {
      470 +                        /*
      471 +                         * special case for ".." and 'nohide'exported root
      472 +                         */
      473 +                        if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
      474 +                                error = NFSERR_ACCES;
      475 +                                goto out;
      476 +                        }
      477 +                } else  {
      478 +                        error = NFSERR_NOENT;
      479 +                        goto out;
      480 +                }
 394  481          }
 395  482  
 396  483          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 397  484          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 398  485              MAXPATHLEN);
 399  486  
 400  487          if (name == NULL) {
 401      -                dr->dr_status = NFSERR_ACCES;
 402      -                return;
      488 +                error = NFSERR_ACCES;
      489 +                goto out;
 403  490          }
 404  491  
 405  492          /*
 406  493           * If the public filehandle is used then allow
 407  494           * a multi-component lookup, i.e. evaluate
 408  495           * a pathname and follow symbolic links if
 409  496           * necessary.
 410  497           *
 411  498           * This may result in a vnode in another filesystem
 412  499           * which is OK as long as the filesystem is exported.
 413  500           */
 414  501          if (PUBLIC_FH2(fhp)) {
 415  502                  publicfh_flag = TRUE;
      503 +
      504 +                exi_rele(exi);
      505 +
 416  506                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 417  507                      &sec);
 418  508          } else {
 419  509                  /*
 420  510                   * Do a normal single component lookup.
 421  511                   */
 422  512                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 423  513                      NULL, NULL, NULL);
 424  514          }
 425  515  
 426  516          if (name != da->da_name)
 427  517                  kmem_free(name, MAXPATHLEN);
 428  518  
      519 +        if (error == 0 && vn_ismntpt(vp)) {
      520 +                error = rfs_cross_mnt(&vp, &exi);
      521 +                if (error)
      522 +                        VN_RELE(vp);
      523 +        }
 429  524  
 430  525          if (!error) {
 431  526                  va.va_mask = AT_ALL;    /* we want everything */
 432  527  
 433  528                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 434  529  
 435  530                  /* check for overflows */
 436  531                  if (!error) {
 437  532                          acl_perm(vp, exi, &va, cr);
 438  533                          error = vattr_to_nattr(&va, &dr->dr_attr);

 439  534                          if (!error) {
 440  535                                  if (sec.sec_flags & SEC_QUERY)
 441  536                                          error = makefh_ol(&dr->dr_fhandle, exi,
 442  537                                              sec.sec_index);
 443  538                                  else {
 444  539                                          error = makefh(&dr->dr_fhandle, vp,

↓ open down ↓

6 lines elided

↑ open up ↑

 445  540                                              exi);
 446  541                                          if (!error && publicfh_flag &&
 447  542                                              !chk_clnt_sec(exi, req))
 448  543                                                  auth_weak = TRUE;
 449  544                                  }
 450  545                          }
 451  546                  }
 452  547                  VN_RELE(vp);
 453  548          }
 454  549  
      550 +out:
 455  551          VN_RELE(dvp);
 456  552  
 457      -        /*
 458      -         * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 459      -         * and have obtained a new exportinfo in exi which needs to be
 460      -         * released. Note the the original exportinfo pointed to by exi
 461      -         * will be released by the caller, comon_dispatch.
 462      -         */
 463      -        if (publicfh_flag && exi != NULL)
      553 +        if (exi != NULL)
 464  554                  exi_rele(exi);
 465  555  
 466  556          /*
 467  557           * If it's public fh, no 0x81, and client's flavor is
 468  558           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 469  559           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 470  560           */
 471  561          if (auth_weak)
 472  562                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 473  563          else

 474  564                  dr->dr_status = puterrno(error);
 475  565  }
 476  566  void *
 477  567  rfs_lookup_getfh(struct nfsdiropargs *da)
 478  568  {
 479  569          return (da->da_fhandle);
 480  570  }
 481  571  
 482  572  /*
 483  573   * Read symbolic link.
 484  574   * Returns the string in the symbolic link at the given fhandle.
 485  575   */
 486  576  /* ARGSUSED */
 487  577  void
 488  578  rfs_readlink(fhandle_t *fhp, struct nfsrdlnres *rl, struct exportinfo *exi,
 489  579      struct svc_req *req, cred_t *cr, bool_t ro)
 490  580  {
 491  581          int error;
 492  582          struct iovec iov;
 493  583          struct uio uio;
 494  584          vnode_t *vp;
 495  585          struct vattr va;
 496  586          struct sockaddr *ca;
 497  587          char *name = NULL;
 498  588          int is_referral = 0;
 499  589  
 500  590          vp = nfs_fhtovp(fhp, exi);
 501  591          if (vp == NULL) {
 502  592                  rl->rl_data = NULL;
 503  593                  rl->rl_status = NFSERR_STALE;
 504  594                  return;
 505  595          }
 506  596  
 507  597          va.va_mask = AT_MODE;
 508  598  
 509  599          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
 510  600  
 511  601          if (error) {
 512  602                  VN_RELE(vp);
 513  603                  rl->rl_data = NULL;
 514  604                  rl->rl_status = puterrno(error);
 515  605                  return;
 516  606          }
 517  607  
 518  608          if (MANDLOCK(vp, va.va_mode)) {
 519  609                  VN_RELE(vp);
 520  610                  rl->rl_data = NULL;
 521  611                  rl->rl_status = NFSERR_ACCES;
 522  612                  return;
 523  613          }
 524  614  
 525  615          /* We lied about the object type for a referral */
 526  616          if (vn_is_nfs_reparse(vp, cr))
 527  617                  is_referral = 1;
 528  618  
 529  619          /*
 530  620           * XNFS and RFC1094 require us to return ENXIO if argument
 531  621           * is not a link. BUGID 1138002.
 532  622           */
 533  623          if (vp->v_type != VLNK && !is_referral) {
 534  624                  VN_RELE(vp);
 535  625                  rl->rl_data = NULL;
 536  626                  rl->rl_status = NFSERR_NXIO;
 537  627                  return;
 538  628          }
 539  629  
 540  630          /*
 541  631           * Allocate data for pathname.  This will be freed by rfs_rlfree.
 542  632           */
 543  633          rl->rl_data = kmem_alloc(NFS_MAXPATHLEN, KM_SLEEP);
 544  634  
 545  635          if (is_referral) {
 546  636                  char *s;
 547  637                  size_t strsz;
 548  638  
 549  639                  /* Get an artificial symlink based on a referral */
 550  640                  s = build_symlink(vp, cr, &strsz);
 551  641                  global_svstat_ptr[2][NFS_REFERLINKS].value.ui64++;
 552  642                  DTRACE_PROBE2(nfs2serv__func__referral__reflink,
 553  643                      vnode_t *, vp, char *, s);
 554  644                  if (s == NULL)
 555  645                          error = EINVAL;
 556  646                  else {
 557  647                          error = 0;
 558  648                          (void) strlcpy(rl->rl_data, s, NFS_MAXPATHLEN);
 559  649                          rl->rl_count = (uint32_t)MIN(strsz, NFS_MAXPATHLEN);
 560  650                          kmem_free(s, strsz);
 561  651                  }
 562  652  
 563  653          } else {
 564  654  
 565  655                  /*
 566  656                   * Set up io vector to read sym link data
 567  657                   */
 568  658                  iov.iov_base = rl->rl_data;
 569  659                  iov.iov_len = NFS_MAXPATHLEN;
 570  660                  uio.uio_iov = &iov;
 571  661                  uio.uio_iovcnt = 1;
 572  662                  uio.uio_segflg = UIO_SYSSPACE;
 573  663                  uio.uio_extflg = UIO_COPY_CACHED;
 574  664                  uio.uio_loffset = (offset_t)0;
 575  665                  uio.uio_resid = NFS_MAXPATHLEN;
 576  666  
 577  667                  /*
 578  668                   * Do the readlink.
 579  669                   */
 580  670                  error = VOP_READLINK(vp, &uio, cr, NULL);
 581  671  
 582  672                  rl->rl_count = (uint32_t)(NFS_MAXPATHLEN - uio.uio_resid);
 583  673  
 584  674                  if (!error)
 585  675                          rl->rl_data[rl->rl_count] = '\0';
 586  676  
 587  677          }
 588  678  
 589  679  
 590  680          VN_RELE(vp);
 591  681  
 592  682          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 593  683          name = nfscmd_convname(ca, exi, rl->rl_data,
 594  684              NFSCMD_CONV_OUTBOUND, MAXPATHLEN);
 595  685  
 596  686          if (name != NULL && name != rl->rl_data) {
 597  687                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 598  688                  rl->rl_data = name;
 599  689          }
 600  690  
 601  691          /*
 602  692           * XNFS and RFC1094 require us to return ENXIO if argument
 603  693           * is not a link. UFS returns EINVAL if this is the case,
 604  694           * so we do the mapping here. BUGID 1138002.
 605  695           */
 606  696          if (error == EINVAL)
 607  697                  rl->rl_status = NFSERR_NXIO;
 608  698          else
 609  699                  rl->rl_status = puterrno(error);
 610  700  
 611  701  }
 612  702  void *
 613  703  rfs_readlink_getfh(fhandle_t *fhp)
 614  704  {
 615  705          return (fhp);
 616  706  }
 617  707  /*
 618  708   * Free data allocated by rfs_readlink
 619  709   */
 620  710  void
 621  711  rfs_rlfree(struct nfsrdlnres *rl)
 622  712  {
 623  713          if (rl->rl_data != NULL)
 624  714                  kmem_free(rl->rl_data, NFS_MAXPATHLEN);
 625  715  }
 626  716  
 627  717  static int rdma_setup_read_data2(struct nfsreadargs *, struct nfsrdresult *);
 628  718  
 629  719  /*
 630  720   * Read data.
 631  721   * Returns some data read from the file at the given fhandle.
 632  722   */
 633  723  /* ARGSUSED */
 634  724  void
 635  725  rfs_read(struct nfsreadargs *ra, struct nfsrdresult *rr,
 636  726      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 637  727  {
 638  728          vnode_t *vp;
 639  729          int error;
 640  730          struct vattr va;
 641  731          struct iovec iov;
 642  732          struct uio uio;
 643  733          mblk_t *mp;
 644  734          int alloc_err = 0;
 645  735          int in_crit = 0;
 646  736          caller_context_t ct;
 647  737  
 648  738          vp = nfs_fhtovp(&ra->ra_fhandle, exi);
 649  739          if (vp == NULL) {
 650  740                  rr->rr_data = NULL;
 651  741                  rr->rr_status = NFSERR_STALE;
 652  742                  return;
 653  743          }
 654  744  
 655  745          if (vp->v_type != VREG) {
 656  746                  VN_RELE(vp);
 657  747                  rr->rr_data = NULL;
 658  748                  rr->rr_status = NFSERR_ISDIR;
 659  749                  return;
 660  750          }
 661  751  
 662  752          ct.cc_sysid = 0;
 663  753          ct.cc_pid = 0;
 664  754          ct.cc_caller_id = nfs2_srv_caller_id;
 665  755          ct.cc_flags = CC_DONTBLOCK;
 666  756  
 667  757          /*
 668  758           * Enter the critical region before calling VOP_RWLOCK
 669  759           * to avoid a deadlock with write requests.
 670  760           */
 671  761          if (nbl_need_check(vp)) {
 672  762                  nbl_start_crit(vp, RW_READER);
 673  763                  if (nbl_conflict(vp, NBL_READ, ra->ra_offset, ra->ra_count,
 674  764                      0, NULL)) {
 675  765                          nbl_end_crit(vp);
 676  766                          VN_RELE(vp);
 677  767                          rr->rr_data = NULL;
 678  768                          rr->rr_status = NFSERR_ACCES;
 679  769                          return;
 680  770                  }
 681  771                  in_crit = 1;
 682  772          }
 683  773  
 684  774          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 685  775  
 686  776          /* check if a monitor detected a delegation conflict */
 687  777          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
 688  778                  VN_RELE(vp);
 689  779                  /* mark as wouldblock so response is dropped */
 690  780                  curthread->t_flag |= T_WOULDBLOCK;
 691  781  
 692  782                  rr->rr_data = NULL;
 693  783                  return;
 694  784          }
 695  785  
 696  786          va.va_mask = AT_ALL;
 697  787  
 698  788          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 699  789  
 700  790          if (error) {
 701  791                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 702  792                  if (in_crit)
 703  793                          nbl_end_crit(vp);
 704  794  
 705  795                  VN_RELE(vp);
 706  796                  rr->rr_data = NULL;
 707  797                  rr->rr_status = puterrno(error);
 708  798  
 709  799                  return;
 710  800          }
 711  801  
 712  802          /*
 713  803           * This is a kludge to allow reading of files created
 714  804           * with no read permission.  The owner of the file
 715  805           * is always allowed to read it.
 716  806           */
 717  807          if (crgetuid(cr) != va.va_uid) {
 718  808                  error = VOP_ACCESS(vp, VREAD, 0, cr, &ct);
 719  809  
 720  810                  if (error) {
 721  811                          /*
 722  812                           * Exec is the same as read over the net because
 723  813                           * of demand loading.
 724  814                           */
 725  815                          error = VOP_ACCESS(vp, VEXEC, 0, cr, &ct);
 726  816                  }
 727  817                  if (error) {
 728  818                          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 729  819                          if (in_crit)
 730  820                                  nbl_end_crit(vp);
 731  821                          VN_RELE(vp);
 732  822                          rr->rr_data = NULL;
 733  823                          rr->rr_status = puterrno(error);
 734  824  
 735  825                          return;
 736  826                  }
 737  827          }
 738  828  
 739  829          if (MANDLOCK(vp, va.va_mode)) {
 740  830                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 741  831                  if (in_crit)
 742  832                          nbl_end_crit(vp);
 743  833  
 744  834                  VN_RELE(vp);
 745  835                  rr->rr_data = NULL;
 746  836                  rr->rr_status = NFSERR_ACCES;
 747  837  
 748  838                  return;
 749  839          }
 750  840  
 751  841          rr->rr_ok.rrok_wlist_len = 0;
 752  842          rr->rr_ok.rrok_wlist = NULL;
 753  843  
 754  844          if ((u_offset_t)ra->ra_offset >= va.va_size) {
 755  845                  rr->rr_count = 0;
 756  846                  rr->rr_data = NULL;
 757  847                  /*
 758  848                   * In this case, status is NFS_OK, but there is no data
 759  849                   * to encode. So set rr_mp to NULL.
 760  850                   */
 761  851                  rr->rr_mp = NULL;
 762  852                  rr->rr_ok.rrok_wlist = ra->ra_wlist;
 763  853                  if (rr->rr_ok.rrok_wlist)
 764  854                          clist_zero_len(rr->rr_ok.rrok_wlist);
 765  855                  goto done;
 766  856          }
 767  857  
 768  858          if (ra->ra_wlist) {
 769  859                  mp = NULL;
 770  860                  rr->rr_mp = NULL;
 771  861                  (void) rdma_get_wchunk(req, &iov, ra->ra_wlist);
 772  862                  if (ra->ra_count > iov.iov_len) {
 773  863                          rr->rr_data = NULL;
 774  864                          rr->rr_status = NFSERR_INVAL;
 775  865                          goto done;
 776  866                  }
 777  867          } else {
 778  868                  /*
 779  869                   * mp will contain the data to be sent out in the read reply.
 780  870                   * This will be freed after the reply has been sent out (by the
 781  871                   * driver).
 782  872                   * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, so
 783  873                   * that the call to xdrmblk_putmblk() never fails.
 784  874                   */
 785  875                  mp = allocb_wait(RNDUP(ra->ra_count), BPRI_MED, STR_NOSIG,
 786  876                      &alloc_err);
 787  877                  ASSERT(mp != NULL);
 788  878                  ASSERT(alloc_err == 0);
 789  879  
 790  880                  rr->rr_mp = mp;
 791  881  
 792  882                  /*
 793  883                   * Set up io vector
 794  884                   */
 795  885                  iov.iov_base = (caddr_t)mp->b_datap->db_base;
 796  886                  iov.iov_len = ra->ra_count;
 797  887          }
 798  888  
 799  889          uio.uio_iov = &iov;
 800  890          uio.uio_iovcnt = 1;
 801  891          uio.uio_segflg = UIO_SYSSPACE;
 802  892          uio.uio_extflg = UIO_COPY_CACHED;
 803  893          uio.uio_loffset = (offset_t)ra->ra_offset;
 804  894          uio.uio_resid = ra->ra_count;
 805  895  
 806  896          error = VOP_READ(vp, &uio, 0, cr, &ct);
 807  897  
 808  898          if (error) {
 809  899                  if (mp)
 810  900                          freeb(mp);
 811  901  
 812  902                  /*
 813  903                   * check if a monitor detected a delegation conflict and
 814  904                   * mark as wouldblock so response is dropped
 815  905                   */
 816  906                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
 817  907                          curthread->t_flag |= T_WOULDBLOCK;
 818  908                  else
 819  909                          rr->rr_status = puterrno(error);
 820  910  
 821  911                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 822  912                  if (in_crit)
 823  913                          nbl_end_crit(vp);
 824  914  
 825  915                  VN_RELE(vp);
 826  916                  rr->rr_data = NULL;
 827  917  
 828  918                  return;
 829  919          }
 830  920  
 831  921          /*
 832  922           * Get attributes again so we can send the latest access
 833  923           * time to the client side for its cache.
 834  924           */
 835  925          va.va_mask = AT_ALL;
 836  926  
 837  927          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 838  928  
 839  929          if (error) {
 840  930                  if (mp)
 841  931                          freeb(mp);
 842  932  
 843  933                  VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 844  934                  if (in_crit)
 845  935                          nbl_end_crit(vp);
 846  936  
 847  937                  VN_RELE(vp);
 848  938                  rr->rr_data = NULL;
 849  939                  rr->rr_status = puterrno(error);
 850  940  
 851  941                  return;
 852  942          }
 853  943  
 854  944          rr->rr_count = (uint32_t)(ra->ra_count - uio.uio_resid);
 855  945  
 856  946          if (mp) {
 857  947                  rr->rr_data = (char *)mp->b_datap->db_base;
 858  948          } else {
 859  949                  if (ra->ra_wlist) {
 860  950                          rr->rr_data = (caddr_t)iov.iov_base;
 861  951                          if (!rdma_setup_read_data2(ra, rr)) {
 862  952                                  rr->rr_data = NULL;
 863  953                                  rr->rr_status = puterrno(NFSERR_INVAL);
 864  954                          }
 865  955                  }
 866  956          }
 867  957  done:
 868  958          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, &ct);
 869  959          if (in_crit)
 870  960                  nbl_end_crit(vp);
 871  961  
 872  962          acl_perm(vp, exi, &va, cr);
 873  963  
 874  964          /* check for overflows */
 875  965          error = vattr_to_nattr(&va, &rr->rr_attr);
 876  966  
 877  967          VN_RELE(vp);
 878  968  
 879  969          rr->rr_status = puterrno(error);
 880  970  }
 881  971  
 882  972  /*
 883  973   * Free data allocated by rfs_read
 884  974   */
 885  975  void
 886  976  rfs_rdfree(struct nfsrdresult *rr)
 887  977  {
 888  978          mblk_t *mp;
 889  979  
 890  980          if (rr->rr_status == NFS_OK) {
 891  981                  mp = rr->rr_mp;
 892  982                  if (mp != NULL)
 893  983                          freeb(mp);
 894  984          }
 895  985  }
 896  986  
 897  987  void *
 898  988  rfs_read_getfh(struct nfsreadargs *ra)
 899  989  {
 900  990          return (&ra->ra_fhandle);
 901  991  }
 902  992  
 903  993  #define MAX_IOVECS      12
 904  994  
 905  995  #ifdef DEBUG
 906  996  static int rfs_write_sync_hits = 0;
 907  997  static int rfs_write_sync_misses = 0;
 908  998  #endif
 909  999  
 910 1000  /*
 911 1001   * Write data to file.
 912 1002   * Returns attributes of a file after writing some data to it.
 913 1003   *
 914 1004   * Any changes made here, especially in error handling might have
 915 1005   * to also be done in rfs_write (which clusters write requests).
 916 1006   */
 917 1007  /* ARGSUSED */
 918 1008  void
 919 1009  rfs_write_sync(struct nfswriteargs *wa, struct nfsattrstat *ns,
 920 1010      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 921 1011  {
 922 1012          int error;
 923 1013          vnode_t *vp;
 924 1014          rlim64_t rlimit;
 925 1015          struct vattr va;
 926 1016          struct uio uio;
 927 1017          struct iovec iov[MAX_IOVECS];
 928 1018          mblk_t *m;
 929 1019          struct iovec *iovp;
 930 1020          int iovcnt;
 931 1021          cred_t *savecred;
 932 1022          int in_crit = 0;
 933 1023          caller_context_t ct;
 934 1024  
 935 1025          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
 936 1026          if (vp == NULL) {
 937 1027                  ns->ns_status = NFSERR_STALE;
 938 1028                  return;
 939 1029          }
 940 1030  
 941 1031          if (rdonly(ro, vp)) {
 942 1032                  VN_RELE(vp);
 943 1033                  ns->ns_status = NFSERR_ROFS;
 944 1034                  return;
 945 1035          }
 946 1036  
 947 1037          if (vp->v_type != VREG) {
 948 1038                  VN_RELE(vp);
 949 1039                  ns->ns_status = NFSERR_ISDIR;
 950 1040                  return;
 951 1041          }
 952 1042  
 953 1043          ct.cc_sysid = 0;
 954 1044          ct.cc_pid = 0;
 955 1045          ct.cc_caller_id = nfs2_srv_caller_id;
 956 1046          ct.cc_flags = CC_DONTBLOCK;
 957 1047  
 958 1048          va.va_mask = AT_UID|AT_MODE;
 959 1049  
 960 1050          error = VOP_GETATTR(vp, &va, 0, cr, &ct);
 961 1051  
 962 1052          if (error) {
 963 1053                  VN_RELE(vp);
 964 1054                  ns->ns_status = puterrno(error);
 965 1055  
 966 1056                  return;
 967 1057          }
 968 1058  
 969 1059          if (crgetuid(cr) != va.va_uid) {
 970 1060                  /*
 971 1061                   * This is a kludge to allow writes of files created
 972 1062                   * with read only permission.  The owner of the file
 973 1063                   * is always allowed to write it.
 974 1064                   */
 975 1065                  error = VOP_ACCESS(vp, VWRITE, 0, cr, &ct);
 976 1066  
 977 1067                  if (error) {
 978 1068                          VN_RELE(vp);
 979 1069                          ns->ns_status = puterrno(error);
 980 1070                          return;
 981 1071                  }
 982 1072          }
 983 1073  
 984 1074          /*
 985 1075           * Can't access a mandatory lock file.  This might cause
 986 1076           * the NFS service thread to block forever waiting for a
 987 1077           * lock to be released that will never be released.
 988 1078           */
 989 1079          if (MANDLOCK(vp, va.va_mode)) {
 990 1080                  VN_RELE(vp);
 991 1081                  ns->ns_status = NFSERR_ACCES;
 992 1082                  return;
 993 1083          }
 994 1084  
 995 1085          /*
 996 1086           * We have to enter the critical region before calling VOP_RWLOCK
 997 1087           * to avoid a deadlock with ufs.
 998 1088           */
 999 1089          if (nbl_need_check(vp)) {
1000 1090                  nbl_start_crit(vp, RW_READER);
1001 1091                  in_crit = 1;
1002 1092                  if (nbl_conflict(vp, NBL_WRITE, wa->wa_offset,
1003 1093                      wa->wa_count, 0, NULL)) {
1004 1094                          error = EACCES;
1005 1095                          goto out;
1006 1096                  }
1007 1097          }
1008 1098  
1009 1099          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1010 1100  
1011 1101          /* check if a monitor detected a delegation conflict */
1012 1102          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1013 1103                  VN_RELE(vp);
1014 1104                  /* mark as wouldblock so response is dropped */
1015 1105                  curthread->t_flag |= T_WOULDBLOCK;
1016 1106                  return;
1017 1107          }
1018 1108  
1019 1109          if (wa->wa_data || wa->wa_rlist) {
1020 1110                  /* Do the RDMA thing if necessary */
1021 1111                  if (wa->wa_rlist) {
1022 1112                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1023 1113                          iov[0].iov_len = wa->wa_count;
1024 1114                  } else  {
1025 1115                          iov[0].iov_base = wa->wa_data;
1026 1116                          iov[0].iov_len = wa->wa_count;
1027 1117                  }
1028 1118                  uio.uio_iov = iov;
1029 1119                  uio.uio_iovcnt = 1;
1030 1120                  uio.uio_segflg = UIO_SYSSPACE;
1031 1121                  uio.uio_extflg = UIO_COPY_DEFAULT;
1032 1122                  uio.uio_loffset = (offset_t)wa->wa_offset;
1033 1123                  uio.uio_resid = wa->wa_count;
1034 1124                  /*
1035 1125                   * The limit is checked on the client. We
1036 1126                   * should allow any size writes here.
1037 1127                   */
1038 1128                  uio.uio_llimit = curproc->p_fsz_ctl;
1039 1129                  rlimit = uio.uio_llimit - wa->wa_offset;
1040 1130                  if (rlimit < (rlim64_t)uio.uio_resid)
1041 1131                          uio.uio_resid = (uint_t)rlimit;
1042 1132  
1043 1133                  /*
1044 1134                   * for now we assume no append mode
1045 1135                   */
1046 1136                  /*
1047 1137                   * We're changing creds because VM may fault and we need
1048 1138                   * the cred of the current thread to be used if quota
1049 1139                   * checking is enabled.
1050 1140                   */
1051 1141                  savecred = curthread->t_cred;
1052 1142                  curthread->t_cred = cr;
1053 1143                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1054 1144                  curthread->t_cred = savecred;
1055 1145          } else {
1056 1146                  iovcnt = 0;
1057 1147                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1058 1148                          iovcnt++;
1059 1149                  if (iovcnt <= MAX_IOVECS) {
1060 1150  #ifdef DEBUG
1061 1151                          rfs_write_sync_hits++;
1062 1152  #endif
1063 1153                          iovp = iov;
1064 1154                  } else {
1065 1155  #ifdef DEBUG
1066 1156                          rfs_write_sync_misses++;
1067 1157  #endif
1068 1158                          iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP);
1069 1159                  }
1070 1160                  mblk_to_iov(wa->wa_mblk, iovcnt, iovp);
1071 1161                  uio.uio_iov = iovp;
1072 1162                  uio.uio_iovcnt = iovcnt;
1073 1163                  uio.uio_segflg = UIO_SYSSPACE;
1074 1164                  uio.uio_extflg = UIO_COPY_DEFAULT;
1075 1165                  uio.uio_loffset = (offset_t)wa->wa_offset;
1076 1166                  uio.uio_resid = wa->wa_count;
1077 1167                  /*
1078 1168                   * The limit is checked on the client. We
1079 1169                   * should allow any size writes here.
1080 1170                   */
1081 1171                  uio.uio_llimit = curproc->p_fsz_ctl;
1082 1172                  rlimit = uio.uio_llimit - wa->wa_offset;
1083 1173                  if (rlimit < (rlim64_t)uio.uio_resid)
1084 1174                          uio.uio_resid = (uint_t)rlimit;
1085 1175  
1086 1176                  /*
1087 1177                   * For now we assume no append mode.
1088 1178                   */
1089 1179                  /*
1090 1180                   * We're changing creds because VM may fault and we need
1091 1181                   * the cred of the current thread to be used if quota
1092 1182                   * checking is enabled.
1093 1183                   */
1094 1184                  savecred = curthread->t_cred;
1095 1185                  curthread->t_cred = cr;
1096 1186                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1097 1187                  curthread->t_cred = savecred;
1098 1188  
1099 1189                  if (iovp != iov)
1100 1190                          kmem_free(iovp, sizeof (*iovp) * iovcnt);
1101 1191          }
1102 1192  
1103 1193          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1104 1194  
1105 1195          if (!error) {
1106 1196                  /*
1107 1197                   * Get attributes again so we send the latest mod
1108 1198                   * time to the client side for its cache.
1109 1199                   */
1110 1200                  va.va_mask = AT_ALL;    /* now we want everything */
1111 1201  
1112 1202                  error = VOP_GETATTR(vp, &va, 0, cr, &ct);
1113 1203  
1114 1204                  /* check for overflows */
1115 1205                  if (!error) {
1116 1206                          acl_perm(vp, exi, &va, cr);
1117 1207                          error = vattr_to_nattr(&va, &ns->ns_attr);
1118 1208                  }
1119 1209          }
1120 1210  
1121 1211  out:
1122 1212          if (in_crit)
1123 1213                  nbl_end_crit(vp);
1124 1214          VN_RELE(vp);
1125 1215  
1126 1216          /* check if a monitor detected a delegation conflict */
1127 1217          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1128 1218                  /* mark as wouldblock so response is dropped */
1129 1219                  curthread->t_flag |= T_WOULDBLOCK;
1130 1220          else
1131 1221                  ns->ns_status = puterrno(error);
1132 1222  
1133 1223  }
1134 1224  
1135 1225  struct rfs_async_write {
1136 1226          struct nfswriteargs *wa;
1137 1227          struct nfsattrstat *ns;
1138 1228          struct svc_req *req;
1139 1229          cred_t *cr;
1140 1230          bool_t ro;
1141 1231          kthread_t *thread;
1142 1232          struct rfs_async_write *list;
1143 1233  };
1144 1234  
1145 1235  struct rfs_async_write_list {
1146 1236          fhandle_t *fhp;
1147 1237          kcondvar_t cv;
1148 1238          struct rfs_async_write *list;
1149 1239          struct rfs_async_write_list *next;
1150 1240  };
1151 1241  
1152 1242  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1153 1243  static kmutex_t rfs_async_write_lock;
1154 1244  static int rfs_write_async = 1; /* enables write clustering if == 1 */
1155 1245  
1156 1246  #define MAXCLIOVECS     42
1157 1247  #define RFSWRITE_INITVAL (enum nfsstat) -1
1158 1248  
1159 1249  #ifdef DEBUG
1160 1250  static int rfs_write_hits = 0;
1161 1251  static int rfs_write_misses = 0;
1162 1252  #endif
1163 1253  
1164 1254  /*
1165 1255   * Write data to file.
1166 1256   * Returns attributes of a file after writing some data to it.
1167 1257   */
1168 1258  void
1169 1259  rfs_write(struct nfswriteargs *wa, struct nfsattrstat *ns,
1170 1260      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1171 1261  {
1172 1262          int error;
1173 1263          vnode_t *vp;
1174 1264          rlim64_t rlimit;
1175 1265          struct vattr va;
1176 1266          struct uio uio;
1177 1267          struct rfs_async_write_list *lp;
1178 1268          struct rfs_async_write_list *nlp;
1179 1269          struct rfs_async_write *rp;
1180 1270          struct rfs_async_write *nrp;
1181 1271          struct rfs_async_write *trp;
1182 1272          struct rfs_async_write *lrp;
1183 1273          int data_written;
1184 1274          int iovcnt;
1185 1275          mblk_t *m;
1186 1276          struct iovec *iovp;
1187 1277          struct iovec *niovp;
1188 1278          struct iovec iov[MAXCLIOVECS];
1189 1279          int count;
1190 1280          int rcount;
1191 1281          uint_t off;
1192 1282          uint_t len;
1193 1283          struct rfs_async_write nrpsp;
1194 1284          struct rfs_async_write_list nlpsp;
1195 1285          ushort_t t_flag;
1196 1286          cred_t *savecred;
1197 1287          int in_crit = 0;
1198 1288          caller_context_t ct;
1199 1289  
1200 1290          if (!rfs_write_async) {
1201 1291                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1202 1292                  return;
1203 1293          }
1204 1294  
1205 1295          /*
1206 1296           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1207 1297           * is considered an OK.
1208 1298           */
1209 1299          ns->ns_status = RFSWRITE_INITVAL;
1210 1300  
1211 1301          nrp = &nrpsp;
1212 1302          nrp->wa = wa;
1213 1303          nrp->ns = ns;
1214 1304          nrp->req = req;
1215 1305          nrp->cr = cr;
1216 1306          nrp->ro = ro;
1217 1307          nrp->thread = curthread;
1218 1308  
1219 1309          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1220 1310  
1221 1311          /*
1222 1312           * Look to see if there is already a cluster started
1223 1313           * for this file.
1224 1314           */
1225 1315          mutex_enter(&rfs_async_write_lock);
1226 1316          for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
1227 1317                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1228 1318                      sizeof (fhandle_t)) == 0)
1229 1319                          break;
1230 1320          }
1231 1321  
1232 1322          /*
1233 1323           * If lp is non-NULL, then there is already a cluster
1234 1324           * started.  We need to place ourselves in the cluster
1235 1325           * list in the right place as determined by starting
1236 1326           * offset.  Conflicts with non-blocking mandatory locked
1237 1327           * regions will be checked when the cluster is processed.
1238 1328           */
1239 1329          if (lp != NULL) {
1240 1330                  rp = lp->list;
1241 1331                  trp = NULL;
1242 1332                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1243 1333                          trp = rp;
1244 1334                          rp = rp->list;
1245 1335                  }
1246 1336                  nrp->list = rp;
1247 1337                  if (trp == NULL)
1248 1338                          lp->list = nrp;
1249 1339                  else
1250 1340                          trp->list = nrp;
1251 1341                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1252 1342                          cv_wait(&lp->cv, &rfs_async_write_lock);
1253 1343                  mutex_exit(&rfs_async_write_lock);
1254 1344  
1255 1345                  return;
1256 1346          }
1257 1347  
1258 1348          /*
1259 1349           * No cluster started yet, start one and add ourselves
1260 1350           * to the list of clusters.
1261 1351           */
1262 1352          nrp->list = NULL;
1263 1353  
1264 1354          nlp = &nlpsp;
1265 1355          nlp->fhp = &wa->wa_fhandle;
1266 1356          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1267 1357          nlp->list = nrp;
1268 1358          nlp->next = NULL;
1269 1359  
1270 1360          if (rfs_async_write_head == NULL) {
1271 1361                  rfs_async_write_head = nlp;
1272 1362          } else {
1273 1363                  lp = rfs_async_write_head;
1274 1364                  while (lp->next != NULL)
1275 1365                          lp = lp->next;
1276 1366                  lp->next = nlp;
1277 1367          }
1278 1368          mutex_exit(&rfs_async_write_lock);
1279 1369  
1280 1370          /*
1281 1371           * Convert the file handle common to all of the requests
1282 1372           * in this cluster to a vnode.
1283 1373           */
1284 1374          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1285 1375          if (vp == NULL) {
1286 1376                  mutex_enter(&rfs_async_write_lock);
1287 1377                  if (rfs_async_write_head == nlp)
1288 1378                          rfs_async_write_head = nlp->next;
1289 1379                  else {
1290 1380                          lp = rfs_async_write_head;
1291 1381                          while (lp->next != nlp)
1292 1382                                  lp = lp->next;
1293 1383                          lp->next = nlp->next;
1294 1384                  }
1295 1385                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1296 1386                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1297 1387                          rp->ns->ns_status = NFSERR_STALE;
1298 1388                          rp->thread->t_flag |= t_flag;
1299 1389                  }
1300 1390                  cv_broadcast(&nlp->cv);
1301 1391                  mutex_exit(&rfs_async_write_lock);
1302 1392  
1303 1393                  return;
1304 1394          }
1305 1395  
1306 1396          /*
1307 1397           * Can only write regular files.  Attempts to write any
1308 1398           * other file types fail with EISDIR.
1309 1399           */
1310 1400          if (vp->v_type != VREG) {
1311 1401                  VN_RELE(vp);
1312 1402                  mutex_enter(&rfs_async_write_lock);
1313 1403                  if (rfs_async_write_head == nlp)
1314 1404                          rfs_async_write_head = nlp->next;
1315 1405                  else {
1316 1406                          lp = rfs_async_write_head;
1317 1407                          while (lp->next != nlp)
1318 1408                                  lp = lp->next;
1319 1409                          lp->next = nlp->next;
1320 1410                  }
1321 1411                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1322 1412                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1323 1413                          rp->ns->ns_status = NFSERR_ISDIR;
1324 1414                          rp->thread->t_flag |= t_flag;
1325 1415                  }
1326 1416                  cv_broadcast(&nlp->cv);
1327 1417                  mutex_exit(&rfs_async_write_lock);
1328 1418  
1329 1419                  return;
1330 1420          }
1331 1421  
1332 1422          /*
1333 1423           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1334 1424           * deadlock with ufs.
1335 1425           */
1336 1426          if (nbl_need_check(vp)) {
1337 1427                  nbl_start_crit(vp, RW_READER);
1338 1428                  in_crit = 1;
1339 1429          }
1340 1430  
1341 1431          ct.cc_sysid = 0;
1342 1432          ct.cc_pid = 0;
1343 1433          ct.cc_caller_id = nfs2_srv_caller_id;
1344 1434          ct.cc_flags = CC_DONTBLOCK;
1345 1435  
1346 1436          /*
1347 1437           * Lock the file for writing.  This operation provides
1348 1438           * the delay which allows clusters to grow.
1349 1439           */
1350 1440          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1351 1441  
1352 1442          /* check if a monitor detected a delegation conflict */
1353 1443          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1354 1444                  if (in_crit)
1355 1445                          nbl_end_crit(vp);
1356 1446                  VN_RELE(vp);
1357 1447                  /* mark as wouldblock so response is dropped */
1358 1448                  curthread->t_flag |= T_WOULDBLOCK;
1359 1449                  mutex_enter(&rfs_async_write_lock);
1360 1450                  if (rfs_async_write_head == nlp)
1361 1451                          rfs_async_write_head = nlp->next;
1362 1452                  else {
1363 1453                          lp = rfs_async_write_head;
1364 1454                          while (lp->next != nlp)
1365 1455                                  lp = lp->next;
1366 1456                          lp->next = nlp->next;
1367 1457                  }
1368 1458                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1369 1459                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1370 1460                                  rp->ns->ns_status = puterrno(error);
1371 1461                                  rp->thread->t_flag |= T_WOULDBLOCK;
1372 1462                          }
1373 1463                  }
1374 1464                  cv_broadcast(&nlp->cv);
1375 1465                  mutex_exit(&rfs_async_write_lock);
1376 1466  
1377 1467                  return;
1378 1468          }
1379 1469  
1380 1470          /*
1381 1471           * Disconnect this cluster from the list of clusters.
1382 1472           * The cluster that is being dealt with must be fixed
1383 1473           * in size after this point, so there is no reason
1384 1474           * to leave it on the list so that new requests can
1385 1475           * find it.
1386 1476           *
1387 1477           * The algorithm is that the first write request will
1388 1478           * create a cluster, convert the file handle to a
1389 1479           * vnode pointer, and then lock the file for writing.
1390 1480           * This request is not likely to be clustered with
1391 1481           * any others.  However, the next request will create
1392 1482           * a new cluster and be blocked in VOP_RWLOCK while
1393 1483           * the first request is being processed.  This delay
1394 1484           * will allow more requests to be clustered in this
1395 1485           * second cluster.
1396 1486           */
1397 1487          mutex_enter(&rfs_async_write_lock);
1398 1488          if (rfs_async_write_head == nlp)
1399 1489                  rfs_async_write_head = nlp->next;
1400 1490          else {
1401 1491                  lp = rfs_async_write_head;
1402 1492                  while (lp->next != nlp)
1403 1493                          lp = lp->next;
1404 1494                  lp->next = nlp->next;
1405 1495          }
1406 1496          mutex_exit(&rfs_async_write_lock);
1407 1497  
1408 1498          /*
1409 1499           * Step through the list of requests in this cluster.
1410 1500           * We need to check permissions to make sure that all
1411 1501           * of the requests have sufficient permission to write
1412 1502           * the file.  A cluster can be composed of requests
1413 1503           * from different clients and different users on each
1414 1504           * client.
1415 1505           *
1416 1506           * As a side effect, we also calculate the size of the
1417 1507           * byte range that this cluster encompasses.
1418 1508           */
1419 1509          rp = nlp->list;
1420 1510          off = rp->wa->wa_offset;
1421 1511          len = (uint_t)0;
1422 1512          do {
1423 1513                  if (rdonly(rp->ro, vp)) {
1424 1514                          rp->ns->ns_status = NFSERR_ROFS;
1425 1515                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1426 1516                          rp->thread->t_flag |= t_flag;
1427 1517                          continue;
1428 1518                  }
1429 1519  
1430 1520                  va.va_mask = AT_UID|AT_MODE;
1431 1521  
1432 1522                  error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1433 1523  
1434 1524                  if (!error) {
1435 1525                          if (crgetuid(rp->cr) != va.va_uid) {
1436 1526                                  /*
1437 1527                                   * This is a kludge to allow writes of files
1438 1528                                   * created with read only permission.  The
1439 1529                                   * owner of the file is always allowed to
1440 1530                                   * write it.
1441 1531                                   */
1442 1532                                  error = VOP_ACCESS(vp, VWRITE, 0, rp->cr, &ct);
1443 1533                          }
1444 1534                          if (!error && MANDLOCK(vp, va.va_mode))
1445 1535                                  error = EACCES;
1446 1536                  }
1447 1537  
1448 1538                  /*
1449 1539                   * Check for a conflict with a nbmand-locked region.
1450 1540                   */
1451 1541                  if (in_crit && nbl_conflict(vp, NBL_WRITE, rp->wa->wa_offset,
1452 1542                      rp->wa->wa_count, 0, NULL)) {
1453 1543                          error = EACCES;
1454 1544                  }
1455 1545  
1456 1546                  if (error) {
1457 1547                          rp->ns->ns_status = puterrno(error);
1458 1548                          t_flag = curthread->t_flag & T_WOULDBLOCK;
1459 1549                          rp->thread->t_flag |= t_flag;
1460 1550                          continue;
1461 1551                  }
1462 1552                  if (len < rp->wa->wa_offset + rp->wa->wa_count - off)
1463 1553                          len = rp->wa->wa_offset + rp->wa->wa_count - off;
1464 1554          } while ((rp = rp->list) != NULL);
1465 1555  
1466 1556          /*
1467 1557           * Step through the cluster attempting to gather as many
1468 1558           * requests which are contiguous as possible.  These
1469 1559           * contiguous requests are handled via one call to VOP_WRITE
1470 1560           * instead of different calls to VOP_WRITE.  We also keep
1471 1561           * track of the fact that any data was written.
1472 1562           */
1473 1563          rp = nlp->list;
1474 1564          data_written = 0;
1475 1565          do {
1476 1566                  /*
1477 1567                   * Skip any requests which are already marked as having an
1478 1568                   * error.
1479 1569                   */
1480 1570                  if (rp->ns->ns_status != RFSWRITE_INITVAL) {
1481 1571                          rp = rp->list;
1482 1572                          continue;
1483 1573                  }
1484 1574  
1485 1575                  /*
1486 1576                   * Count the number of iovec's which are required
1487 1577                   * to handle this set of requests.  One iovec is
1488 1578                   * needed for each data buffer, whether addressed
1489 1579                   * by wa_data or by the b_rptr pointers in the
1490 1580                   * mblk chains.
1491 1581                   */
1492 1582                  iovcnt = 0;
1493 1583                  lrp = rp;
1494 1584                  for (;;) {
1495 1585                          if (lrp->wa->wa_data || lrp->wa->wa_rlist)
1496 1586                                  iovcnt++;
1497 1587                          else {
1498 1588                                  m = lrp->wa->wa_mblk;
1499 1589                                  while (m != NULL) {
1500 1590                                          iovcnt++;
1501 1591                                          m = m->b_cont;
1502 1592                                  }
1503 1593                          }
1504 1594                          if (lrp->list == NULL ||
1505 1595                              lrp->list->ns->ns_status != RFSWRITE_INITVAL ||
1506 1596                              lrp->wa->wa_offset + lrp->wa->wa_count !=
1507 1597                              lrp->list->wa->wa_offset) {
1508 1598                                  lrp = lrp->list;
1509 1599                                  break;
1510 1600                          }
1511 1601                          lrp = lrp->list;
1512 1602                  }
1513 1603  
1514 1604                  if (iovcnt <= MAXCLIOVECS) {
1515 1605  #ifdef DEBUG
1516 1606                          rfs_write_hits++;
1517 1607  #endif
1518 1608                          niovp = iov;
1519 1609                  } else {
1520 1610  #ifdef DEBUG
1521 1611                          rfs_write_misses++;
1522 1612  #endif
1523 1613                          niovp = kmem_alloc(sizeof (*niovp) * iovcnt, KM_SLEEP);
1524 1614                  }
1525 1615                  /*
1526 1616                   * Put together the scatter/gather iovecs.
1527 1617                   */
1528 1618                  iovp = niovp;
1529 1619                  trp = rp;
1530 1620                  count = 0;
1531 1621                  do {
1532 1622                          if (trp->wa->wa_data || trp->wa->wa_rlist) {
1533 1623                                  if (trp->wa->wa_rlist) {
1534 1624                                          iovp->iov_base =
1535 1625                                              (char *)((trp->wa->wa_rlist)->
1536 1626                                              u.c_daddr3);
1537 1627                                          iovp->iov_len = trp->wa->wa_count;
1538 1628                                  } else  {
1539 1629                                          iovp->iov_base = trp->wa->wa_data;
1540 1630                                          iovp->iov_len = trp->wa->wa_count;
1541 1631                                  }
1542 1632                                  iovp++;
1543 1633                          } else {
1544 1634                                  m = trp->wa->wa_mblk;
1545 1635                                  rcount = trp->wa->wa_count;
1546 1636                                  while (m != NULL) {
1547 1637                                          iovp->iov_base = (caddr_t)m->b_rptr;
1548 1638                                          iovp->iov_len = (m->b_wptr - m->b_rptr);
1549 1639                                          rcount -= iovp->iov_len;
1550 1640                                          if (rcount < 0)
1551 1641                                                  iovp->iov_len += rcount;
1552 1642                                          iovp++;
1553 1643                                          if (rcount <= 0)
1554 1644                                                  break;
1555 1645                                          m = m->b_cont;
1556 1646                                  }
1557 1647                          }
1558 1648                          count += trp->wa->wa_count;
1559 1649                          trp = trp->list;
1560 1650                  } while (trp != lrp);
1561 1651  
1562 1652                  uio.uio_iov = niovp;
1563 1653                  uio.uio_iovcnt = iovcnt;
1564 1654                  uio.uio_segflg = UIO_SYSSPACE;
1565 1655                  uio.uio_extflg = UIO_COPY_DEFAULT;
1566 1656                  uio.uio_loffset = (offset_t)rp->wa->wa_offset;
1567 1657                  uio.uio_resid = count;
1568 1658                  /*
1569 1659                   * The limit is checked on the client. We
1570 1660                   * should allow any size writes here.
1571 1661                   */
1572 1662                  uio.uio_llimit = curproc->p_fsz_ctl;
1573 1663                  rlimit = uio.uio_llimit - rp->wa->wa_offset;
1574 1664                  if (rlimit < (rlim64_t)uio.uio_resid)
1575 1665                          uio.uio_resid = (uint_t)rlimit;
1576 1666  
1577 1667                  /*
1578 1668                   * For now we assume no append mode.
1579 1669                   */
1580 1670  
1581 1671                  /*
1582 1672                   * We're changing creds because VM may fault
1583 1673                   * and we need the cred of the current
1584 1674                   * thread to be used if quota * checking is
1585 1675                   * enabled.
1586 1676                   */
1587 1677                  savecred = curthread->t_cred;
1588 1678                  curthread->t_cred = cr;
1589 1679                  error = VOP_WRITE(vp, &uio, 0, rp->cr, &ct);
1590 1680                  curthread->t_cred = savecred;
1591 1681  
1592 1682                  /* check if a monitor detected a delegation conflict */
1593 1683                  if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK))
1594 1684                          /* mark as wouldblock so response is dropped */
1595 1685                          curthread->t_flag |= T_WOULDBLOCK;
1596 1686  
1597 1687                  if (niovp != iov)
1598 1688                          kmem_free(niovp, sizeof (*niovp) * iovcnt);
1599 1689  
1600 1690                  if (!error) {
1601 1691                          data_written = 1;
1602 1692                          /*
1603 1693                           * Get attributes again so we send the latest mod
1604 1694                           * time to the client side for its cache.
1605 1695                           */
1606 1696                          va.va_mask = AT_ALL;    /* now we want everything */
1607 1697  
1608 1698                          error = VOP_GETATTR(vp, &va, 0, rp->cr, &ct);
1609 1699  
1610 1700                          if (!error)
1611 1701                                  acl_perm(vp, exi, &va, rp->cr);
1612 1702                  }
1613 1703  
1614 1704                  /*
1615 1705                   * Fill in the status responses for each request
1616 1706                   * which was just handled.  Also, copy the latest
1617 1707                   * attributes in to the attribute responses if
1618 1708                   * appropriate.
1619 1709                   */
1620 1710                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1621 1711                  do {
1622 1712                          rp->thread->t_flag |= t_flag;
1623 1713                          /* check for overflows */
1624 1714                          if (!error) {
1625 1715                                  error  = vattr_to_nattr(&va, &rp->ns->ns_attr);
1626 1716                          }
1627 1717                          rp->ns->ns_status = puterrno(error);
1628 1718                          rp = rp->list;
1629 1719                  } while (rp != lrp);
1630 1720          } while (rp != NULL);
1631 1721  
1632 1722          /*
1633 1723           * If any data was written at all, then we need to flush
1634 1724           * the data and metadata to stable storage.
1635 1725           */
1636 1726          if (data_written) {
1637 1727                  error = VOP_PUTPAGE(vp, (u_offset_t)off, len, 0, cr, &ct);
1638 1728  
1639 1729                  if (!error) {
1640 1730                          error = VOP_FSYNC(vp, FNODSYNC, cr, &ct);
1641 1731                  }
1642 1732          }
1643 1733  
1644 1734          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1645 1735  
1646 1736          if (in_crit)
1647 1737                  nbl_end_crit(vp);
1648 1738          VN_RELE(vp);
1649 1739  
1650 1740          t_flag = curthread->t_flag & T_WOULDBLOCK;
1651 1741          mutex_enter(&rfs_async_write_lock);
1652 1742          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1653 1743                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1654 1744                          rp->ns->ns_status = puterrno(error);
1655 1745                          rp->thread->t_flag |= t_flag;
1656 1746                  }
1657 1747          }
1658 1748          cv_broadcast(&nlp->cv);
1659 1749          mutex_exit(&rfs_async_write_lock);
1660 1750  
1661 1751  }
1662 1752  
1663 1753  void *
1664 1754  rfs_write_getfh(struct nfswriteargs *wa)
1665 1755  {
1666 1756          return (&wa->wa_fhandle);
1667 1757  }
1668 1758  
1669 1759  /*
1670 1760   * Create a file.
1671 1761   * Creates a file with given attributes and returns those attributes
1672 1762   * and an fhandle for the new file.
1673 1763   */
1674 1764  void
1675 1765  rfs_create(struct nfscreatargs *args, struct nfsdiropres *dr,
1676 1766      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1677 1767  {
1678 1768          int error;
1679 1769          int lookuperr;
1680 1770          int in_crit = 0;
1681 1771          struct vattr va;
1682 1772          vnode_t *vp;
1683 1773          vnode_t *realvp;
1684 1774          vnode_t *dvp;
1685 1775          char *name = args->ca_da.da_name;
1686 1776          vnode_t *tvp = NULL;
1687 1777          int mode;
1688 1778          int lookup_ok;
1689 1779          bool_t trunc;
1690 1780          struct sockaddr *ca;
1691 1781  
1692 1782          /*
1693 1783           * Disallow NULL paths
1694 1784           */
1695 1785          if (name == NULL || *name == '\0') {
1696 1786                  dr->dr_status = NFSERR_ACCES;
1697 1787                  return;
1698 1788          }
1699 1789  
1700 1790          dvp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
1701 1791          if (dvp == NULL) {
1702 1792                  dr->dr_status = NFSERR_STALE;
1703 1793                  return;
1704 1794          }
1705 1795  
1706 1796          error = sattr_to_vattr(args->ca_sa, &va);
1707 1797          if (error) {
1708 1798                  dr->dr_status = puterrno(error);
1709 1799                  return;
1710 1800          }
1711 1801  
1712 1802          /*
1713 1803           * Must specify the mode.
1714 1804           */
1715 1805          if (!(va.va_mask & AT_MODE)) {
1716 1806                  VN_RELE(dvp);
1717 1807                  dr->dr_status = NFSERR_INVAL;
1718 1808                  return;
1719 1809          }
1720 1810  
1721 1811          /*
1722 1812           * This is a completely gross hack to make mknod
1723 1813           * work over the wire until we can wack the protocol
1724 1814           */
1725 1815          if ((va.va_mode & IFMT) == IFCHR) {
1726 1816                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1727 1817                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1728 1818                  else {
1729 1819                          va.va_type = VCHR;
1730 1820                          /*
1731 1821                           * uncompress the received dev_t
1732 1822                           * if the top half is zero indicating a request
1733 1823                           * from an `older style' OS.
1734 1824                           */
1735 1825                          if ((va.va_size & 0xffff0000) == 0)
1736 1826                                  va.va_rdev = nfsv2_expdev(va.va_size);
1737 1827                          else
1738 1828                                  va.va_rdev = (dev_t)va.va_size;
1739 1829                  }
1740 1830                  va.va_mask &= ~AT_SIZE;
1741 1831          } else if ((va.va_mode & IFMT) == IFBLK) {
1742 1832                  va.va_type = VBLK;
1743 1833                  /*
1744 1834                   * uncompress the received dev_t
1745 1835                   * if the top half is zero indicating a request
1746 1836                   * from an `older style' OS.
1747 1837                   */
1748 1838                  if ((va.va_size & 0xffff0000) == 0)
1749 1839                          va.va_rdev = nfsv2_expdev(va.va_size);
1750 1840                  else
1751 1841                          va.va_rdev = (dev_t)va.va_size;
1752 1842                  va.va_mask &= ~AT_SIZE;
1753 1843          } else if ((va.va_mode & IFMT) == IFSOCK) {
1754 1844                  va.va_type = VSOCK;
1755 1845          } else {
1756 1846                  va.va_type = VREG;
1757 1847          }
1758 1848          va.va_mode &= ~IFMT;
1759 1849          va.va_mask |= AT_TYPE;
1760 1850  
1761 1851          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
1762 1852          name = nfscmd_convname(ca, exi, name, NFSCMD_CONV_INBOUND,
1763 1853              MAXPATHLEN);
1764 1854          if (name == NULL) {
1765 1855                  dr->dr_status = puterrno(EINVAL);
1766 1856                  return;
1767 1857          }
1768 1858  
1769 1859          /*
1770 1860           * Why was the choice made to use VWRITE as the mode to the
1771 1861           * call to VOP_CREATE ? This results in a bug.  When a client
1772 1862           * opens a file that already exists and is RDONLY, the second
1773 1863           * open fails with an EACESS because of the mode.
1774 1864           * bug ID 1054648.
1775 1865           */
1776 1866          lookup_ok = 0;
1777 1867          mode = VWRITE;
1778 1868          if (!(va.va_mask & AT_SIZE) || va.va_type != VREG) {
1779 1869                  error = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1780 1870                      NULL, NULL, NULL);
1781 1871                  if (!error) {
1782 1872                          struct vattr at;
1783 1873  
1784 1874                          lookup_ok = 1;
1785 1875                          at.va_mask = AT_MODE;
1786 1876                          error = VOP_GETATTR(tvp, &at, 0, cr, NULL);
1787 1877                          if (!error)
1788 1878                                  mode = (at.va_mode & S_IWUSR) ? VWRITE : VREAD;
1789 1879                          VN_RELE(tvp);
1790 1880                          tvp = NULL;
1791 1881                  }
1792 1882          }
1793 1883  
1794 1884          if (!lookup_ok) {
1795 1885                  if (rdonly(ro, dvp)) {
1796 1886                          error = EROFS;
1797 1887                  } else if (va.va_type != VREG && va.va_type != VFIFO &&
1798 1888                      va.va_type != VSOCK && secpolicy_sys_devices(cr) != 0) {
1799 1889                          error = EPERM;
1800 1890                  } else {
1801 1891                          error = 0;
1802 1892                  }
1803 1893          }
1804 1894  
1805 1895          /*
1806 1896           * If file size is being modified on an already existing file
1807 1897           * make sure that there are no conflicting non-blocking mandatory
1808 1898           * locks in the region being manipulated. Return EACCES if there
1809 1899           * are conflicting locks.
1810 1900           */
1811 1901          if (!error && (va.va_type == VREG) && (va.va_mask & AT_SIZE)) {
1812 1902                  lookuperr = VOP_LOOKUP(dvp, name, &tvp, NULL, 0, NULL, cr,
1813 1903                      NULL, NULL, NULL);
1814 1904  
1815 1905                  if (!lookuperr &&
1816 1906                      rfs4_check_delegated(FWRITE, tvp, va.va_size == 0)) {
1817 1907                          VN_RELE(tvp);
1818 1908                          curthread->t_flag |= T_WOULDBLOCK;
1819 1909                          goto out;
1820 1910                  }
1821 1911  
1822 1912                  if (!lookuperr && nbl_need_check(tvp)) {
1823 1913                          /*
1824 1914                           * The file exists. Now check if it has any
1825 1915                           * conflicting non-blocking mandatory locks
1826 1916                           * in the region being changed.
1827 1917                           */
1828 1918                          struct vattr bva;
1829 1919                          u_offset_t offset;
1830 1920                          ssize_t length;
1831 1921  
1832 1922                          nbl_start_crit(tvp, RW_READER);
1833 1923                          in_crit = 1;
1834 1924  
1835 1925                          bva.va_mask = AT_SIZE;
1836 1926                          error = VOP_GETATTR(tvp, &bva, 0, cr, NULL);
1837 1927                          if (!error) {
1838 1928                                  if (va.va_size < bva.va_size) {
1839 1929                                          offset = va.va_size;
1840 1930                                          length = bva.va_size - va.va_size;
1841 1931                                  } else {
1842 1932                                          offset = bva.va_size;
1843 1933                                          length = va.va_size - bva.va_size;
1844 1934                                  }
1845 1935                                  if (length) {
1846 1936                                          if (nbl_conflict(tvp, NBL_WRITE,
1847 1937                                              offset, length, 0, NULL)) {
1848 1938                                                  error = EACCES;
1849 1939                                          }
1850 1940                                  }
1851 1941                          }
1852 1942                          if (error) {
1853 1943                                  nbl_end_crit(tvp);
1854 1944                                  VN_RELE(tvp);
1855 1945                                  in_crit = 0;
1856 1946                          }
1857 1947                  } else if (tvp != NULL) {
1858 1948                          VN_RELE(tvp);
1859 1949                  }
1860 1950          }
1861 1951  
1862 1952          if (!error) {
1863 1953                  /*
1864 1954                   * If filesystem is shared with nosuid the remove any
1865 1955                   * setuid/setgid bits on create.
1866 1956                   */
1867 1957                  if (va.va_type == VREG &&
1868 1958                      exi->exi_export.ex_flags & EX_NOSUID)
1869 1959                          va.va_mode &= ~(VSUID | VSGID);
1870 1960  
1871 1961                  error = VOP_CREATE(dvp, name, &va, NONEXCL, mode, &vp, cr, 0,
1872 1962                      NULL, NULL);
1873 1963  
1874 1964                  if (!error) {
1875 1965  
1876 1966                          if ((va.va_mask & AT_SIZE) && (va.va_size == 0))
1877 1967                                  trunc = TRUE;
1878 1968                          else
1879 1969                                  trunc = FALSE;
1880 1970  
1881 1971                          if (rfs4_check_delegated(FWRITE, vp, trunc)) {
1882 1972                                  VN_RELE(vp);
1883 1973                                  curthread->t_flag |= T_WOULDBLOCK;
1884 1974                                  goto out;
1885 1975                          }
1886 1976                          va.va_mask = AT_ALL;
1887 1977  
1888 1978                          error = VOP_GETATTR(vp, &va, 0, cr, NULL);
1889 1979  
1890 1980                          /* check for overflows */
1891 1981                          if (!error) {
1892 1982                                  acl_perm(vp, exi, &va, cr);
1893 1983                                  error = vattr_to_nattr(&va, &dr->dr_attr);
1894 1984                                  if (!error) {
1895 1985                                          error = makefh(&dr->dr_fhandle, vp,
1896 1986                                              exi);
1897 1987                                  }
1898 1988                          }
1899 1989                          /*
1900 1990                           * Force modified metadata out to stable storage.
1901 1991                           *
1902 1992                           * if a underlying vp exists, pass it to VOP_FSYNC
1903 1993                           */
1904 1994                          if (VOP_REALVP(vp, &realvp, NULL) == 0)
1905 1995                                  (void) VOP_FSYNC(realvp, FNODSYNC, cr, NULL);
1906 1996                          else
1907 1997                                  (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
1908 1998                          VN_RELE(vp);
1909 1999                  }
1910 2000  
1911 2001                  if (in_crit) {
1912 2002                          nbl_end_crit(tvp);
1913 2003                          VN_RELE(tvp);
1914 2004                  }
1915 2005          }
1916 2006  
1917 2007          /*
1918 2008           * Force modified data and metadata out to stable storage.
1919 2009           */
1920 2010          (void) VOP_FSYNC(dvp, 0, cr, NULL);
1921 2011  
1922 2012  out:
1923 2013  
1924 2014          VN_RELE(dvp);
1925 2015  
1926 2016          dr->dr_status = puterrno(error);
1927 2017  
1928 2018          if (name != args->ca_da.da_name)
1929 2019                  kmem_free(name, MAXPATHLEN);
1930 2020  }
1931 2021  void *
1932 2022  rfs_create_getfh(struct nfscreatargs *args)
1933 2023  {
1934 2024          return (args->ca_da.da_fhandle);
1935 2025  }
1936 2026  
1937 2027  /*
1938 2028   * Remove a file.
1939 2029   * Remove named file from parent directory.
1940 2030   */
1941 2031  /* ARGSUSED */
1942 2032  void
1943 2033  rfs_remove(struct nfsdiropargs *da, enum nfsstat *status,
1944 2034      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
1945 2035  {
1946 2036          int error = 0;
1947 2037          vnode_t *vp;
1948 2038          vnode_t *targvp;
1949 2039          int in_crit = 0;
1950 2040  
1951 2041          /*
1952 2042           * Disallow NULL paths
1953 2043           */
1954 2044          if (da->da_name == NULL || *da->da_name == '\0') {
1955 2045                  *status = NFSERR_ACCES;
1956 2046                  return;
1957 2047          }
1958 2048  
1959 2049          vp = nfs_fhtovp(da->da_fhandle, exi);
1960 2050          if (vp == NULL) {
1961 2051                  *status = NFSERR_STALE;
1962 2052                  return;
1963 2053          }
1964 2054  
1965 2055          if (rdonly(ro, vp)) {
1966 2056                  VN_RELE(vp);
1967 2057                  *status = NFSERR_ROFS;
1968 2058                  return;
1969 2059          }
1970 2060  
1971 2061          /*
1972 2062           * Check for a conflict with a non-blocking mandatory share reservation.
1973 2063           */
1974 2064          error = VOP_LOOKUP(vp, da->da_name, &targvp, NULL, 0,
1975 2065              NULL, cr, NULL, NULL, NULL);
1976 2066          if (error != 0) {
1977 2067                  VN_RELE(vp);
1978 2068                  *status = puterrno(error);
1979 2069                  return;
1980 2070          }
1981 2071  
1982 2072          /*
1983 2073           * If the file is delegated to an v4 client, then initiate
1984 2074           * recall and drop this request (by setting T_WOULDBLOCK).
1985 2075           * The client will eventually re-transmit the request and
1986 2076           * (hopefully), by then, the v4 client will have returned
1987 2077           * the delegation.
1988 2078           */
1989 2079  
1990 2080          if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
1991 2081                  VN_RELE(vp);
1992 2082                  VN_RELE(targvp);
1993 2083                  curthread->t_flag |= T_WOULDBLOCK;
1994 2084                  return;
1995 2085          }
1996 2086  
1997 2087          if (nbl_need_check(targvp)) {
1998 2088                  nbl_start_crit(targvp, RW_READER);
1999 2089                  in_crit = 1;
2000 2090                  if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, NULL)) {
2001 2091                          error = EACCES;
2002 2092                          goto out;
2003 2093                  }
2004 2094          }
2005 2095  
2006 2096          error = VOP_REMOVE(vp, da->da_name, cr, NULL, 0);
2007 2097  
2008 2098          /*
2009 2099           * Force modified data and metadata out to stable storage.
2010 2100           */
2011 2101          (void) VOP_FSYNC(vp, 0, cr, NULL);
2012 2102  
2013 2103  out:
2014 2104          if (in_crit)
2015 2105                  nbl_end_crit(targvp);
2016 2106          VN_RELE(targvp);
2017 2107          VN_RELE(vp);
2018 2108  
2019 2109          *status = puterrno(error);
2020 2110  
2021 2111  }
2022 2112  
2023 2113  void *
2024 2114  rfs_remove_getfh(struct nfsdiropargs *da)
2025 2115  {
2026 2116          return (da->da_fhandle);
2027 2117  }
2028 2118  
2029 2119  /*
2030 2120   * rename a file
2031 2121   * Give a file (from) a new name (to).
2032 2122   */
2033 2123  /* ARGSUSED */
2034 2124  void
2035 2125  rfs_rename(struct nfsrnmargs *args, enum nfsstat *status,
2036 2126      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2037 2127  {
2038 2128          int error = 0;
2039 2129          vnode_t *fromvp;
2040 2130          vnode_t *tovp;
2041 2131          struct exportinfo *to_exi;
2042 2132          fhandle_t *fh;
2043 2133          vnode_t *srcvp;
2044 2134          vnode_t *targvp;
2045 2135          int in_crit = 0;
2046 2136  
2047 2137          fromvp = nfs_fhtovp(args->rna_from.da_fhandle, exi);
2048 2138          if (fromvp == NULL) {
2049 2139                  *status = NFSERR_STALE;
2050 2140                  return;
2051 2141          }
2052 2142  
2053 2143          fh = args->rna_to.da_fhandle;
2054 2144          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2055 2145          if (to_exi == NULL) {
2056 2146                  VN_RELE(fromvp);
2057 2147                  *status = NFSERR_ACCES;
2058 2148                  return;
2059 2149          }
2060 2150          exi_rele(to_exi);
2061 2151  
2062 2152          if (to_exi != exi) {
2063 2153                  VN_RELE(fromvp);
2064 2154                  *status = NFSERR_XDEV;
2065 2155                  return;
2066 2156          }
2067 2157  
2068 2158          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2069 2159          if (tovp == NULL) {
2070 2160                  VN_RELE(fromvp);
2071 2161                  *status = NFSERR_STALE;
2072 2162                  return;
2073 2163          }
2074 2164  
2075 2165          if (fromvp->v_type != VDIR || tovp->v_type != VDIR) {
2076 2166                  VN_RELE(tovp);
2077 2167                  VN_RELE(fromvp);
2078 2168                  *status = NFSERR_NOTDIR;
2079 2169                  return;
2080 2170          }
2081 2171  
2082 2172          /*
2083 2173           * Disallow NULL paths
2084 2174           */
2085 2175          if (args->rna_from.da_name == NULL || *args->rna_from.da_name == '\0' ||
2086 2176              args->rna_to.da_name == NULL || *args->rna_to.da_name == '\0') {
2087 2177                  VN_RELE(tovp);
2088 2178                  VN_RELE(fromvp);
2089 2179                  *status = NFSERR_ACCES;
2090 2180                  return;
2091 2181          }
2092 2182  
2093 2183          if (rdonly(ro, tovp)) {
2094 2184                  VN_RELE(tovp);
2095 2185                  VN_RELE(fromvp);
2096 2186                  *status = NFSERR_ROFS;
2097 2187                  return;
2098 2188          }
2099 2189  
2100 2190          /*
2101 2191           * Check for a conflict with a non-blocking mandatory share reservation.
2102 2192           */
2103 2193          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2104 2194              NULL, cr, NULL, NULL, NULL);
2105 2195          if (error != 0) {
2106 2196                  VN_RELE(tovp);
2107 2197                  VN_RELE(fromvp);
2108 2198                  *status = puterrno(error);
2109 2199                  return;
2110 2200          }
2111 2201  
2112 2202          /* Check for delegations on the source file */
2113 2203  
2114 2204          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2115 2205                  VN_RELE(tovp);
2116 2206                  VN_RELE(fromvp);
2117 2207                  VN_RELE(srcvp);
2118 2208                  curthread->t_flag |= T_WOULDBLOCK;
2119 2209                  return;
2120 2210          }
2121 2211  
2122 2212          /* Check for delegation on the file being renamed over, if it exists */
2123 2213  
2124 2214          if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
2125 2215              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2126 2216              NULL, NULL, NULL) == 0) {
2127 2217  
2128 2218                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2129 2219                          VN_RELE(tovp);
2130 2220                          VN_RELE(fromvp);
2131 2221                          VN_RELE(srcvp);
2132 2222                          VN_RELE(targvp);
2133 2223                          curthread->t_flag |= T_WOULDBLOCK;
2134 2224                          return;
2135 2225                  }
2136 2226                  VN_RELE(targvp);
2137 2227          }
2138 2228  
2139 2229  
2140 2230          if (nbl_need_check(srcvp)) {
2141 2231                  nbl_start_crit(srcvp, RW_READER);
2142 2232                  in_crit = 1;
2143 2233                  if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, NULL)) {
2144 2234                          error = EACCES;
2145 2235                          goto out;
2146 2236                  }
2147 2237          }
2148 2238  
2149 2239          error = VOP_RENAME(fromvp, args->rna_from.da_name,
2150 2240              tovp, args->rna_to.da_name, cr, NULL, 0);
2151 2241  
2152 2242          if (error == 0)
2153 2243                  vn_renamepath(tovp, srcvp, args->rna_to.da_name,
2154 2244                      strlen(args->rna_to.da_name));
2155 2245  
2156 2246          /*
2157 2247           * Force modified data and metadata out to stable storage.
2158 2248           */
2159 2249          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2160 2250          (void) VOP_FSYNC(fromvp, 0, cr, NULL);
2161 2251  
2162 2252  out:
2163 2253          if (in_crit)
2164 2254                  nbl_end_crit(srcvp);
2165 2255          VN_RELE(srcvp);
2166 2256          VN_RELE(tovp);
2167 2257          VN_RELE(fromvp);
2168 2258  
2169 2259          *status = puterrno(error);
2170 2260  
2171 2261  }
2172 2262  void *
2173 2263  rfs_rename_getfh(struct nfsrnmargs *args)
2174 2264  {
2175 2265          return (args->rna_from.da_fhandle);
2176 2266  }
2177 2267  
2178 2268  /*
2179 2269   * Link to a file.
2180 2270   * Create a file (to) which is a hard link to the given file (from).
2181 2271   */
2182 2272  /* ARGSUSED */
2183 2273  void
2184 2274  rfs_link(struct nfslinkargs *args, enum nfsstat *status,
2185 2275      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2186 2276  {
2187 2277          int error;
2188 2278          vnode_t *fromvp;
2189 2279          vnode_t *tovp;
2190 2280          struct exportinfo *to_exi;
2191 2281          fhandle_t *fh;
2192 2282  
2193 2283          fromvp = nfs_fhtovp(args->la_from, exi);
2194 2284          if (fromvp == NULL) {
2195 2285                  *status = NFSERR_STALE;
2196 2286                  return;
2197 2287          }
2198 2288  
2199 2289          fh = args->la_to.da_fhandle;
2200 2290          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2201 2291          if (to_exi == NULL) {
2202 2292                  VN_RELE(fromvp);
2203 2293                  *status = NFSERR_ACCES;
2204 2294                  return;
2205 2295          }
2206 2296          exi_rele(to_exi);
2207 2297  
2208 2298          if (to_exi != exi) {
2209 2299                  VN_RELE(fromvp);
2210 2300                  *status = NFSERR_XDEV;
2211 2301                  return;
2212 2302          }
2213 2303  
2214 2304          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2215 2305          if (tovp == NULL) {
2216 2306                  VN_RELE(fromvp);
2217 2307                  *status = NFSERR_STALE;
2218 2308                  return;
2219 2309          }
2220 2310  
2221 2311          if (tovp->v_type != VDIR) {
2222 2312                  VN_RELE(tovp);
2223 2313                  VN_RELE(fromvp);
2224 2314                  *status = NFSERR_NOTDIR;
2225 2315                  return;
2226 2316          }
2227 2317          /*
2228 2318           * Disallow NULL paths
2229 2319           */
2230 2320          if (args->la_to.da_name == NULL || *args->la_to.da_name == '\0') {
2231 2321                  VN_RELE(tovp);
2232 2322                  VN_RELE(fromvp);
2233 2323                  *status = NFSERR_ACCES;
2234 2324                  return;
2235 2325          }
2236 2326  
2237 2327          if (rdonly(ro, tovp)) {
2238 2328                  VN_RELE(tovp);
2239 2329                  VN_RELE(fromvp);
2240 2330                  *status = NFSERR_ROFS;
2241 2331                  return;
2242 2332          }
2243 2333  
2244 2334          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2245 2335  
2246 2336          /*
2247 2337           * Force modified data and metadata out to stable storage.
2248 2338           */
2249 2339          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2250 2340          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2251 2341  
2252 2342          VN_RELE(tovp);
2253 2343          VN_RELE(fromvp);
2254 2344  
2255 2345          *status = puterrno(error);
2256 2346  
2257 2347  }
2258 2348  void *
2259 2349  rfs_link_getfh(struct nfslinkargs *args)
2260 2350  {
2261 2351          return (args->la_from);
2262 2352  }
2263 2353  
2264 2354  /*
2265 2355   * Symbolicly link to a file.
2266 2356   * Create a file (to) with the given attributes which is a symbolic link
2267 2357   * to the given path name (to).
2268 2358   */
2269 2359  void
2270 2360  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2271 2361      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2272 2362  {
2273 2363          int error;
2274 2364          struct vattr va;
2275 2365          vnode_t *vp;
2276 2366          vnode_t *svp;
2277 2367          int lerror;
2278 2368          struct sockaddr *ca;
2279 2369          char *name = NULL;
2280 2370  
2281 2371          /*
2282 2372           * Disallow NULL paths
2283 2373           */
2284 2374          if (args->sla_from.da_name == NULL || *args->sla_from.da_name == '\0') {
2285 2375                  *status = NFSERR_ACCES;
2286 2376                  return;
2287 2377          }
2288 2378  
2289 2379          vp = nfs_fhtovp(args->sla_from.da_fhandle, exi);
2290 2380          if (vp == NULL) {
2291 2381                  *status = NFSERR_STALE;
2292 2382                  return;
2293 2383          }
2294 2384  
2295 2385          if (rdonly(ro, vp)) {
2296 2386                  VN_RELE(vp);
2297 2387                  *status = NFSERR_ROFS;
2298 2388                  return;
2299 2389          }
2300 2390  
2301 2391          error = sattr_to_vattr(args->sla_sa, &va);
2302 2392          if (error) {
2303 2393                  VN_RELE(vp);
2304 2394                  *status = puterrno(error);
2305 2395                  return;
2306 2396          }
2307 2397  
2308 2398          if (!(va.va_mask & AT_MODE)) {
2309 2399                  VN_RELE(vp);
2310 2400                  *status = NFSERR_INVAL;
2311 2401                  return;
2312 2402          }
2313 2403  
2314 2404          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2315 2405          name = nfscmd_convname(ca, exi, args->sla_tnm,
2316 2406              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2317 2407  
2318 2408          if (name == NULL) {
2319 2409                  *status = NFSERR_ACCES;
2320 2410                  return;
2321 2411          }
2322 2412  
2323 2413          va.va_type = VLNK;
2324 2414          va.va_mask |= AT_TYPE;
2325 2415  
2326 2416          error = VOP_SYMLINK(vp, args->sla_from.da_name, &va, name, cr, NULL, 0);
2327 2417  
2328 2418          /*
2329 2419           * Force new data and metadata out to stable storage.
2330 2420           */
2331 2421          lerror = VOP_LOOKUP(vp, args->sla_from.da_name, &svp, NULL, 0,
2332 2422              NULL, cr, NULL, NULL, NULL);
2333 2423  
2334 2424          if (!lerror) {
2335 2425                  (void) VOP_FSYNC(svp, 0, cr, NULL);
2336 2426                  VN_RELE(svp);
2337 2427          }
2338 2428  
2339 2429          /*
2340 2430           * Force modified data and metadata out to stable storage.
2341 2431           */
2342 2432          (void) VOP_FSYNC(vp, 0, cr, NULL);
2343 2433  
2344 2434          VN_RELE(vp);
2345 2435  
2346 2436          *status = puterrno(error);
2347 2437          if (name != args->sla_tnm)
2348 2438                  kmem_free(name, MAXPATHLEN);
2349 2439  
2350 2440  }
2351 2441  void *
2352 2442  rfs_symlink_getfh(struct nfsslargs *args)
2353 2443  {
2354 2444          return (args->sla_from.da_fhandle);
2355 2445  }
2356 2446  
2357 2447  /*
2358 2448   * Make a directory.
2359 2449   * Create a directory with the given name, parent directory, and attributes.
2360 2450   * Returns a file handle and attributes for the new directory.
2361 2451   */
2362 2452  /* ARGSUSED */
2363 2453  void
2364 2454  rfs_mkdir(struct nfscreatargs *args, struct nfsdiropres *dr,
2365 2455      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2366 2456  {
2367 2457          int error;
2368 2458          struct vattr va;
2369 2459          vnode_t *dvp = NULL;
2370 2460          vnode_t *vp;
2371 2461          char *name = args->ca_da.da_name;
2372 2462  
2373 2463          /*
2374 2464           * Disallow NULL paths
2375 2465           */
2376 2466          if (name == NULL || *name == '\0') {
2377 2467                  dr->dr_status = NFSERR_ACCES;
2378 2468                  return;
2379 2469          }
2380 2470  
2381 2471          vp = nfs_fhtovp(args->ca_da.da_fhandle, exi);
2382 2472          if (vp == NULL) {
2383 2473                  dr->dr_status = NFSERR_STALE;
2384 2474                  return;
2385 2475          }
2386 2476  
2387 2477          if (rdonly(ro, vp)) {
2388 2478                  VN_RELE(vp);
2389 2479                  dr->dr_status = NFSERR_ROFS;
2390 2480                  return;
2391 2481          }
2392 2482  
2393 2483          error = sattr_to_vattr(args->ca_sa, &va);
2394 2484          if (error) {
2395 2485                  VN_RELE(vp);
2396 2486                  dr->dr_status = puterrno(error);
2397 2487                  return;
2398 2488          }
2399 2489  
2400 2490          if (!(va.va_mask & AT_MODE)) {
2401 2491                  VN_RELE(vp);
2402 2492                  dr->dr_status = NFSERR_INVAL;
2403 2493                  return;
2404 2494          }
2405 2495  
2406 2496          va.va_type = VDIR;
2407 2497          va.va_mask |= AT_TYPE;
2408 2498  
2409 2499          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2410 2500  
2411 2501          if (!error) {
2412 2502                  /*
2413 2503                   * Attribtutes of the newly created directory should
2414 2504                   * be returned to the client.
2415 2505                   */
2416 2506                  va.va_mask = AT_ALL; /* We want everything */
2417 2507                  error = VOP_GETATTR(dvp, &va, 0, cr, NULL);
2418 2508  
2419 2509                  /* check for overflows */
2420 2510                  if (!error) {
2421 2511                          acl_perm(vp, exi, &va, cr);
2422 2512                          error = vattr_to_nattr(&va, &dr->dr_attr);
2423 2513                          if (!error) {
2424 2514                                  error = makefh(&dr->dr_fhandle, dvp, exi);
2425 2515                          }
2426 2516                  }
2427 2517                  /*
2428 2518                   * Force new data and metadata out to stable storage.
2429 2519                   */
2430 2520                  (void) VOP_FSYNC(dvp, 0, cr, NULL);
2431 2521                  VN_RELE(dvp);
2432 2522          }
2433 2523  
2434 2524          /*
2435 2525           * Force modified data and metadata out to stable storage.
2436 2526           */
2437 2527          (void) VOP_FSYNC(vp, 0, cr, NULL);
2438 2528  
2439 2529          VN_RELE(vp);
2440 2530  
2441 2531          dr->dr_status = puterrno(error);
2442 2532  
2443 2533  }
2444 2534  void *
2445 2535  rfs_mkdir_getfh(struct nfscreatargs *args)
2446 2536  {
2447 2537          return (args->ca_da.da_fhandle);
2448 2538  }
2449 2539  
2450 2540  /*
2451 2541   * Remove a directory.
2452 2542   * Remove the given directory name from the given parent directory.
2453 2543   */
2454 2544  /* ARGSUSED */
2455 2545  void
2456 2546  rfs_rmdir(struct nfsdiropargs *da, enum nfsstat *status,
2457 2547      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2458 2548  {
2459 2549          int error;
2460 2550          vnode_t *vp;
2461 2551  
2462 2552          /*
2463 2553           * Disallow NULL paths
2464 2554           */
2465 2555          if (da->da_name == NULL || *da->da_name == '\0') {
2466 2556                  *status = NFSERR_ACCES;
2467 2557                  return;
2468 2558          }
2469 2559  
2470 2560          vp = nfs_fhtovp(da->da_fhandle, exi);
2471 2561          if (vp == NULL) {
2472 2562                  *status = NFSERR_STALE;
2473 2563                  return;
2474 2564          }
2475 2565  
2476 2566          if (rdonly(ro, vp)) {
2477 2567                  VN_RELE(vp);
2478 2568                  *status = NFSERR_ROFS;
2479 2569                  return;
2480 2570          }
2481 2571  
2482 2572          /*
2483 2573           * VOP_RMDIR takes a third argument (the current
2484 2574           * directory of the process).  That's because someone
2485 2575           * wants to return EINVAL if one tries to remove ".".
2486 2576           * Of course, NFS servers have no idea what their
2487 2577           * clients' current directories are.  We fake it by
2488 2578           * supplying a vnode known to exist and illegal to
2489 2579           * remove.
2490 2580           */
2491 2581          error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
2492 2582  
2493 2583          /*
2494 2584           * Force modified data and metadata out to stable storage.
2495 2585           */
2496 2586          (void) VOP_FSYNC(vp, 0, cr, NULL);
2497 2587  
2498 2588          VN_RELE(vp);
2499 2589  
2500 2590          /*
2501 2591           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
2502 2592           * if the directory is not empty.  A System V NFS server
2503 2593           * needs to map NFSERR_EXIST to NFSERR_NOTEMPTY to transmit
2504 2594           * over the wire.
2505 2595           */
2506 2596          if (error == EEXIST)
2507 2597                  *status = NFSERR_NOTEMPTY;
2508 2598          else
2509 2599                  *status = puterrno(error);
2510 2600  
2511 2601  }
2512 2602  void *
2513 2603  rfs_rmdir_getfh(struct nfsdiropargs *da)
2514 2604  {
2515 2605          return (da->da_fhandle);
2516 2606  }
2517 2607  
2518 2608  /* ARGSUSED */
2519 2609  void
2520 2610  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2521 2611      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2522 2612  {
2523 2613          int error;
2524 2614          int iseof;
2525 2615          struct iovec iov;
2526 2616          struct uio uio;
2527 2617          vnode_t *vp;
2528 2618          char *ndata = NULL;
2529 2619          struct sockaddr *ca;
2530 2620          size_t nents;
2531 2621          int ret;
2532 2622  
2533 2623          vp = nfs_fhtovp(&rda->rda_fh, exi);
2534 2624          if (vp == NULL) {
2535 2625                  rd->rd_entries = NULL;
2536 2626                  rd->rd_status = NFSERR_STALE;
2537 2627                  return;
2538 2628          }
2539 2629  
2540 2630          if (vp->v_type != VDIR) {
2541 2631                  VN_RELE(vp);
2542 2632                  rd->rd_entries = NULL;
2543 2633                  rd->rd_status = NFSERR_NOTDIR;
2544 2634                  return;
2545 2635          }
2546 2636  
2547 2637          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2548 2638  
2549 2639          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2550 2640  
2551 2641          if (error) {
2552 2642                  rd->rd_entries = NULL;
2553 2643                  goto bad;
2554 2644          }
2555 2645  
2556 2646          if (rda->rda_count == 0) {
2557 2647                  rd->rd_entries = NULL;
2558 2648                  rd->rd_size = 0;
2559 2649                  rd->rd_eof = FALSE;
2560 2650                  goto bad;
2561 2651          }
2562 2652  
2563 2653          rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
2564 2654  
2565 2655          /*
2566 2656           * Allocate data for entries.  This will be freed by rfs_rddirfree.
2567 2657           */
2568 2658          rd->rd_bufsize = (uint_t)rda->rda_count;
2569 2659          rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
2570 2660  
2571 2661          /*
2572 2662           * Set up io vector to read directory data
2573 2663           */
2574 2664          iov.iov_base = (caddr_t)rd->rd_entries;
2575 2665          iov.iov_len = rda->rda_count;
2576 2666          uio.uio_iov = &iov;
2577 2667          uio.uio_iovcnt = 1;
2578 2668          uio.uio_segflg = UIO_SYSSPACE;
2579 2669          uio.uio_extflg = UIO_COPY_CACHED;
2580 2670          uio.uio_loffset = (offset_t)rda->rda_offset;
2581 2671          uio.uio_resid = rda->rda_count;
2582 2672  
2583 2673          /*
2584 2674           * read directory
2585 2675           */
2586 2676          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
2587 2677  
2588 2678          /*
2589 2679           * Clean up
2590 2680           */
2591 2681          if (!error) {
2592 2682                  /*
2593 2683                   * set size and eof
2594 2684                   */
2595 2685                  if (uio.uio_resid == rda->rda_count) {
2596 2686                          rd->rd_size = 0;
2597 2687                          rd->rd_eof = TRUE;
2598 2688                  } else {
2599 2689                          rd->rd_size = (uint32_t)(rda->rda_count -
2600 2690                              uio.uio_resid);
2601 2691                          rd->rd_eof = iseof ? TRUE : FALSE;
2602 2692                  }
2603 2693          }
2604 2694  
2605 2695          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2606 2696          nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2607 2697          ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2608 2698              rda->rda_count, &ndata);
2609 2699  
2610 2700          if (ret != 0) {
2611 2701                  size_t dropbytes;
2612 2702                  /*
2613 2703                   * We had to drop one or more entries in order to fit
2614 2704                   * during the character conversion.  We need to patch
2615 2705                   * up the size and eof info.
2616 2706                   */
2617 2707                  if (rd->rd_eof)
2618 2708                          rd->rd_eof = FALSE;
2619 2709                  dropbytes = nfscmd_dropped_entrysize(
2620 2710                      (struct dirent64 *)rd->rd_entries, nents, ret);
2621 2711                  rd->rd_size -= dropbytes;
2622 2712          }
2623 2713          if (ndata == NULL) {
2624 2714                  ndata = (char *)rd->rd_entries;
2625 2715          } else if (ndata != (char *)rd->rd_entries) {
2626 2716                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2627 2717                  rd->rd_entries = (void *)ndata;
2628 2718                  rd->rd_bufsize = rda->rda_count;
2629 2719          }
2630 2720  
2631 2721  bad:
2632 2722          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2633 2723  
2634 2724  #if 0 /* notyet */
2635 2725          /*
2636 2726           * Don't do this.  It causes local disk writes when just
2637 2727           * reading the file and the overhead is deemed larger
2638 2728           * than the benefit.
2639 2729           */
2640 2730          /*
2641 2731           * Force modified metadata out to stable storage.
2642 2732           */
2643 2733          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2644 2734  #endif
2645 2735  
2646 2736          VN_RELE(vp);
2647 2737  
2648 2738          rd->rd_status = puterrno(error);
2649 2739  
2650 2740  }
2651 2741  void *
2652 2742  rfs_readdir_getfh(struct nfsrddirargs *rda)
2653 2743  {
2654 2744          return (&rda->rda_fh);
2655 2745  }
2656 2746  void
2657 2747  rfs_rddirfree(struct nfsrddirres *rd)
2658 2748  {
2659 2749          if (rd->rd_entries != NULL)
2660 2750                  kmem_free(rd->rd_entries, rd->rd_bufsize);
2661 2751  }
2662 2752  
2663 2753  /* ARGSUSED */
2664 2754  void
2665 2755  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2666 2756      struct svc_req *req, cred_t *cr, bool_t ro)
2667 2757  {
2668 2758          int error;
2669 2759          struct statvfs64 sb;
2670 2760          vnode_t *vp;
2671 2761  
2672 2762          vp = nfs_fhtovp(fh, exi);
2673 2763          if (vp == NULL) {
2674 2764                  fs->fs_status = NFSERR_STALE;
2675 2765                  return;
2676 2766          }
2677 2767  
2678 2768          error = VFS_STATVFS(vp->v_vfsp, &sb);
2679 2769  
2680 2770          if (!error) {
2681 2771                  fs->fs_tsize = nfstsize();
2682 2772                  fs->fs_bsize = sb.f_frsize;
2683 2773                  fs->fs_blocks = sb.f_blocks;
2684 2774                  fs->fs_bfree = sb.f_bfree;
2685 2775                  fs->fs_bavail = sb.f_bavail;
2686 2776          }
2687 2777  
2688 2778          VN_RELE(vp);
2689 2779  
2690 2780          fs->fs_status = puterrno(error);
2691 2781  
2692 2782  }
2693 2783  void *
2694 2784  rfs_statfs_getfh(fhandle_t *fh)
2695 2785  {
2696 2786          return (fh);
2697 2787  }
2698 2788  
2699 2789  static int
2700 2790  sattr_to_vattr(struct nfssattr *sa, struct vattr *vap)
2701 2791  {
2702 2792          vap->va_mask = 0;
2703 2793  
2704 2794          /*
2705 2795           * There was a sign extension bug in some VFS based systems
2706 2796           * which stored the mode as a short.  When it would get
2707 2797           * assigned to a u_long, no sign extension would occur.
2708 2798           * It needed to, but this wasn't noticed because sa_mode
2709 2799           * would then get assigned back to the short, thus ignoring
2710 2800           * the upper 16 bits of sa_mode.
2711 2801           *
2712 2802           * To make this implementation work for both broken
2713 2803           * clients and good clients, we check for both versions
2714 2804           * of the mode.
2715 2805           */
2716 2806          if (sa->sa_mode != (uint32_t)((ushort_t)-1) &&
2717 2807              sa->sa_mode != (uint32_t)-1) {
2718 2808                  vap->va_mask |= AT_MODE;
2719 2809                  vap->va_mode = sa->sa_mode;
2720 2810          }
2721 2811          if (sa->sa_uid != (uint32_t)-1) {
2722 2812                  vap->va_mask |= AT_UID;
2723 2813                  vap->va_uid = sa->sa_uid;
2724 2814          }
2725 2815          if (sa->sa_gid != (uint32_t)-1) {
2726 2816                  vap->va_mask |= AT_GID;
2727 2817                  vap->va_gid = sa->sa_gid;
2728 2818          }
2729 2819          if (sa->sa_size != (uint32_t)-1) {
2730 2820                  vap->va_mask |= AT_SIZE;
2731 2821                  vap->va_size = sa->sa_size;
2732 2822          }
2733 2823          if (sa->sa_atime.tv_sec != (int32_t)-1 &&
2734 2824              sa->sa_atime.tv_usec != (int32_t)-1) {
2735 2825  #ifndef _LP64
2736 2826                  /* return error if time overflow */
2737 2827                  if (!NFS2_TIME_OK(sa->sa_atime.tv_sec))
2738 2828                          return (EOVERFLOW);
2739 2829  #endif
2740 2830                  vap->va_mask |= AT_ATIME;
2741 2831                  /*
2742 2832                   * nfs protocol defines times as unsigned so don't extend sign,
2743 2833                   * unless sysadmin set nfs_allow_preepoch_time.
2744 2834                   */
2745 2835                  NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, sa->sa_atime.tv_sec);
2746 2836                  vap->va_atime.tv_nsec = (uint32_t)(sa->sa_atime.tv_usec * 1000);
2747 2837          }
2748 2838          if (sa->sa_mtime.tv_sec != (int32_t)-1 &&
2749 2839              sa->sa_mtime.tv_usec != (int32_t)-1) {
2750 2840  #ifndef _LP64
2751 2841                  /* return error if time overflow */
2752 2842                  if (!NFS2_TIME_OK(sa->sa_mtime.tv_sec))
2753 2843                          return (EOVERFLOW);
2754 2844  #endif
2755 2845                  vap->va_mask |= AT_MTIME;
2756 2846                  /*
2757 2847                   * nfs protocol defines times as unsigned so don't extend sign,
2758 2848                   * unless sysadmin set nfs_allow_preepoch_time.
2759 2849                   */
2760 2850                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2761 2851                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2762 2852          }
2763 2853          return (0);
2764 2854  }
2765 2855  
2766 2856  static enum nfsftype vt_to_nf[] = {
2767 2857          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2768 2858  };
2769 2859  
2770 2860  /*
2771 2861   * check the following fields for overflow: nodeid, size, and time.
2772 2862   * There could be a problem when converting 64-bit LP64 fields
2773 2863   * into 32-bit ones.  Return an error if there is an overflow.
2774 2864   */
2775 2865  int
2776 2866  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
2777 2867  {
2778 2868          ASSERT(vap->va_type >= VNON && vap->va_type <= VBAD);
2779 2869          na->na_type = vt_to_nf[vap->va_type];
2780 2870  
2781 2871          if (vap->va_mode == (unsigned short) -1)
2782 2872                  na->na_mode = (uint32_t)-1;
2783 2873          else
2784 2874                  na->na_mode = VTTOIF(vap->va_type) | vap->va_mode;
2785 2875  
2786 2876          if (vap->va_uid == (unsigned short)(-1))
2787 2877                  na->na_uid = (uint32_t)(-1);
2788 2878          else if (vap->va_uid == UID_NOBODY)
2789 2879                  na->na_uid = (uint32_t)NFS_UID_NOBODY;
2790 2880          else
2791 2881                  na->na_uid = vap->va_uid;
2792 2882  
2793 2883          if (vap->va_gid == (unsigned short)(-1))
2794 2884                  na->na_gid = (uint32_t)-1;
2795 2885          else if (vap->va_gid == GID_NOBODY)
2796 2886                  na->na_gid = (uint32_t)NFS_GID_NOBODY;
2797 2887          else
2798 2888                  na->na_gid = vap->va_gid;
2799 2889  
2800 2890          /*
2801 2891           * Do we need to check fsid for overflow?  It is 64-bit in the
2802 2892           * vattr, but are bigger than 32 bit values supported?
2803 2893           */
2804 2894          na->na_fsid = vap->va_fsid;
2805 2895  
2806 2896          na->na_nodeid = vap->va_nodeid;
2807 2897  
2808 2898          /*
2809 2899           * Check to make sure that the nodeid is representable over the
2810 2900           * wire without losing bits.
2811 2901           */
2812 2902          if (vap->va_nodeid != (u_longlong_t)na->na_nodeid)
2813 2903                  return (EFBIG);
2814 2904          na->na_nlink = vap->va_nlink;
2815 2905  
2816 2906          /*
2817 2907           * Check for big files here, instead of at the caller.  See
2818 2908           * comments in cstat for large special file explanation.
2819 2909           */
2820 2910          if (vap->va_size > (u_longlong_t)MAXOFF32_T) {
2821 2911                  if ((vap->va_type == VREG) || (vap->va_type == VDIR))
2822 2912                          return (EFBIG);
2823 2913                  if ((vap->va_type == VBLK) || (vap->va_type == VCHR)) {
2824 2914                          /* UNKNOWN_SIZE | OVERFLOW */
2825 2915                          na->na_size = MAXOFF32_T;
2826 2916                  } else
2827 2917                          na->na_size = vap->va_size;
2828 2918          } else
2829 2919                  na->na_size = vap->va_size;
2830 2920  
2831 2921          /*
2832 2922           * If the vnode times overflow the 32-bit times that NFS2
2833 2923           * uses on the wire then return an error.
2834 2924           */
2835 2925          if (!NFS_VAP_TIME_OK(vap)) {
2836 2926                  return (EOVERFLOW);
2837 2927          }
2838 2928          na->na_atime.tv_sec = vap->va_atime.tv_sec;
2839 2929          na->na_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
2840 2930  
2841 2931          na->na_mtime.tv_sec = vap->va_mtime.tv_sec;
2842 2932          na->na_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
2843 2933  
2844 2934          na->na_ctime.tv_sec = vap->va_ctime.tv_sec;
2845 2935          na->na_ctime.tv_usec = vap->va_ctime.tv_nsec / 1000;
2846 2936  
2847 2937          /*
2848 2938           * If the dev_t will fit into 16 bits then compress
2849 2939           * it, otherwise leave it alone. See comments in
2850 2940           * nfs_client.c.
2851 2941           */
2852 2942          if (getminor(vap->va_rdev) <= SO4_MAXMIN &&
2853 2943              getmajor(vap->va_rdev) <= SO4_MAXMAJ)
2854 2944                  na->na_rdev = nfsv2_cmpdev(vap->va_rdev);
2855 2945          else
2856 2946                  (void) cmpldev(&na->na_rdev, vap->va_rdev);
2857 2947  
2858 2948          na->na_blocks = vap->va_nblocks;
2859 2949          na->na_blocksize = vap->va_blksize;
2860 2950  
2861 2951          /*
2862 2952           * This bit of ugliness is a *TEMPORARY* hack to preserve the
2863 2953           * over-the-wire protocols for named-pipe vnodes.  It remaps the
2864 2954           * VFIFO type to the special over-the-wire type. (see note in nfs.h)
2865 2955           *
2866 2956           * BUYER BEWARE:
2867 2957           *  If you are porting the NFS to a non-Sun server, you probably
2868 2958           *  don't want to include the following block of code.  The
2869 2959           *  over-the-wire special file types will be changing with the
2870 2960           *  NFS Protocol Revision.
2871 2961           */
2872 2962          if (vap->va_type == VFIFO)
2873 2963                  NA_SETFIFO(na);
2874 2964          return (0);
2875 2965  }
2876 2966  
2877 2967  /*
2878 2968   * acl v2 support: returns approximate permission.
2879 2969   *      default: returns minimal permission (more restrictive)
2880 2970   *      aclok: returns maximal permission (less restrictive)
2881 2971   *      This routine changes the permissions that are alaredy in *va.
2882 2972   *      If a file has minimal ACL, i.e. aclcnt == MIN_ACL_ENTRIES,
2883 2973   *      CLASS_OBJ is always the same as GROUP_OBJ entry.
2884 2974   */
2885 2975  static void
2886 2976  acl_perm(struct vnode *vp, struct exportinfo *exi, struct vattr *va, cred_t *cr)
2887 2977  {
2888 2978          vsecattr_t      vsa;
2889 2979          int             aclcnt;
2890 2980          aclent_t        *aclentp;
2891 2981          mode_t          mask_perm;
2892 2982          mode_t          grp_perm;
2893 2983          mode_t          other_perm;
2894 2984          mode_t          other_orig;
2895 2985          int             error;
2896 2986  
2897 2987          /* dont care default acl */
2898 2988          vsa.vsa_mask = (VSA_ACL | VSA_ACLCNT);
2899 2989          error = VOP_GETSECATTR(vp, &vsa, 0, cr, NULL);
2900 2990  
2901 2991          if (!error) {
2902 2992                  aclcnt = vsa.vsa_aclcnt;
2903 2993                  if (aclcnt > MIN_ACL_ENTRIES) {
2904 2994                          /* non-trivial ACL */
2905 2995                          aclentp = vsa.vsa_aclentp;
2906 2996                          if (exi->exi_export.ex_flags & EX_ACLOK) {
2907 2997                                  /* maximal permissions */
2908 2998                                  grp_perm = 0;
2909 2999                                  other_perm = 0;
2910 3000                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
2911 3001                                          switch (aclentp->a_type) {
2912 3002                                          case USER_OBJ:
2913 3003                                                  break;
2914 3004                                          case USER:
2915 3005                                                  grp_perm |=
2916 3006                                                      aclentp->a_perm << 3;
2917 3007                                                  other_perm |= aclentp->a_perm;
2918 3008                                                  break;
2919 3009                                          case GROUP_OBJ:
2920 3010                                                  grp_perm |=
2921 3011                                                      aclentp->a_perm << 3;
2922 3012                                                  break;
2923 3013                                          case GROUP:
2924 3014                                                  other_perm |= aclentp->a_perm;
2925 3015                                                  break;
2926 3016                                          case OTHER_OBJ:
2927 3017                                                  other_orig = aclentp->a_perm;
2928 3018                                                  break;
2929 3019                                          case CLASS_OBJ:
2930 3020                                                  mask_perm = aclentp->a_perm;
2931 3021                                                  break;
2932 3022                                          default:
2933 3023                                                  break;
2934 3024                                          }
2935 3025                                  }
2936 3026                                  grp_perm &= mask_perm << 3;
2937 3027                                  other_perm &= mask_perm;
2938 3028                                  other_perm |= other_orig;
2939 3029  
2940 3030                          } else {
2941 3031                                  /* minimal permissions */
2942 3032                                  grp_perm = 070;
2943 3033                                  other_perm = 07;
2944 3034                                  for (; aclcnt > 0; aclcnt--, aclentp++) {
2945 3035                                          switch (aclentp->a_type) {
2946 3036                                          case USER_OBJ:
2947 3037                                                  break;
2948 3038                                          case USER:
2949 3039                                          case CLASS_OBJ:
2950 3040                                                  grp_perm &=
2951 3041                                                      aclentp->a_perm << 3;
2952 3042                                                  other_perm &=
2953 3043                                                      aclentp->a_perm;
2954 3044                                                  break;
2955 3045                                          case GROUP_OBJ:
2956 3046                                                  grp_perm &=
2957 3047                                                      aclentp->a_perm << 3;
2958 3048                                                  break;
2959 3049                                          case GROUP:
2960 3050                                                  other_perm &=
2961 3051                                                      aclentp->a_perm;
2962 3052                                                  break;
2963 3053                                          case OTHER_OBJ:
2964 3054                                                  other_perm &=
2965 3055                                                      aclentp->a_perm;
2966 3056                                                  break;
2967 3057                                          default:
2968 3058                                                  break;
2969 3059                                          }
2970 3060                                  }
2971 3061                          }
2972 3062                          /* copy to va */
2973 3063                          va->va_mode &= ~077;
2974 3064                          va->va_mode |= grp_perm | other_perm;
2975 3065                  }
2976 3066                  if (vsa.vsa_aclcnt)
2977 3067                          kmem_free(vsa.vsa_aclentp,
2978 3068                              vsa.vsa_aclcnt * sizeof (aclent_t));
2979 3069          }
2980 3070  }
2981 3071  
2982 3072  void
2983 3073  rfs_srvrinit(void)
2984 3074  {
2985 3075          mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2986 3076          nfs2_srv_caller_id = fs_new_caller_id();
2987 3077  }
2988 3078  
2989 3079  void
2990 3080  rfs_srvrfini(void)
2991 3081  {
2992 3082          mutex_destroy(&rfs_async_write_lock);
2993 3083  }
2994 3084  
2995 3085  static int
2996 3086  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2997 3087  {
2998 3088          struct clist    *wcl;
2999 3089          int             wlist_len;
3000 3090          uint32_t        count = rr->rr_count;
3001 3091  
3002 3092          wcl = ra->ra_wlist;
3003 3093  
3004 3094          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3005 3095                  return (FALSE);
3006 3096          }
3007 3097  
3008 3098          wcl = ra->ra_wlist;
3009 3099          rr->rr_ok.rrok_wlist_len = wlist_len;
3010 3100          rr->rr_ok.rrok_wlist = wcl;
3011 3101  
3012 3102          return (TRUE);
3013 3103  }

↓ open down ↓

2540 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX