Print this page
NEX-17125 NFS: nbmand lock entered but not exited on error path
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Rick McNeal <rick.mcneal@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Evan Layton <evan.layton@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-15279 support NFS server in zone
NEX-15520 online NFS shares cause zoneadm halt to hang in nfs_export_zone_fini
Portions contributed by: Dan Kruchinin dan.kruchinin@nexenta.com
Portions contributed by: Stepan Zastupov stepan.zastupov@gmail.com
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
NEX-9275 Got "bad mutex" panic when run IO to nfs share from clients
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Rob Gittins <rob.gittins@nexenta.com>
NEX-3524 CLONE - Port NEX-3505 "wrong authentication" messages with root=@0.0.0.0/0 set, result in loss of client access
Reviewed by: Marcel Telka <marcel.telka@nexenta.com>
NEX-3533 CLONE - Port NEX-3019 NFSv3 writes underneath mounted filesystem to directory
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-3095 Issues related to NFS nohide
Reviewed by: Dan Fields <dan.fields@nexenta.com>
NEX-1128 NFS server: Generic uid and gid remapping for AUTH_SYS
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
OS-20 share_nfs(1m) charset handling is unreliable
OS-22 Page fault at nfscmd_dropped_entrysize+0x1e()
OS-23 NFSv2/3/4: READDIR responses are inconsistent when charset conversion fails
OS-24 rfs3_readdir(): Issues related to nfscmd_convdirent()
Reviewed by: Jan Kryl <jan.kryl@nexenta.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
re #13613 rb4516 Tunables needs volatile keyword
closes #12112 rb3823 - nfs-nohide: lookup("..") for submount should be correct
re #3541 rb11254 - nfs nohide - "nfssrv: need ability to go to submounts for v3 and v2 protocols"

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/nfs/nfs_srv.c
          +++ new/usr/src/uts/common/fs/nfs/nfs_srv.c
↓ open down ↓ 10 lines elided ↑ open up ↑
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
       21 +
  21   22  /*
  22   23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  24      - * Copyright (c) 2016 by Delphix. All rights reserved.
  25   24   */
  26   25  
  27   26  /*
  28   27   *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  29   28   *      All rights reserved.
  30   29   */
  31   30  
       31 +/*
       32 + * Copyright 2018 Nexenta Systems, Inc.
       33 + * Copyright (c) 2016 by Delphix. All rights reserved.
       34 + */
       35 +
  32   36  #include <sys/param.h>
  33   37  #include <sys/types.h>
  34   38  #include <sys/systm.h>
  35   39  #include <sys/cred.h>
  36   40  #include <sys/buf.h>
  37   41  #include <sys/vfs.h>
  38   42  #include <sys/vnode.h>
  39   43  #include <sys/uio.h>
  40   44  #include <sys/stat.h>
  41   45  #include <sys/errno.h>
↓ open down ↓ 20 lines elided ↑ open up ↑
  62   66  #include <nfs/nfs_cmd.h>
  63   67  
  64   68  #include <vm/hat.h>
  65   69  #include <vm/as.h>
  66   70  #include <vm/seg.h>
  67   71  #include <vm/seg_map.h>
  68   72  #include <vm/seg_kmem.h>
  69   73  
  70   74  #include <sys/strsubr.h>
  71   75  
       76 +struct rfs_async_write_list;
       77 +
  72   78  /*
       79 + * Zone globals of NFSv2 server
       80 + */
       81 +typedef struct nfs_srv {
       82 +        kmutex_t                        async_write_lock;
       83 +        struct rfs_async_write_list     *async_write_head;
       84 +
       85 +        /*
       86 +         * enables write clustering if == 1
       87 +         */
       88 +        int             write_async;
       89 +} nfs_srv_t;
       90 +
       91 +/*
  73   92   * These are the interface routines for the server side of the
  74   93   * Network File System.  See the NFS version 2 protocol specification
  75   94   * for a description of this interface.
  76   95   */
  77   96  
  78   97  static int      sattr_to_vattr(struct nfssattr *, struct vattr *);
  79   98  static void     acl_perm(struct vnode *, struct exportinfo *, struct vattr *,
  80   99                          cred_t *);
      100 +static void     *rfs_zone_init(zoneid_t zoneid);
      101 +static void     rfs_zone_fini(zoneid_t zoneid, void *data);
  81  102  
      103 +
  82  104  /*
  83  105   * Some "over the wire" UNIX file types.  These are encoded
  84  106   * into the mode.  This needs to be fixed in the next rev.
  85  107   */
  86  108  #define IFMT            0170000         /* type of file */
  87  109  #define IFCHR           0020000         /* character special */
  88  110  #define IFBLK           0060000         /* block special */
  89  111  #define IFSOCK          0140000         /* socket */
  90  112  
  91  113  u_longlong_t nfs2_srv_caller_id;
      114 +static zone_key_t rfs_zone_key;
  92  115  
  93  116  /*
  94  117   * Get file attributes.
  95  118   * Returns the current attributes of the file with the given fhandle.
  96  119   */
  97  120  /* ARGSUSED */
  98  121  void
  99  122  rfs_getattr(fhandle_t *fhp, struct nfsattrstat *ns, struct exportinfo *exi,
 100  123      struct svc_req *req, cred_t *cr, bool_t ro)
 101  124  {
↓ open down ↓ 220 lines elided ↑ open up ↑
 322  345          VN_RELE(vp);
 323  346  
 324  347          ns->ns_status = puterrno(error);
 325  348  }
 326  349  void *
 327  350  rfs_setattr_getfh(struct nfssaargs *args)
 328  351  {
 329  352          return (&args->saa_fh);
 330  353  }
 331  354  
      355 +/* Change and release @exip and @vpp only in success */
      356 +int
      357 +rfs_cross_mnt(vnode_t **vpp, struct exportinfo **exip)
      358 +{
      359 +        struct exportinfo *exi;
      360 +        vnode_t *vp = *vpp;
      361 +        fid_t fid;
      362 +        int error;
      363 +
      364 +        VN_HOLD(vp);
      365 +
      366 +        if ((error = traverse(&vp)) != 0) {
      367 +                VN_RELE(vp);
      368 +                return (error);
      369 +        }
      370 +
      371 +        bzero(&fid, sizeof (fid));
      372 +        fid.fid_len = MAXFIDSZ;
      373 +        error = VOP_FID(vp, &fid, NULL);
      374 +        if (error) {
      375 +                VN_RELE(vp);
      376 +                return (error);
      377 +        }
      378 +
      379 +        exi = checkexport(&vp->v_vfsp->vfs_fsid, &fid);
      380 +        if (exi == NULL ||
      381 +            (exi->exi_export.ex_flags & EX_NOHIDE) == 0) {
      382 +                /*
      383 +                 * It is not error, just subdir is not exported
      384 +                 * or "nohide" is not set
      385 +                 */
      386 +                if (exi != NULL)
      387 +                        exi_rele(&exi);
      388 +                VN_RELE(vp);
      389 +        } else {
      390 +                /* go to submount */
      391 +                exi_rele(exip);
      392 +                *exip = exi;
      393 +
      394 +                VN_RELE(*vpp);
      395 +                *vpp = vp;
      396 +        }
      397 +
      398 +        return (0);
      399 +}
      400 +
 332  401  /*
      402 + * Given mounted "dvp" and "exi", go upper mountpoint
      403 + * with dvp/exi correction
      404 + * Return 0 in success
      405 + */
      406 +int
      407 +rfs_climb_crossmnt(vnode_t **dvpp, struct exportinfo **exip, cred_t *cr)
      408 +{
      409 +        struct exportinfo *exi;
      410 +        vnode_t *dvp = *dvpp;
      411 +
      412 +        ASSERT(dvp->v_flag & VROOT);
      413 +
      414 +        VN_HOLD(dvp);
      415 +        dvp = untraverse(dvp);
      416 +        exi = nfs_vptoexi(NULL, dvp, cr, NULL, NULL, FALSE);
      417 +        if (exi == NULL) {
      418 +                VN_RELE(dvp);
      419 +                return (-1);
      420 +        }
      421 +
      422 +        exi_rele(exip);
      423 +        *exip = exi;
      424 +        VN_RELE(*dvpp);
      425 +        *dvpp = dvp;
      426 +
      427 +        return (0);
      428 +}
      429 +/*
 333  430   * Directory lookup.
 334  431   * Returns an fhandle and file attributes for file name in a directory.
 335  432   */
 336  433  /* ARGSUSED */
 337  434  void
 338  435  rfs_lookup(struct nfsdiropargs *da, struct nfsdiropres *dr,
 339  436      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
 340  437  {
 341  438          int error;
 342  439          vnode_t *dvp;
↓ open down ↓ 21 lines elided ↑ open up ↑
 364  461          if (da->da_name == NULL || *da->da_name == '\0') {
 365  462                  dr->dr_status = NFSERR_ACCES;
 366  463                  return;
 367  464          }
 368  465  
 369  466          /*
 370  467           * Allow lookups from the root - the default
 371  468           * location of the public filehandle.
 372  469           */
 373  470          if (exi != NULL && (exi->exi_export.ex_flags & EX_PUBLIC)) {
 374      -                dvp = rootdir;
      471 +                dvp = ZONE_ROOTVP();
 375  472                  VN_HOLD(dvp);
 376  473          } else {
 377  474                  dvp = nfs_fhtovp(fhp, exi);
 378  475                  if (dvp == NULL) {
 379  476                          dr->dr_status = NFSERR_STALE;
 380  477                          return;
 381  478                  }
 382  479          }
 383  480  
      481 +        exi_hold(exi);
      482 +
 384  483          /*
 385  484           * Not allow lookup beyond root.
 386  485           * If the filehandle matches a filehandle of the exi,
 387  486           * then the ".." refers beyond the root of an exported filesystem.
 388  487           */
 389  488          if (strcmp(da->da_name, "..") == 0 &&
 390  489              EQFID(&exi->exi_fid, (fid_t *)&fhp->fh_len)) {
 391      -                VN_RELE(dvp);
 392      -                dr->dr_status = NFSERR_NOENT;
 393      -                return;
      490 +                if ((exi->exi_export.ex_flags & EX_NOHIDE) &&
      491 +                    (dvp->v_flag & VROOT)) {
      492 +                        /*
      493 +                         * special case for ".." and 'nohide'exported root
      494 +                         */
      495 +                        if (rfs_climb_crossmnt(&dvp, &exi, cr) != 0) {
      496 +                                error = NFSERR_ACCES;
      497 +                                goto out;
      498 +                        }
      499 +                } else  {
      500 +                        error = NFSERR_NOENT;
      501 +                        goto out;
      502 +                }
 394  503          }
 395  504  
 396  505          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
 397  506          name = nfscmd_convname(ca, exi, da->da_name, NFSCMD_CONV_INBOUND,
 398  507              MAXPATHLEN);
 399  508  
 400  509          if (name == NULL) {
 401      -                dr->dr_status = NFSERR_ACCES;
 402      -                return;
      510 +                error = NFSERR_ACCES;
      511 +                goto out;
 403  512          }
 404  513  
 405  514          /*
 406  515           * If the public filehandle is used then allow
 407  516           * a multi-component lookup, i.e. evaluate
 408  517           * a pathname and follow symbolic links if
 409  518           * necessary.
 410  519           *
 411  520           * This may result in a vnode in another filesystem
 412  521           * which is OK as long as the filesystem is exported.
 413  522           */
 414  523          if (PUBLIC_FH2(fhp)) {
 415  524                  publicfh_flag = TRUE;
      525 +
      526 +                exi_rele(&exi);
      527 +
 416  528                  error = rfs_publicfh_mclookup(name, dvp, cr, &vp, &exi,
 417  529                      &sec);
 418  530          } else {
 419  531                  /*
 420  532                   * Do a normal single component lookup.
 421  533                   */
 422  534                  error = VOP_LOOKUP(dvp, name, &vp, NULL, 0, NULL, cr,
 423  535                      NULL, NULL, NULL);
 424  536          }
 425  537  
 426  538          if (name != da->da_name)
 427  539                  kmem_free(name, MAXPATHLEN);
 428  540  
      541 +        if (error == 0 && vn_ismntpt(vp)) {
      542 +                error = rfs_cross_mnt(&vp, &exi);
      543 +                if (error)
      544 +                        VN_RELE(vp);
      545 +        }
 429  546  
 430  547          if (!error) {
 431  548                  va.va_mask = AT_ALL;    /* we want everything */
 432  549  
 433  550                  error = rfs4_delegated_getattr(vp, &va, 0, cr);
 434  551  
 435  552                  /* check for overflows */
 436  553                  if (!error) {
 437  554                          acl_perm(vp, exi, &va, cr);
 438  555                          error = vattr_to_nattr(&va, &dr->dr_attr);
↓ open down ↓ 6 lines elided ↑ open up ↑
 445  562                                              exi);
 446  563                                          if (!error && publicfh_flag &&
 447  564                                              !chk_clnt_sec(exi, req))
 448  565                                                  auth_weak = TRUE;
 449  566                                  }
 450  567                          }
 451  568                  }
 452  569                  VN_RELE(vp);
 453  570          }
 454  571  
      572 +out:
 455  573          VN_RELE(dvp);
 456  574  
 457      -        /*
 458      -         * If publicfh_flag is true then we have called rfs_publicfh_mclookup
 459      -         * and have obtained a new exportinfo in exi which needs to be
 460      -         * released. Note the the original exportinfo pointed to by exi
 461      -         * will be released by the caller, comon_dispatch.
 462      -         */
 463      -        if (publicfh_flag && exi != NULL)
 464      -                exi_rele(exi);
      575 +        if (exi != NULL)
      576 +                exi_rele(&exi);
 465  577  
 466  578          /*
 467  579           * If it's public fh, no 0x81, and client's flavor is
 468  580           * invalid, set WebNFS status to WNFSERR_CLNT_FLAVOR now.
 469  581           * Then set RPC status to AUTH_TOOWEAK in common_dispatch.
 470  582           */
 471  583          if (auth_weak)
 472  584                  dr->dr_status = (enum nfsstat)WNFSERR_CLNT_FLAVOR;
 473  585          else
 474  586                  dr->dr_status = puterrno(error);
↓ open down ↓ 203 lines elided ↑ open up ↑
 678  790                          rr->rr_status = NFSERR_ACCES;
 679  791                          return;
 680  792                  }
 681  793                  in_crit = 1;
 682  794          }
 683  795  
 684  796          error = VOP_RWLOCK(vp, V_WRITELOCK_FALSE, &ct);
 685  797  
 686  798          /* check if a monitor detected a delegation conflict */
 687  799          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
      800 +                if (in_crit)
      801 +                        nbl_end_crit(vp);
 688  802                  VN_RELE(vp);
 689  803                  /* mark as wouldblock so response is dropped */
 690  804                  curthread->t_flag |= T_WOULDBLOCK;
 691  805  
 692  806                  rr->rr_data = NULL;
 693  807                  return;
 694  808          }
 695  809  
 696  810          va.va_mask = AT_ALL;
 697  811  
↓ open down ↓ 305 lines elided ↑ open up ↑
1003 1117                      wa->wa_count, 0, NULL)) {
1004 1118                          error = EACCES;
1005 1119                          goto out;
1006 1120                  }
1007 1121          }
1008 1122  
1009 1123          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1010 1124  
1011 1125          /* check if a monitor detected a delegation conflict */
1012 1126          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1013      -                VN_RELE(vp);
1014      -                /* mark as wouldblock so response is dropped */
1015      -                curthread->t_flag |= T_WOULDBLOCK;
1016      -                return;
     1127 +                goto out;
1017 1128          }
1018 1129  
1019 1130          if (wa->wa_data || wa->wa_rlist) {
1020 1131                  /* Do the RDMA thing if necessary */
1021 1132                  if (wa->wa_rlist) {
1022 1133                          iov[0].iov_base = (char *)((wa->wa_rlist)->u.c_daddr3);
1023 1134                          iov[0].iov_len = wa->wa_count;
1024 1135                  } else  {
1025 1136                          iov[0].iov_base = wa->wa_data;
1026 1137                          iov[0].iov_len = wa->wa_count;
↓ open down ↓ 19 lines elided ↑ open up ↑
1046 1157                  /*
1047 1158                   * We're changing creds because VM may fault and we need
1048 1159                   * the cred of the current thread to be used if quota
1049 1160                   * checking is enabled.
1050 1161                   */
1051 1162                  savecred = curthread->t_cred;
1052 1163                  curthread->t_cred = cr;
1053 1164                  error = VOP_WRITE(vp, &uio, FSYNC, cr, &ct);
1054 1165                  curthread->t_cred = savecred;
1055 1166          } else {
     1167 +
1056 1168                  iovcnt = 0;
1057 1169                  for (m = wa->wa_mblk; m != NULL; m = m->b_cont)
1058 1170                          iovcnt++;
1059 1171                  if (iovcnt <= MAX_IOVECS) {
1060 1172  #ifdef DEBUG
1061 1173                          rfs_write_sync_hits++;
1062 1174  #endif
1063 1175                          iovp = iov;
1064 1176                  } else {
1065 1177  #ifdef DEBUG
↓ open down ↓ 78 lines elided ↑ open up ↑
1144 1256  
1145 1257  struct rfs_async_write_list {
1146 1258          fhandle_t *fhp;
1147 1259          kcondvar_t cv;
1148 1260          struct rfs_async_write *list;
1149 1261          struct rfs_async_write_list *next;
1150 1262  };
1151 1263  
1152 1264  static struct rfs_async_write_list *rfs_async_write_head = NULL;
1153 1265  static kmutex_t rfs_async_write_lock;
1154      -static int rfs_write_async = 1; /* enables write clustering if == 1 */
     1266 +volatile int rfs_write_async = 1;       /* enables write clustering if == 1 */
1155 1267  
1156 1268  #define MAXCLIOVECS     42
1157 1269  #define RFSWRITE_INITVAL (enum nfsstat) -1
1158 1270  
1159 1271  #ifdef DEBUG
1160 1272  static int rfs_write_hits = 0;
1161 1273  static int rfs_write_misses = 0;
1162 1274  #endif
1163 1275  
1164 1276  /*
↓ open down ↓ 24 lines elided ↑ open up ↑
1189 1301          int count;
1190 1302          int rcount;
1191 1303          uint_t off;
1192 1304          uint_t len;
1193 1305          struct rfs_async_write nrpsp;
1194 1306          struct rfs_async_write_list nlpsp;
1195 1307          ushort_t t_flag;
1196 1308          cred_t *savecred;
1197 1309          int in_crit = 0;
1198 1310          caller_context_t ct;
     1311 +        nfs_srv_t *nsrv;
1199 1312  
1200      -        if (!rfs_write_async) {
     1313 +        nsrv = zone_getspecific(rfs_zone_key, curzone);
     1314 +        if (!nsrv->write_async) {
1201 1315                  rfs_write_sync(wa, ns, exi, req, cr, ro);
1202 1316                  return;
1203 1317          }
1204 1318  
1205 1319          /*
1206 1320           * Initialize status to RFSWRITE_INITVAL instead of 0, since value of 0
1207 1321           * is considered an OK.
1208 1322           */
1209 1323          ns->ns_status = RFSWRITE_INITVAL;
1210 1324  
↓ open down ↓ 4 lines elided ↑ open up ↑
1215 1329          nrp->cr = cr;
1216 1330          nrp->ro = ro;
1217 1331          nrp->thread = curthread;
1218 1332  
1219 1333          ASSERT(curthread->t_schedflag & TS_DONT_SWAP);
1220 1334  
1221 1335          /*
1222 1336           * Look to see if there is already a cluster started
1223 1337           * for this file.
1224 1338           */
1225      -        mutex_enter(&rfs_async_write_lock);
1226      -        for (lp = rfs_async_write_head; lp != NULL; lp = lp->next) {
     1339 +        mutex_enter(&nsrv->async_write_lock);
     1340 +        for (lp = nsrv->async_write_head; lp != NULL; lp = lp->next) {
1227 1341                  if (bcmp(&wa->wa_fhandle, lp->fhp,
1228 1342                      sizeof (fhandle_t)) == 0)
1229 1343                          break;
1230 1344          }
1231 1345  
1232 1346          /*
1233 1347           * If lp is non-NULL, then there is already a cluster
1234 1348           * started.  We need to place ourselves in the cluster
1235 1349           * list in the right place as determined by starting
1236 1350           * offset.  Conflicts with non-blocking mandatory locked
↓ open down ↓ 5 lines elided ↑ open up ↑
1242 1356                  while (rp != NULL && rp->wa->wa_offset < wa->wa_offset) {
1243 1357                          trp = rp;
1244 1358                          rp = rp->list;
1245 1359                  }
1246 1360                  nrp->list = rp;
1247 1361                  if (trp == NULL)
1248 1362                          lp->list = nrp;
1249 1363                  else
1250 1364                          trp->list = nrp;
1251 1365                  while (nrp->ns->ns_status == RFSWRITE_INITVAL)
1252      -                        cv_wait(&lp->cv, &rfs_async_write_lock);
1253      -                mutex_exit(&rfs_async_write_lock);
     1366 +                        cv_wait(&lp->cv, &nsrv->async_write_lock);
     1367 +                mutex_exit(&nsrv->async_write_lock);
1254 1368  
1255 1369                  return;
1256 1370          }
1257 1371  
1258 1372          /*
1259 1373           * No cluster started yet, start one and add ourselves
1260 1374           * to the list of clusters.
1261 1375           */
1262 1376          nrp->list = NULL;
1263 1377  
1264 1378          nlp = &nlpsp;
1265 1379          nlp->fhp = &wa->wa_fhandle;
1266 1380          cv_init(&nlp->cv, NULL, CV_DEFAULT, NULL);
1267 1381          nlp->list = nrp;
1268 1382          nlp->next = NULL;
1269 1383  
1270      -        if (rfs_async_write_head == NULL) {
1271      -                rfs_async_write_head = nlp;
     1384 +        if (nsrv->async_write_head == NULL) {
     1385 +                nsrv->async_write_head = nlp;
1272 1386          } else {
1273      -                lp = rfs_async_write_head;
     1387 +                lp = nsrv->async_write_head;
1274 1388                  while (lp->next != NULL)
1275 1389                          lp = lp->next;
1276 1390                  lp->next = nlp;
1277 1391          }
1278      -        mutex_exit(&rfs_async_write_lock);
     1392 +        mutex_exit(&nsrv->async_write_lock);
1279 1393  
1280 1394          /*
1281 1395           * Convert the file handle common to all of the requests
1282 1396           * in this cluster to a vnode.
1283 1397           */
1284 1398          vp = nfs_fhtovp(&wa->wa_fhandle, exi);
1285 1399          if (vp == NULL) {
1286      -                mutex_enter(&rfs_async_write_lock);
1287      -                if (rfs_async_write_head == nlp)
1288      -                        rfs_async_write_head = nlp->next;
     1400 +                mutex_enter(&nsrv->async_write_lock);
     1401 +                if (nsrv->async_write_head == nlp)
     1402 +                        nsrv->async_write_head = nlp->next;
1289 1403                  else {
1290      -                        lp = rfs_async_write_head;
     1404 +                        lp = nsrv->async_write_head;
1291 1405                          while (lp->next != nlp)
1292 1406                                  lp = lp->next;
1293 1407                          lp->next = nlp->next;
1294 1408                  }
1295 1409                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1296 1410                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1297 1411                          rp->ns->ns_status = NFSERR_STALE;
1298 1412                          rp->thread->t_flag |= t_flag;
1299 1413                  }
1300 1414                  cv_broadcast(&nlp->cv);
1301      -                mutex_exit(&rfs_async_write_lock);
     1415 +                mutex_exit(&nsrv->async_write_lock);
1302 1416  
1303 1417                  return;
1304 1418          }
1305 1419  
1306 1420          /*
1307 1421           * Can only write regular files.  Attempts to write any
1308 1422           * other file types fail with EISDIR.
1309 1423           */
1310 1424          if (vp->v_type != VREG) {
1311 1425                  VN_RELE(vp);
1312      -                mutex_enter(&rfs_async_write_lock);
1313      -                if (rfs_async_write_head == nlp)
1314      -                        rfs_async_write_head = nlp->next;
     1426 +                mutex_enter(&nsrv->async_write_lock);
     1427 +                if (nsrv->async_write_head == nlp)
     1428 +                        nsrv->async_write_head = nlp->next;
1315 1429                  else {
1316      -                        lp = rfs_async_write_head;
     1430 +                        lp = nsrv->async_write_head;
1317 1431                          while (lp->next != nlp)
1318 1432                                  lp = lp->next;
1319 1433                          lp->next = nlp->next;
1320 1434                  }
1321 1435                  t_flag = curthread->t_flag & T_WOULDBLOCK;
1322 1436                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1323 1437                          rp->ns->ns_status = NFSERR_ISDIR;
1324 1438                          rp->thread->t_flag |= t_flag;
1325 1439                  }
1326 1440                  cv_broadcast(&nlp->cv);
1327      -                mutex_exit(&rfs_async_write_lock);
     1441 +                mutex_exit(&nsrv->async_write_lock);
1328 1442  
1329 1443                  return;
1330 1444          }
1331 1445  
1332 1446          /*
1333 1447           * Enter the critical region before calling VOP_RWLOCK, to avoid a
1334 1448           * deadlock with ufs.
1335 1449           */
1336 1450          if (nbl_need_check(vp)) {
1337 1451                  nbl_start_crit(vp, RW_READER);
↓ open down ↓ 11 lines elided ↑ open up ↑
1349 1463           */
1350 1464          error = VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct);
1351 1465  
1352 1466          /* check if a monitor detected a delegation conflict */
1353 1467          if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) {
1354 1468                  if (in_crit)
1355 1469                          nbl_end_crit(vp);
1356 1470                  VN_RELE(vp);
1357 1471                  /* mark as wouldblock so response is dropped */
1358 1472                  curthread->t_flag |= T_WOULDBLOCK;
1359      -                mutex_enter(&rfs_async_write_lock);
1360      -                if (rfs_async_write_head == nlp)
1361      -                        rfs_async_write_head = nlp->next;
     1473 +                mutex_enter(&nsrv->async_write_lock);
     1474 +                if (nsrv->async_write_head == nlp)
     1475 +                        nsrv->async_write_head = nlp->next;
1362 1476                  else {
1363      -                        lp = rfs_async_write_head;
     1477 +                        lp = nsrv->async_write_head;
1364 1478                          while (lp->next != nlp)
1365 1479                                  lp = lp->next;
1366 1480                          lp->next = nlp->next;
1367 1481                  }
1368 1482                  for (rp = nlp->list; rp != NULL; rp = rp->list) {
1369 1483                          if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1370 1484                                  rp->ns->ns_status = puterrno(error);
1371 1485                                  rp->thread->t_flag |= T_WOULDBLOCK;
1372 1486                          }
1373 1487                  }
1374 1488                  cv_broadcast(&nlp->cv);
1375      -                mutex_exit(&rfs_async_write_lock);
     1489 +                mutex_exit(&nsrv->async_write_lock);
1376 1490  
1377 1491                  return;
1378 1492          }
1379 1493  
1380 1494          /*
1381 1495           * Disconnect this cluster from the list of clusters.
1382 1496           * The cluster that is being dealt with must be fixed
1383 1497           * in size after this point, so there is no reason
1384 1498           * to leave it on the list so that new requests can
1385 1499           * find it.
↓ open down ↓ 1 lines elided ↑ open up ↑
1387 1501           * The algorithm is that the first write request will
1388 1502           * create a cluster, convert the file handle to a
1389 1503           * vnode pointer, and then lock the file for writing.
1390 1504           * This request is not likely to be clustered with
1391 1505           * any others.  However, the next request will create
1392 1506           * a new cluster and be blocked in VOP_RWLOCK while
1393 1507           * the first request is being processed.  This delay
1394 1508           * will allow more requests to be clustered in this
1395 1509           * second cluster.
1396 1510           */
1397      -        mutex_enter(&rfs_async_write_lock);
1398      -        if (rfs_async_write_head == nlp)
1399      -                rfs_async_write_head = nlp->next;
     1511 +        mutex_enter(&nsrv->async_write_lock);
     1512 +        if (nsrv->async_write_head == nlp)
     1513 +                nsrv->async_write_head = nlp->next;
1400 1514          else {
1401      -                lp = rfs_async_write_head;
     1515 +                lp = nsrv->async_write_head;
1402 1516                  while (lp->next != nlp)
1403 1517                          lp = lp->next;
1404 1518                  lp->next = nlp->next;
1405 1519          }
1406      -        mutex_exit(&rfs_async_write_lock);
     1520 +        mutex_exit(&nsrv->async_write_lock);
1407 1521  
1408 1522          /*
1409 1523           * Step through the list of requests in this cluster.
1410 1524           * We need to check permissions to make sure that all
1411 1525           * of the requests have sufficient permission to write
1412 1526           * the file.  A cluster can be composed of requests
1413 1527           * from different clients and different users on each
1414 1528           * client.
1415 1529           *
1416 1530           * As a side effect, we also calculate the size of the
↓ open down ↓ 224 lines elided ↑ open up ↑
1641 1755                  }
1642 1756          }
1643 1757  
1644 1758          VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct);
1645 1759  
1646 1760          if (in_crit)
1647 1761                  nbl_end_crit(vp);
1648 1762          VN_RELE(vp);
1649 1763  
1650 1764          t_flag = curthread->t_flag & T_WOULDBLOCK;
1651      -        mutex_enter(&rfs_async_write_lock);
     1765 +        mutex_enter(&nsrv->async_write_lock);
1652 1766          for (rp = nlp->list; rp != NULL; rp = rp->list) {
1653 1767                  if (rp->ns->ns_status == RFSWRITE_INITVAL) {
1654 1768                          rp->ns->ns_status = puterrno(error);
1655 1769                          rp->thread->t_flag |= t_flag;
1656 1770                  }
1657 1771          }
1658 1772          cv_broadcast(&nlp->cv);
1659      -        mutex_exit(&rfs_async_write_lock);
     1773 +        mutex_exit(&nsrv->async_write_lock);
1660 1774  
1661 1775  }
1662 1776  
1663 1777  void *
1664 1778  rfs_write_getfh(struct nfswriteargs *wa)
1665 1779  {
1666 1780          return (&wa->wa_fhandle);
1667 1781  }
1668 1782  
1669 1783  /*
↓ open down ↓ 41 lines elided ↑ open up ↑
1711 1825  
1712 1826          /*
1713 1827           * Must specify the mode.
1714 1828           */
1715 1829          if (!(va.va_mask & AT_MODE)) {
1716 1830                  VN_RELE(dvp);
1717 1831                  dr->dr_status = NFSERR_INVAL;
1718 1832                  return;
1719 1833          }
1720 1834  
     1835 +        if (protect_zfs_mntpt(dvp) != 0) {
     1836 +                VN_RELE(dvp);
     1837 +                dr->dr_status = NFSERR_ACCES;
     1838 +                return;
     1839 +        }
     1840 +
1721 1841          /*
1722 1842           * This is a completely gross hack to make mknod
1723 1843           * work over the wire until we can wack the protocol
1724 1844           */
1725 1845          if ((va.va_mode & IFMT) == IFCHR) {
1726 1846                  if (args->ca_sa->sa_size == (uint_t)NFS_FIFO_DEV)
1727 1847                          va.va_type = VFIFO;     /* xtra kludge for named pipe */
1728 1848                  else {
1729 1849                          va.va_type = VCHR;
1730 1850                          /*
↓ open down ↓ 319 lines elided ↑ open up ↑
2050 2170                  return;
2051 2171          }
2052 2172  
2053 2173          fh = args->rna_to.da_fhandle;
2054 2174          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2055 2175          if (to_exi == NULL) {
2056 2176                  VN_RELE(fromvp);
2057 2177                  *status = NFSERR_ACCES;
2058 2178                  return;
2059 2179          }
2060      -        exi_rele(to_exi);
     2180 +        exi_rele(&to_exi);
2061 2181  
2062 2182          if (to_exi != exi) {
2063 2183                  VN_RELE(fromvp);
2064 2184                  *status = NFSERR_XDEV;
2065 2185                  return;
2066 2186          }
2067 2187  
2068 2188          tovp = nfs_fhtovp(args->rna_to.da_fhandle, exi);
2069 2189          if (tovp == NULL) {
2070 2190                  VN_RELE(fromvp);
↓ open down ↓ 19 lines elided ↑ open up ↑
2090 2210                  return;
2091 2211          }
2092 2212  
2093 2213          if (rdonly(ro, tovp)) {
2094 2214                  VN_RELE(tovp);
2095 2215                  VN_RELE(fromvp);
2096 2216                  *status = NFSERR_ROFS;
2097 2217                  return;
2098 2218          }
2099 2219  
     2220 +        if (protect_zfs_mntpt(tovp) != 0) {
     2221 +                VN_RELE(tovp);
     2222 +                VN_RELE(fromvp);
     2223 +                *status = NFSERR_ACCES;
     2224 +                return;
     2225 +        }
     2226 +
2100 2227          /*
2101 2228           * Check for a conflict with a non-blocking mandatory share reservation.
2102 2229           */
2103 2230          error = VOP_LOOKUP(fromvp, args->rna_from.da_name, &srcvp, NULL, 0,
2104 2231              NULL, cr, NULL, NULL, NULL);
2105 2232          if (error != 0) {
2106 2233                  VN_RELE(tovp);
2107 2234                  VN_RELE(fromvp);
2108 2235                  *status = puterrno(error);
2109 2236                  return;
↓ open down ↓ 4 lines elided ↑ open up ↑
2114 2241          if (rfs4_check_delegated(FWRITE, srcvp, FALSE)) {
2115 2242                  VN_RELE(tovp);
2116 2243                  VN_RELE(fromvp);
2117 2244                  VN_RELE(srcvp);
2118 2245                  curthread->t_flag |= T_WOULDBLOCK;
2119 2246                  return;
2120 2247          }
2121 2248  
2122 2249          /* Check for delegation on the file being renamed over, if it exists */
2123 2250  
2124      -        if (rfs4_deleg_policy != SRV_NEVER_DELEGATE &&
     2251 +        if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE &&
2125 2252              VOP_LOOKUP(tovp, args->rna_to.da_name, &targvp, NULL, 0, NULL, cr,
2126 2253              NULL, NULL, NULL) == 0) {
2127 2254  
2128 2255                  if (rfs4_check_delegated(FWRITE, targvp, TRUE)) {
2129 2256                          VN_RELE(tovp);
2130 2257                          VN_RELE(fromvp);
2131 2258                          VN_RELE(srcvp);
2132 2259                          VN_RELE(targvp);
2133 2260                          curthread->t_flag |= T_WOULDBLOCK;
2134 2261                          return;
↓ open down ↓ 61 lines elided ↑ open up ↑
2196 2323                  return;
2197 2324          }
2198 2325  
2199 2326          fh = args->la_to.da_fhandle;
2200 2327          to_exi = checkexport(&fh->fh_fsid, (fid_t *)&fh->fh_xlen);
2201 2328          if (to_exi == NULL) {
2202 2329                  VN_RELE(fromvp);
2203 2330                  *status = NFSERR_ACCES;
2204 2331                  return;
2205 2332          }
2206      -        exi_rele(to_exi);
     2333 +        exi_rele(&to_exi);
2207 2334  
2208 2335          if (to_exi != exi) {
2209 2336                  VN_RELE(fromvp);
2210 2337                  *status = NFSERR_XDEV;
2211 2338                  return;
2212 2339          }
2213 2340  
2214 2341          tovp = nfs_fhtovp(args->la_to.da_fhandle, exi);
2215 2342          if (tovp == NULL) {
2216 2343                  VN_RELE(fromvp);
↓ open down ↓ 17 lines elided ↑ open up ↑
2234 2361                  return;
2235 2362          }
2236 2363  
2237 2364          if (rdonly(ro, tovp)) {
2238 2365                  VN_RELE(tovp);
2239 2366                  VN_RELE(fromvp);
2240 2367                  *status = NFSERR_ROFS;
2241 2368                  return;
2242 2369          }
2243 2370  
     2371 +        if (protect_zfs_mntpt(tovp) != 0) {
     2372 +                VN_RELE(tovp);
     2373 +                VN_RELE(fromvp);
     2374 +                *status = NFSERR_ACCES;
     2375 +                return;
     2376 +        }
     2377 +
2244 2378          error = VOP_LINK(tovp, fromvp, args->la_to.da_name, cr, NULL, 0);
2245 2379  
2246 2380          /*
2247 2381           * Force modified data and metadata out to stable storage.
2248 2382           */
2249 2383          (void) VOP_FSYNC(tovp, 0, cr, NULL);
2250 2384          (void) VOP_FSYNC(fromvp, FNODSYNC, cr, NULL);
2251 2385  
2252 2386          VN_RELE(tovp);
2253 2387          VN_RELE(fromvp);
↓ open down ↓ 2 lines elided ↑ open up ↑
2256 2390  
2257 2391  }
2258 2392  void *
2259 2393  rfs_link_getfh(struct nfslinkargs *args)
2260 2394  {
2261 2395          return (args->la_from);
2262 2396  }
2263 2397  
2264 2398  /*
2265 2399   * Symbolicly link to a file.
2266      - * Create a file (to) with the given attributes which is a symbolic link
     2400 + * Create a file (from) with the given attributes which is a symbolic link
2267 2401   * to the given path name (to).
2268 2402   */
2269 2403  void
2270 2404  rfs_symlink(struct nfsslargs *args, enum nfsstat *status,
2271 2405      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2272 2406  {
2273 2407          int error;
2274 2408          struct vattr va;
2275 2409          vnode_t *vp;
2276 2410          vnode_t *svp;
↓ open down ↓ 27 lines elided ↑ open up ↑
2304 2438                  *status = puterrno(error);
2305 2439                  return;
2306 2440          }
2307 2441  
2308 2442          if (!(va.va_mask & AT_MODE)) {
2309 2443                  VN_RELE(vp);
2310 2444                  *status = NFSERR_INVAL;
2311 2445                  return;
2312 2446          }
2313 2447  
     2448 +        if (protect_zfs_mntpt(vp) != 0) {
     2449 +                VN_RELE(vp);
     2450 +                *status = NFSERR_ACCES;
     2451 +                return;
     2452 +        }
     2453 +
2314 2454          ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2315 2455          name = nfscmd_convname(ca, exi, args->sla_tnm,
2316 2456              NFSCMD_CONV_INBOUND, MAXPATHLEN);
2317 2457  
2318 2458          if (name == NULL) {
2319 2459                  *status = NFSERR_ACCES;
2320 2460                  return;
2321 2461          }
2322 2462  
2323 2463          va.va_type = VLNK;
↓ open down ↓ 72 lines elided ↑ open up ↑
2396 2536                  dr->dr_status = puterrno(error);
2397 2537                  return;
2398 2538          }
2399 2539  
2400 2540          if (!(va.va_mask & AT_MODE)) {
2401 2541                  VN_RELE(vp);
2402 2542                  dr->dr_status = NFSERR_INVAL;
2403 2543                  return;
2404 2544          }
2405 2545  
     2546 +        if (protect_zfs_mntpt(vp) != 0) {
     2547 +                VN_RELE(vp);
     2548 +                dr->dr_status = NFSERR_ACCES;
     2549 +                return;
     2550 +        }
     2551 +
2406 2552          va.va_type = VDIR;
2407 2553          va.va_mask |= AT_TYPE;
2408 2554  
2409 2555          error = VOP_MKDIR(vp, name, &va, &dvp, cr, NULL, 0, NULL);
2410 2556  
2411 2557          if (!error) {
2412 2558                  /*
2413 2559                   * Attribtutes of the newly created directory should
2414 2560                   * be returned to the client.
2415 2561                   */
↓ open down ↓ 65 lines elided ↑ open up ↑
2481 2627  
2482 2628          /*
2483 2629           * VOP_RMDIR takes a third argument (the current
2484 2630           * directory of the process).  That's because someone
2485 2631           * wants to return EINVAL if one tries to remove ".".
2486 2632           * Of course, NFS servers have no idea what their
2487 2633           * clients' current directories are.  We fake it by
2488 2634           * supplying a vnode known to exist and illegal to
2489 2635           * remove.
2490 2636           */
2491      -        error = VOP_RMDIR(vp, da->da_name, rootdir, cr, NULL, 0);
     2637 +        error = VOP_RMDIR(vp, da->da_name, ZONE_ROOTVP(), cr, NULL, 0);
2492 2638  
2493 2639          /*
2494 2640           * Force modified data and metadata out to stable storage.
2495 2641           */
2496 2642          (void) VOP_FSYNC(vp, 0, cr, NULL);
2497 2643  
2498 2644          VN_RELE(vp);
2499 2645  
2500 2646          /*
2501 2647           * System V defines rmdir to return EEXIST, not ENOTEMPTY,
↓ open down ↓ 6 lines elided ↑ open up ↑
2508 2654          else
2509 2655                  *status = puterrno(error);
2510 2656  
2511 2657  }
2512 2658  void *
2513 2659  rfs_rmdir_getfh(struct nfsdiropargs *da)
2514 2660  {
2515 2661          return (da->da_fhandle);
2516 2662  }
2517 2663  
     2664 +#ifdef nextdp
     2665 +#undef nextdp
     2666 +#endif
     2667 +#define nextdp(dp)      ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
     2668 +
2518 2669  /* ARGSUSED */
2519 2670  void
2520 2671  rfs_readdir(struct nfsrddirargs *rda, struct nfsrddirres *rd,
2521 2672      struct exportinfo *exi, struct svc_req *req, cred_t *cr, bool_t ro)
2522 2673  {
2523 2674          int error;
2524      -        int iseof;
     2675 +        vnode_t *vp;
2525 2676          struct iovec iov;
2526 2677          struct uio uio;
2527      -        vnode_t *vp;
2528      -        char *ndata = NULL;
     2678 +        int iseof;
     2679 +
     2680 +        uint32_t count = rda->rda_count;
     2681 +        uint32_t size;          /* size of the readdirres structure */
     2682 +        int overflow = 0;
     2683 +
     2684 +        size_t datasz;
     2685 +        char *data = NULL;
     2686 +        dirent64_t *dp;
     2687 +
2529 2688          struct sockaddr *ca;
2530      -        size_t nents;
2531      -        int ret;
     2689 +        struct nfsentry **eptr;
     2690 +        struct nfsentry *entry;
2532 2691  
2533 2692          vp = nfs_fhtovp(&rda->rda_fh, exi);
2534 2693          if (vp == NULL) {
2535      -                rd->rd_entries = NULL;
2536 2694                  rd->rd_status = NFSERR_STALE;
2537 2695                  return;
2538 2696          }
2539 2697  
2540 2698          if (vp->v_type != VDIR) {
2541 2699                  VN_RELE(vp);
2542      -                rd->rd_entries = NULL;
2543 2700                  rd->rd_status = NFSERR_NOTDIR;
2544 2701                  return;
2545 2702          }
2546 2703  
2547 2704          (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
2548 2705  
2549 2706          error = VOP_ACCESS(vp, VREAD, 0, cr, NULL);
2550      -
2551      -        if (error) {
2552      -                rd->rd_entries = NULL;
     2707 +        if (error)
2553 2708                  goto bad;
2554      -        }
2555 2709  
2556      -        if (rda->rda_count == 0) {
2557      -                rd->rd_entries = NULL;
2558      -                rd->rd_size = 0;
2559      -                rd->rd_eof = FALSE;
2560      -                goto bad;
2561      -        }
     2710 +        /*
     2711 +         * Don't allow arbitrary counts for allocation
     2712 +         */
     2713 +        if (count > NFS_MAXDATA)
     2714 +                count = NFS_MAXDATA;
2562 2715  
2563      -        rda->rda_count = MIN(rda->rda_count, NFS_MAXDATA);
     2716 +        /*
     2717 +         * struct readdirres:
     2718 +         *   status:            1
     2719 +         *   entries (bool):    1
     2720 +         *   eof:               1
     2721 +         */
     2722 +        size = (1 + 1 + 1) * BYTES_PER_XDR_UNIT;
2564 2723  
     2724 +        if (size > count) {
     2725 +                eptr = &rd->rd_entries;
     2726 +                iseof = 0;
     2727 +                size = 0;
     2728 +
     2729 +                goto done;
     2730 +        }
     2731 +
2565 2732          /*
2566      -         * Allocate data for entries.  This will be freed by rfs_rddirfree.
     2733 +         * This is simplification.  The dirent64_t size is not the same as the
     2734 +         * size of XDR representation of entry, but the sizes are similar so
     2735 +         * we'll assume they are same.  This assumption should not cause any
     2736 +         * harm.  In worst case we will need to issue VOP_READDIR() once more.
2567 2737           */
2568      -        rd->rd_bufsize = (uint_t)rda->rda_count;
2569      -        rd->rd_entries = kmem_alloc(rd->rd_bufsize, KM_SLEEP);
     2738 +        datasz = count;
2570 2739  
2571 2740          /*
2572      -         * Set up io vector to read directory data
     2741 +         * Make sure that there is room to read at least one entry
     2742 +         * if any are available.
2573 2743           */
2574      -        iov.iov_base = (caddr_t)rd->rd_entries;
2575      -        iov.iov_len = rda->rda_count;
     2744 +        if (datasz < DIRENT64_RECLEN(MAXNAMELEN))
     2745 +                datasz = DIRENT64_RECLEN(MAXNAMELEN);
     2746 +
     2747 +        data = kmem_alloc(datasz, KM_NOSLEEP);
     2748 +        if (data == NULL) {
     2749 +                /* The allocation failed; downsize and wait for it this time */
     2750 +                if (datasz > MAXBSIZE)
     2751 +                        datasz = MAXBSIZE;
     2752 +                data = kmem_alloc(datasz, KM_SLEEP);
     2753 +        }
     2754 +
2576 2755          uio.uio_iov = &iov;
2577 2756          uio.uio_iovcnt = 1;
2578 2757          uio.uio_segflg = UIO_SYSSPACE;
2579 2758          uio.uio_extflg = UIO_COPY_CACHED;
2580 2759          uio.uio_loffset = (offset_t)rda->rda_offset;
2581      -        uio.uio_resid = rda->rda_count;
     2760 +        uio.uio_resid = datasz;
2582 2761  
2583      -        /*
2584      -         * read directory
2585      -         */
     2762 +        ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
     2763 +        eptr = &rd->rd_entries;
     2764 +        entry = NULL;
     2765 +
     2766 +getmoredents:
     2767 +        iov.iov_base = data;
     2768 +        iov.iov_len = datasz;
     2769 +
2586 2770          error = VOP_READDIR(vp, &uio, cr, &iseof, NULL, 0);
     2771 +        if (error) {
     2772 +                iseof = 0;
     2773 +                goto done;
     2774 +        }
2587 2775  
2588      -        /*
2589      -         * Clean up
2590      -         */
2591      -        if (!error) {
2592      -                /*
2593      -                 * set size and eof
2594      -                 */
2595      -                if (uio.uio_resid == rda->rda_count) {
2596      -                        rd->rd_size = 0;
2597      -                        rd->rd_eof = TRUE;
2598      -                } else {
2599      -                        rd->rd_size = (uint32_t)(rda->rda_count -
2600      -                            uio.uio_resid);
2601      -                        rd->rd_eof = iseof ? TRUE : FALSE;
     2776 +        if (iov.iov_len == datasz)
     2777 +                goto done;
     2778 +
     2779 +        for (dp = (dirent64_t *)data;
     2780 +            (char *)dp - data < datasz - iov.iov_len && !overflow;
     2781 +            dp = nextdp(dp)) {
     2782 +                char *name;
     2783 +                uint32_t esize;
     2784 +                uint32_t cookie;
     2785 +
     2786 +                overflow = (uint64_t)dp->d_off > UINT32_MAX;
     2787 +                if (overflow) {
     2788 +                        cookie = 0;
     2789 +                        iseof = 1;
     2790 +                } else
     2791 +                        cookie = (uint32_t)dp->d_off;
     2792 +
     2793 +                if (dp->d_ino == 0 || (uint64_t)dp->d_ino > UINT32_MAX) {
     2794 +                        if (entry != NULL)
     2795 +                                entry->cookie = cookie;
     2796 +                        continue;
2602 2797                  }
2603      -        }
2604 2798  
2605      -        ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
2606      -        nents = nfscmd_countents((char *)rd->rd_entries, rd->rd_size);
2607      -        ret = nfscmd_convdirplus(ca, exi, (char *)rd->rd_entries, nents,
2608      -            rda->rda_count, &ndata);
     2799 +                name = nfscmd_convname(ca, exi, dp->d_name,
     2800 +                    NFSCMD_CONV_OUTBOUND, NFS_MAXPATHLEN + 1);
     2801 +                if (name == NULL) {
     2802 +                        if (entry != NULL)
     2803 +                                entry->cookie = cookie;
     2804 +                        continue;
     2805 +                }
2609 2806  
2610      -        if (ret != 0) {
2611      -                size_t dropbytes;
2612 2807                  /*
2613      -                 * We had to drop one or more entries in order to fit
2614      -                 * during the character conversion.  We need to patch
2615      -                 * up the size and eof info.
     2808 +                 * struct entry:
     2809 +                 *   fileid:            1
     2810 +                 *   name (length):     1
     2811 +                 *   name (data):       length (rounded up)
     2812 +                 *   cookie:            1
     2813 +                 *   nextentry (bool):  1
2616 2814                   */
2617      -                if (rd->rd_eof)
2618      -                        rd->rd_eof = FALSE;
2619      -                dropbytes = nfscmd_dropped_entrysize(
2620      -                    (struct dirent64 *)rd->rd_entries, nents, ret);
2621      -                rd->rd_size -= dropbytes;
     2815 +                esize = (1 + 1 + 1 + 1) * BYTES_PER_XDR_UNIT +
     2816 +                    RNDUP(strlen(name));
     2817 +
     2818 +                /* If the new entry does not fit, discard it */
     2819 +                if (esize > count - size) {
     2820 +                        if (name != dp->d_name)
     2821 +                                kmem_free(name, NFS_MAXPATHLEN + 1);
     2822 +                        iseof = 0;
     2823 +                        goto done;
     2824 +                }
     2825 +
     2826 +                entry = kmem_alloc(sizeof (struct nfsentry), KM_SLEEP);
     2827 +
     2828 +                entry->fileid = (uint32_t)dp->d_ino;
     2829 +                entry->name = strdup(name);
     2830 +                if (name != dp->d_name)
     2831 +                        kmem_free(name, NFS_MAXPATHLEN + 1);
     2832 +                entry->cookie = cookie;
     2833 +
     2834 +                size += esize;
     2835 +
     2836 +                /* Add the entry to the linked list */
     2837 +                *eptr = entry;
     2838 +                eptr = &entry->nextentry;
2622 2839          }
2623      -        if (ndata == NULL) {
2624      -                ndata = (char *)rd->rd_entries;
2625      -        } else if (ndata != (char *)rd->rd_entries) {
2626      -                kmem_free(rd->rd_entries, rd->rd_bufsize);
2627      -                rd->rd_entries = (void *)ndata;
2628      -                rd->rd_bufsize = rda->rda_count;
     2840 +
     2841 +        if (!iseof && size < count) {
     2842 +                uio.uio_resid = MIN(datasz, MAXBSIZE);
     2843 +                goto getmoredents;
2629 2844          }
2630 2845  
     2846 +done:
     2847 +        *eptr = NULL;
     2848 +
     2849 +        if (iseof || rd->rd_entries != NULL || !error) {
     2850 +                error = 0;
     2851 +                rd->rd_eof = iseof ? TRUE : FALSE;
     2852 +
     2853 +                /* This is for nfslog only */
     2854 +                rd->rd_offset = rda->rda_offset;
     2855 +                rd->rd_size = size;
     2856 +        }
     2857 +
2631 2858  bad:
2632 2859          VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
2633 2860  
2634 2861  #if 0 /* notyet */
2635 2862          /*
2636 2863           * Don't do this.  It causes local disk writes when just
2637 2864           * reading the file and the overhead is deemed larger
2638 2865           * than the benefit.
2639 2866           */
2640 2867          /*
2641 2868           * Force modified metadata out to stable storage.
2642 2869           */
2643 2870          (void) VOP_FSYNC(vp, FNODSYNC, cr, NULL);
2644 2871  #endif
2645 2872  
2646 2873          VN_RELE(vp);
2647 2874  
2648 2875          rd->rd_status = puterrno(error);
2649 2876  
     2877 +        if (data != NULL)
     2878 +                kmem_free(data, datasz);
2650 2879  }
2651 2880  void *
2652 2881  rfs_readdir_getfh(struct nfsrddirargs *rda)
2653 2882  {
2654 2883          return (&rda->rda_fh);
2655 2884  }
2656 2885  void
2657 2886  rfs_rddirfree(struct nfsrddirres *rd)
2658 2887  {
2659      -        if (rd->rd_entries != NULL)
2660      -                kmem_free(rd->rd_entries, rd->rd_bufsize);
     2888 +        if (rd->rd_status == NFS_OK) {
     2889 +                struct nfsentry *entry, *nentry;
     2890 +
     2891 +                for (entry = rd->rd_entries; entry != NULL; entry = nentry) {
     2892 +                        nentry = entry->nextentry;
     2893 +                        strfree(entry->name);
     2894 +                        kmem_free(entry, sizeof (struct nfsentry));
     2895 +                }
     2896 +        }
2661 2897  }
2662 2898  
2663 2899  /* ARGSUSED */
2664 2900  void
2665 2901  rfs_statfs(fhandle_t *fh, struct nfsstatfs *fs, struct exportinfo *exi,
2666 2902      struct svc_req *req, cred_t *cr, bool_t ro)
2667 2903  {
2668 2904          int error;
2669 2905          struct statvfs64 sb;
2670 2906          vnode_t *vp;
↓ open down ↓ 85 lines elided ↑ open up ↑
2756 2992                  /*
2757 2993                   * nfs protocol defines times as unsigned so don't extend sign,
2758 2994                   * unless sysadmin set nfs_allow_preepoch_time.
2759 2995                   */
2760 2996                  NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, sa->sa_mtime.tv_sec);
2761 2997                  vap->va_mtime.tv_nsec = (uint32_t)(sa->sa_mtime.tv_usec * 1000);
2762 2998          }
2763 2999          return (0);
2764 3000  }
2765 3001  
2766      -static enum nfsftype vt_to_nf[] = {
     3002 +static const enum nfsftype vt_to_nf[] = {
2767 3003          0, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, 0, 0, 0, NFSOC, 0
2768 3004  };
2769 3005  
2770 3006  /*
2771 3007   * check the following fields for overflow: nodeid, size, and time.
2772 3008   * There could be a problem when converting 64-bit LP64 fields
2773 3009   * into 32-bit ones.  Return an error if there is an overflow.
2774 3010   */
2775 3011  int
2776 3012  vattr_to_nattr(struct vattr *vap, struct nfsfattr *na)
↓ open down ↓ 198 lines elided ↑ open up ↑
2975 3211                  }
2976 3212                  if (vsa.vsa_aclcnt)
2977 3213                          kmem_free(vsa.vsa_aclentp,
2978 3214                              vsa.vsa_aclcnt * sizeof (aclent_t));
2979 3215          }
2980 3216  }
2981 3217  
2982 3218  void
2983 3219  rfs_srvrinit(void)
2984 3220  {
2985      -        mutex_init(&rfs_async_write_lock, NULL, MUTEX_DEFAULT, NULL);
2986 3221          nfs2_srv_caller_id = fs_new_caller_id();
     3222 +        zone_key_create(&rfs_zone_key, rfs_zone_init, NULL, rfs_zone_fini);
2987 3223  }
2988 3224  
2989 3225  void
2990 3226  rfs_srvrfini(void)
2991 3227  {
2992      -        mutex_destroy(&rfs_async_write_lock);
2993 3228  }
2994 3229  
     3230 +/* ARGSUSED */
     3231 +static void *
     3232 +rfs_zone_init(zoneid_t zoneid)
     3233 +{
     3234 +        nfs_srv_t *ns;
     3235 +
     3236 +        ns = kmem_zalloc(sizeof (*ns), KM_SLEEP);
     3237 +
     3238 +        mutex_init(&ns->async_write_lock, NULL, MUTEX_DEFAULT, NULL);
     3239 +        ns->write_async = 1;
     3240 +
     3241 +        return (ns);
     3242 +}
     3243 +
     3244 +/* ARGSUSED */
     3245 +static void
     3246 +rfs_zone_fini(zoneid_t zoneid, void *data)
     3247 +{
     3248 +        nfs_srv_t *ns;
     3249 +
     3250 +        ns = (nfs_srv_t *)data;
     3251 +        mutex_destroy(&ns->async_write_lock);
     3252 +        kmem_free(ns, sizeof (*ns));
     3253 +}
     3254 +
2995 3255  static int
2996 3256  rdma_setup_read_data2(struct nfsreadargs *ra, struct nfsrdresult *rr)
2997 3257  {
2998 3258          struct clist    *wcl;
2999 3259          int             wlist_len;
3000 3260          uint32_t        count = rr->rr_count;
3001 3261  
3002 3262          wcl = ra->ra_wlist;
3003 3263  
3004 3264          if (rdma_setup_read_chunks(wcl, count, &wlist_len) == FALSE) {
3005 3265                  return (FALSE);
3006 3266          }
3007 3267  
3008 3268          wcl = ra->ra_wlist;
3009 3269          rr->rr_ok.rrok_wlist_len = wlist_len;
3010 3270          rr->rr_ok.rrok_wlist = wcl;
3011 3271  
3012 3272          return (TRUE);
3013 3273  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX