1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright 2018 Nexenta Systems, Inc.
  28  */
  29 
  30 #include <sys/systm.h>
  31 #include <sys/kmem.h>
  32 #include <sys/cmn_err.h>
  33 #include <sys/atomic.h>
  34 #include <sys/clconf.h>
  35 #include <sys/cladm.h>
  36 #include <sys/flock.h>
  37 #include <nfs/export.h>
  38 #include <nfs/nfs.h>
  39 #include <nfs/nfs4.h>
  40 #include <nfs/nfssys.h>
  41 #include <nfs/lm.h>
  42 #include <sys/pathname.h>
  43 #include <sys/sdt.h>
  44 #include <sys/nvpair.h>
  45 
  46 extern u_longlong_t nfs4_srv_caller_id;
  47 
  48 extern uint_t nfs4_srv_vkey;
  49 
  50 stateid4 special0 = {
  51         0,
  52         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
  53 };
  54 
  55 stateid4 special1 = {
  56         0xffffffff,
  57         {
  58                 (char)0xff, (char)0xff, (char)0xff, (char)0xff,
  59                 (char)0xff, (char)0xff, (char)0xff, (char)0xff,
  60                 (char)0xff, (char)0xff, (char)0xff, (char)0xff
  61         }
  62 };
  63 
  64 
  65 #define ISSPECIAL(id)  (stateid4_cmp(id, &special0) || \
  66                         stateid4_cmp(id, &special1))
  67 
  68 /* For embedding the cluster nodeid into our clientid */
  69 #define CLUSTER_NODEID_SHIFT    24
  70 #define CLUSTER_MAX_NODEID      255
  71 
  72 #ifdef DEBUG
  73 int rfs4_debug;
  74 #endif
  75 
  76 static uint32_t rfs4_database_debug = 0x00;
  77 
  78 /* CSTYLED */
  79 static void rfs4_ss_clid_write(nfs4_srv_t *nsrv4, rfs4_client_t *cp, char *leaf);
  80 static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
  81 static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
  82 static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
  83 
  84 /*
  85  * Couple of simple init/destroy functions for a general waiter
  86  */
  87 void
  88 rfs4_sw_init(rfs4_state_wait_t *swp)
  89 {
  90         mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
  91         cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
  92         swp->sw_active = FALSE;
  93         swp->sw_wait_count = 0;
  94 }
  95 
  96 void
  97 rfs4_sw_destroy(rfs4_state_wait_t *swp)
  98 {
  99         mutex_destroy(swp->sw_cv_lock);
 100         cv_destroy(swp->sw_cv);
 101 }
 102 
 103 void
 104 rfs4_sw_enter(rfs4_state_wait_t *swp)
 105 {
 106         mutex_enter(swp->sw_cv_lock);
 107         while (swp->sw_active) {
 108                 swp->sw_wait_count++;
 109                 cv_wait(swp->sw_cv, swp->sw_cv_lock);
 110                 swp->sw_wait_count--;
 111         }
 112         ASSERT(swp->sw_active == FALSE);
 113         swp->sw_active = TRUE;
 114         mutex_exit(swp->sw_cv_lock);
 115 }
 116 
 117 void
 118 rfs4_sw_exit(rfs4_state_wait_t *swp)
 119 {
 120         mutex_enter(swp->sw_cv_lock);
 121         ASSERT(swp->sw_active == TRUE);
 122         swp->sw_active = FALSE;
 123         if (swp->sw_wait_count != 0)
 124                 cv_broadcast(swp->sw_cv);
 125         mutex_exit(swp->sw_cv_lock);
 126 }
 127 
 128 static void
 129 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
 130 {
 131         lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
 132         lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
 133 
 134         if (sres->status == NFS4ERR_DENIED) {
 135                 dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
 136                 bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
 137         }
 138 }
 139 
 140 /*
 141  * CPR callback id -- not related to v4 callbacks
 142  */
 143 static callb_id_t cpr_id = 0;
 144 
 145 static void
 146 deep_lock_free(LOCK4res *res)
 147 {
 148         lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
 149 
 150         if (res->status == NFS4ERR_DENIED)
 151                 kmem_free(lo->owner_val, lo->owner_len);
 152 }
 153 
 154 static void
 155 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
 156 {
 157         nfsace4 *sacep, *dacep;
 158 
 159         if (sres->status != NFS4_OK) {
 160                 return;
 161         }
 162 
 163         dres->attrset = sres->attrset;
 164 
 165         switch (sres->delegation.delegation_type) {
 166         case OPEN_DELEGATE_NONE:
 167                 return;
 168         case OPEN_DELEGATE_READ:
 169                 sacep = &sres->delegation.open_delegation4_u.read.permissions;
 170                 dacep = &dres->delegation.open_delegation4_u.read.permissions;
 171                 break;
 172         case OPEN_DELEGATE_WRITE:
 173                 sacep = &sres->delegation.open_delegation4_u.write.permissions;
 174                 dacep = &dres->delegation.open_delegation4_u.write.permissions;
 175                 break;
 176         }
 177         dacep->who.utf8string_val =
 178             kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
 179         bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
 180             sacep->who.utf8string_len);
 181 }
 182 
 183 static void
 184 deep_open_free(OPEN4res *res)
 185 {
 186         nfsace4 *acep;
 187         if (res->status != NFS4_OK)
 188                 return;
 189 
 190         switch (res->delegation.delegation_type) {
 191         case OPEN_DELEGATE_NONE:
 192                 return;
 193         case OPEN_DELEGATE_READ:
 194                 acep = &res->delegation.open_delegation4_u.read.permissions;
 195                 break;
 196         case OPEN_DELEGATE_WRITE:
 197                 acep = &res->delegation.open_delegation4_u.write.permissions;
 198                 break;
 199         }
 200 
 201         if (acep->who.utf8string_val) {
 202                 kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
 203                 acep->who.utf8string_val = NULL;
 204         }
 205 }
 206 
 207 void
 208 rfs4_free_reply(nfs_resop4 *rp)
 209 {
 210         switch (rp->resop) {
 211         case OP_LOCK:
 212                 deep_lock_free(&rp->nfs_resop4_u.oplock);
 213                 break;
 214         case OP_OPEN:
 215                 deep_open_free(&rp->nfs_resop4_u.opopen);
 216         default:
 217                 break;
 218         }
 219 }
 220 
 221 void
 222 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
 223 {
 224         *dst = *src;
 225 
 226         /* Handle responses that need deep copy */
 227         switch (src->resop) {
 228         case OP_LOCK:
 229                 deep_lock_copy(&dst->nfs_resop4_u.oplock,
 230                     &src->nfs_resop4_u.oplock);
 231                 break;
 232         case OP_OPEN:
 233                 deep_open_copy(&dst->nfs_resop4_u.opopen,
 234                     &src->nfs_resop4_u.opopen);
 235                 break;
 236         default:
 237                 break;
 238         };
 239 }
 240 
 241 /*
 242  * This is the implementation of the underlying state engine. The
 243  * public interface to this engine is described by
 244  * nfs4_state.h. Callers to the engine should hold no state engine
 245  * locks when they call in to it. If the protocol needs to lock data
 246  * structures it should do so after acquiring all references to them
 247  * first and then follow the following lock order:
 248  *
 249  *      client > openowner > state > lo_state > lockowner > file.
 250  *
 251  * Internally we only allow a thread to hold one hash bucket lock at a
 252  * time and the lock is higher in the lock order (must be acquired
 253  * first) than the data structure that is on that hash list.
 254  *
 255  * If a new reference was acquired by the caller, that reference needs
 256  * to be released after releasing all acquired locks with the
 257  * corresponding rfs4_*_rele routine.
 258  */
 259 
 260 /*
 261  * This code is some what prototypical for now. Its purpose currently is to
 262  * implement the interfaces sufficiently to finish the higher protocol
 263  * elements. This will be replaced by a dynamically resizeable tables
 264  * backed by kmem_cache allocator. However synchronization is handled
 265  * correctly (I hope) and will not change by much.  The mutexes for
 266  * the hash buckets that can be used to create new instances of data
 267  * structures  might be good candidates to evolve into reader writer
 268  * locks. If it has to do a creation, it would be holding the
 269  * mutex across a kmem_alloc with KM_SLEEP specified.
 270  */
 271 
 272 #ifdef DEBUG
 273 #define TABSIZE 17
 274 #else
 275 #define TABSIZE 2047
 276 #endif
 277 
 278 #define ADDRHASH(key) ((unsigned long)(key) >> 3)
 279 
 280 #define MAXTABSZ 1024*1024
 281 
 282 /* The values below are rfs4_lease_time units */
 283 
 284 #ifdef DEBUG
 285 #define CLIENT_CACHE_TIME 1
 286 #define OPENOWNER_CACHE_TIME 1
 287 #define STATE_CACHE_TIME 1
 288 #define LO_STATE_CACHE_TIME 1
 289 #define LOCKOWNER_CACHE_TIME 1
 290 #define FILE_CACHE_TIME 3
 291 #define DELEG_STATE_CACHE_TIME 1
 292 #else
 293 #define CLIENT_CACHE_TIME 10
 294 #define OPENOWNER_CACHE_TIME 5
 295 #define STATE_CACHE_TIME 1
 296 #define LO_STATE_CACHE_TIME 1
 297 #define LOCKOWNER_CACHE_TIME 3
 298 #define FILE_CACHE_TIME 40
 299 #define DELEG_STATE_CACHE_TIME 1
 300 #endif
 301 
 302 /*
 303  * NFSv4 server state databases
 304  *
 305  * Initilized when the module is loaded and used by NFSv4 state tables.
 306  * These kmem_cache databases are global, the tables that make use of these
 307  * are per zone.
 308  */
 309 kmem_cache_t *rfs4_client_mem_cache;
 310 kmem_cache_t *rfs4_clntIP_mem_cache;
 311 kmem_cache_t *rfs4_openown_mem_cache;
 312 kmem_cache_t *rfs4_openstID_mem_cache;
 313 kmem_cache_t *rfs4_lockstID_mem_cache;
 314 kmem_cache_t *rfs4_lockown_mem_cache;
 315 kmem_cache_t *rfs4_file_mem_cache;
 316 kmem_cache_t *rfs4_delegstID_mem_cache;
 317 
 318 /*
 319  * NFSv4 state table functions
 320  */
 321 static bool_t rfs4_client_create(rfs4_entry_t, void *);
 322 static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
 323 static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
 324 static void rfs4_client_destroy(rfs4_entry_t);
 325 static bool_t rfs4_client_expiry(rfs4_entry_t);
 326 static uint32_t clientid_hash(void *);
 327 static bool_t clientid_compare(rfs4_entry_t, void *);
 328 static void *clientid_mkkey(rfs4_entry_t);
 329 static uint32_t nfsclnt_hash(void *);
 330 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
 331 static void *nfsclnt_mkkey(rfs4_entry_t);
 332 static bool_t rfs4_clntip_expiry(rfs4_entry_t);
 333 static void rfs4_clntip_destroy(rfs4_entry_t);
 334 static bool_t rfs4_clntip_create(rfs4_entry_t, void *);
 335 static uint32_t clntip_hash(void *);
 336 static bool_t clntip_compare(rfs4_entry_t, void *);
 337 static void *clntip_mkkey(rfs4_entry_t);
 338 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
 339 static void rfs4_openowner_destroy(rfs4_entry_t);
 340 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
 341 static uint32_t openowner_hash(void *);
 342 static bool_t openowner_compare(rfs4_entry_t, void *);
 343 static void *openowner_mkkey(rfs4_entry_t);
 344 static bool_t rfs4_state_create(rfs4_entry_t, void *);
 345 static void rfs4_state_destroy(rfs4_entry_t);
 346 static bool_t rfs4_state_expiry(rfs4_entry_t);
 347 static uint32_t state_hash(void *);
 348 static bool_t state_compare(rfs4_entry_t, void *);
 349 static void *state_mkkey(rfs4_entry_t);
 350 static uint32_t state_owner_file_hash(void *);
 351 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
 352 static void *state_owner_file_mkkey(rfs4_entry_t);
 353 static uint32_t state_file_hash(void *);
 354 static bool_t state_file_compare(rfs4_entry_t, void *);
 355 static void *state_file_mkkey(rfs4_entry_t);
 356 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
 357 static void rfs4_lo_state_destroy(rfs4_entry_t);
 358 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
 359 static uint32_t lo_state_hash(void *);
 360 static bool_t lo_state_compare(rfs4_entry_t, void *);
 361 static void *lo_state_mkkey(rfs4_entry_t);
 362 static uint32_t lo_state_lo_hash(void *);
 363 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
 364 static void *lo_state_lo_mkkey(rfs4_entry_t);
 365 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
 366 static void rfs4_lockowner_destroy(rfs4_entry_t);
 367 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
 368 static uint32_t lockowner_hash(void *);
 369 static bool_t lockowner_compare(rfs4_entry_t, void *);
 370 static void *lockowner_mkkey(rfs4_entry_t);
 371 static uint32_t pid_hash(void *);
 372 static bool_t pid_compare(rfs4_entry_t, void *);
 373 static void *pid_mkkey(rfs4_entry_t);
 374 static bool_t rfs4_file_create(rfs4_entry_t, void *);
 375 static void rfs4_file_destroy(rfs4_entry_t);
 376 static uint32_t file_hash(void *);
 377 static bool_t file_compare(rfs4_entry_t, void *);
 378 static void *file_mkkey(rfs4_entry_t);
 379 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
 380 static void rfs4_deleg_state_destroy(rfs4_entry_t);
 381 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
 382 static uint32_t deleg_hash(void *);
 383 static bool_t deleg_compare(rfs4_entry_t, void *);
 384 static void *deleg_mkkey(rfs4_entry_t);
 385 static uint32_t deleg_state_hash(void *);
 386 static bool_t deleg_state_compare(rfs4_entry_t, void *);
 387 static void *deleg_state_mkkey(rfs4_entry_t);
 388 
 389 static void rfs4_state_rele_nounlock(rfs4_state_t *);
 390 
 391 static int rfs4_ss_enabled = 0;
 392 
 393 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
 394 
 395 void
 396 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
 397 {
 398         kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
 399 }
 400 
 401 static rfs4_ss_pn_t *
 402 rfs4_ss_pnalloc(char *dir, char *leaf)
 403 {
 404         rfs4_ss_pn_t *ss_pn;
 405         int     dir_len, leaf_len;
 406 
 407         /*
 408          * validate we have a resonable path
 409          * (account for the '/' and trailing null)
 410          */
 411         if ((dir_len = strlen(dir)) > MAXPATHLEN ||
 412             (leaf_len = strlen(leaf)) > MAXNAMELEN ||
 413             (dir_len + leaf_len + 2) > MAXPATHLEN) {
 414                 return (NULL);
 415         }
 416 
 417         ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
 418 
 419         (void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
 420         /* Handy pointer to just the leaf name */
 421         ss_pn->leaf = ss_pn->pn + dir_len + 1;
 422         return (ss_pn);
 423 }
 424 
 425 
 426 /*
 427  * Move the "leaf" filename from "sdir" directory
 428  * to the "ddir" directory. Return the pathname of
 429  * the destination unless the rename fails in which
 430  * case we need to return the source pathname.
 431  */
 432 static rfs4_ss_pn_t *
 433 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
 434 {
 435         rfs4_ss_pn_t *src, *dst;
 436 
 437         if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
 438                 return (NULL);
 439 
 440         if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
 441                 rfs4_ss_pnfree(src);
 442                 return (NULL);
 443         }
 444 
 445         /*
 446          * If the rename fails we shall return the src
 447          * pathname and free the dst. Otherwise we need
 448          * to free the src and return the dst pathanme.
 449          */
 450         if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
 451                 rfs4_ss_pnfree(dst);
 452                 return (src);
 453         }
 454         rfs4_ss_pnfree(src);
 455         return (dst);
 456 }
 457 
 458 
 459 static rfs4_oldstate_t *
 460 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
 461 {
 462         struct uio uio;
 463         struct iovec iov[3];
 464 
 465         rfs4_oldstate_t *cl_ss = NULL;
 466         vnode_t *vp;
 467         vattr_t va;
 468         uint_t id_len;
 469         int err, kill_file, file_vers;
 470 
 471         if (ss_pn == NULL)
 472                 return (NULL);
 473 
 474         /*
 475          * open the state file.
 476          */
 477         if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
 478                 return (NULL);
 479         }
 480 
 481         if (vp->v_type != VREG) {
 482                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 483                 VN_RELE(vp);
 484                 return (NULL);
 485         }
 486 
 487         err = VOP_ACCESS(vp, VREAD, 0, CRED(), NULL);
 488         if (err) {
 489                 /*
 490                  * We don't have read access? better get the heck out.
 491                  */
 492                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 493                 VN_RELE(vp);
 494                 return (NULL);
 495         }
 496 
 497         (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
 498         /*
 499          * get the file size to do some basic validation
 500          */
 501         va.va_mask = AT_SIZE;
 502         err = VOP_GETATTR(vp, &va, 0, CRED(), NULL);
 503 
 504         kill_file = (va.va_size == 0 || va.va_size <
 505             (NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
 506 
 507         if (err || kill_file) {
 508                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 509                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 510                 VN_RELE(vp);
 511                 if (kill_file) {
 512                         (void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
 513                 }
 514                 return (NULL);
 515         }
 516 
 517         cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
 518 
 519         /*
 520          * build iovecs to read in the file_version, verifier and id_len
 521          */
 522         iov[0].iov_base = (caddr_t)&file_vers;
 523         iov[0].iov_len = sizeof (int);
 524         iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
 525         iov[1].iov_len = NFS4_VERIFIER_SIZE;
 526         iov[2].iov_base = (caddr_t)&id_len;
 527         iov[2].iov_len = sizeof (uint_t);
 528 
 529         uio.uio_iov = iov;
 530         uio.uio_iovcnt = 3;
 531         uio.uio_segflg = UIO_SYSSPACE;
 532         uio.uio_loffset = 0;
 533         uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
 534 
 535         if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
 536                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 537                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 538                 VN_RELE(vp);
 539                 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
 540                 return (NULL);
 541         }
 542 
 543         /*
 544          * if the file_version doesn't match or if the
 545          * id_len is zero or the combination of the verifier,
 546          * id_len and id_val is bigger than the file we have
 547          * a problem. If so ditch the file.
 548          */
 549         kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
 550             (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
 551 
 552         if (err || kill_file) {
 553                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 554                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 555                 VN_RELE(vp);
 556                 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
 557                 if (kill_file) {
 558                         (void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
 559                 }
 560                 return (NULL);
 561         }
 562 
 563         /*
 564          * now get the client id value
 565          */
 566         cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
 567         iov[0].iov_base = cl_ss->cl_id4.id_val;
 568         iov[0].iov_len = id_len;
 569 
 570         uio.uio_iov = iov;
 571         uio.uio_iovcnt = 1;
 572         uio.uio_segflg = UIO_SYSSPACE;
 573         uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
 574 
 575         if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
 576                 VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 577                 (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 578                 VN_RELE(vp);
 579                 kmem_free(cl_ss->cl_id4.id_val, id_len);
 580                 kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
 581                 return (NULL);
 582         }
 583 
 584         VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
 585         (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
 586         VN_RELE(vp);
 587         return (cl_ss);
 588 }
 589 
 590 #ifdef  nextdp
 591 #undef nextdp
 592 #endif
 593 #define nextdp(dp)      ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
 594 
 595 /*
 596  * Add entries from statedir to supplied oldstate list.
 597  * Optionally, move all entries from statedir -> destdir.
 598  */
 599 void
 600 rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
 601 {
 602         rfs4_ss_pn_t *ss_pn;
 603         rfs4_oldstate_t *cl_ss = NULL;
 604         char    *dirt = NULL;
 605         int     err, dir_eof = 0, size = 0;
 606         vnode_t *dvp;
 607         struct iovec iov;
 608         struct uio uio;
 609         struct dirent64 *dep;
 610         offset_t dirchunk_offset = 0;
 611 
 612         /*
 613          * open the state directory
 614          */
 615         if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
 616                 return;
 617 
 618         if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED(), NULL))
 619                 goto out;
 620 
 621         dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
 622 
 623         /*
 624          * Get and process the directory entries
 625          */
 626         while (!dir_eof) {
 627                 (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
 628                 iov.iov_base = dirt;
 629                 iov.iov_len = RFS4_SS_DIRSIZE;
 630                 uio.uio_iov = &iov;
 631                 uio.uio_iovcnt = 1;
 632                 uio.uio_segflg = UIO_SYSSPACE;
 633                 uio.uio_loffset = dirchunk_offset;
 634                 uio.uio_resid = RFS4_SS_DIRSIZE;
 635 
 636                 err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof, NULL, 0);
 637                 VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
 638                 if (err)
 639                         goto out;
 640 
 641                 size = RFS4_SS_DIRSIZE - uio.uio_resid;
 642 
 643                 /*
 644                  * Process all the directory entries in this
 645                  * readdir chunk
 646                  */
 647                 for (dep = (struct dirent64 *)dirt; size > 0;
 648                     dep = nextdp(dep)) {
 649 
 650                         size -= dep->d_reclen;
 651                         dirchunk_offset = dep->d_off;
 652 
 653                         /*
 654                          * Skip '.' and '..'
 655                          */
 656                         if (NFS_IS_DOTNAME(dep->d_name))
 657                                 continue;
 658 
 659                         ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
 660                         if (ss_pn == NULL)
 661                                 continue;
 662 
 663                         if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
 664                                 if (destdir != NULL) {
 665                                         rfs4_ss_pnfree(ss_pn);
 666                                         cl_ss->ss_pn = rfs4_ss_movestate(
 667                                             statedir, destdir, dep->d_name);
 668                                 } else {
 669                                         cl_ss->ss_pn = ss_pn;
 670                                 }
 671                                 insque(cl_ss, oldstate);
 672                         } else {
 673                                 rfs4_ss_pnfree(ss_pn);
 674                         }
 675                 }
 676         }
 677 
 678 out:
 679         (void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED(), NULL);
 680         VN_RELE(dvp);
 681         if (dirt)
 682                 kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
 683 }
 684 
 685 static void
 686 rfs4_ss_init(nfs4_srv_t *nsrv4)
 687 {
 688         int npaths = 1;
 689         char *default_dss_path = NFS4_DSS_VAR_DIR;
 690 
 691         /* read the default stable storage state */
 692         rfs4_dss_readstate(nsrv4, npaths, &default_dss_path);
 693 
 694         rfs4_ss_enabled = 1;
 695 }
 696 
 697 static void
 698 rfs4_ss_fini(nfs4_srv_t *nsrv4)
 699 {
 700         rfs4_servinst_t *sip;
 701 
 702         mutex_enter(&nsrv4->servinst_lock);
 703         sip = nsrv4->nfs4_cur_servinst;
 704         while (sip != NULL) {
 705                 rfs4_dss_clear_oldstate(sip);
 706                 sip = sip->next;
 707         }
 708         mutex_exit(&nsrv4->servinst_lock);
 709 }
 710 
 711 /*
 712  * Remove all oldstate files referenced by this servinst.
 713  */
 714 static void
 715 rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
 716 {
 717         rfs4_oldstate_t *os_head, *osp;
 718 
 719         rw_enter(&sip->oldstate_lock, RW_WRITER);
 720         os_head = sip->oldstate;
 721 
 722         if (os_head == NULL) {
 723                 rw_exit(&sip->oldstate_lock);
 724                 return;
 725         }
 726 
 727         /* skip dummy entry */
 728         osp = os_head->next;
 729         while (osp != os_head) {
 730                 char *leaf = osp->ss_pn->leaf;
 731                 rfs4_oldstate_t *os_next;
 732 
 733                 rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
 734 
 735                 if (osp->cl_id4.id_val)
 736                         kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
 737                 rfs4_ss_pnfree(osp->ss_pn);
 738 
 739                 os_next = osp->next;
 740                 remque(osp);
 741                 kmem_free(osp, sizeof (rfs4_oldstate_t));
 742                 osp = os_next;
 743         }
 744 
 745         rw_exit(&sip->oldstate_lock);
 746 }
 747 
 748 /*
 749  * Form the state and oldstate paths, and read in the stable storage files.
 750  */
 751 void
 752 rfs4_dss_readstate(nfs4_srv_t *nsrv4, int npaths, char **paths)
 753 {
 754         int i;
 755         char *state, *oldstate;
 756 
 757         state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 758         oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 759 
 760         for (i = 0; i < npaths; i++) {
 761                 char *path = paths[i];
 762 
 763                 (void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
 764                 (void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
 765 
 766                 /*
 767                  * Populate the current server instance's oldstate list.
 768                  *
 769                  * 1. Read stable storage data from old state directory,
 770                  *    leaving its contents alone.
 771                  *
 772                  * 2. Read stable storage data from state directory,
 773                  *    and move the latter's contents to old state
 774                  *    directory.
 775                  */
 776                 /* CSTYLED */
 777                 rfs4_ss_oldstate(nsrv4->nfs4_cur_servinst->oldstate, oldstate, NULL);
 778                 /* CSTYLED */
 779                 rfs4_ss_oldstate(nsrv4->nfs4_cur_servinst->oldstate, state, oldstate);
 780         }
 781 
 782         kmem_free(state, MAXPATHLEN);
 783         kmem_free(oldstate, MAXPATHLEN);
 784 }
 785 
 786 
 787 /*
 788  * Check if we are still in grace and if the client can be
 789  * granted permission to perform reclaims.
 790  */
 791 void
 792 rfs4_ss_chkclid(nfs4_srv_t *nsrv4, rfs4_client_t *cp)
 793 {
 794         rfs4_servinst_t *sip;
 795 
 796         /*
 797          * It should be sufficient to check the oldstate data for just
 798          * this client's instance. However, since our per-instance
 799          * client grouping is solely temporal, HA-NFSv4 RG failover
 800          * might result in clients of the same RG being partitioned into
 801          * separate instances.
 802          *
 803          * Until the client grouping is improved, we must check the
 804          * oldstate data for all instances with an active grace period.
 805          *
 806          * This also serves as the mechanism to remove stale oldstate data.
 807          * The first time we check an instance after its grace period has
 808          * expired, the oldstate data should be cleared.
 809          *
 810          * Start at the current instance, and walk the list backwards
 811          * to the first.
 812          */
 813         mutex_enter(&nsrv4->servinst_lock);
 814         for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
 815                 rfs4_ss_chkclid_sip(cp, sip);
 816 
 817                 /* if the above check found this client, we're done */
 818                 if (cp->rc_can_reclaim)
 819                         break;
 820         }
 821         mutex_exit(&nsrv4->servinst_lock);
 822 }
 823 
 824 static void
 825 rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
 826 {
 827         rfs4_oldstate_t *osp, *os_head;
 828 
 829         /* short circuit everything if this server instance has no oldstate */
 830         rw_enter(&sip->oldstate_lock, RW_READER);
 831         os_head = sip->oldstate;
 832         rw_exit(&sip->oldstate_lock);
 833         if (os_head == NULL)
 834                 return;
 835 
 836         /*
 837          * If this server instance is no longer in a grace period then
 838          * the client won't be able to reclaim. No further need for this
 839          * instance's oldstate data, so it can be cleared.
 840          */
 841         if (!rfs4_servinst_in_grace(sip))
 842                 return;
 843 
 844         /* this instance is still in grace; search for the clientid */
 845 
 846         rw_enter(&sip->oldstate_lock, RW_READER);
 847 
 848         os_head = sip->oldstate;
 849         /* skip dummy entry */
 850         osp = os_head->next;
 851         while (osp != os_head) {
 852                 if (osp->cl_id4.id_len == cp->rc_nfs_client.id_len) {
 853                         if (bcmp(osp->cl_id4.id_val, cp->rc_nfs_client.id_val,
 854                             osp->cl_id4.id_len) == 0) {
 855                                 cp->rc_can_reclaim = 1;
 856                                 break;
 857                         }
 858                 }
 859                 osp = osp->next;
 860         }
 861 
 862         rw_exit(&sip->oldstate_lock);
 863 }
 864 
 865 /*
 866  * Place client information into stable storage: 1/3.
 867  * First, generate the leaf filename, from the client's IP address and
 868  * the server-generated short-hand clientid.
 869  */
 870 void
 871 rfs4_ss_clid(nfs4_srv_t *nsrv4, rfs4_client_t *cp)
 872 {
 873         const char *kinet_ntop6(uchar_t *, char *, size_t);
 874         char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
 875         struct sockaddr *ca;
 876         uchar_t *b;
 877 
 878         if (rfs4_ss_enabled == 0) {
 879                 return;
 880         }
 881 
 882         buf[0] = 0;
 883 
 884         ca = (struct sockaddr *)&cp->rc_addr;
 885 
 886         /*
 887          * Convert the caller's IP address to a dotted string
 888          */
 889         if (ca->sa_family == AF_INET) {
 890                 b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
 891                 (void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
 892                     b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
 893         } else if (ca->sa_family == AF_INET6) {
 894                 struct sockaddr_in6 *sin6;
 895 
 896                 sin6 = (struct sockaddr_in6 *)ca;
 897                 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
 898                     buf, INET6_ADDRSTRLEN);
 899         }
 900 
 901         (void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
 902             (longlong_t)cp->rc_clientid);
 903         rfs4_ss_clid_write(nsrv4, cp, leaf);
 904 }
 905 
 906 /*
 907  * Place client information into stable storage: 2/3.
 908  * DSS: distributed stable storage: the file may need to be written to
 909  * multiple directories.
 910  */
 911 static void
 912 rfs4_ss_clid_write(nfs4_srv_t *nsrv4, rfs4_client_t *cp, char *leaf)
 913 {
 914         rfs4_servinst_t *sip;
 915 
 916         /*
 917          * It should be sufficient to write the leaf file to (all) DSS paths
 918          * associated with just this client's instance. However, since our
 919          * per-instance client grouping is solely temporal, HA-NFSv4 RG
 920          * failover might result in us losing DSS data.
 921          *
 922          * Until the client grouping is improved, we must write the DSS data
 923          * to all instances' paths. Start at the current instance, and
 924          * walk the list backwards to the first.
 925          */
 926         mutex_enter(&nsrv4->servinst_lock);
 927         for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
 928                 int i, npaths = sip->dss_npaths;
 929 
 930                 /* write the leaf file to all DSS paths */
 931                 for (i = 0; i < npaths; i++) {
 932                         rfs4_dss_path_t *dss_path = sip->dss_paths[i];
 933 
 934                         /* HA-NFSv4 path might have been failed-away from us */
 935                         if (dss_path == NULL)
 936                                 continue;
 937 
 938                         rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
 939                 }
 940         }
 941         mutex_exit(&nsrv4->servinst_lock);
 942 }
 943 
 944 /*
 945  * Place client information into stable storage: 3/3.
 946  * Write the stable storage data to the requested file.
 947  */
 948 static void
 949 rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
 950 {
 951         int ioflag;
 952         int file_vers = NFS4_SS_VERSION;
 953         size_t dirlen;
 954         struct uio uio;
 955         struct iovec iov[4];
 956         char *dir;
 957         rfs4_ss_pn_t *ss_pn;
 958         vnode_t *vp;
 959         nfs_client_id4 *cl_id4 = &(cp->rc_nfs_client);
 960 
 961         /* allow 2 extra bytes for '/' & NUL */
 962         dirlen = strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2;
 963         dir = kmem_alloc(dirlen, KM_SLEEP);
 964         (void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
 965 
 966         ss_pn = rfs4_ss_pnalloc(dir, leaf);
 967         /* rfs4_ss_pnalloc takes its own copy */
 968         kmem_free(dir, dirlen);
 969         if (ss_pn == NULL)
 970                 return;
 971 
 972         if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
 973             CRCREAT, 0)) {
 974                 rfs4_ss_pnfree(ss_pn);
 975                 return;
 976         }
 977 
 978         /*
 979          * We need to record leaf - i.e. the filename - so that we know
 980          * what to remove, in the future. However, the dir part of cp->ss_pn
 981          * should never be referenced directly, since it's potentially only
 982          * one of several paths with this leaf in it.
 983          */
 984         if (cp->rc_ss_pn != NULL) {
 985                 if (strcmp(cp->rc_ss_pn->leaf, leaf) == 0) {
 986                         /* we've already recorded *this* leaf */
 987                         rfs4_ss_pnfree(ss_pn);
 988                 } else {
 989                         /* replace with this leaf */
 990                         rfs4_ss_pnfree(cp->rc_ss_pn);
 991                         cp->rc_ss_pn = ss_pn;
 992                 }
 993         } else {
 994                 cp->rc_ss_pn = ss_pn;
 995         }
 996 
 997         /*
 998          * Build a scatter list that points to the nfs_client_id4
 999          */
1000         iov[0].iov_base = (caddr_t)&file_vers;
1001         iov[0].iov_len = sizeof (int);
1002         iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
1003         iov[1].iov_len = NFS4_VERIFIER_SIZE;
1004         iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
1005         iov[2].iov_len = sizeof (uint_t);
1006         iov[3].iov_base = (caddr_t)cl_id4->id_val;
1007         iov[3].iov_len = cl_id4->id_len;
1008 
1009         uio.uio_iov = iov;
1010         uio.uio_iovcnt = 4;
1011         uio.uio_loffset = 0;
1012         uio.uio_segflg = UIO_SYSSPACE;
1013         uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
1014         uio.uio_resid = cl_id4->id_len + sizeof (int) +
1015             NFS4_VERIFIER_SIZE + sizeof (uint_t);
1016 
1017         ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1018         uio.uio_extflg = UIO_COPY_DEFAULT;
1019 
1020         (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1021         /* write the full client id to the file. */
1022         (void) VOP_WRITE(vp, &uio, ioflag, CRED(), NULL);
1023         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1024 
1025         (void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
1026         VN_RELE(vp);
1027 }
1028 
1029 /*
1030  * DSS: distributed stable storage.
1031  * Unpack the list of paths passed by nfsd.
1032  * Use nvlist_alloc(9F) to manage the data.
1033  * The caller is responsible for allocating and freeing the buffer.
1034  */
1035 int
1036 rfs4_dss_setpaths(char *buf, size_t buflen)
1037 {
1038         int error;
1039 
1040         /*
1041          * If this is a "warm start", i.e. we previously had DSS paths,
1042          * preserve the old paths.
1043          */
1044         if (rfs4_dss_paths != NULL) {
1045                 /*
1046                  * Before we lose the ptr, destroy the nvlist and pathnames
1047                  * array from the warm start before this one.
1048                  */
1049                 nvlist_free(rfs4_dss_oldpaths);
1050                 rfs4_dss_oldpaths = rfs4_dss_paths;
1051         }
1052 
1053         /* unpack the buffer into a searchable nvlist */
1054         error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
1055         if (error)
1056                 return (error);
1057 
1058         /*
1059          * Search the nvlist for the pathnames nvpair (which is the only nvpair
1060          * in the list, and record its location.
1061          */
1062         error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
1063             &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
1064         return (error);
1065 }
1066 
1067 /*
1068  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1069  * to find and mark the client for forced expire.
1070  */
1071 static void
1072 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1073 {
1074         rfs4_client_t *cp = (rfs4_client_t *)ent;
1075         struct nfs4clrst_args *clr = arg;
1076         struct sockaddr_in6 *ent_sin6;
1077         struct in6_addr  clr_in6;
1078         struct sockaddr_in  *ent_sin;
1079         struct in_addr   clr_in;
1080 
1081         if (clr->addr_type != cp->rc_addr.ss_family) {
1082                 return;
1083         }
1084 
1085         switch (clr->addr_type) {
1086 
1087         case AF_INET6:
1088                 /* copyin the address from user space */
1089                 if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1090                         break;
1091                 }
1092 
1093                 ent_sin6 = (struct sockaddr_in6 *)&cp->rc_addr;
1094 
1095                 /*
1096                  * now compare, and if equivalent mark entry
1097                  * for forced expiration
1098                  */
1099                 if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1100                         cp->rc_forced_expire = 1;
1101                 }
1102                 break;
1103 
1104         case AF_INET:
1105                 /* copyin the address from user space */
1106                 if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1107                         break;
1108                 }
1109 
1110                 ent_sin = (struct sockaddr_in *)&cp->rc_addr;
1111 
1112                 /*
1113                  * now compare, and if equivalent mark entry
1114                  * for forced expiration
1115                  */
1116                 if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1117                         cp->rc_forced_expire = 1;
1118                 }
1119                 break;
1120 
1121         default:
1122                 /* force this assert to fail */
1123                 ASSERT(clr->addr_type != clr->addr_type);
1124         }
1125 }
1126 
1127 /*
1128  * This is called from nfssys() in order to clear server state
1129  * for the specified client IP Address.
1130  */
1131 void
1132 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1133 {
1134         nfs4_srv_t *nsrv4;
1135         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1136         (void) rfs4_dbe_walk(nsrv4->rfs4_client_tab, rfs4_client_scrub, clr);
1137 }
1138 
1139 /*
1140  * Used to initialize the NFSv4 server's state or database.  All of
1141  * the tables are created and timers are set.
1142  */
1143 void
1144 rfs4_state_g_init()
1145 {
1146         extern boolean_t rfs4_cpr_callb(void *, int);
1147         /*
1148          * Add a CPR callback so that we can update client
1149          * access times to extend the lease after a suspend
1150          * and resume (using the same class as rpcmod/connmgr)
1151          */
1152         cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1153 
1154         /*
1155          * NFSv4 server state databases
1156          *
1157          * Initilized when the module is loaded and used by NFSv4 state tables.
1158          * These kmem_cache free pools are used globally, the NFSv4 state
1159          * tables which make use of these kmem_cache free pools are per zone.
1160          *
1161          * initialize the global kmem_cache free pools which will be used by
1162          * the NFSv4 state tables.
1163          */
1164         /* CSTYLED */
1165         rfs4_client_mem_cache = nfs4_init_mem_cache("Client_entry_cache", 2, sizeof (rfs4_client_t), 0);
1166         /* CSTYLED */
1167         rfs4_clntIP_mem_cache = nfs4_init_mem_cache("ClntIP_entry_cache", 1, sizeof (rfs4_clntip_t), 1);
1168         /* CSTYLED */
1169         rfs4_openown_mem_cache = nfs4_init_mem_cache("OpenOwner_entry_cache", 1, sizeof (rfs4_openowner_t), 2);
1170         /* CSTYLED */
1171         rfs4_openstID_mem_cache = nfs4_init_mem_cache("OpenStateID_entry_cache", 3, sizeof (rfs4_state_t), 3);
1172         /* CSTYLED */
1173         rfs4_lockstID_mem_cache = nfs4_init_mem_cache("LockStateID_entry_cache", 3, sizeof (rfs4_lo_state_t), 4);
1174         /* CSTYLED */
1175         rfs4_lockown_mem_cache = nfs4_init_mem_cache("Lockowner_entry_cache", 2, sizeof (rfs4_lockowner_t), 5);
1176         /* CSTYLED */
1177         rfs4_file_mem_cache = nfs4_init_mem_cache("File_entry_cache", 1, sizeof (rfs4_file_t), 6);
1178         /* CSTYLED */
1179         rfs4_delegstID_mem_cache = nfs4_init_mem_cache("DelegStateID_entry_cache", 2, sizeof (rfs4_deleg_state_t), 7);
1180 
1181         rfs4_client_clrst = rfs4_clear_client_state;
1182 }
1183 
1184 
1185 /*
1186  * Used at server shutdown to cleanup all of the NFSv4 server's structures
1187  * and other state.
1188  */
1189 void
1190 rfs4_state_g_fini()
1191 {
1192         int i;
1193         /*
1194          * Cleanup the CPR callback.
1195          */
1196         if (cpr_id)
1197                 (void) callb_delete(cpr_id);
1198 
1199         rfs4_client_clrst = NULL;
1200 
1201         /* free the NFSv4 state databases */
1202         for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
1203                 kmem_cache_destroy(rfs4_db_mem_cache_table[i].r_db_mem_cache);
1204                 rfs4_db_mem_cache_table[i].r_db_mem_cache = NULL;
1205         }
1206 
1207         rfs4_client_mem_cache = NULL;
1208         rfs4_clntIP_mem_cache = NULL;
1209         rfs4_openown_mem_cache = NULL;
1210         rfs4_openstID_mem_cache = NULL;
1211         rfs4_lockstID_mem_cache = NULL;
1212         rfs4_lockown_mem_cache = NULL;
1213         rfs4_file_mem_cache = NULL;
1214         rfs4_delegstID_mem_cache = NULL;
1215 
1216         /* DSS: distributed stable storage */
1217         nvlist_free(rfs4_dss_oldpaths);
1218         nvlist_free(rfs4_dss_paths);
1219         rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
1220 }
1221 
1222 /*
1223  * Used to initialize the per zone NFSv4 server's state
1224  */
1225 void
1226 rfs4_state_zone_init(nfs4_srv_t *nsrv4)
1227 {
1228         time_t start_time;
1229         int start_grace;
1230         char *dss_path = NFS4_DSS_VAR_DIR;
1231 
1232         /* DSS: distributed stable storage: initialise served paths list */
1233         nsrv4->dss_pathlist = NULL;
1234 
1235         /*
1236          * Set the boot time.  If the server
1237          * has been restarted quickly and has had the opportunity to
1238          * service clients, then the start_time needs to be bumped
1239          * regardless.  A small window but it exists...
1240          */
1241         start_time = gethrestime_sec();
1242         if (nsrv4->rfs4_start_time < start_time)
1243                 nsrv4->rfs4_start_time = start_time;
1244         else
1245                 nsrv4->rfs4_start_time++;
1246 
1247         /*
1248          * Create the first server instance, or a new one if the server has
1249          * been restarted; see above comments on rfs4_start_time. Don't
1250          * start its grace period; that will be done later, to maximise the
1251          * clients' recovery window.
1252          */
1253         start_grace = 0;
1254         rfs4_servinst_create(nsrv4, start_grace, 1, &dss_path);
1255 
1256         /* reset the "first NFSv4 request" status */
1257         nsrv4->seen_first_compound = 0;
1258 
1259         mutex_enter(&nsrv4->state_lock);
1260 
1261         /*
1262          * If the server state database has already been initialized,
1263          * skip it
1264          */
1265         if (nsrv4->nfs4_server_state != NULL) {
1266                 mutex_exit(&nsrv4->state_lock);
1267                 return;
1268         }
1269 
1270         rw_init(&nsrv4->rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1271 
1272         /* set the various cache timers for table creation */
1273         if (nsrv4->rfs4_client_cache_time == 0)
1274                 nsrv4->rfs4_client_cache_time = CLIENT_CACHE_TIME;
1275         if (nsrv4->rfs4_openowner_cache_time == 0)
1276                 nsrv4->rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1277         if (nsrv4->rfs4_state_cache_time == 0)
1278                 nsrv4->rfs4_state_cache_time = STATE_CACHE_TIME;
1279         if (nsrv4->rfs4_lo_state_cache_time == 0)
1280                 nsrv4->rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1281         if (nsrv4->rfs4_lockowner_cache_time == 0)
1282                 nsrv4->rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1283         if (nsrv4->rfs4_file_cache_time == 0)
1284                 nsrv4->rfs4_file_cache_time = FILE_CACHE_TIME;
1285         if (nsrv4->rfs4_deleg_state_cache_time == 0)
1286                 nsrv4->rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1287 
1288         /* Create the overall database to hold all server state */
1289         nsrv4->nfs4_server_state = rfs4_database_create(rfs4_database_debug);
1290 
1291         /* Now create the individual tables */
1292         nsrv4->rfs4_client_cache_time *= rfs4_lease_time;
1293         nsrv4->rfs4_client_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1294             "Client",
1295             nsrv4->rfs4_client_cache_time,
1296             2,
1297             rfs4_client_create,
1298             rfs4_client_destroy,
1299             rfs4_client_expiry,
1300             sizeof (rfs4_client_t),
1301             TABSIZE,
1302             MAXTABSZ/8, 100);
1303         nsrv4->rfs4_nfsclnt_idx = rfs4_index_create(nsrv4->rfs4_client_tab,
1304             "nfs_client_id4", nfsclnt_hash,
1305             nfsclnt_compare, nfsclnt_mkkey,
1306             TRUE);
1307         nsrv4->rfs4_clientid_idx = rfs4_index_create(nsrv4->rfs4_client_tab,
1308             "client_id", clientid_hash,
1309             clientid_compare, clientid_mkkey,
1310             FALSE);
1311 
1312         nsrv4->rfs4_clntip_cache_time = 86400 * 365; /* about a year */
1313         nsrv4->rfs4_clntip_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1314             "ClntIP",
1315             nsrv4->rfs4_clntip_cache_time,
1316             1,
1317             rfs4_clntip_create,
1318             rfs4_clntip_destroy,
1319             rfs4_clntip_expiry,
1320             sizeof (rfs4_clntip_t),
1321             TABSIZE,
1322             MAXTABSZ, 100);
1323         nsrv4->rfs4_clntip_idx = rfs4_index_create(nsrv4->rfs4_clntip_tab,
1324             "client_ip", clntip_hash,
1325             clntip_compare, clntip_mkkey,
1326             TRUE);
1327 
1328         nsrv4->rfs4_openowner_cache_time *= rfs4_lease_time;
1329         nsrv4->rfs4_openowner_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1330             "OpenOwner",
1331             nsrv4->rfs4_openowner_cache_time,
1332             1,
1333             rfs4_openowner_create,
1334             rfs4_openowner_destroy,
1335             rfs4_openowner_expiry,
1336             sizeof (rfs4_openowner_t),
1337             TABSIZE,
1338             MAXTABSZ, 100);
1339         nsrv4->rfs4_openowner_idx = rfs4_index_create(nsrv4->rfs4_openowner_tab,
1340             "open_owner4", openowner_hash,
1341             openowner_compare,
1342             openowner_mkkey, TRUE);
1343 
1344         nsrv4->rfs4_state_cache_time *= rfs4_lease_time;
1345         nsrv4->rfs4_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1346             "OpenStateID",
1347             nsrv4->rfs4_state_cache_time,
1348             3,
1349             rfs4_state_create,
1350             rfs4_state_destroy,
1351             rfs4_state_expiry,
1352             sizeof (rfs4_state_t),
1353             TABSIZE,
1354             MAXTABSZ, 100);
1355 
1356         /* CSTYLED */
1357         nsrv4->rfs4_state_owner_file_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1358             "Openowner-File",
1359             state_owner_file_hash,
1360             state_owner_file_compare,
1361             state_owner_file_mkkey, TRUE);
1362 
1363         nsrv4->rfs4_state_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1364             "State-id", state_hash,
1365             state_compare, state_mkkey, FALSE);
1366 
1367         nsrv4->rfs4_state_file_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1368             "File", state_file_hash,
1369             state_file_compare, state_file_mkkey,
1370             FALSE);
1371 
1372         nsrv4->rfs4_lo_state_cache_time *= rfs4_lease_time;
1373         nsrv4->rfs4_lo_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1374             "LockStateID",
1375             nsrv4->rfs4_lo_state_cache_time,
1376             2,
1377             rfs4_lo_state_create,
1378             rfs4_lo_state_destroy,
1379             rfs4_lo_state_expiry,
1380             sizeof (rfs4_lo_state_t),
1381             TABSIZE,
1382             MAXTABSZ, 100);
1383 
1384         /* CSTYLED */
1385         nsrv4->rfs4_lo_state_owner_idx = rfs4_index_create(nsrv4->rfs4_lo_state_tab,
1386             "lockownerxstate",
1387             lo_state_lo_hash,
1388             lo_state_lo_compare,
1389             lo_state_lo_mkkey, TRUE);
1390 
1391         nsrv4->rfs4_lo_state_idx = rfs4_index_create(nsrv4->rfs4_lo_state_tab,
1392             "State-id",
1393             lo_state_hash, lo_state_compare,
1394             lo_state_mkkey, FALSE);
1395 
1396         nsrv4->rfs4_lockowner_cache_time *= rfs4_lease_time;
1397 
1398         nsrv4->rfs4_lockowner_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1399             "Lockowner",
1400             nsrv4->rfs4_lockowner_cache_time,
1401             2,
1402             rfs4_lockowner_create,
1403             rfs4_lockowner_destroy,
1404             rfs4_lockowner_expiry,
1405             sizeof (rfs4_lockowner_t),
1406             TABSIZE,
1407             MAXTABSZ, 100);
1408 
1409         nsrv4->rfs4_lockowner_idx = rfs4_index_create(nsrv4->rfs4_lockowner_tab,
1410             "lock_owner4", lockowner_hash,
1411             lockowner_compare,
1412             lockowner_mkkey, TRUE);
1413 
1414         /* CSTYLED */
1415         nsrv4->rfs4_lockowner_pid_idx = rfs4_index_create(nsrv4->rfs4_lockowner_tab,
1416             "pid", pid_hash,
1417             pid_compare, pid_mkkey,
1418             FALSE);
1419 
1420         nsrv4->rfs4_file_cache_time *= rfs4_lease_time;
1421         nsrv4->rfs4_file_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1422             "File",
1423             nsrv4->rfs4_file_cache_time,
1424             1,
1425             rfs4_file_create,
1426             rfs4_file_destroy,
1427             NULL,
1428             sizeof (rfs4_file_t),
1429             TABSIZE,
1430             MAXTABSZ, -1);
1431 
1432         nsrv4->rfs4_file_idx = rfs4_index_create(nsrv4->rfs4_file_tab,
1433             "Filehandle", file_hash,
1434             file_compare, file_mkkey, TRUE);
1435 
1436         nsrv4->rfs4_deleg_state_cache_time *= rfs4_lease_time;
1437         /* CSTYLED */
1438         nsrv4->rfs4_deleg_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1439             "DelegStateID",
1440             nsrv4->rfs4_deleg_state_cache_time,
1441             2,
1442             rfs4_deleg_state_create,
1443             rfs4_deleg_state_destroy,
1444             rfs4_deleg_state_expiry,
1445             sizeof (rfs4_deleg_state_t),
1446             TABSIZE,
1447             MAXTABSZ, 100);
1448         nsrv4->rfs4_deleg_idx = rfs4_index_create(nsrv4->rfs4_deleg_state_tab,
1449             "DelegByFileClient",
1450             deleg_hash,
1451             deleg_compare,
1452             deleg_mkkey, TRUE);
1453 
1454         /* CSTYLED */
1455         nsrv4->rfs4_deleg_state_idx = rfs4_index_create(nsrv4->rfs4_deleg_state_tab,
1456             "DelegState",
1457             deleg_state_hash,
1458             deleg_state_compare,
1459             deleg_state_mkkey, FALSE);
1460 
1461         mutex_exit(&nsrv4->state_lock);
1462 
1463         /*
1464          * Init the stable storage.
1465          */
1466         rfs4_ss_init(nsrv4);
1467 }
1468 
1469 /*
1470  * Used at server shutdown to cleanup all of NFSv4 server's zone structures
1471  * and state.
1472  */
1473 void
1474 rfs4_state_zone_fini()
1475 {
1476         rfs4_database_t *dbp;
1477         nfs4_srv_t *nsrv4;
1478         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1479 
1480         rfs4_set_deleg_policy(nsrv4, SRV_NEVER_DELEGATE);
1481 
1482         mutex_enter(&nsrv4->state_lock);
1483 
1484         if (nsrv4->nfs4_server_state == NULL) {
1485                 mutex_exit(&nsrv4->state_lock);
1486                 return;
1487         }
1488 
1489         /* destroy server instances and current instance ptr */
1490         rfs4_servinst_destroy_all(nsrv4);
1491 
1492         /* reset the "first NFSv4 request" status */
1493         nsrv4->seen_first_compound = 0;
1494 
1495         dbp = nsrv4->nfs4_server_state;
1496         nsrv4->nfs4_server_state = NULL;
1497 
1498         rw_destroy(&nsrv4->rfs4_findclient_lock);
1499 
1500         /* First stop all of the reaper threads in the database */
1501         rfs4_database_shutdown(dbp);
1502         /*
1503          * XXX workaround
1504          * Skip destrying the state database yet just in case there
1505          * are unfinished operations depending on it.
1506          */
1507         /* Now destroy/release the database tables */
1508         /* rfs4_database_destroy(dbp); */
1509 
1510         /* Reset the cache timers for next time */
1511         nsrv4->rfs4_client_cache_time = 0;
1512         nsrv4->rfs4_openowner_cache_time = 0;
1513         nsrv4->rfs4_state_cache_time = 0;
1514         nsrv4->rfs4_lo_state_cache_time = 0;
1515         nsrv4->rfs4_lockowner_cache_time = 0;
1516         nsrv4->rfs4_file_cache_time = 0;
1517         nsrv4->rfs4_deleg_state_cache_time = 0;
1518 
1519         mutex_exit(&nsrv4->state_lock);
1520 
1521         /* clean up any dangling stable storage structures */
1522         rfs4_ss_fini(nsrv4);
1523 }
1524 
1525 typedef union {
1526         struct {
1527                 uint32_t start_time;
1528                 uint32_t c_id;
1529         } impl_id;
1530         clientid4 id4;
1531 } cid;
1532 
1533 static int foreign_stateid(stateid_t *id);
1534 static int foreign_clientid(cid *cidp);
1535 static void embed_nodeid(cid *cidp);
1536 
1537 typedef union {
1538         struct {
1539                 uint32_t c_id;
1540                 uint32_t gen_num;
1541         } cv_impl;
1542         verifier4       confirm_verf;
1543 } scid_confirm_verf;
1544 
1545 static uint32_t
1546 clientid_hash(void *key)
1547 {
1548         cid *idp = key;
1549 
1550         return (idp->impl_id.c_id);
1551 }
1552 
1553 static bool_t
1554 clientid_compare(rfs4_entry_t entry, void *key)
1555 {
1556         rfs4_client_t *cp = (rfs4_client_t *)entry;
1557         clientid4 *idp = key;
1558 
1559         return (*idp == cp->rc_clientid);
1560 }
1561 
1562 static void *
1563 clientid_mkkey(rfs4_entry_t entry)
1564 {
1565         rfs4_client_t *cp = (rfs4_client_t *)entry;
1566 
1567         return (&cp->rc_clientid);
1568 }
1569 
1570 static uint32_t
1571 nfsclnt_hash(void *key)
1572 {
1573         nfs_client_id4 *client = key;
1574         int i;
1575         uint32_t hash = 0;
1576 
1577         for (i = 0; i < client->id_len; i++) {
1578                 hash <<= 1;
1579                 hash += (uint_t)client->id_val[i];
1580         }
1581         return (hash);
1582 }
1583 
1584 
1585 static bool_t
1586 nfsclnt_compare(rfs4_entry_t entry, void *key)
1587 {
1588         rfs4_client_t *cp = (rfs4_client_t *)entry;
1589         nfs_client_id4 *nfs_client = key;
1590 
1591         if (cp->rc_nfs_client.id_len != nfs_client->id_len)
1592                 return (FALSE);
1593 
1594         return (bcmp(cp->rc_nfs_client.id_val, nfs_client->id_val,
1595             nfs_client->id_len) == 0);
1596 }
1597 
1598 static void *
1599 nfsclnt_mkkey(rfs4_entry_t entry)
1600 {
1601         rfs4_client_t *cp = (rfs4_client_t *)entry;
1602 
1603         return (&cp->rc_nfs_client);
1604 }
1605 
1606 static bool_t
1607 rfs4_client_expiry(rfs4_entry_t u_entry)
1608 {
1609         rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1610         bool_t cp_expired;
1611 
1612         if (rfs4_dbe_is_invalid(cp->rc_dbe)) {
1613                 cp->rc_ss_remove = 1;
1614                 return (TRUE);
1615         }
1616         /*
1617          * If the sysadmin has used clear_locks for this
1618          * entry then forced_expire will be set and we
1619          * want this entry to be reaped. Or the entry
1620          * has exceeded its lease period.
1621          */
1622         cp_expired = (cp->rc_forced_expire ||
1623             (gethrestime_sec() - cp->rc_last_access
1624             > rfs4_lease_time));
1625 
1626         if (!cp->rc_ss_remove && cp_expired)
1627                 cp->rc_ss_remove = 1;
1628         return (cp_expired);
1629 }
1630 
1631 /*
1632  * Remove the leaf file from all distributed stable storage paths.
1633  */
1634 static void
1635 rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
1636 {
1637         nfs4_srv_t *nsrv4;
1638         rfs4_servinst_t *sip;
1639         char *leaf = cp->rc_ss_pn->leaf;
1640 
1641         /*
1642          * since the state files are written to all DSS
1643          * paths we must remove this leaf file instance
1644          * from all server instances.
1645          */
1646 
1647         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1648         mutex_enter(&nsrv4->servinst_lock);
1649         for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
1650                 /* remove the leaf file associated with this server instance */
1651                 rfs4_dss_remove_leaf(sip, NFS4_DSS_STATE_LEAF, leaf);
1652         }
1653         mutex_exit(&nsrv4->servinst_lock);
1654 }
1655 
1656 static void
1657 rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
1658 {
1659         int i, npaths = sip->dss_npaths;
1660 
1661         for (i = 0; i < npaths; i++) {
1662                 rfs4_dss_path_t *dss_path = sip->dss_paths[i];
1663                 char *path, *dir;
1664                 size_t pathlen;
1665 
1666                 /* the HA-NFSv4 path might have been failed-over away from us */
1667                 if (dss_path == NULL)
1668                         continue;
1669 
1670                 dir = dss_path->path;
1671 
1672                 /* allow 3 extra bytes for two '/' & a NUL */
1673                 pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
1674                 path = kmem_alloc(pathlen, KM_SLEEP);
1675                 (void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
1676 
1677                 (void) vn_remove(path, UIO_SYSSPACE, RMFILE);
1678 
1679                 kmem_free(path, pathlen);
1680         }
1681 }
1682 
1683 static void
1684 rfs4_client_destroy(rfs4_entry_t u_entry)
1685 {
1686         rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1687 
1688         mutex_destroy(cp->rc_cbinfo.cb_lock);
1689         cv_destroy(cp->rc_cbinfo.cb_cv);
1690         cv_destroy(cp->rc_cbinfo.cb_cv_nullcaller);
1691         list_destroy(&cp->rc_openownerlist);
1692 
1693         /* free callback info */
1694         rfs4_cbinfo_free(&cp->rc_cbinfo);
1695 
1696         if (cp->rc_cp_confirmed)
1697                 rfs4_client_rele(cp->rc_cp_confirmed);
1698 
1699         if (cp->rc_ss_pn) {
1700                 /* check if the stable storage files need to be removed */
1701                 if (cp->rc_ss_remove)
1702                         rfs4_dss_remove_cpleaf(cp);
1703                 rfs4_ss_pnfree(cp->rc_ss_pn);
1704         }
1705 
1706         /* Free the client supplied client id */
1707         kmem_free(cp->rc_nfs_client.id_val, cp->rc_nfs_client.id_len);
1708 
1709         if (cp->rc_sysidt != LM_NOSYSID)
1710                 lm_free_sysidt(cp->rc_sysidt);
1711 }
1712 
1713 static bool_t
1714 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1715 {
1716         rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1717         nfs_client_id4 *client = (nfs_client_id4 *)arg;
1718         struct sockaddr *ca;
1719         cid *cidp;
1720         scid_confirm_verf *scvp;
1721         nfs4_srv_t *nsrv4;
1722 
1723         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1724 
1725         /* Get a clientid to give to the client */
1726         cidp = (cid *)&cp->rc_clientid;
1727         cidp->impl_id.start_time = nsrv4->rfs4_start_time;
1728         cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->rc_dbe);
1729 
1730         /* If we are booted as a cluster node, embed our nodeid */
1731         if (cluster_bootflags & CLUSTER_BOOTED)
1732                 embed_nodeid(cidp);
1733 
1734         /* Allocate and copy client's client id value */
1735         cp->rc_nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1736         cp->rc_nfs_client.id_len = client->id_len;
1737         bcopy(client->id_val, cp->rc_nfs_client.id_val, client->id_len);
1738         cp->rc_nfs_client.verifier = client->verifier;
1739 
1740         /* Copy client's IP address */
1741         ca = client->cl_addr;
1742         if (ca->sa_family == AF_INET)
1743                 bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in));
1744         else if (ca->sa_family == AF_INET6)
1745                 bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in6));
1746         cp->rc_nfs_client.cl_addr = (struct sockaddr *)&cp->rc_addr;
1747 
1748         /* Init the value for the SETCLIENTID_CONFIRM verifier */
1749         scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1750         scvp->cv_impl.c_id = cidp->impl_id.c_id;
1751         scvp->cv_impl.gen_num = 0;
1752 
1753         /* An F_UNLKSYS has been done for this client */
1754         cp->rc_unlksys_completed = FALSE;
1755 
1756         /* We need the client to ack us */
1757         cp->rc_need_confirm = TRUE;
1758         cp->rc_cp_confirmed = NULL;
1759 
1760         /* TRUE all the time until the callback path actually fails */
1761         cp->rc_cbinfo.cb_notified_of_cb_path_down = TRUE;
1762 
1763         /* Initialize the access time to now */
1764         cp->rc_last_access = gethrestime_sec();
1765 
1766         cp->rc_cr_set = NULL;
1767 
1768         cp->rc_sysidt = LM_NOSYSID;
1769 
1770         list_create(&cp->rc_openownerlist, sizeof (rfs4_openowner_t),
1771             offsetof(rfs4_openowner_t, ro_node));
1772 
1773         /* set up the callback control structure */
1774         cp->rc_cbinfo.cb_state = CB_UNINIT;
1775         mutex_init(cp->rc_cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1776         cv_init(cp->rc_cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1777         cv_init(cp->rc_cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1778 
1779         /*
1780          * Associate the client_t with the current server instance.
1781          * The hold is solely to satisfy the calling requirement of
1782          * rfs4_servinst_assign(). In this case it's not strictly necessary.
1783          */
1784         rfs4_dbe_hold(cp->rc_dbe);
1785         rfs4_servinst_assign(nsrv4, cp, nsrv4->nfs4_cur_servinst);
1786         rfs4_dbe_rele(cp->rc_dbe);
1787 
1788         return (TRUE);
1789 }
1790 
1791 /*
1792  * Caller wants to generate/update the setclientid_confirm verifier
1793  * associated with a client.  This is done during the SETCLIENTID
1794  * processing.
1795  */
1796 void
1797 rfs4_client_scv_next(rfs4_client_t *cp)
1798 {
1799         scid_confirm_verf *scvp;
1800 
1801         /* Init the value for the SETCLIENTID_CONFIRM verifier */
1802         scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1803         scvp->cv_impl.gen_num++;
1804 }
1805 
1806 void
1807 rfs4_client_rele(rfs4_client_t *cp)
1808 {
1809         rfs4_dbe_rele(cp->rc_dbe);
1810 }
1811 
1812 rfs4_client_t *
1813 rfs4_findclient(nfs_client_id4 *client, bool_t *create, rfs4_client_t *oldcp)
1814 {
1815         rfs4_client_t *cp;
1816         nfs4_srv_t *nsrv4;
1817         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1818 
1819 
1820         if (oldcp) {
1821                 rw_enter(&nsrv4->rfs4_findclient_lock, RW_WRITER);
1822                 rfs4_dbe_hide(oldcp->rc_dbe);
1823         } else {
1824                 rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1825         }
1826 
1827         cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_nfsclnt_idx, client,
1828             create, (void *)client, RFS4_DBS_VALID);
1829 
1830         if (oldcp)
1831                 rfs4_dbe_unhide(oldcp->rc_dbe);
1832 
1833         rw_exit(&nsrv4->rfs4_findclient_lock);
1834 
1835         return (cp);
1836 }
1837 
1838 rfs4_client_t *
1839 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1840 {
1841         rfs4_client_t *cp;
1842         bool_t create = FALSE;
1843         cid *cidp = (cid *)&clientid;
1844         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1845 
1846         /* If we're a cluster and the nodeid isn't right, short-circuit */
1847         if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
1848                 return (NULL);
1849 
1850         rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1851 
1852         cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx, &clientid,
1853             &create, NULL, RFS4_DBS_VALID);
1854 
1855         rw_exit(&nsrv4->rfs4_findclient_lock);
1856 
1857         if (cp && cp->rc_need_confirm && find_unconfirmed == FALSE) {
1858                 rfs4_client_rele(cp);
1859                 return (NULL);
1860         } else {
1861                 return (cp);
1862         }
1863 }
1864 
1865 static uint32_t
1866 clntip_hash(void *key)
1867 {
1868         struct sockaddr *addr = key;
1869         int i, len = 0;
1870         uint32_t hash = 0;
1871         char *ptr;
1872 
1873         if (addr->sa_family == AF_INET) {
1874                 struct sockaddr_in *a = (struct sockaddr_in *)addr;
1875                 len = sizeof (struct in_addr);
1876                 ptr = (char *)&a->sin_addr;
1877         } else if (addr->sa_family == AF_INET6) {
1878                 struct sockaddr_in6 *a = (struct sockaddr_in6 *)addr;
1879                 len = sizeof (struct in6_addr);
1880                 ptr = (char *)&a->sin6_addr;
1881         } else
1882                 return (0);
1883 
1884         for (i = 0; i < len; i++) {
1885                 hash <<= 1;
1886                 hash += (uint_t)ptr[i];
1887         }
1888         return (hash);
1889 }
1890 
1891 static bool_t
1892 clntip_compare(rfs4_entry_t entry, void *key)
1893 {
1894         rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1895         struct sockaddr *addr = key;
1896         int len = 0;
1897         char *p1, *p2;
1898 
1899         if (addr->sa_family == AF_INET) {
1900                 struct sockaddr_in *a1 = (struct sockaddr_in *)&cp->ri_addr;
1901                 struct sockaddr_in *a2 = (struct sockaddr_in *)addr;
1902                 len = sizeof (struct in_addr);
1903                 p1 = (char *)&a1->sin_addr;
1904                 p2 = (char *)&a2->sin_addr;
1905         } else if (addr->sa_family == AF_INET6) {
1906                 struct sockaddr_in6 *a1 = (struct sockaddr_in6 *)&cp->ri_addr;
1907                 struct sockaddr_in6 *a2 = (struct sockaddr_in6 *)addr;
1908                 len = sizeof (struct in6_addr);
1909                 p1 = (char *)&a1->sin6_addr;
1910                 p2 = (char *)&a2->sin6_addr;
1911         } else
1912                 return (0);
1913 
1914         return (bcmp(p1, p2, len) == 0);
1915 }
1916 
1917 static void *
1918 clntip_mkkey(rfs4_entry_t entry)
1919 {
1920         rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1921 
1922         return (&cp->ri_addr);
1923 }
1924 
1925 static bool_t
1926 rfs4_clntip_expiry(rfs4_entry_t u_entry)
1927 {
1928         rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1929 
1930         if (rfs4_dbe_is_invalid(cp->ri_dbe))
1931                 return (TRUE);
1932         return (FALSE);
1933 }
1934 
1935 /* ARGSUSED */
1936 static void
1937 rfs4_clntip_destroy(rfs4_entry_t u_entry)
1938 {
1939 }
1940 
1941 static bool_t
1942 rfs4_clntip_create(rfs4_entry_t u_entry, void *arg)
1943 {
1944         rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1945         struct sockaddr *ca = (struct sockaddr *)arg;
1946 
1947         /* Copy client's IP address */
1948         if (ca->sa_family == AF_INET)
1949                 bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in));
1950         else if (ca->sa_family == AF_INET6)
1951                 bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in6));
1952         else
1953                 return (FALSE);
1954         cp->ri_no_referrals = 1;
1955 
1956         return (TRUE);
1957 }
1958 
1959 rfs4_clntip_t *
1960 rfs4_find_clntip(struct sockaddr *addr, bool_t *create)
1961 {
1962         rfs4_clntip_t *cp;
1963         nfs4_srv_t *nsrv4;
1964 
1965         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1966 
1967         rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1968 
1969         cp = (rfs4_clntip_t *)rfs4_dbsearch(nsrv4->rfs4_clntip_idx, addr,
1970             create, addr, RFS4_DBS_VALID);
1971 
1972         rw_exit(&nsrv4->rfs4_findclient_lock);
1973 
1974         return (cp);
1975 }
1976 
1977 void
1978 rfs4_invalidate_clntip(struct sockaddr *addr)
1979 {
1980         rfs4_clntip_t *cp;
1981         bool_t create = FALSE;
1982         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
1983 
1984         rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1985 
1986         cp = (rfs4_clntip_t *)rfs4_dbsearch(nsrv4->rfs4_clntip_idx, addr,
1987             &create, NULL, RFS4_DBS_VALID);
1988         if (cp == NULL) {
1989                 rw_exit(&nsrv4->rfs4_findclient_lock);
1990                 return;
1991         }
1992         rfs4_dbe_invalidate(cp->ri_dbe);
1993         rfs4_dbe_rele(cp->ri_dbe);
1994 
1995         rw_exit(&nsrv4->rfs4_findclient_lock);
1996 }
1997 
1998 bool_t
1999 rfs4_lease_expired(rfs4_client_t *cp)
2000 {
2001         bool_t rc;
2002 
2003         rfs4_dbe_lock(cp->rc_dbe);
2004 
2005         /*
2006          * If the admin has executed clear_locks for this
2007          * client id, force expire will be set, so no need
2008          * to calculate anything because it's "outa here".
2009          */
2010         if (cp->rc_forced_expire) {
2011                 rc = TRUE;
2012         } else {
2013                 rc = (gethrestime_sec() - cp->rc_last_access > rfs4_lease_time);
2014         }
2015 
2016         /*
2017          * If the lease has expired we will also want
2018          * to remove any stable storage state data. So
2019          * mark the client id accordingly.
2020          */
2021         if (!cp->rc_ss_remove)
2022                 cp->rc_ss_remove = (rc == TRUE);
2023 
2024         rfs4_dbe_unlock(cp->rc_dbe);
2025 
2026         return (rc);
2027 }
2028 
2029 void
2030 rfs4_update_lease(rfs4_client_t *cp)
2031 {
2032         rfs4_dbe_lock(cp->rc_dbe);
2033         if (!cp->rc_forced_expire)
2034                 cp->rc_last_access = gethrestime_sec();
2035         rfs4_dbe_unlock(cp->rc_dbe);
2036 }
2037 
2038 
2039 static bool_t
2040 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
2041 {
2042         bool_t rc;
2043 
2044         if (a->clientid != b->clientid)
2045                 return (FALSE);
2046 
2047         if (a->owner_len != b->owner_len)
2048                 return (FALSE);
2049 
2050         rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
2051 
2052         return (rc);
2053 }
2054 
2055 static uint_t
2056 openowner_hash(void *key)
2057 {
2058         int i;
2059         open_owner4 *openowner = key;
2060         uint_t hash = 0;
2061 
2062         for (i = 0; i < openowner->owner_len; i++) {
2063                 hash <<= 4;
2064                 hash += (uint_t)openowner->owner_val[i];
2065         }
2066         hash += (uint_t)openowner->clientid;
2067         hash |= (openowner->clientid >> 32);
2068 
2069         return (hash);
2070 }
2071 
2072 static bool_t
2073 openowner_compare(rfs4_entry_t u_entry, void *key)
2074 {
2075         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2076         open_owner4 *arg = key;
2077 
2078         return (EQOPENOWNER(&oo->ro_owner, arg));
2079 }
2080 
2081 void *
2082 openowner_mkkey(rfs4_entry_t u_entry)
2083 {
2084         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2085 
2086         return (&oo->ro_owner);
2087 }
2088 
2089 /* ARGSUSED */
2090 static bool_t
2091 rfs4_openowner_expiry(rfs4_entry_t u_entry)
2092 {
2093         /* openstateid held us and did all needed delay */
2094         return (TRUE);
2095 }
2096 
2097 static void
2098 rfs4_openowner_destroy(rfs4_entry_t u_entry)
2099 {
2100         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2101 
2102         /* Remove open owner from client's lists of open owners */
2103         rfs4_dbe_lock(oo->ro_client->rc_dbe);
2104         list_remove(&oo->ro_client->rc_openownerlist, oo);
2105         rfs4_dbe_unlock(oo->ro_client->rc_dbe);
2106 
2107         /* One less reference to the client */
2108         rfs4_client_rele(oo->ro_client);
2109         oo->ro_client = NULL;
2110 
2111         /* Free the last reply for this lock owner */
2112         rfs4_free_reply(&oo->ro_reply);
2113 
2114         if (oo->ro_reply_fh.nfs_fh4_val) {
2115                 kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2116                     oo->ro_reply_fh.nfs_fh4_len);
2117                 oo->ro_reply_fh.nfs_fh4_val = NULL;
2118                 oo->ro_reply_fh.nfs_fh4_len = 0;
2119         }
2120 
2121         rfs4_sw_destroy(&oo->ro_sw);
2122         list_destroy(&oo->ro_statelist);
2123 
2124         /* Free the lock owner id */
2125         kmem_free(oo->ro_owner.owner_val, oo->ro_owner.owner_len);
2126 }
2127 
2128 void
2129 rfs4_openowner_rele(rfs4_openowner_t *oo)
2130 {
2131         rfs4_dbe_rele(oo->ro_dbe);
2132 }
2133 
2134 static bool_t
2135 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
2136 {
2137         rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2138         rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
2139         open_owner4 *openowner = &argp->ro_owner;
2140         seqid4 seqid = argp->ro_open_seqid;
2141         rfs4_client_t *cp;
2142         bool_t create = FALSE;
2143         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2144 
2145         rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2146 
2147         cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx,
2148             &openowner->clientid,
2149             &create, NULL, RFS4_DBS_VALID);
2150 
2151         rw_exit(&nsrv4->rfs4_findclient_lock);
2152 
2153         if (cp == NULL)
2154                 return (FALSE);
2155 
2156         oo->ro_reply_fh.nfs_fh4_len = 0;
2157         oo->ro_reply_fh.nfs_fh4_val = NULL;
2158 
2159         oo->ro_owner.clientid = openowner->clientid;
2160         oo->ro_owner.owner_val =
2161             kmem_alloc(openowner->owner_len, KM_SLEEP);
2162 
2163         bcopy(openowner->owner_val,
2164             oo->ro_owner.owner_val, openowner->owner_len);
2165 
2166         oo->ro_owner.owner_len = openowner->owner_len;
2167 
2168         oo->ro_need_confirm = TRUE;
2169 
2170         rfs4_sw_init(&oo->ro_sw);
2171 
2172         oo->ro_open_seqid = seqid;
2173         bzero(&oo->ro_reply, sizeof (nfs_resop4));
2174         oo->ro_client = cp;
2175         oo->ro_cr_set = NULL;
2176 
2177         list_create(&oo->ro_statelist, sizeof (rfs4_state_t),
2178             offsetof(rfs4_state_t, rs_node));
2179 
2180         /* Insert openowner into client's open owner list */
2181         rfs4_dbe_lock(cp->rc_dbe);
2182         list_insert_tail(&cp->rc_openownerlist, oo);
2183         rfs4_dbe_unlock(cp->rc_dbe);
2184 
2185         return (TRUE);
2186 }
2187 
2188 rfs4_openowner_t *
2189 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
2190 {
2191         rfs4_openowner_t *oo;
2192         rfs4_openowner_t arg;
2193         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2194 
2195         arg.ro_owner = *openowner;
2196         arg.ro_open_seqid = seqid;
2197         /* CSTYLED */
2198         oo = (rfs4_openowner_t *)rfs4_dbsearch(nsrv4->rfs4_openowner_idx, openowner,
2199             create, &arg, RFS4_DBS_VALID);
2200 
2201         return (oo);
2202 }
2203 
2204 void
2205 rfs4_update_open_sequence(rfs4_openowner_t *oo)
2206 {
2207 
2208         rfs4_dbe_lock(oo->ro_dbe);
2209 
2210         oo->ro_open_seqid++;
2211 
2212         rfs4_dbe_unlock(oo->ro_dbe);
2213 }
2214 
2215 void
2216 rfs4_update_open_resp(rfs4_openowner_t *oo, nfs_resop4 *resp, nfs_fh4 *fh)
2217 {
2218 
2219         rfs4_dbe_lock(oo->ro_dbe);
2220 
2221         rfs4_free_reply(&oo->ro_reply);
2222 
2223         rfs4_copy_reply(&oo->ro_reply, resp);
2224 
2225         /* Save the filehandle if provided and free if not used */
2226         if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
2227             fh && fh->nfs_fh4_len) {
2228                 if (oo->ro_reply_fh.nfs_fh4_val == NULL)
2229                         oo->ro_reply_fh.nfs_fh4_val =
2230                             kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2231                 nfs_fh4_copy(fh, &oo->ro_reply_fh);
2232         } else {
2233                 if (oo->ro_reply_fh.nfs_fh4_val) {
2234                         kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2235                             oo->ro_reply_fh.nfs_fh4_len);
2236                         oo->ro_reply_fh.nfs_fh4_val = NULL;
2237                         oo->ro_reply_fh.nfs_fh4_len = 0;
2238                 }
2239         }
2240 
2241         rfs4_dbe_unlock(oo->ro_dbe);
2242 }
2243 
2244 static bool_t
2245 lockowner_compare(rfs4_entry_t u_entry, void *key)
2246 {
2247         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2248         lock_owner4 *b = (lock_owner4 *)key;
2249 
2250         if (lo->rl_owner.clientid != b->clientid)
2251                 return (FALSE);
2252 
2253         if (lo->rl_owner.owner_len != b->owner_len)
2254                 return (FALSE);
2255 
2256         return (bcmp(lo->rl_owner.owner_val, b->owner_val,
2257             lo->rl_owner.owner_len) == 0);
2258 }
2259 
2260 void *
2261 lockowner_mkkey(rfs4_entry_t u_entry)
2262 {
2263         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2264 
2265         return (&lo->rl_owner);
2266 }
2267 
2268 static uint32_t
2269 lockowner_hash(void *key)
2270 {
2271         int i;
2272         lock_owner4 *lockowner = key;
2273         uint_t hash = 0;
2274 
2275         for (i = 0; i < lockowner->owner_len; i++) {
2276                 hash <<= 4;
2277                 hash += (uint_t)lockowner->owner_val[i];
2278         }
2279         hash += (uint_t)lockowner->clientid;
2280         hash |= (lockowner->clientid >> 32);
2281 
2282         return (hash);
2283 }
2284 
2285 static uint32_t
2286 pid_hash(void *key)
2287 {
2288         return ((uint32_t)(uintptr_t)key);
2289 }
2290 
2291 static void *
2292 pid_mkkey(rfs4_entry_t u_entry)
2293 {
2294         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2295 
2296         return ((void *)(uintptr_t)lo->rl_pid);
2297 }
2298 
2299 static bool_t
2300 pid_compare(rfs4_entry_t u_entry, void *key)
2301 {
2302         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2303 
2304         return (lo->rl_pid == (pid_t)(uintptr_t)key);
2305 }
2306 
2307 static void
2308 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
2309 {
2310         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2311 
2312         /* Free the lock owner id */
2313         kmem_free(lo->rl_owner.owner_val, lo->rl_owner.owner_len);
2314         rfs4_client_rele(lo->rl_client);
2315 }
2316 
2317 void
2318 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
2319 {
2320         rfs4_dbe_rele(lo->rl_dbe);
2321 }
2322 
2323 /* ARGSUSED */
2324 static bool_t
2325 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
2326 {
2327         /*
2328          * Since expiry is called with no other references on
2329          * this struct, go ahead and have it removed.
2330          */
2331         return (TRUE);
2332 }
2333 
2334 static bool_t
2335 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
2336 {
2337         rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2338         lock_owner4 *lockowner = (lock_owner4 *)arg;
2339         rfs4_client_t *cp;
2340         bool_t create = FALSE;
2341         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2342 
2343         rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2344 
2345         cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx,
2346             &lockowner->clientid,
2347             &create, NULL, RFS4_DBS_VALID);
2348 
2349         rw_exit(&nsrv4->rfs4_findclient_lock);
2350 
2351         if (cp == NULL)
2352                 return (FALSE);
2353 
2354         /* Reference client */
2355         lo->rl_client = cp;
2356         lo->rl_owner.clientid = lockowner->clientid;
2357         lo->rl_owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
2358         bcopy(lockowner->owner_val, lo->rl_owner.owner_val,
2359             lockowner->owner_len);
2360         lo->rl_owner.owner_len = lockowner->owner_len;
2361         lo->rl_pid = rfs4_dbe_getid(lo->rl_dbe);
2362 
2363         return (TRUE);
2364 }
2365 
2366 rfs4_lockowner_t *
2367 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2368 {
2369         rfs4_lockowner_t *lo;
2370         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2371 
2372         /* CSTYLED */
2373         lo = (rfs4_lockowner_t *)rfs4_dbsearch(nsrv4->rfs4_lockowner_idx, lockowner,
2374             create, lockowner, RFS4_DBS_VALID);
2375 
2376         return (lo);
2377 }
2378 
2379 rfs4_lockowner_t *
2380 rfs4_findlockowner_by_pid(pid_t pid)
2381 {
2382         rfs4_lockowner_t *lo;
2383         bool_t create = FALSE;
2384         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2385 
2386         lo = (rfs4_lockowner_t *)rfs4_dbsearch(nsrv4->rfs4_lockowner_pid_idx,
2387             (void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2388 
2389         return (lo);
2390 }
2391 
2392 
2393 static uint32_t
2394 file_hash(void *key)
2395 {
2396         return (ADDRHASH(key));
2397 }
2398 
2399 static void *
2400 file_mkkey(rfs4_entry_t u_entry)
2401 {
2402         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2403 
2404         return (fp->rf_vp);
2405 }
2406 
2407 static bool_t
2408 file_compare(rfs4_entry_t u_entry, void *key)
2409 {
2410         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2411 
2412         return (fp->rf_vp == (vnode_t *)key);
2413 }
2414 
2415 static void
2416 rfs4_file_destroy(rfs4_entry_t u_entry)
2417 {
2418         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2419 
2420         list_destroy(&fp->rf_delegstatelist);
2421 
2422         if (fp->rf_filehandle.nfs_fh4_val)
2423                 kmem_free(fp->rf_filehandle.nfs_fh4_val,
2424                     fp->rf_filehandle.nfs_fh4_len);
2425         cv_destroy(fp->rf_dinfo.rd_recall_cv);
2426         if (fp->rf_vp) {
2427                 vnode_t *vp = fp->rf_vp;
2428 
2429                 mutex_enter(&vp->v_vsd_lock);
2430                 (void) vsd_set(vp, nfs4_srv_vkey, NULL);
2431                 mutex_exit(&vp->v_vsd_lock);
2432                 VN_RELE(vp);
2433                 fp->rf_vp = NULL;
2434         }
2435         rw_destroy(&fp->rf_file_rwlock);
2436 }
2437 
2438 /*
2439  * Used to unlock the underlying dbe struct only
2440  */
2441 void
2442 rfs4_file_rele(rfs4_file_t *fp)
2443 {
2444         rfs4_dbe_rele(fp->rf_dbe);
2445 }
2446 
2447 typedef struct {
2448     vnode_t *vp;
2449     nfs_fh4 *fh;
2450 } rfs4_fcreate_arg;
2451 
2452 static bool_t
2453 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2454 {
2455         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2456         rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2457         vnode_t *vp = ap->vp;
2458         nfs_fh4 *fh = ap->fh;
2459 
2460         VN_HOLD(vp);
2461 
2462         fp->rf_filehandle.nfs_fh4_len = 0;
2463         fp->rf_filehandle.nfs_fh4_val = NULL;
2464         ASSERT(fh && fh->nfs_fh4_len);
2465         if (fh && fh->nfs_fh4_len) {
2466                 fp->rf_filehandle.nfs_fh4_val =
2467                     kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2468                 nfs_fh4_copy(fh, &fp->rf_filehandle);
2469         }
2470         fp->rf_vp = vp;
2471 
2472         list_create(&fp->rf_delegstatelist, sizeof (rfs4_deleg_state_t),
2473             offsetof(rfs4_deleg_state_t, rds_node));
2474 
2475         fp->rf_share_deny = fp->rf_share_access = fp->rf_access_read = 0;
2476         fp->rf_access_write = fp->rf_deny_read = fp->rf_deny_write = 0;
2477 
2478         mutex_init(fp->rf_dinfo.rd_recall_lock, NULL, MUTEX_DEFAULT, NULL);
2479         cv_init(fp->rf_dinfo.rd_recall_cv, NULL, CV_DEFAULT, NULL);
2480 
2481         fp->rf_dinfo.rd_dtype = OPEN_DELEGATE_NONE;
2482 
2483         rw_init(&fp->rf_file_rwlock, NULL, RW_DEFAULT, NULL);
2484 
2485         mutex_enter(&vp->v_vsd_lock);
2486         VERIFY(vsd_set(vp, nfs4_srv_vkey, (void *)fp) == 0);
2487         mutex_exit(&vp->v_vsd_lock);
2488 
2489         return (TRUE);
2490 }
2491 
2492 rfs4_file_t *
2493 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2494 {
2495         rfs4_file_t *fp;
2496         rfs4_fcreate_arg arg;
2497         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2498 
2499         arg.vp = vp;
2500         arg.fh = fh;
2501 
2502         if (*create == TRUE)
2503                 /* CSTYLED */
2504                 fp = (rfs4_file_t *)rfs4_dbsearch(nsrv4->rfs4_file_idx, vp, create,
2505                     &arg, RFS4_DBS_VALID);
2506         else {
2507                 mutex_enter(&vp->v_vsd_lock);
2508                 fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2509                 if (fp) {
2510                         rfs4_dbe_lock(fp->rf_dbe);
2511                         if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2512                             (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2513                                 rfs4_dbe_unlock(fp->rf_dbe);
2514                                 fp = NULL;
2515                         } else {
2516                                 rfs4_dbe_hold(fp->rf_dbe);
2517                                 rfs4_dbe_unlock(fp->rf_dbe);
2518                         }
2519                 }
2520                 mutex_exit(&vp->v_vsd_lock);
2521         }
2522         return (fp);
2523 }
2524 
2525 /*
2526  * Find a file in the db and once it is located, take the rw lock.
2527  * Need to check the vnode pointer and if it does not exist (it was
2528  * removed between the db location and check) redo the find.  This
2529  * assumes that a file struct that has a NULL vnode pointer is marked
2530  * at 'invalid' and will not be found in the db the second time
2531  * around.
2532  */
2533 rfs4_file_t *
2534 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2535 {
2536         rfs4_file_t *fp;
2537         rfs4_fcreate_arg arg;
2538         bool_t screate = *create;
2539         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2540 
2541         if (screate == FALSE) {
2542                 mutex_enter(&vp->v_vsd_lock);
2543                 fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2544                 if (fp) {
2545                         rfs4_dbe_lock(fp->rf_dbe);
2546                         if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2547                             (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2548                                 rfs4_dbe_unlock(fp->rf_dbe);
2549                                 mutex_exit(&vp->v_vsd_lock);
2550                                 fp = NULL;
2551                         } else {
2552                                 rfs4_dbe_hold(fp->rf_dbe);
2553                                 rfs4_dbe_unlock(fp->rf_dbe);
2554                                 mutex_exit(&vp->v_vsd_lock);
2555                                 rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2556                                 if (fp->rf_vp == NULL) {
2557                                         rw_exit(&fp->rf_file_rwlock);
2558                                         rfs4_file_rele(fp);
2559                                         fp = NULL;
2560                                 }
2561                         }
2562                 } else {
2563                         mutex_exit(&vp->v_vsd_lock);
2564                 }
2565         } else {
2566 retry:
2567                 arg.vp = vp;
2568                 arg.fh = fh;
2569 
2570                 fp = (rfs4_file_t *)rfs4_dbsearch(nsrv4->rfs4_file_idx, vp,
2571                     create, &arg, RFS4_DBS_VALID);
2572                 if (fp != NULL) {
2573                         rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2574                         if (fp->rf_vp == NULL) {
2575                                 rw_exit(&fp->rf_file_rwlock);
2576                                 rfs4_file_rele(fp);
2577                                 *create = screate;
2578                                 goto retry;
2579                         }
2580                 }
2581         }
2582 
2583         return (fp);
2584 }
2585 
2586 static uint32_t
2587 lo_state_hash(void *key)
2588 {
2589         stateid_t *id = key;
2590 
2591         return (id->bits.ident+id->bits.pid);
2592 }
2593 
2594 static bool_t
2595 lo_state_compare(rfs4_entry_t u_entry, void *key)
2596 {
2597         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2598         stateid_t *id = key;
2599         bool_t rc;
2600 
2601         rc = (lsp->rls_lockid.bits.boottime == id->bits.boottime &&
2602             lsp->rls_lockid.bits.type == id->bits.type &&
2603             lsp->rls_lockid.bits.ident == id->bits.ident &&
2604             lsp->rls_lockid.bits.pid == id->bits.pid);
2605 
2606         return (rc);
2607 }
2608 
2609 static void *
2610 lo_state_mkkey(rfs4_entry_t u_entry)
2611 {
2612         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2613 
2614         return (&lsp->rls_lockid);
2615 }
2616 
2617 static bool_t
2618 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2619 {
2620         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2621 
2622         if (rfs4_dbe_is_invalid(lsp->rls_dbe))
2623                 return (TRUE);
2624         if (lsp->rls_state->rs_closed)
2625                 return (TRUE);
2626         return ((gethrestime_sec() -
2627             lsp->rls_state->rs_owner->ro_client->rc_last_access
2628             > rfs4_lease_time));
2629 }
2630 
2631 static void
2632 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2633 {
2634         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2635 
2636         rfs4_dbe_lock(lsp->rls_state->rs_dbe);
2637         list_remove(&lsp->rls_state->rs_lostatelist, lsp);
2638         rfs4_dbe_unlock(lsp->rls_state->rs_dbe);
2639 
2640         rfs4_sw_destroy(&lsp->rls_sw);
2641 
2642         /* Make sure to release the file locks */
2643         if (lsp->rls_locks_cleaned == FALSE) {
2644                 lsp->rls_locks_cleaned = TRUE;
2645                 if (lsp->rls_locker->rl_client->rc_sysidt != LM_NOSYSID) {
2646                         /* Is the PxFS kernel module loaded? */
2647                         if (lm_remove_file_locks != NULL) {
2648                                 int new_sysid;
2649 
2650                                 /* Encode the cluster nodeid in new sysid */
2651                                 new_sysid =
2652                                     lsp->rls_locker->rl_client->rc_sysidt;
2653                                 lm_set_nlmid_flk(&new_sysid);
2654 
2655                                 /*
2656                                  * This PxFS routine removes file locks for a
2657                                  * client over all nodes of a cluster.
2658                                  */
2659                                 DTRACE_PROBE1(nfss_i_clust_rm_lck,
2660                                     int, new_sysid);
2661                                 (*lm_remove_file_locks)(new_sysid);
2662                         } else {
2663                                 (void) cleanlocks(
2664                                     lsp->rls_state->rs_finfo->rf_vp,
2665                                     lsp->rls_locker->rl_pid,
2666                                     lsp->rls_locker->rl_client->rc_sysidt);
2667                         }
2668                 }
2669         }
2670 
2671         /* Free the last reply for this state */
2672         rfs4_free_reply(&lsp->rls_reply);
2673 
2674         rfs4_lockowner_rele(lsp->rls_locker);
2675         lsp->rls_locker = NULL;
2676 
2677         rfs4_state_rele_nounlock(lsp->rls_state);
2678         lsp->rls_state = NULL;
2679 }
2680 
2681 static bool_t
2682 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2683 {
2684         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2685         rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2686         rfs4_lockowner_t *lo = argp->rls_locker;
2687         rfs4_state_t *sp = argp->rls_state;
2688 
2689         lsp->rls_state = sp;
2690 
2691         lsp->rls_lockid = sp->rs_stateid;
2692         lsp->rls_lockid.bits.type = LOCKID;
2693         lsp->rls_lockid.bits.chgseq = 0;
2694         lsp->rls_lockid.bits.pid = lo->rl_pid;
2695 
2696         lsp->rls_locks_cleaned = FALSE;
2697         lsp->rls_lock_completed = FALSE;
2698 
2699         rfs4_sw_init(&lsp->rls_sw);
2700 
2701         /* Attached the supplied lock owner */
2702         rfs4_dbe_hold(lo->rl_dbe);
2703         lsp->rls_locker = lo;
2704 
2705         rfs4_dbe_lock(sp->rs_dbe);
2706         list_insert_tail(&sp->rs_lostatelist, lsp);
2707         rfs4_dbe_hold(sp->rs_dbe);
2708         rfs4_dbe_unlock(sp->rs_dbe);
2709 
2710         return (TRUE);
2711 }
2712 
2713 void
2714 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2715 {
2716         if (unlock_fp == TRUE)
2717                 rw_exit(&lsp->rls_state->rs_finfo->rf_file_rwlock);
2718         rfs4_dbe_rele(lsp->rls_dbe);
2719 }
2720 
2721 static rfs4_lo_state_t *
2722 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2723 {
2724         rfs4_lo_state_t *lsp;
2725         bool_t create = FALSE;
2726         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2727 
2728         lsp = (rfs4_lo_state_t *)rfs4_dbsearch(nsrv4->rfs4_lo_state_idx, id,
2729             &create, NULL, RFS4_DBS_VALID);
2730         if (lock_fp == TRUE && lsp != NULL)
2731                 rw_enter(&lsp->rls_state->rs_finfo->rf_file_rwlock, RW_READER);
2732 
2733         return (lsp);
2734 }
2735 
2736 
2737 static uint32_t
2738 lo_state_lo_hash(void *key)
2739 {
2740         rfs4_lo_state_t *lsp = key;
2741 
2742         return (ADDRHASH(lsp->rls_locker) ^ ADDRHASH(lsp->rls_state));
2743 }
2744 
2745 static bool_t
2746 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2747 {
2748         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2749         rfs4_lo_state_t *keyp = key;
2750 
2751         return (keyp->rls_locker == lsp->rls_locker &&
2752             keyp->rls_state == lsp->rls_state);
2753 }
2754 
2755 static void *
2756 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2757 {
2758         return (u_entry);
2759 }
2760 
2761 rfs4_lo_state_t *
2762 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo, rfs4_state_t *sp,
2763     bool_t *create)
2764 {
2765         rfs4_lo_state_t *lsp;
2766         rfs4_lo_state_t arg;
2767         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2768 
2769         arg.rls_locker = lo;
2770         arg.rls_state = sp;
2771 
2772         lsp = (rfs4_lo_state_t *)rfs4_dbsearch(nsrv4->rfs4_lo_state_owner_idx,
2773             &arg, create, &arg, RFS4_DBS_VALID);
2774 
2775         return (lsp);
2776 }
2777 
2778 static stateid_t
2779 get_stateid(id_t eid)
2780 {
2781         stateid_t id;
2782         nfs4_srv_t *nsrv4;
2783 
2784         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
2785 
2786         id.bits.boottime = nsrv4->rfs4_start_time;
2787         id.bits.ident = eid;
2788         id.bits.chgseq = 0;
2789         id.bits.type = 0;
2790         id.bits.pid = 0;
2791 
2792         /*
2793          * If we are booted as a cluster node, embed our nodeid.
2794          * We've already done sanity checks in rfs4_client_create() so no
2795          * need to repeat them here.
2796          */
2797         id.bits.clnodeid = (cluster_bootflags & CLUSTER_BOOTED) ?
2798             clconf_get_nodeid() : 0;
2799 
2800         return (id);
2801 }
2802 
2803 /*
2804  * For use only when booted as a cluster node.
2805  * Returns TRUE if the embedded nodeid indicates that this stateid was
2806  * generated on another node.
2807  */
2808 static int
2809 foreign_stateid(stateid_t *id)
2810 {
2811         ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2812         return (id->bits.clnodeid != (uint32_t)clconf_get_nodeid());
2813 }
2814 
2815 /*
2816  * For use only when booted as a cluster node.
2817  * Returns TRUE if the embedded nodeid indicates that this clientid was
2818  * generated on another node.
2819  */
2820 static int
2821 foreign_clientid(cid *cidp)
2822 {
2823         ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2824         return (cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT !=
2825             (uint32_t)clconf_get_nodeid());
2826 }
2827 
2828 /*
2829  * For use only when booted as a cluster node.
2830  * Embed our cluster nodeid into the clientid.
2831  */
2832 static void
2833 embed_nodeid(cid *cidp)
2834 {
2835         int clnodeid;
2836         /*
2837          * Currently, our state tables are small enough that their
2838          * ids will leave enough bits free for the nodeid. If the
2839          * tables become larger, we mustn't overwrite the id.
2840          * Equally, we only have room for so many bits of nodeid, so
2841          * must check that too.
2842          */
2843         ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2844         ASSERT(cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT == 0);
2845         clnodeid = clconf_get_nodeid();
2846         ASSERT(clnodeid <= CLUSTER_MAX_NODEID);
2847         ASSERT(clnodeid != NODEID_UNKNOWN);
2848         cidp->impl_id.c_id |= (clnodeid << CLUSTER_NODEID_SHIFT);
2849 }
2850 
2851 static uint32_t
2852 state_hash(void *key)
2853 {
2854         stateid_t *ip = (stateid_t *)key;
2855 
2856         return (ip->bits.ident);
2857 }
2858 
2859 static bool_t
2860 state_compare(rfs4_entry_t u_entry, void *key)
2861 {
2862         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2863         stateid_t *id = (stateid_t *)key;
2864         bool_t rc;
2865 
2866         rc = (sp->rs_stateid.bits.boottime == id->bits.boottime &&
2867             sp->rs_stateid.bits.ident == id->bits.ident);
2868 
2869         return (rc);
2870 }
2871 
2872 static void *
2873 state_mkkey(rfs4_entry_t u_entry)
2874 {
2875         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2876 
2877         return (&sp->rs_stateid);
2878 }
2879 
2880 static void
2881 rfs4_state_destroy(rfs4_entry_t u_entry)
2882 {
2883         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2884 
2885         /* remove from openowner list */
2886         rfs4_dbe_lock(sp->rs_owner->ro_dbe);
2887         list_remove(&sp->rs_owner->ro_statelist, sp);
2888         rfs4_dbe_unlock(sp->rs_owner->ro_dbe);
2889 
2890         list_destroy(&sp->rs_lostatelist);
2891 
2892         /* release any share locks for this stateid if it's still open */
2893         if (!sp->rs_closed) {
2894                 rfs4_dbe_lock(sp->rs_dbe);
2895                 (void) rfs4_unshare(sp);
2896                 rfs4_dbe_unlock(sp->rs_dbe);
2897         }
2898 
2899         /* Were done with the file */
2900         rfs4_file_rele(sp->rs_finfo);
2901         sp->rs_finfo = NULL;
2902 
2903         /* And now with the openowner */
2904         rfs4_openowner_rele(sp->rs_owner);
2905         sp->rs_owner = NULL;
2906 }
2907 
2908 static void
2909 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2910 {
2911         rfs4_dbe_rele(sp->rs_dbe);
2912 }
2913 
2914 void
2915 rfs4_state_rele(rfs4_state_t *sp)
2916 {
2917         rw_exit(&sp->rs_finfo->rf_file_rwlock);
2918         rfs4_dbe_rele(sp->rs_dbe);
2919 }
2920 
2921 static uint32_t
2922 deleg_hash(void *key)
2923 {
2924         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2925 
2926         return (ADDRHASH(dsp->rds_client) ^ ADDRHASH(dsp->rds_finfo));
2927 }
2928 
2929 static bool_t
2930 deleg_compare(rfs4_entry_t u_entry, void *key)
2931 {
2932         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2933         rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2934 
2935         return (dsp->rds_client == kdsp->rds_client &&
2936             dsp->rds_finfo == kdsp->rds_finfo);
2937 }
2938 
2939 static void *
2940 deleg_mkkey(rfs4_entry_t u_entry)
2941 {
2942         return (u_entry);
2943 }
2944 
2945 static uint32_t
2946 deleg_state_hash(void *key)
2947 {
2948         stateid_t *ip = (stateid_t *)key;
2949 
2950         return (ip->bits.ident);
2951 }
2952 
2953 static bool_t
2954 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2955 {
2956         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2957         stateid_t *id = (stateid_t *)key;
2958         bool_t rc;
2959 
2960         if (id->bits.type != DELEGID)
2961                 return (FALSE);
2962 
2963         rc = (dsp->rds_delegid.bits.boottime == id->bits.boottime &&
2964             dsp->rds_delegid.bits.ident == id->bits.ident);
2965 
2966         return (rc);
2967 }
2968 
2969 static void *
2970 deleg_state_mkkey(rfs4_entry_t u_entry)
2971 {
2972         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2973 
2974         return (&dsp->rds_delegid);
2975 }
2976 
2977 static bool_t
2978 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
2979 {
2980         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2981 
2982         if (rfs4_dbe_is_invalid(dsp->rds_dbe))
2983                 return (TRUE);
2984 
2985         if (dsp->rds_dtype == OPEN_DELEGATE_NONE)
2986                 return (TRUE);
2987 
2988         if ((gethrestime_sec() - dsp->rds_client->rc_last_access
2989             > rfs4_lease_time)) {
2990                 rfs4_dbe_invalidate(dsp->rds_dbe);
2991                 return (TRUE);
2992         }
2993 
2994         return (FALSE);
2995 }
2996 
2997 static bool_t
2998 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
2999 {
3000         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3001         rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->rds_finfo;
3002         rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->rds_client;
3003 
3004         rfs4_dbe_hold(fp->rf_dbe);
3005         rfs4_dbe_hold(cp->rc_dbe);
3006 
3007         dsp->rds_delegid = get_stateid(rfs4_dbe_getid(dsp->rds_dbe));
3008         dsp->rds_delegid.bits.type = DELEGID;
3009         dsp->rds_finfo = fp;
3010         dsp->rds_client = cp;
3011         dsp->rds_dtype = OPEN_DELEGATE_NONE;
3012 
3013         dsp->rds_time_granted = gethrestime_sec();   /* observability */
3014         dsp->rds_time_revoked = 0;
3015 
3016         list_link_init(&dsp->rds_node);
3017 
3018         return (TRUE);
3019 }
3020 
3021 static void
3022 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
3023 {
3024         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3025 
3026         /* return delegation if necessary */
3027         rfs4_return_deleg(dsp, FALSE);
3028 
3029         /* Were done with the file */
3030         rfs4_file_rele(dsp->rds_finfo);
3031         dsp->rds_finfo = NULL;
3032 
3033         /* And now with the openowner */
3034         rfs4_client_rele(dsp->rds_client);
3035         dsp->rds_client = NULL;
3036 }
3037 
3038 rfs4_deleg_state_t *
3039 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
3040 {
3041         rfs4_deleg_state_t ds, *dsp;
3042         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
3043 
3044         ds.rds_client = sp->rs_owner->ro_client;
3045         ds.rds_finfo = sp->rs_finfo;
3046 
3047         dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(nsrv4->rfs4_deleg_idx, &ds,
3048             create, &ds, RFS4_DBS_VALID);
3049 
3050         return (dsp);
3051 }
3052 
3053 rfs4_deleg_state_t *
3054 rfs4_finddelegstate(stateid_t *id)
3055 {
3056         rfs4_deleg_state_t *dsp;
3057         bool_t create = FALSE;
3058         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
3059 
3060         dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(nsrv4->rfs4_deleg_state_idx,
3061             id, &create, NULL, RFS4_DBS_VALID);
3062 
3063         return (dsp);
3064 }
3065 
3066 void
3067 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
3068 {
3069         rfs4_dbe_rele(dsp->rds_dbe);
3070 }
3071 
3072 void
3073 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
3074 {
3075 
3076         rfs4_dbe_lock(lsp->rls_dbe);
3077 
3078         /*
3079          * If we are skipping sequence id checking, this means that
3080          * this is the first lock request and therefore the sequence
3081          * id does not need to be updated.  This only happens on the
3082          * first lock request for a lockowner
3083          */
3084         if (!lsp->rls_skip_seqid_check)
3085                 lsp->rls_seqid++;
3086 
3087         rfs4_dbe_unlock(lsp->rls_dbe);
3088 }
3089 
3090 void
3091 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
3092 {
3093 
3094         rfs4_dbe_lock(lsp->rls_dbe);
3095 
3096         rfs4_free_reply(&lsp->rls_reply);
3097 
3098         rfs4_copy_reply(&lsp->rls_reply, resp);
3099 
3100         rfs4_dbe_unlock(lsp->rls_dbe);
3101 }
3102 
3103 void
3104 rfs4_free_opens(rfs4_openowner_t *oo, bool_t invalidate,
3105     bool_t close_of_client)
3106 {
3107         rfs4_state_t *sp;
3108 
3109         rfs4_dbe_lock(oo->ro_dbe);
3110 
3111         for (sp = list_head(&oo->ro_statelist); sp != NULL;
3112             sp = list_next(&oo->ro_statelist, sp)) {
3113                 rfs4_state_close(sp, FALSE, close_of_client, CRED());
3114                 if (invalidate == TRUE)
3115                         rfs4_dbe_invalidate(sp->rs_dbe);
3116         }
3117 
3118         rfs4_dbe_invalidate(oo->ro_dbe);
3119         rfs4_dbe_unlock(oo->ro_dbe);
3120 }
3121 
3122 static uint32_t
3123 state_owner_file_hash(void *key)
3124 {
3125         rfs4_state_t *sp = key;
3126 
3127         return (ADDRHASH(sp->rs_owner) ^ ADDRHASH(sp->rs_finfo));
3128 }
3129 
3130 static bool_t
3131 state_owner_file_compare(rfs4_entry_t u_entry, void *key)
3132 {
3133         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3134         rfs4_state_t *arg = key;
3135 
3136         if (sp->rs_closed == TRUE)
3137                 return (FALSE);
3138 
3139         return (arg->rs_owner == sp->rs_owner && arg->rs_finfo == sp->rs_finfo);
3140 }
3141 
3142 static void *
3143 state_owner_file_mkkey(rfs4_entry_t u_entry)
3144 {
3145         return (u_entry);
3146 }
3147 
3148 static uint32_t
3149 state_file_hash(void *key)
3150 {
3151         return (ADDRHASH(key));
3152 }
3153 
3154 static bool_t
3155 state_file_compare(rfs4_entry_t u_entry, void *key)
3156 {
3157         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3158         rfs4_file_t *fp = key;
3159 
3160         if (sp->rs_closed == TRUE)
3161                 return (FALSE);
3162 
3163         return (fp == sp->rs_finfo);
3164 }
3165 
3166 static void *
3167 state_file_mkkey(rfs4_entry_t u_entry)
3168 {
3169         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3170 
3171         return (sp->rs_finfo);
3172 }
3173 
3174 rfs4_state_t *
3175 rfs4_findstate_by_owner_file(rfs4_openowner_t *oo, rfs4_file_t *fp,
3176     bool_t *create)
3177 {
3178         rfs4_state_t *sp;
3179         rfs4_state_t key;
3180         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
3181 
3182         key.rs_owner = oo;
3183         key.rs_finfo = fp;
3184 
3185         sp = (rfs4_state_t *)rfs4_dbsearch(nsrv4->rfs4_state_owner_file_idx,
3186             &key, create, &key, RFS4_DBS_VALID);
3187 
3188         return (sp);
3189 }
3190 
3191 /* This returns ANY state struct that refers to this file */
3192 static rfs4_state_t *
3193 rfs4_findstate_by_file(rfs4_file_t *fp)
3194 {
3195         bool_t create = FALSE;
3196         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
3197 
3198         return ((rfs4_state_t *)rfs4_dbsearch(nsrv4->rfs4_state_file_idx, fp,
3199             &create, fp, RFS4_DBS_VALID));
3200 }
3201 
3202 static bool_t
3203 rfs4_state_expiry(rfs4_entry_t u_entry)
3204 {
3205         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3206 
3207         if (rfs4_dbe_is_invalid(sp->rs_dbe))
3208                 return (TRUE);
3209 
3210         if (sp->rs_closed == TRUE &&
3211             ((gethrestime_sec() - rfs4_dbe_get_timerele(sp->rs_dbe))
3212             > rfs4_lease_time))
3213                 return (TRUE);
3214 
3215         return ((gethrestime_sec() - sp->rs_owner->ro_client->rc_last_access
3216             > rfs4_lease_time));
3217 }
3218 
3219 static bool_t
3220 rfs4_state_create(rfs4_entry_t u_entry, void *argp)
3221 {
3222         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3223         rfs4_file_t *fp = ((rfs4_state_t *)argp)->rs_finfo;
3224         rfs4_openowner_t *oo = ((rfs4_state_t *)argp)->rs_owner;
3225 
3226         rfs4_dbe_hold(fp->rf_dbe);
3227         rfs4_dbe_hold(oo->ro_dbe);
3228         sp->rs_stateid = get_stateid(rfs4_dbe_getid(sp->rs_dbe));
3229         sp->rs_stateid.bits.type = OPENID;
3230         sp->rs_owner = oo;
3231         sp->rs_finfo = fp;
3232 
3233         list_create(&sp->rs_lostatelist, sizeof (rfs4_lo_state_t),
3234             offsetof(rfs4_lo_state_t, rls_node));
3235 
3236         /* Insert state on per open owner's list */
3237         rfs4_dbe_lock(oo->ro_dbe);
3238         list_insert_tail(&oo->ro_statelist, sp);
3239         rfs4_dbe_unlock(oo->ro_dbe);
3240 
3241         return (TRUE);
3242 }
3243 
3244 static rfs4_state_t *
3245 rfs4_findstate(stateid_t *id, rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3246 {
3247         rfs4_state_t *sp;
3248         bool_t create = FALSE;
3249         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
3250 
3251         sp = (rfs4_state_t *)rfs4_dbsearch(nsrv4->rfs4_state_idx, id,
3252             &create, NULL, find_invalid);
3253         if (lock_fp == TRUE && sp != NULL)
3254                 rw_enter(&sp->rs_finfo->rf_file_rwlock, RW_READER);
3255 
3256         return (sp);
3257 }
3258 
3259 void
3260 rfs4_state_close(rfs4_state_t *sp, bool_t lock_held, bool_t close_of_client,
3261     cred_t *cr)
3262 {
3263         /* Remove the associated lo_state owners */
3264         if (!lock_held)
3265                 rfs4_dbe_lock(sp->rs_dbe);
3266 
3267         /*
3268          * If refcnt == 0, the dbe is about to be destroyed.
3269          * lock state will be released by the reaper thread.
3270          */
3271 
3272         if (rfs4_dbe_refcnt(sp->rs_dbe) > 0) {
3273                 if (sp->rs_closed == FALSE) {
3274                         rfs4_release_share_lock_state(sp, cr, close_of_client);
3275                         sp->rs_closed = TRUE;
3276                 }
3277         }
3278 
3279         if (!lock_held)
3280                 rfs4_dbe_unlock(sp->rs_dbe);
3281 }
3282 
3283 /*
3284  * Remove all state associated with the given client.
3285  */
3286 void
3287 rfs4_client_state_remove(rfs4_client_t *cp)
3288 {
3289         rfs4_openowner_t *oo;
3290 
3291         rfs4_dbe_lock(cp->rc_dbe);
3292 
3293         for (oo = list_head(&cp->rc_openownerlist); oo != NULL;
3294             oo = list_next(&cp->rc_openownerlist, oo)) {
3295                 rfs4_free_opens(oo, TRUE, TRUE);
3296         }
3297 
3298         rfs4_dbe_unlock(cp->rc_dbe);
3299 }
3300 
3301 void
3302 rfs4_client_close(rfs4_client_t *cp)
3303 {
3304         /* Mark client as going away. */
3305         rfs4_dbe_lock(cp->rc_dbe);
3306         rfs4_dbe_invalidate(cp->rc_dbe);
3307         rfs4_dbe_unlock(cp->rc_dbe);
3308 
3309         rfs4_client_state_remove(cp);
3310 
3311         /* Release the client */
3312         rfs4_client_rele(cp);
3313 }
3314 
3315 nfsstat4
3316 rfs4_check_clientid(clientid4 *cp, int setclid_confirm)
3317 {
3318         cid *cidp = (cid *) cp;
3319         nfs4_srv_t *nsrv4;
3320 
3321         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
3322 
3323         /*
3324          * If we are booted as a cluster node, check the embedded nodeid.
3325          * If it indicates that this clientid was generated on another node,
3326          * inform the client accordingly.
3327          */
3328         if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
3329                 return (NFS4ERR_STALE_CLIENTID);
3330 
3331         /*
3332          * If the server start time matches the time provided
3333          * by the client (via the clientid) and this is NOT a
3334          * setclientid_confirm then return EXPIRED.
3335          */
3336         if (!setclid_confirm &&
3337             cidp->impl_id.start_time == nsrv4->rfs4_start_time)
3338                 return (NFS4ERR_EXPIRED);
3339 
3340         return (NFS4ERR_STALE_CLIENTID);
3341 }
3342 
3343 /*
3344  * This is used when a stateid has not been found amongst the
3345  * current server's state.  Check the stateid to see if it
3346  * was from this server instantiation or not.
3347  */
3348 static nfsstat4
3349 what_stateid_error(stateid_t *id, stateid_type_t type)
3350 {
3351         nfs4_srv_t *nsrv4;
3352 
3353         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
3354 
3355         /* If we are booted as a cluster node, was stateid locally generated? */
3356         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3357                 return (NFS4ERR_STALE_STATEID);
3358 
3359         /* If types don't match then no use checking further */
3360         if (type != id->bits.type)
3361                 return (NFS4ERR_BAD_STATEID);
3362 
3363         /* From a different server instantiation, return STALE */
3364         if (id->bits.boottime != nsrv4->rfs4_start_time)
3365                 return (NFS4ERR_STALE_STATEID);
3366 
3367         /*
3368          * From this server but the state is most likely beyond lease
3369          * timeout: return NFS4ERR_EXPIRED.  However, there is the
3370          * case of a delegation stateid.  For delegations, there is a
3371          * case where the state can be removed without the client's
3372          * knowledge/consent: revocation.  In the case of delegation
3373          * revocation, the delegation state will be removed and will
3374          * not be found.  If the client does something like a
3375          * DELEGRETURN or even a READ/WRITE with a delegatoin stateid
3376          * that has been revoked, the server should return BAD_STATEID
3377          * instead of the more common EXPIRED error.
3378          */
3379         if (id->bits.boottime == nsrv4->rfs4_start_time) {
3380                 if (type == DELEGID)
3381                         return (NFS4ERR_BAD_STATEID);
3382                 else
3383                         return (NFS4ERR_EXPIRED);
3384         }
3385 
3386         return (NFS4ERR_BAD_STATEID);
3387 }
3388 
3389 /*
3390  * Used later on to find the various state structs.  When called from
3391  * rfs4_check_stateid()->rfs4_get_all_state(), no file struct lock is
3392  * taken (it is not needed) and helps on the read/write path with
3393  * respect to performance.
3394  */
3395 static nfsstat4
3396 rfs4_get_state_lockit(stateid4 *stateid, rfs4_state_t **spp,
3397     rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3398 {
3399         stateid_t *id = (stateid_t *)stateid;
3400         rfs4_state_t *sp;
3401 
3402         *spp = NULL;
3403 
3404         /* If we are booted as a cluster node, was stateid locally generated? */
3405         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3406                 return (NFS4ERR_STALE_STATEID);
3407 
3408         sp = rfs4_findstate(id, find_invalid, lock_fp);
3409         if (sp == NULL) {
3410                 return (what_stateid_error(id, OPENID));
3411         }
3412 
3413         if (rfs4_lease_expired(sp->rs_owner->ro_client)) {
3414                 if (lock_fp == TRUE)
3415                         rfs4_state_rele(sp);
3416                 else
3417                         rfs4_state_rele_nounlock(sp);
3418                 return (NFS4ERR_EXPIRED);
3419         }
3420 
3421         *spp = sp;
3422 
3423         return (NFS4_OK);
3424 }
3425 
3426 nfsstat4
3427 rfs4_get_state(stateid4 *stateid, rfs4_state_t **spp,
3428     rfs4_dbsearch_type_t find_invalid)
3429 {
3430         return (rfs4_get_state_lockit(stateid, spp, find_invalid, TRUE));
3431 }
3432 
3433 int
3434 rfs4_check_stateid_seqid(rfs4_state_t *sp, stateid4 *stateid)
3435 {
3436         stateid_t *id = (stateid_t *)stateid;
3437 
3438         if (rfs4_lease_expired(sp->rs_owner->ro_client))
3439                 return (NFS4_CHECK_STATEID_EXPIRED);
3440 
3441         /* Stateid is some time in the future - that's bad */
3442         if (sp->rs_stateid.bits.chgseq < id->bits.chgseq)
3443                 return (NFS4_CHECK_STATEID_BAD);
3444 
3445         if (sp->rs_stateid.bits.chgseq == id->bits.chgseq + 1)
3446                 return (NFS4_CHECK_STATEID_REPLAY);
3447 
3448         /* Stateid is some time in the past - that's old */
3449         if (sp->rs_stateid.bits.chgseq > id->bits.chgseq)
3450                 return (NFS4_CHECK_STATEID_OLD);
3451 
3452         /* Caller needs to know about confirmation before closure */
3453         if (sp->rs_owner->ro_need_confirm)
3454                 return (NFS4_CHECK_STATEID_UNCONFIRMED);
3455 
3456         if (sp->rs_closed == TRUE)
3457                 return (NFS4_CHECK_STATEID_CLOSED);
3458 
3459         return (NFS4_CHECK_STATEID_OKAY);
3460 }
3461 
3462 int
3463 rfs4_check_lo_stateid_seqid(rfs4_lo_state_t *lsp, stateid4 *stateid)
3464 {
3465         stateid_t *id = (stateid_t *)stateid;
3466 
3467         if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client))
3468                 return (NFS4_CHECK_STATEID_EXPIRED);
3469 
3470         /* Stateid is some time in the future - that's bad */
3471         if (lsp->rls_lockid.bits.chgseq < id->bits.chgseq)
3472                 return (NFS4_CHECK_STATEID_BAD);
3473 
3474         if (lsp->rls_lockid.bits.chgseq == id->bits.chgseq + 1)
3475                 return (NFS4_CHECK_STATEID_REPLAY);
3476 
3477         /* Stateid is some time in the past - that's old */
3478         if (lsp->rls_lockid.bits.chgseq > id->bits.chgseq)
3479                 return (NFS4_CHECK_STATEID_OLD);
3480 
3481         if (lsp->rls_state->rs_closed == TRUE)
3482                 return (NFS4_CHECK_STATEID_CLOSED);
3483 
3484         return (NFS4_CHECK_STATEID_OKAY);
3485 }
3486 
3487 nfsstat4
3488 rfs4_get_deleg_state(stateid4 *stateid, rfs4_deleg_state_t **dspp)
3489 {
3490         stateid_t *id = (stateid_t *)stateid;
3491         rfs4_deleg_state_t *dsp;
3492 
3493         *dspp = NULL;
3494 
3495         /* If we are booted as a cluster node, was stateid locally generated? */
3496         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3497                 return (NFS4ERR_STALE_STATEID);
3498 
3499         dsp = rfs4_finddelegstate(id);
3500         if (dsp == NULL) {
3501                 return (what_stateid_error(id, DELEGID));
3502         }
3503 
3504         if (rfs4_lease_expired(dsp->rds_client)) {
3505                 rfs4_deleg_state_rele(dsp);
3506                 return (NFS4ERR_EXPIRED);
3507         }
3508 
3509         *dspp = dsp;
3510 
3511         return (NFS4_OK);
3512 }
3513 
3514 nfsstat4
3515 rfs4_get_lo_state(stateid4 *stateid, rfs4_lo_state_t **lspp, bool_t lock_fp)
3516 {
3517         stateid_t *id = (stateid_t *)stateid;
3518         rfs4_lo_state_t *lsp;
3519 
3520         *lspp = NULL;
3521 
3522         /* If we are booted as a cluster node, was stateid locally generated? */
3523         if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3524                 return (NFS4ERR_STALE_STATEID);
3525 
3526         lsp = rfs4_findlo_state(id, lock_fp);
3527         if (lsp == NULL) {
3528                 return (what_stateid_error(id, LOCKID));
3529         }
3530 
3531         if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client)) {
3532                 rfs4_lo_state_rele(lsp, lock_fp);
3533                 return (NFS4ERR_EXPIRED);
3534         }
3535 
3536         *lspp = lsp;
3537 
3538         return (NFS4_OK);
3539 }
3540 
3541 static nfsstat4
3542 rfs4_get_all_state(stateid4 *sid, rfs4_state_t **spp,
3543     rfs4_deleg_state_t **dspp, rfs4_lo_state_t **lspp)
3544 {
3545         rfs4_state_t *sp = NULL;
3546         rfs4_deleg_state_t *dsp = NULL;
3547         rfs4_lo_state_t *lsp = NULL;
3548         stateid_t *id;
3549         nfsstat4 status;
3550 
3551         *spp = NULL; *dspp = NULL; *lspp = NULL;
3552 
3553         id = (stateid_t *)sid;
3554         switch (id->bits.type) {
3555         case OPENID:
3556                 status = rfs4_get_state_lockit(sid, &sp, FALSE, FALSE);
3557                 break;
3558         case DELEGID:
3559                 status = rfs4_get_deleg_state(sid, &dsp);
3560                 break;
3561         case LOCKID:
3562                 status = rfs4_get_lo_state(sid, &lsp, FALSE);
3563                 if (status == NFS4_OK) {
3564                         sp = lsp->rls_state;
3565                         rfs4_dbe_hold(sp->rs_dbe);
3566                 }
3567                 break;
3568         default:
3569                 status = NFS4ERR_BAD_STATEID;
3570         }
3571 
3572         if (status == NFS4_OK) {
3573                 *spp = sp;
3574                 *dspp = dsp;
3575                 *lspp = lsp;
3576         }
3577 
3578         return (status);
3579 }
3580 
3581 /*
3582  * Given the I/O mode (FREAD or FWRITE), this checks whether the
3583  * rfs4_state_t struct has access to do this operation and if so
3584  * return NFS4_OK; otherwise the proper NFSv4 error is returned.
3585  */
3586 nfsstat4
3587 rfs4_state_has_access(rfs4_state_t *sp, int mode, vnode_t *vp)
3588 {
3589         nfsstat4 stat = NFS4_OK;
3590         rfs4_file_t *fp;
3591         bool_t create = FALSE;
3592 
3593         rfs4_dbe_lock(sp->rs_dbe);
3594         if (mode == FWRITE) {
3595                 if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_WRITE)) {
3596                         stat = NFS4ERR_OPENMODE;
3597                 }
3598         } else if (mode == FREAD) {
3599                 if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_READ)) {
3600                         /*
3601                          * If we have OPENed the file with DENYing access
3602                          * to both READ and WRITE then no one else could
3603                          * have OPENed the file, hence no conflicting READ
3604                          * deny.  This check is merely an optimization.
3605                          */
3606                         if (sp->rs_share_deny == OPEN4_SHARE_DENY_BOTH)
3607                                 goto out;
3608 
3609                         /* Check against file struct's DENY mode */
3610                         fp = rfs4_findfile(vp, NULL, &create);
3611                         if (fp != NULL) {
3612                                 int deny_read = 0;
3613                                 rfs4_dbe_lock(fp->rf_dbe);
3614                                 /*
3615                                  * Check if any other open owner has the file
3616                                  * OPENed with deny READ.
3617                                  */
3618                                 if (sp->rs_share_deny & OPEN4_SHARE_DENY_READ)
3619                                         deny_read = 1;
3620                                 ASSERT(fp->rf_deny_read >= deny_read);
3621                                 if (fp->rf_deny_read > deny_read)
3622                                         stat = NFS4ERR_OPENMODE;
3623                                 rfs4_dbe_unlock(fp->rf_dbe);
3624                                 rfs4_file_rele(fp);
3625                         }
3626                 }
3627         } else {
3628                 /* Illegal I/O mode */
3629                 stat = NFS4ERR_INVAL;
3630         }
3631 out:
3632         rfs4_dbe_unlock(sp->rs_dbe);
3633         return (stat);
3634 }
3635 
3636 /*
3637  * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether
3638  * the file is being truncated, return NFS4_OK if allowed or appropriate
3639  * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on
3640  * the associated file will be done if the I/O is not consistent with any
3641  * delegation in effect on the file. Should be holding VOP_RWLOCK, either
3642  * as reader or writer as appropriate. rfs4_op_open will acquire the
3643  * VOP_RWLOCK as writer when setting up delegation. If the stateid is bad
3644  * this routine will return NFS4ERR_BAD_STATEID. In addition, through the
3645  * deleg parameter, we will return whether a write delegation is held by
3646  * the client associated with this stateid.
3647  * If the server instance associated with the relevant client is in its
3648  * grace period, return NFS4ERR_GRACE.
3649  */
3650 
3651 nfsstat4
3652 rfs4_check_stateid(int mode, vnode_t *vp,
3653     stateid4 *stateid, bool_t trunc, bool_t *deleg,
3654     bool_t do_access, caller_context_t *ct)
3655 {
3656         rfs4_file_t *fp;
3657         bool_t create = FALSE;
3658         rfs4_state_t *sp;
3659         rfs4_deleg_state_t *dsp;
3660         rfs4_lo_state_t *lsp;
3661         stateid_t *id = (stateid_t *)stateid;
3662         nfsstat4 stat = NFS4_OK;
3663 
3664         if (ct != NULL) {
3665                 ct->cc_sysid = 0;
3666                 ct->cc_pid = 0;
3667                 ct->cc_caller_id = nfs4_srv_caller_id;
3668                 ct->cc_flags = CC_DONTBLOCK;
3669         }
3670 
3671         if (ISSPECIAL(stateid)) {
3672                 fp = rfs4_findfile(vp, NULL, &create);
3673                 if (fp == NULL)
3674                         return (NFS4_OK);
3675                 if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_NONE) {
3676                         rfs4_file_rele(fp);
3677                         return (NFS4_OK);
3678                 }
3679                 if (mode == FWRITE ||
3680                     fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_WRITE) {
3681                         rfs4_recall_deleg(fp, trunc, NULL);
3682                         rfs4_file_rele(fp);
3683                         return (NFS4ERR_DELAY);
3684                 }
3685                 rfs4_file_rele(fp);
3686                 return (NFS4_OK);
3687         } else {
3688                 stat = rfs4_get_all_state(stateid, &sp, &dsp, &lsp);
3689                 if (stat != NFS4_OK)
3690                         return (stat);
3691                 if (lsp != NULL) {
3692                         /* Is associated server instance in its grace period? */
3693                         if (rfs4_clnt_in_grace(lsp->rls_locker->rl_client)) {
3694                                 rfs4_lo_state_rele(lsp, FALSE);
3695                                 if (sp != NULL)
3696                                         rfs4_state_rele_nounlock(sp);
3697                                 return (NFS4ERR_GRACE);
3698                         }
3699                         if (id->bits.type == LOCKID) {
3700                                 /* Seqid in the future? - that's bad */
3701                                 if (lsp->rls_lockid.bits.chgseq <
3702                                     id->bits.chgseq) {
3703                                         rfs4_lo_state_rele(lsp, FALSE);
3704                                         if (sp != NULL)
3705                                                 rfs4_state_rele_nounlock(sp);
3706                                         return (NFS4ERR_BAD_STATEID);
3707                                 }
3708                                 /* Seqid in the past? - that's old */
3709                                 if (lsp->rls_lockid.bits.chgseq >
3710                                     id->bits.chgseq) {
3711                                         rfs4_lo_state_rele(lsp, FALSE);
3712                                         if (sp != NULL)
3713                                                 rfs4_state_rele_nounlock(sp);
3714                                         return (NFS4ERR_OLD_STATEID);
3715                                 }
3716                                 /* Ensure specified filehandle matches */
3717                                 if (lsp->rls_state->rs_finfo->rf_vp != vp) {
3718                                         rfs4_lo_state_rele(lsp, FALSE);
3719                                         if (sp != NULL)
3720                                                 rfs4_state_rele_nounlock(sp);
3721                                         return (NFS4ERR_BAD_STATEID);
3722                                 }
3723                         }
3724                         if (ct != NULL) {
3725                                 ct->cc_sysid =
3726                                     lsp->rls_locker->rl_client->rc_sysidt;
3727                                 ct->cc_pid = lsp->rls_locker->rl_pid;
3728                         }
3729                         rfs4_lo_state_rele(lsp, FALSE);
3730                 }
3731 
3732                 /* Stateid provided was an "open" stateid */
3733                 if (sp != NULL) {
3734                         /* Is associated server instance in its grace period? */
3735                         if (rfs4_clnt_in_grace(sp->rs_owner->ro_client)) {
3736                                 rfs4_state_rele_nounlock(sp);
3737                                 return (NFS4ERR_GRACE);
3738                         }
3739                         if (id->bits.type == OPENID) {
3740                                 /* Seqid in the future? - that's bad */
3741                                 if (sp->rs_stateid.bits.chgseq <
3742                                     id->bits.chgseq) {
3743                                         rfs4_state_rele_nounlock(sp);
3744                                         return (NFS4ERR_BAD_STATEID);
3745                                 }
3746                                 /* Seqid in the past - that's old */
3747                                 if (sp->rs_stateid.bits.chgseq >
3748                                     id->bits.chgseq) {
3749                                         rfs4_state_rele_nounlock(sp);
3750                                         return (NFS4ERR_OLD_STATEID);
3751                                 }
3752                         }
3753                         /* Ensure specified filehandle matches */
3754                         if (sp->rs_finfo->rf_vp != vp) {
3755                                 rfs4_state_rele_nounlock(sp);
3756                                 return (NFS4ERR_BAD_STATEID);
3757                         }
3758 
3759                         if (sp->rs_owner->ro_need_confirm) {
3760                                 rfs4_state_rele_nounlock(sp);
3761                                 return (NFS4ERR_BAD_STATEID);
3762                         }
3763 
3764                         if (sp->rs_closed == TRUE) {
3765                                 rfs4_state_rele_nounlock(sp);
3766                                 return (NFS4ERR_OLD_STATEID);
3767                         }
3768 
3769                         if (do_access)
3770                                 stat = rfs4_state_has_access(sp, mode, vp);
3771                         else
3772                                 stat = NFS4_OK;
3773 
3774                         /*
3775                          * Return whether this state has write
3776                          * delegation if desired
3777                          */
3778                         if (deleg && (sp->rs_finfo->rf_dinfo.rd_dtype ==
3779                             OPEN_DELEGATE_WRITE))
3780                                 *deleg = TRUE;
3781 
3782                         /*
3783                          * We got a valid stateid, so we update the
3784                          * lease on the client. Ideally we would like
3785                          * to do this after the calling op succeeds,
3786                          * but for now this will be good
3787                          * enough. Callers of this routine are
3788                          * currently insulated from the state stuff.
3789                          */
3790                         rfs4_update_lease(sp->rs_owner->ro_client);
3791 
3792                         /*
3793                          * If a delegation is present on this file and
3794                          * this is a WRITE, then update the lastwrite
3795                          * time to indicate that activity is present.
3796                          */
3797                         if (sp->rs_finfo->rf_dinfo.rd_dtype ==
3798                             OPEN_DELEGATE_WRITE &&
3799                             mode == FWRITE) {
3800                                 sp->rs_finfo->rf_dinfo.rd_time_lastwrite =
3801                                     gethrestime_sec();
3802                         }
3803 
3804                         rfs4_state_rele_nounlock(sp);
3805 
3806                         return (stat);
3807                 }
3808 
3809                 if (dsp != NULL) {
3810                         /* Is associated server instance in its grace period? */
3811                         if (rfs4_clnt_in_grace(dsp->rds_client)) {
3812                                 rfs4_deleg_state_rele(dsp);
3813                                 return (NFS4ERR_GRACE);
3814                         }
3815                         if (dsp->rds_delegid.bits.chgseq != id->bits.chgseq) {
3816                                 rfs4_deleg_state_rele(dsp);
3817                                 return (NFS4ERR_BAD_STATEID);
3818                         }
3819 
3820                         /* Ensure specified filehandle matches */
3821                         if (dsp->rds_finfo->rf_vp != vp) {
3822                                 rfs4_deleg_state_rele(dsp);
3823                                 return (NFS4ERR_BAD_STATEID);
3824                         }
3825                         /*
3826                          * Return whether this state has write
3827                          * delegation if desired
3828                          */
3829                         if (deleg && (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3830                             OPEN_DELEGATE_WRITE))
3831                                 *deleg = TRUE;
3832 
3833                         rfs4_update_lease(dsp->rds_client);
3834 
3835                         /*
3836                          * If a delegation is present on this file and
3837                          * this is a WRITE, then update the lastwrite
3838                          * time to indicate that activity is present.
3839                          */
3840                         if (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3841                             OPEN_DELEGATE_WRITE && mode == FWRITE) {
3842                                 dsp->rds_finfo->rf_dinfo.rd_time_lastwrite =
3843                                     gethrestime_sec();
3844                         }
3845 
3846                         /*
3847                          * XXX - what happens if this is a WRITE and the
3848                          * delegation type of for READ.
3849                          */
3850                         rfs4_deleg_state_rele(dsp);
3851 
3852                         return (stat);
3853                 }
3854                 /*
3855                  * If we got this far, something bad happened
3856                  */
3857                 return (NFS4ERR_BAD_STATEID);
3858         }
3859 }
3860 
3861 
3862 /*
3863  * This is a special function in that for the file struct provided the
3864  * server wants to remove/close all current state associated with the
3865  * file.  The prime use of this would be with OP_REMOVE to force the
3866  * release of state and particularly of file locks.
3867  *
3868  * There is an assumption that there is no delegations outstanding on
3869  * this file at this point.  The caller should have waited for those
3870  * to be returned or revoked.
3871  */
3872 void
3873 rfs4_close_all_state(rfs4_file_t *fp)
3874 {
3875         rfs4_state_t *sp;
3876 
3877         rfs4_dbe_lock(fp->rf_dbe);
3878 
3879 #ifdef DEBUG
3880         /* only applies when server is handing out delegations */
3881         if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE)
3882                 ASSERT(fp->rf_dinfo.rd_hold_grant > 0);
3883 #endif
3884 
3885         /* No delegations for this file */
3886         ASSERT(list_is_empty(&fp->rf_delegstatelist));
3887 
3888         /* Make sure that it can not be found */
3889         rfs4_dbe_invalidate(fp->rf_dbe);
3890 
3891         if (fp->rf_vp == NULL) {
3892                 rfs4_dbe_unlock(fp->rf_dbe);
3893                 return;
3894         }
3895         rfs4_dbe_unlock(fp->rf_dbe);
3896 
3897         /*
3898          * Hold as writer to prevent other server threads from
3899          * processing requests related to the file while all state is
3900          * being removed.
3901          */
3902         rw_enter(&fp->rf_file_rwlock, RW_WRITER);
3903 
3904         /* Remove ALL state from the file */
3905         while (sp = rfs4_findstate_by_file(fp)) {
3906                 rfs4_state_close(sp, FALSE, FALSE, CRED());
3907                 rfs4_state_rele_nounlock(sp);
3908         }
3909 
3910         /*
3911          * This is only safe since there are no further references to
3912          * the file.
3913          */
3914         rfs4_dbe_lock(fp->rf_dbe);
3915         if (fp->rf_vp) {
3916                 vnode_t *vp = fp->rf_vp;
3917 
3918                 mutex_enter(&vp->v_vsd_lock);
3919                 (void) vsd_set(vp, nfs4_srv_vkey, NULL);
3920                 mutex_exit(&vp->v_vsd_lock);
3921                 VN_RELE(vp);
3922                 fp->rf_vp = NULL;
3923         }
3924         rfs4_dbe_unlock(fp->rf_dbe);
3925 
3926         /* Finally let other references to proceed */
3927         rw_exit(&fp->rf_file_rwlock);
3928 }
3929 
3930 /*
3931  * This function is used as a target for the rfs4_dbe_walk() call
3932  * below.  The purpose of this function is to see if the
3933  * lockowner_state refers to a file that resides within the exportinfo
3934  * export.  If so, then remove the lock_owner state (file locks and
3935  * share "locks") for this object since the intent is the server is
3936  * unexporting the specified directory.  Be sure to invalidate the
3937  * object after the state has been released
3938  */
3939 static void
3940 rfs4_lo_state_walk_callout(rfs4_entry_t u_entry, void *e)
3941 {
3942         rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
3943         struct exportinfo *exi = (struct exportinfo *)e;
3944         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3945         fhandle_t *efhp;
3946 
3947         efhp = (fhandle_t *)&exi->exi_fh;
3948         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3949 
3950         FH_TO_FMT4(efhp, exi_fhp);
3951 
3952         finfo_fhp = (nfs_fh4_fmt_t *)lsp->rls_state->rs_finfo->
3953             rf_filehandle.nfs_fh4_val;
3954 
3955         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3956             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3957             exi_fhp->fh4_xlen) == 0) {
3958                 rfs4_state_close(lsp->rls_state, FALSE, FALSE, CRED());
3959                 rfs4_dbe_invalidate(lsp->rls_dbe);
3960                 rfs4_dbe_invalidate(lsp->rls_state->rs_dbe);
3961         }
3962 }
3963 
3964 /*
3965  * This function is used as a target for the rfs4_dbe_walk() call
3966  * below.  The purpose of this function is to see if the state refers
3967  * to a file that resides within the exportinfo export.  If so, then
3968  * remove the open state for this object since the intent is the
3969  * server is unexporting the specified directory.  The main result for
3970  * this type of entry is to invalidate it such it will not be found in
3971  * the future.
3972  */
3973 static void
3974 rfs4_state_walk_callout(rfs4_entry_t u_entry, void *e)
3975 {
3976         rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3977         struct exportinfo *exi = (struct exportinfo *)e;
3978         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3979         fhandle_t *efhp;
3980 
3981         efhp = (fhandle_t *)&exi->exi_fh;
3982         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3983 
3984         FH_TO_FMT4(efhp, exi_fhp);
3985 
3986         finfo_fhp =
3987             (nfs_fh4_fmt_t *)sp->rs_finfo->rf_filehandle.nfs_fh4_val;
3988 
3989         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3990             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3991             exi_fhp->fh4_xlen) == 0) {
3992                 rfs4_state_close(sp, TRUE, FALSE, CRED());
3993                 rfs4_dbe_invalidate(sp->rs_dbe);
3994         }
3995 }
3996 
3997 /*
3998  * This function is used as a target for the rfs4_dbe_walk() call
3999  * below.  The purpose of this function is to see if the state refers
4000  * to a file that resides within the exportinfo export.  If so, then
4001  * remove the deleg state for this object since the intent is the
4002  * server is unexporting the specified directory.  The main result for
4003  * this type of entry is to invalidate it such it will not be found in
4004  * the future.
4005  */
4006 static void
4007 rfs4_deleg_state_walk_callout(rfs4_entry_t u_entry, void *e)
4008 {
4009         rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
4010         struct exportinfo *exi = (struct exportinfo *)e;
4011         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
4012         fhandle_t *efhp;
4013 
4014         efhp = (fhandle_t *)&exi->exi_fh;
4015         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
4016 
4017         FH_TO_FMT4(efhp, exi_fhp);
4018 
4019         finfo_fhp =
4020             (nfs_fh4_fmt_t *)dsp->rds_finfo->rf_filehandle.nfs_fh4_val;
4021 
4022         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
4023             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
4024             exi_fhp->fh4_xlen) == 0) {
4025                 rfs4_dbe_invalidate(dsp->rds_dbe);
4026         }
4027 }
4028 
4029 /*
4030  * This function is used as a target for the rfs4_dbe_walk() call
4031  * below.  The purpose of this function is to see if the state refers
4032  * to a file that resides within the exportinfo export.  If so, then
4033  * release vnode hold for this object since the intent is the server
4034  * is unexporting the specified directory.  Invalidation will prevent
4035  * this struct from being found in the future.
4036  */
4037 static void
4038 rfs4_file_walk_callout(rfs4_entry_t u_entry, void *e)
4039 {
4040         rfs4_file_t *fp = (rfs4_file_t *)u_entry;
4041         struct exportinfo *exi = (struct exportinfo *)e;
4042         nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
4043         fhandle_t *efhp;
4044 
4045         efhp = (fhandle_t *)&exi->exi_fh;
4046         exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
4047 
4048         FH_TO_FMT4(efhp, exi_fhp);
4049 
4050         finfo_fhp = (nfs_fh4_fmt_t *)fp->rf_filehandle.nfs_fh4_val;
4051 
4052         if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
4053             bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
4054             exi_fhp->fh4_xlen) == 0) {
4055                 if (fp->rf_vp) {
4056                         vnode_t *vp = fp->rf_vp;
4057 
4058                         /*
4059                          * don't leak monitors and remove the reference
4060                          * put on the vnode when the delegation was granted.
4061                          */
4062                         if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_READ) {
4063                                 (void) fem_uninstall(vp, deleg_rdops,
4064                                     (void *)fp);
4065                                 vn_open_downgrade(vp, FREAD);
4066                         } else if (fp->rf_dinfo.rd_dtype ==
4067                             OPEN_DELEGATE_WRITE) {
4068                                 (void) fem_uninstall(vp, deleg_wrops,
4069                                     (void *)fp);
4070                                 vn_open_downgrade(vp, FREAD|FWRITE);
4071                         }
4072                         mutex_enter(&vp->v_vsd_lock);
4073                         (void) vsd_set(vp, nfs4_srv_vkey, NULL);
4074                         mutex_exit(&vp->v_vsd_lock);
4075                         VN_RELE(vp);
4076                         fp->rf_vp = NULL;
4077                 }
4078                 rfs4_dbe_invalidate(fp->rf_dbe);
4079         }
4080 }
4081 
4082 /*
4083  * Given a directory that is being unexported, cleanup/release all
4084  * state in the server that refers to objects residing underneath this
4085  * particular export.  The ordering of the release is important.
4086  * Lock_owner, then state and then file.
4087  */
4088 void
4089 rfs4_clean_state_exi(struct exportinfo *exi)
4090 {
4091         nfs4_srv_t *nsrv4;
4092 
4093         nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
4094         mutex_enter(&nsrv4->state_lock);
4095 
4096         if (nsrv4->nfs4_server_state == NULL) {
4097                 mutex_exit(&nsrv4->state_lock);
4098                 return;
4099         }
4100 
4101         /* CSTYLED */
4102         rfs4_dbe_walk(nsrv4->rfs4_lo_state_tab, rfs4_lo_state_walk_callout, exi);
4103         rfs4_dbe_walk(nsrv4->rfs4_state_tab, rfs4_state_walk_callout, exi);
4104         /* CSTYLED */
4105         rfs4_dbe_walk(nsrv4->rfs4_deleg_state_tab, rfs4_deleg_state_walk_callout, exi);
4106         rfs4_dbe_walk(nsrv4->rfs4_file_tab, rfs4_file_walk_callout, exi);
4107 
4108         mutex_exit(&nsrv4->state_lock);
4109 }