1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2018 Nexenta Systems, Inc.
  29  */
  30 
  31 #include <sys/systm.h>
  32 #include <sys/sdt.h>
  33 #include <rpc/types.h>
  34 #include <rpc/auth.h>
  35 #include <rpc/auth_unix.h>
  36 #include <rpc/auth_des.h>
  37 #include <rpc/svc.h>
  38 #include <rpc/xdr.h>
  39 #include <nfs/nfs4.h>
  40 #include <nfs/nfs_dispatch.h>
  41 #include <nfs/nfs4_drc.h>
  42 
  43 #define NFS4_MAX_MINOR_VERSION  0
  44 
  45 /*
  46  * The default size of the duplicate request cache
  47  */
  48 uint32_t nfs4_drc_max = 8 * 1024;
  49 
  50 /*
  51  * The number of buckets we'd like to hash the
  52  * replies into.. do not change this on the fly.
  53  */
  54 uint32_t nfs4_drc_hash = 541;
  55 
  56 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
  57 
  58 extern zone_key_t rfs4_zone_key;
  59 
  60 /*
  61  * Initialize a duplicate request cache.
  62  */
  63 rfs4_drc_t *
  64 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
  65 {
  66         rfs4_drc_t *drc;
  67         uint32_t   bki;
  68 
  69         ASSERT(drc_size);
  70         ASSERT(drc_hash_size);
  71 
  72         drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
  73 
  74         drc->max_size = drc_size;
  75         drc->in_use = 0;
  76 
  77         mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
  78 
  79         drc->dr_hash = drc_hash_size;
  80 
  81         drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
  82 
  83         for (bki = 0; bki < drc_hash_size; bki++) {
  84                 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
  85                     offsetof(rfs4_dupreq_t, dr_bkt_next));
  86         }
  87 
  88         list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
  89             offsetof(rfs4_dupreq_t, dr_next));
  90 
  91         return (drc);
  92 }
  93 
  94 /*
  95  * Destroy a duplicate request cache.
  96  */
  97 void
  98 rfs4_fini_drc(void)
  99 {
 100         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
 101         rfs4_drc_t *drc = nsrv4->nfs4_drc;
 102         rfs4_dupreq_t *drp, *drp_next;
 103 
 104         /* iterate over the dr_cache and free the enties */
 105         for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
 106 
 107                 if (drp->dr_state == NFS4_DUP_REPLAY)
 108                         rfs4_compound_free(&(drp->dr_res));
 109 
 110                 if (drp->dr_addr.buf != NULL)
 111                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 112 
 113                 drp_next = list_next(&(drc->dr_cache), drp);
 114 
 115                 kmem_free(drp, sizeof (rfs4_dupreq_t));
 116         }
 117 
 118         mutex_destroy(&drc->lock);
 119         kmem_free(drc->dr_buckets,
 120             sizeof (list_t)*drc->dr_hash);
 121         kmem_free(drc, sizeof (rfs4_drc_t));
 122 }
 123 
 124 /*
 125  * rfs4_dr_chstate:
 126  *
 127  * Change the state of a rfs4_dupreq. If it's not in transition
 128  * to the FREE state, return. If we are moving to the FREE state
 129  * then we need to clean up the compound results and move the entry
 130  * to the end of the list.
 131  */
 132 void
 133 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
 134 {
 135         rfs4_drc_t *drc;
 136 
 137         ASSERT(drp);
 138         ASSERT(drp->drc);
 139         ASSERT(drp->dr_bkt);
 140         ASSERT(MUTEX_HELD(&drp->drc->lock));
 141 
 142         drp->dr_state = new_state;
 143 
 144         if (new_state != NFS4_DUP_FREE)
 145                 return;
 146 
 147         drc = drp->drc;
 148 
 149         /*
 150          * Remove entry from the bucket and
 151          * dr_cache list, free compound results.
 152          */
 153         list_remove(drp->dr_bkt, drp);
 154         list_remove(&(drc->dr_cache), drp);
 155         rfs4_compound_free(&(drp->dr_res));
 156 }
 157 
 158 /*
 159  * rfs4_alloc_dr:
 160  *
 161  * Malloc a new one if we have not reached our maximum cache
 162  * limit, otherwise pick an entry off the tail -- Use if it
 163  * is marked as NFS4_DUP_FREE, or is an entry in the
 164  * NFS4_DUP_REPLAY state.
 165  */
 166 rfs4_dupreq_t *
 167 rfs4_alloc_dr(rfs4_drc_t *drc)
 168 {
 169         rfs4_dupreq_t *drp_tail, *drp = NULL;
 170 
 171         ASSERT(drc);
 172         ASSERT(MUTEX_HELD(&drc->lock));
 173 
 174         /*
 175          * Have we hit the cache limit yet ?
 176          */
 177         if (drc->in_use < drc->max_size) {
 178                 /*
 179                  * nope, so let's malloc a new one
 180                  */
 181                 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
 182                 drp->drc = drc;
 183                 drc->in_use++;
 184                 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
 185                 return (drp);
 186         }
 187 
 188         /*
 189          * Cache is all allocated now traverse the list
 190          * backwards to find one we can reuse.
 191          */
 192         for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
 193             drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
 194 
 195                 switch (drp_tail->dr_state) {
 196 
 197                 case NFS4_DUP_FREE:
 198                         list_remove(&(drc->dr_cache), drp_tail);
 199                         DTRACE_PROBE1(nfss__i__drc_freeclaim,
 200                             rfs4_dupreq_t *, drp_tail);
 201                         return (drp_tail);
 202                         /* NOTREACHED */
 203 
 204                 case NFS4_DUP_REPLAY:
 205                         /* grab it. */
 206                         rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
 207                         DTRACE_PROBE1(nfss__i__drc_replayclaim,
 208                             rfs4_dupreq_t *, drp_tail);
 209                         return (drp_tail);
 210                         /* NOTREACHED */
 211                 }
 212         }
 213         DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
 214         return (NULL);
 215 }
 216 
 217 /*
 218  * rfs4_find_dr:
 219  *
 220  * Search for an entry in the duplicate request cache by
 221  * calculating the hash index based on the XID, and examining
 222  * the entries in the hash bucket. If we find a match, return.
 223  * Once we have searched the bucket we call rfs4_alloc_dr() to
 224  * allocate a new entry, or reuse one that is available.
 225  */
 226 int
 227 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
 228 {
 229 
 230         uint32_t        the_xid;
 231         list_t          *dr_bkt;
 232         rfs4_dupreq_t   *drp;
 233         int             bktdex;
 234 
 235         /*
 236          * Get the XID, calculate the bucket and search to
 237          * see if we need to replay from the cache.
 238          */
 239         the_xid = req->rq_xprt->xp_xid;
 240         bktdex = the_xid % drc->dr_hash;
 241 
 242         dr_bkt = (list_t *)
 243             &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
 244 
 245         DTRACE_PROBE3(nfss__i__drc_bktdex,
 246             int, bktdex,
 247             uint32_t, the_xid,
 248             list_t *, dr_bkt);
 249 
 250         *dup = NULL;
 251 
 252         mutex_enter(&drc->lock);
 253         /*
 254          * Search the bucket for a matching xid and address.
 255          */
 256         for (drp = list_head(dr_bkt); drp != NULL;
 257             drp = list_next(dr_bkt, drp)) {
 258 
 259                 if (drp->dr_xid == the_xid &&
 260                     drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
 261                     bcmp((caddr_t)drp->dr_addr.buf,
 262                     (caddr_t)req->rq_xprt->xp_rtaddr.buf,
 263                     drp->dr_addr.len) == 0) {
 264 
 265                         /*
 266                          * Found a match so REPLAY the Reply
 267                          */
 268                         if (drp->dr_state == NFS4_DUP_REPLAY) {
 269                                 rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
 270                                 mutex_exit(&drc->lock);
 271                                 *dup = drp;
 272                                 DTRACE_PROBE1(nfss__i__drc_replay,
 273                                     rfs4_dupreq_t *, drp);
 274                                 return (NFS4_DUP_REPLAY);
 275                         }
 276 
 277                         /*
 278                          * This entry must be in transition, so return
 279                          * the 'pending' status.
 280                          */
 281                         mutex_exit(&drc->lock);
 282                         return (NFS4_DUP_PENDING);
 283                 }
 284         }
 285 
 286         drp = rfs4_alloc_dr(drc);
 287         mutex_exit(&drc->lock);
 288 
 289         /*
 290          * The DRC is full and all entries are in use. Upper function
 291          * should error out this request and force the client to
 292          * retransmit -- effectively this is a resource issue. NFSD
 293          * threads tied up with native File System, or the cache size
 294          * is too small for the server load.
 295          */
 296         if (drp == NULL)
 297                 return (NFS4_DUP_ERROR);
 298 
 299         /*
 300          * Init the state to NEW.
 301          */
 302         drp->dr_state = NFS4_DUP_NEW;
 303 
 304         /*
 305          * If needed, resize the address buffer
 306          */
 307         if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
 308                 if (drp->dr_addr.buf != NULL)
 309                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 310                 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
 311                 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
 312                 if (drp->dr_addr.buf == NULL) {
 313                         /*
 314                          * If the malloc fails, mark the entry
 315                          * as free and put on the tail.
 316                          */
 317                         drp->dr_addr.maxlen = 0;
 318                         drp->dr_state = NFS4_DUP_FREE;
 319                         mutex_enter(&drc->lock);
 320                         list_insert_tail(&(drc->dr_cache), drp);
 321                         mutex_exit(&drc->lock);
 322                         return (NFS4_DUP_ERROR);
 323                 }
 324         }
 325 
 326 
 327         /*
 328          * Copy the address.
 329          */
 330         drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
 331 
 332         bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
 333             (caddr_t)drp->dr_addr.buf,
 334             drp->dr_addr.len);
 335 
 336         drp->dr_xid = the_xid;
 337         drp->dr_bkt = dr_bkt;
 338 
 339         /*
 340          * Insert at the head of the bucket and
 341          * the drc lists..
 342          */
 343         mutex_enter(&drc->lock);
 344         list_insert_head(&drc->dr_cache, drp);
 345         list_insert_head(dr_bkt, drp);
 346         mutex_exit(&drc->lock);
 347 
 348         *dup = drp;
 349 
 350         return (NFS4_DUP_NEW);
 351 }
 352 
 353 /*
 354  *
 355  * This function handles the duplicate request cache,
 356  * NULL_PROC and COMPOUND procedure calls for NFSv4;
 357  *
 358  * Passed into this function are:-
 359  *
 360  *      disp    A pointer to our dispatch table entry
 361  *      req     The request to process
 362  *      xprt    The server transport handle
 363  *      ap      A pointer to the arguments
 364  *      rlen    A pointer to the reply length (output)
 365  *
 366  *
 367  * When appropriate this function is responsible for inserting
 368  * the reply into the duplicate cache or replaying an existing
 369  * cached reply.
 370  *
 371  * dr_stat      reflects the state of the duplicate request that
 372  *              has been inserted into or retrieved from the cache
 373  *
 374  * drp          is the duplicate request entry
 375  *
 376  */
 377 int
 378 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
 379     SVCXPRT *xprt, char *ap, size_t *rlen)
 380 {
 381 
 382         COMPOUND4res     res_buf;
 383         COMPOUND4res    *rbp;
 384         COMPOUND4args   *cap;
 385         cred_t          *cr = NULL;
 386         int              error = 0;
 387         int              dis_flags = 0;
 388         int              dr_stat = NFS4_NOT_DUP;
 389         rfs4_dupreq_t   *drp = NULL;
 390         int              rv;
 391         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
 392         rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
 393 
 394         ASSERT(disp);
 395 
 396         /*
 397          * Short circuit the RPC_NULL proc.
 398          */
 399         if (disp->dis_proc == rpc_null) {
 400                 DTRACE_NFSV4_1(null__start, struct svc_req *, req);
 401                 if (!svc_sendreply(xprt, xdr_void, NULL)) {
 402                         DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 403                         svcerr_systemerr(xprt);
 404                         return (1);
 405                 }
 406                 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 407                 *rlen = xdr_sizeof(xdr_void, NULL);
 408                 return (0);
 409         }
 410 
 411         /* Only NFSv4 Compounds from this point onward */
 412 
 413         rbp = &res_buf;
 414         cap = (COMPOUND4args *)ap;
 415 
 416         /*
 417          * Update kstats
 418          */
 419         rfs4_compound_kstat_args(cap);
 420 
 421         /*
 422          * Figure out the disposition of the whole COMPOUND
 423          * and record it's IDEMPOTENTCY.
 424          */
 425         rfs4_compound_flagproc(cap, &dis_flags);
 426 
 427         /*
 428          * If NON-IDEMPOTENT then we need to figure out if this
 429          * request can be replied from the duplicate cache.
 430          *
 431          * If this is a new request then we need to insert the
 432          * reply into the duplicate cache.
 433          */
 434         if (!(dis_flags & RPC_IDEMPOTENT)) {
 435                 /* look for a replay from the cache or allocate */
 436                 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
 437 
 438                 switch (dr_stat) {
 439 
 440                 case NFS4_DUP_ERROR:
 441                         rfs4_resource_err(req, cap);
 442                         return (1);
 443                         /* NOTREACHED */
 444 
 445                 case NFS4_DUP_PENDING:
 446                         /*
 447                          * reply has previously been inserted into the
 448                          * duplicate cache, however the reply has
 449                          * not yet been sent via svc_sendreply()
 450                          */
 451                         return (1);
 452                         /* NOTREACHED */
 453 
 454                 case NFS4_DUP_NEW:
 455                         curthread->t_flag |= T_DONTPEND;
 456                         /* NON-IDEMPOTENT proc call */
 457                         rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 458                         curthread->t_flag &= ~T_DONTPEND;
 459 
 460                         if (rv)         /* short ckt sendreply on error */
 461                                 return (rv);
 462 
 463                         /*
 464                          * dr_res must be initialized before calling
 465                          * rfs4_dr_chstate (it frees the reply).
 466                          */
 467                         drp->dr_res = res_buf;
 468                         if (curthread->t_flag & T_WOULDBLOCK) {
 469                                 curthread->t_flag &= ~T_WOULDBLOCK;
 470                                 /*
 471                                  * mark this entry as FREE and plop
 472                                  * on the end of the cache list
 473                                  */
 474                                 mutex_enter(&drp->drc->lock);
 475                                 rfs4_dr_chstate(drp, NFS4_DUP_FREE);
 476                                 list_insert_tail(&(drp->drc->dr_cache), drp);
 477                                 mutex_exit(&drp->drc->lock);
 478                                 return (1);
 479                         }
 480                         break;
 481 
 482                 case NFS4_DUP_REPLAY:
 483                         /* replay from the cache */
 484                         rbp = &(drp->dr_res);
 485                         break;
 486                 }
 487         } else {
 488                 curthread->t_flag |= T_DONTPEND;
 489                 /* IDEMPOTENT proc call */
 490                 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 491                 curthread->t_flag &= ~T_DONTPEND;
 492 
 493                 if (rv)         /* short ckt sendreply on error */
 494                         return (rv);
 495 
 496                 if (curthread->t_flag & T_WOULDBLOCK) {
 497                         curthread->t_flag &= ~T_WOULDBLOCK;
 498                         return (1);
 499                 }
 500         }
 501 
 502         /*
 503          * Send out the replayed reply or the 'real' one.
 504          */
 505         if (!svc_sendreply(xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
 506                 DTRACE_PROBE2(nfss__e__dispatch_sendfail,
 507                     struct svc_req *, xprt,
 508                     char *, rbp);
 509                 svcerr_systemerr(xprt);
 510                 error++;
 511         } else {
 512                 /*
 513                  * Update kstats
 514                  */
 515                 rfs4_compound_kstat_res(rbp);
 516                 *rlen = xdr_sizeof(xdr_COMPOUND4res_srv, rbp);
 517         }
 518 
 519         /*
 520          * If this reply was just inserted into the duplicate cache
 521          * or it was replayed from the dup cache; (re)mark it as
 522          * available for replay
 523          *
 524          * At first glance, this 'if' statement seems a little strange;
 525          * testing for NFS4_DUP_REPLAY, and then calling...
 526          *
 527          *      rfs4_dr_chatate(NFS4_DUP_REPLAY)
 528          *
 529          * ... but notice that we are checking dr_stat, and not the
 530          * state of the entry itself, the entry will be NFS4_DUP_INUSE,
 531          * we do that so that we know not to prematurely reap it whilst
 532          * we resent it to the client.
 533          *
 534          */
 535         if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
 536                 mutex_enter(&drp->drc->lock);
 537                 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
 538                 mutex_exit(&drp->drc->lock);
 539         } else if (dr_stat == NFS4_NOT_DUP) {
 540                 rfs4_compound_free(rbp);
 541         }
 542 
 543         return (error);
 544 }
 545 
 546 bool_t
 547 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
 548 {
 549         COMPOUND4args *argsp;
 550         COMPOUND4res res_buf, *resp;
 551 
 552         if (req->rq_vers != 4)
 553                 return (FALSE);
 554 
 555         argsp = (COMPOUND4args *)args;
 556 
 557         if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
 558                 return (FALSE);
 559 
 560         resp = &res_buf;
 561 
 562         /*
 563          * Form a reply tag by copying over the reqeuest tag.
 564          */
 565         resp->tag.utf8string_val =
 566             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 567         resp->tag.utf8string_len = argsp->tag.utf8string_len;
 568         bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
 569             resp->tag.utf8string_len);
 570         resp->array_len = 0;
 571         resp->array = NULL;
 572         resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
 573         if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
 574                 DTRACE_PROBE2(nfss__e__minorvers_mismatch,
 575                     SVCXPRT *, xprt, char *, resp);
 576                 svcerr_systemerr(xprt);
 577         }
 578         rfs4_compound_free(resp);
 579         return (TRUE);
 580 }
 581 
 582 void
 583 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
 584 {
 585         COMPOUND4res res_buf, *rbp;
 586         nfs_resop4 *resop;
 587         PUTFH4res *resp;
 588 
 589         rbp = &res_buf;
 590 
 591         /*
 592          * Form a reply tag by copying over the request tag.
 593          */
 594         rbp->tag.utf8string_val =
 595             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 596         rbp->tag.utf8string_len = argsp->tag.utf8string_len;
 597         bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
 598             rbp->tag.utf8string_len);
 599 
 600         rbp->array_len = 1;
 601         rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
 602             KM_SLEEP);
 603         resop = &rbp->array[0];
 604         resop->resop = argsp->array[0].argop;     /* copy first op over */
 605 
 606         /* Any op will do, just need to access status field */
 607         resp = &resop->nfs_resop4_u.opputfh;
 608 
 609         /*
 610          * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
 611          * Note that all op numbers in the compound array were already
 612          * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
 613          */
 614         resp->status = (resop->resop == OP_ILLEGAL ?
 615             NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
 616 
 617         /* compound status is same as first op status */
 618         rbp->status = resp->status;
 619 
 620         if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
 621                 DTRACE_PROBE2(nfss__rsrc_err__sendfail,
 622                     struct svc_req *, req->rq_xprt, char *, rbp);
 623                 svcerr_systemerr(req->rq_xprt);
 624         }
 625 
 626         UTF8STRING_FREE(rbp->tag);
 627         kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
 628 }