1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2018 Nexenta Systems, Inc.
  29  */
  30 
  31 #include <sys/systm.h>
  32 #include <sys/sdt.h>
  33 #include <rpc/types.h>
  34 #include <rpc/auth.h>
  35 #include <rpc/auth_unix.h>
  36 #include <rpc/auth_des.h>
  37 #include <rpc/svc.h>
  38 #include <rpc/xdr.h>
  39 #include <nfs/nfs4.h>
  40 #include <nfs/nfs_dispatch.h>
  41 #include <nfs/nfs4_drc.h>
  42 
  43 #define NFS4_MAX_MINOR_VERSION  0
  44 
  45 /*
  46  * The default size of the duplicate request cache
  47  */
  48 uint32_t nfs4_drc_max = 8 * 1024;
  49 
  50 /*
  51  * The number of buckets we'd like to hash the
  52  * replies into.. do not change this on the fly.
  53  */
  54 uint32_t nfs4_drc_hash = 541;
  55 
  56 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
  57 
  58 extern zone_key_t rfs4_zone_key;
  59 
  60 /*
  61  * Initialize a duplicate request cache.
  62  */
  63 rfs4_drc_t *
  64 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
  65 {
  66         rfs4_drc_t *drc;
  67         uint32_t   bki;
  68 
  69         ASSERT(drc_size);
  70         ASSERT(drc_hash_size);
  71 
  72         drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
  73 
  74         drc->max_size = drc_size;
  75         drc->in_use = 0;
  76 
  77         mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
  78 
  79         drc->dr_hash = drc_hash_size;
  80 
  81         drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
  82 
  83         for (bki = 0; bki < drc_hash_size; bki++) {
  84                 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
  85                     offsetof(rfs4_dupreq_t, dr_bkt_next));
  86         }
  87 
  88         list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
  89             offsetof(rfs4_dupreq_t, dr_next));
  90 
  91         return (drc);
  92 }
  93 
  94 /*
  95  * Destroy a duplicate request cache.
  96  */
  97 void
  98 rfs4_fini_drc(void)
  99 {
 100         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
 101         rfs4_drc_t *drc = nsrv4->nfs4_drc;
 102         rfs4_dupreq_t *drp, *drp_next;
 103 
 104         /* iterate over the dr_cache and free the enties */
 105         for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
 106 
 107                 if (drp->dr_state == NFS4_DUP_REPLAY)
 108                         rfs4_compound_free(&(drp->dr_res));
 109 
 110                 if (drp->dr_addr.buf != NULL)
 111                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 112 
 113                 drp_next = list_next(&(drc->dr_cache), drp);
 114 
 115                 kmem_free(drp, sizeof (rfs4_dupreq_t));
 116         }
 117 
 118         mutex_destroy(&drc->lock);
 119         kmem_free(drc->dr_buckets,
 120             sizeof (list_t)*drc->dr_hash);
 121         kmem_free(drc, sizeof (rfs4_drc_t));
 122 }
 123 
 124 /*
 125  * rfs4_dr_chstate:
 126  *
 127  * Change the state of a rfs4_dupreq. If it's not in transition
 128  * to the FREE state, return. If we are moving to the FREE state
 129  * then we need to clean up the compound results and move the entry
 130  * to the end of the list.
 131  */
 132 void
 133 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
 134 {
 135         rfs4_drc_t *drc;
 136 
 137         ASSERT(drp);
 138         ASSERT(drp->drc);
 139         ASSERT(drp->dr_bkt);
 140         ASSERT(MUTEX_HELD(&drp->drc->lock));
 141 
 142         drp->dr_state = new_state;
 143 
 144         if (new_state != NFS4_DUP_FREE)
 145                 return;
 146 
 147         drc = drp->drc;
 148 
 149         /*
 150          * Remove entry from the bucket and
 151          * dr_cache list, free compound results.
 152          */
 153         list_remove(drp->dr_bkt, drp);
 154         list_remove(&(drc->dr_cache), drp);
 155         rfs4_compound_free(&(drp->dr_res));
 156 }
 157 
 158 /*
 159  * rfs4_alloc_dr:
 160  *
 161  * Malloc a new one if we have not reached our maximum cache
 162  * limit, otherwise pick an entry off the tail -- Use if it
 163  * is marked as NFS4_DUP_FREE, or is an entry in the
 164  * NFS4_DUP_REPLAY state.
 165  */
 166 rfs4_dupreq_t *
 167 rfs4_alloc_dr(rfs4_drc_t *drc)
 168 {
 169         rfs4_dupreq_t *drp_tail, *drp = NULL;
 170 
 171         ASSERT(drc);
 172         ASSERT(MUTEX_HELD(&drc->lock));
 173 
 174         /*
 175          * Have we hit the cache limit yet ?
 176          */
 177         if (drc->in_use < drc->max_size) {
 178                 /*
 179                  * nope, so let's malloc a new one
 180                  */
 181                 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
 182                 drp->drc = drc;
 183                 drc->in_use++;
 184                 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
 185                 return (drp);
 186         }
 187 
 188         /*
 189          * Cache is all allocated now traverse the list
 190          * backwards to find one we can reuse.
 191          */
 192         for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
 193             drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
 194 
 195                 switch (drp_tail->dr_state) {
 196 
 197                 case NFS4_DUP_FREE:
 198                         list_remove(&(drc->dr_cache), drp_tail);
 199                         DTRACE_PROBE1(nfss__i__drc_freeclaim,
 200                             rfs4_dupreq_t *, drp_tail);
 201                         return (drp_tail);
 202                         /* NOTREACHED */
 203 
 204                 case NFS4_DUP_REPLAY:
 205                         /* grab it. */
 206                         rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
 207                         DTRACE_PROBE1(nfss__i__drc_replayclaim,
 208                             rfs4_dupreq_t *, drp_tail);
 209                         return (drp_tail);
 210                         /* NOTREACHED */
 211                 }
 212         }
 213         DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
 214         return (NULL);
 215 }
 216 
 217 /*
 218  * rfs4_find_dr:
 219  *
 220  * Search for an entry in the duplicate request cache by
 221  * calculating the hash index based on the XID, and examining
 222  * the entries in the hash bucket. If we find a match, return.
 223  * Once we have searched the bucket we call rfs4_alloc_dr() to
 224  * allocate a new entry, or reuse one that is available.
 225  */
 226 int
 227 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
 228 {
 229 
 230         uint32_t        the_xid;
 231         list_t          *dr_bkt;
 232         rfs4_dupreq_t   *drp;
 233         int             bktdex;
 234 
 235         /*
 236          * Get the XID, calculate the bucket and search to
 237          * see if we need to replay from the cache.
 238          */
 239         the_xid = req->rq_xprt->xp_xid;
 240         bktdex = the_xid % drc->dr_hash;
 241 
 242         dr_bkt = (list_t *)
 243             &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
 244 
 245         DTRACE_PROBE3(nfss__i__drc_bktdex,
 246             int, bktdex,
 247             uint32_t, the_xid,
 248             list_t *, dr_bkt);
 249 
 250         *dup = NULL;
 251 
 252         mutex_enter(&drc->lock);
 253         /*
 254          * Search the bucket for a matching xid and address.
 255          */
 256         for (drp = list_head(dr_bkt); drp != NULL;
 257             drp = list_next(dr_bkt, drp)) {
 258 
 259                 if (drp->dr_xid == the_xid &&
 260                     drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
 261                     bcmp((caddr_t)drp->dr_addr.buf,
 262                     (caddr_t)req->rq_xprt->xp_rtaddr.buf,
 263                     drp->dr_addr.len) == 0) {
 264 
 265                         /*
 266                          * Found a match so REPLAY the Reply
 267                          */
 268                         if (drp->dr_state == NFS4_DUP_REPLAY) {
 269                                 rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
 270                                 mutex_exit(&drc->lock);
 271                                 *dup = drp;
 272                                 DTRACE_PROBE1(nfss__i__drc_replay,
 273                                     rfs4_dupreq_t *, drp);
 274                                 return (NFS4_DUP_REPLAY);
 275                         }
 276 
 277                         /*
 278                          * This entry must be in transition, so return
 279                          * the 'pending' status.
 280                          */
 281                         mutex_exit(&drc->lock);
 282                         return (NFS4_DUP_PENDING);
 283                 }
 284         }
 285 
 286         drp = rfs4_alloc_dr(drc);
 287         mutex_exit(&drc->lock);
 288 
 289         /*
 290          * The DRC is full and all entries are in use. Upper function
 291          * should error out this request and force the client to
 292          * retransmit -- effectively this is a resource issue. NFSD
 293          * threads tied up with native File System, or the cache size
 294          * is too small for the server load.
 295          */
 296         if (drp == NULL)
 297                 return (NFS4_DUP_ERROR);
 298 
 299         /*
 300          * Init the state to NEW.
 301          */
 302         drp->dr_state = NFS4_DUP_NEW;
 303 
 304         /*
 305          * If needed, resize the address buffer
 306          */
 307         if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
 308                 if (drp->dr_addr.buf != NULL)
 309                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 310                 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
 311                 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
 312                 if (drp->dr_addr.buf == NULL) {
 313                         /*
 314                          * If the malloc fails, mark the entry
 315                          * as free and put on the tail.
 316                          */
 317                         drp->dr_addr.maxlen = 0;
 318                         drp->dr_state = NFS4_DUP_FREE;
 319                         mutex_enter(&drc->lock);
 320                         list_insert_tail(&(drc->dr_cache), drp);
 321                         mutex_exit(&drc->lock);
 322                         return (NFS4_DUP_ERROR);
 323                 }
 324         }
 325 
 326 
 327         /*
 328          * Copy the address.
 329          */
 330         drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
 331 
 332         bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
 333             (caddr_t)drp->dr_addr.buf,
 334             drp->dr_addr.len);
 335 
 336         drp->dr_xid = the_xid;
 337         drp->dr_bkt = dr_bkt;
 338 
 339         /*
 340          * Insert at the head of the bucket and
 341          * the drc lists..
 342          */
 343         mutex_enter(&drc->lock);
 344         list_insert_head(&drc->dr_cache, drp);
 345         list_insert_head(dr_bkt, drp);
 346         mutex_exit(&drc->lock);
 347 
 348         *dup = drp;
 349 
 350         return (NFS4_DUP_NEW);
 351 }
 352 
 353 /*
 354  *
 355  * This function handles the duplicate request cache,
 356  * NULL_PROC and COMPOUND procedure calls for NFSv4;
 357  *
 358  * Passed into this function are:-
 359  *
 360  *      disp    A pointer to our dispatch table entry
 361  *      req     The request to process
 362  *      xprt    The server transport handle
 363  *      ap      A pointer to the arguments
 364  *
 365  *
 366  * When appropriate this function is responsible for inserting
 367  * the reply into the duplicate cache or replaying an existing
 368  * cached reply.
 369  *
 370  * dr_stat      reflects the state of the duplicate request that
 371  *              has been inserted into or retrieved from the cache
 372  *
 373  * drp          is the duplicate request entry
 374  *
 375  */
 376 int
 377 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
 378                 SVCXPRT *xprt, char *ap)
 379 {
 380 
 381         COMPOUND4res     res_buf;
 382         COMPOUND4res    *rbp;
 383         COMPOUND4args   *cap;
 384         cred_t          *cr = NULL;
 385         int              error = 0;
 386         int              dis_flags = 0;
 387         int              dr_stat = NFS4_NOT_DUP;
 388         rfs4_dupreq_t   *drp = NULL;
 389         int              rv;
 390         nfs4_srv_t *nsrv4 = zone_getspecific(rfs4_zone_key, curzone);
 391         rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
 392 
 393         ASSERT(disp);
 394 
 395         /*
 396          * Short circuit the RPC_NULL proc.
 397          */
 398         if (disp->dis_proc == rpc_null) {
 399                 DTRACE_NFSV4_1(null__start, struct svc_req *, req);
 400                 if (!svc_sendreply(xprt, xdr_void, NULL)) {
 401                         DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 402                         svcerr_systemerr(xprt);
 403                         return (1);
 404                 }
 405                 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 406                 return (0);
 407         }
 408 
 409         /* Only NFSv4 Compounds from this point onward */
 410 
 411         rbp = &res_buf;
 412         cap = (COMPOUND4args *)ap;
 413 
 414         /*
 415          * Figure out the disposition of the whole COMPOUND
 416          * and record it's IDEMPOTENTCY.
 417          */
 418         rfs4_compound_flagproc(cap, &dis_flags);
 419 
 420         /*
 421          * If NON-IDEMPOTENT then we need to figure out if this
 422          * request can be replied from the duplicate cache.
 423          *
 424          * If this is a new request then we need to insert the
 425          * reply into the duplicate cache.
 426          */
 427         if (!(dis_flags & RPC_IDEMPOTENT)) {
 428                 /* look for a replay from the cache or allocate */
 429                 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
 430 
 431                 switch (dr_stat) {
 432 
 433                 case NFS4_DUP_ERROR:
 434                         rfs4_resource_err(req, cap);
 435                         return (1);
 436                         /* NOTREACHED */
 437 
 438                 case NFS4_DUP_PENDING:
 439                         /*
 440                          * reply has previously been inserted into the
 441                          * duplicate cache, however the reply has
 442                          * not yet been sent via svc_sendreply()
 443                          */
 444                         return (1);
 445                         /* NOTREACHED */
 446 
 447                 case NFS4_DUP_NEW:
 448                         curthread->t_flag |= T_DONTPEND;
 449                         /* NON-IDEMPOTENT proc call */
 450                         rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 451                         curthread->t_flag &= ~T_DONTPEND;
 452 
 453                         if (rv)         /* short ckt sendreply on error */
 454                                 return (rv);
 455 
 456                         /*
 457                          * dr_res must be initialized before calling
 458                          * rfs4_dr_chstate (it frees the reply).
 459                          */
 460                         drp->dr_res = res_buf;
 461                         if (curthread->t_flag & T_WOULDBLOCK) {
 462                                 curthread->t_flag &= ~T_WOULDBLOCK;
 463                                 /*
 464                                  * mark this entry as FREE and plop
 465                                  * on the end of the cache list
 466                                  */
 467                                 mutex_enter(&drp->drc->lock);
 468                                 rfs4_dr_chstate(drp, NFS4_DUP_FREE);
 469                                 list_insert_tail(&(drp->drc->dr_cache), drp);
 470                                 mutex_exit(&drp->drc->lock);
 471                                 return (1);
 472                         }
 473                         break;
 474 
 475                 case NFS4_DUP_REPLAY:
 476                         /* replay from the cache */
 477                         rbp = &(drp->dr_res);
 478                         break;
 479                 }
 480         } else {
 481                 curthread->t_flag |= T_DONTPEND;
 482                 /* IDEMPOTENT proc call */
 483                 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 484                 curthread->t_flag &= ~T_DONTPEND;
 485 
 486                 if (rv)         /* short ckt sendreply on error */
 487                         return (rv);
 488 
 489                 if (curthread->t_flag & T_WOULDBLOCK) {
 490                         curthread->t_flag &= ~T_WOULDBLOCK;
 491                         return (1);
 492                 }
 493         }
 494 
 495         /*
 496          * Send out the replayed reply or the 'real' one.
 497          */
 498         if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
 499                 DTRACE_PROBE2(nfss__e__dispatch_sendfail,
 500                     struct svc_req *, xprt,
 501                     char *, rbp);
 502                 svcerr_systemerr(xprt);
 503                 error++;
 504         }
 505 
 506         /*
 507          * If this reply was just inserted into the duplicate cache
 508          * or it was replayed from the dup cache; (re)mark it as
 509          * available for replay
 510          *
 511          * At first glance, this 'if' statement seems a little strange;
 512          * testing for NFS4_DUP_REPLAY, and then calling...
 513          *
 514          *      rfs4_dr_chatate(NFS4_DUP_REPLAY)
 515          *
 516          * ... but notice that we are checking dr_stat, and not the
 517          * state of the entry itself, the entry will be NFS4_DUP_INUSE,
 518          * we do that so that we know not to prematurely reap it whilst
 519          * we resent it to the client.
 520          *
 521          */
 522         if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
 523                 mutex_enter(&drp->drc->lock);
 524                 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
 525                 mutex_exit(&drp->drc->lock);
 526         } else if (dr_stat == NFS4_NOT_DUP) {
 527                 rfs4_compound_free(rbp);
 528         }
 529 
 530         return (error);
 531 }
 532 
 533 bool_t
 534 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
 535 {
 536         COMPOUND4args *argsp;
 537         COMPOUND4res res_buf, *resp;
 538 
 539         if (req->rq_vers != 4)
 540                 return (FALSE);
 541 
 542         argsp = (COMPOUND4args *)args;
 543 
 544         if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
 545                 return (FALSE);
 546 
 547         resp = &res_buf;
 548 
 549         /*
 550          * Form a reply tag by copying over the reqeuest tag.
 551          */
 552         resp->tag.utf8string_val =
 553             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 554         resp->tag.utf8string_len = argsp->tag.utf8string_len;
 555         bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
 556             resp->tag.utf8string_len);
 557         resp->array_len = 0;
 558         resp->array = NULL;
 559         resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
 560         if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
 561                 DTRACE_PROBE2(nfss__e__minorvers_mismatch,
 562                     SVCXPRT *, xprt, char *, resp);
 563                 svcerr_systemerr(xprt);
 564         }
 565         rfs4_compound_free(resp);
 566         return (TRUE);
 567 }
 568 
 569 void
 570 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
 571 {
 572         COMPOUND4res res_buf, *rbp;
 573         nfs_resop4 *resop;
 574         PUTFH4res *resp;
 575 
 576         rbp = &res_buf;
 577 
 578         /*
 579          * Form a reply tag by copying over the request tag.
 580          */
 581         rbp->tag.utf8string_val =
 582             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 583         rbp->tag.utf8string_len = argsp->tag.utf8string_len;
 584         bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
 585             rbp->tag.utf8string_len);
 586 
 587         rbp->array_len = 1;
 588         rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
 589             KM_SLEEP);
 590         resop = &rbp->array[0];
 591         resop->resop = argsp->array[0].argop;     /* copy first op over */
 592 
 593         /* Any op will do, just need to access status field */
 594         resp = &resop->nfs_resop4_u.opputfh;
 595 
 596         /*
 597          * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
 598          * Note that all op numbers in the compound array were already
 599          * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
 600          */
 601         resp->status = (resop->resop == OP_ILLEGAL ?
 602             NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
 603 
 604         /* compound status is same as first op status */
 605         rbp->status = resp->status;
 606 
 607         if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
 608                 DTRACE_PROBE2(nfss__rsrc_err__sendfail,
 609                     struct svc_req *, req->rq_xprt, char *, rbp);
 610                 svcerr_systemerr(req->rq_xprt);
 611         }
 612 
 613         UTF8STRING_FREE(rbp->tag);
 614         kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
 615 }