1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*
  28  * Copyright 2018 Nexenta Systems, Inc.
  29  */
  30 
  31 #include <sys/systm.h>
  32 #include <sys/sdt.h>
  33 #include <rpc/types.h>
  34 #include <rpc/auth.h>
  35 #include <rpc/auth_unix.h>
  36 #include <rpc/auth_des.h>
  37 #include <rpc/svc.h>
  38 #include <rpc/xdr.h>
  39 #include <nfs/nfs4.h>
  40 #include <nfs/nfs_dispatch.h>
  41 #include <nfs/nfs4_drc.h>
  42 
  43 #define NFS4_MAX_MINOR_VERSION  0
  44 
  45 /*
  46  * The default size of the duplicate request cache
  47  */
  48 uint32_t nfs4_drc_max = 8 * 1024;
  49 
  50 /*
  51  * The number of buckets we'd like to hash the
  52  * replies into.. do not change this on the fly.
  53  */
  54 uint32_t nfs4_drc_hash = 541;
  55 
  56 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
  57 
  58 /*
  59  * Initialize a duplicate request cache.
  60  */
  61 rfs4_drc_t *
  62 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
  63 {
  64         rfs4_drc_t *drc;
  65         uint32_t   bki;
  66 
  67         ASSERT(drc_size);
  68         ASSERT(drc_hash_size);
  69 
  70         drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
  71 
  72         drc->max_size = drc_size;
  73         drc->in_use = 0;
  74 
  75         mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
  76 
  77         drc->dr_hash = drc_hash_size;
  78 
  79         drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
  80 
  81         for (bki = 0; bki < drc_hash_size; bki++) {
  82                 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
  83                     offsetof(rfs4_dupreq_t, dr_bkt_next));
  84         }
  85 
  86         list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
  87             offsetof(rfs4_dupreq_t, dr_next));
  88 
  89         return (drc);
  90 }
  91 
  92 /*
  93  * Destroy a duplicate request cache.
  94  */
  95 void
  96 rfs4_fini_drc(void)
  97 {
  98         nfs4_srv_t *nsrv4 = nfs4_get_srv();
  99         rfs4_drc_t *drc = nsrv4->nfs4_drc;
 100         rfs4_dupreq_t *drp, *drp_next;
 101 
 102         /* iterate over the dr_cache and free the enties */
 103         for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
 104 
 105                 if (drp->dr_state == NFS4_DUP_REPLAY)
 106                         rfs4_compound_free(&(drp->dr_res));
 107 
 108                 if (drp->dr_addr.buf != NULL)
 109                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 110 
 111                 drp_next = list_next(&(drc->dr_cache), drp);
 112 
 113                 kmem_free(drp, sizeof (rfs4_dupreq_t));
 114         }
 115 
 116         mutex_destroy(&drc->lock);
 117         kmem_free(drc->dr_buckets,
 118             sizeof (list_t)*drc->dr_hash);
 119         kmem_free(drc, sizeof (rfs4_drc_t));
 120 }
 121 
 122 /*
 123  * rfs4_dr_chstate:
 124  *
 125  * Change the state of a rfs4_dupreq. If it's not in transition
 126  * to the FREE state, return. If we are moving to the FREE state
 127  * then we need to clean up the compound results and move the entry
 128  * to the end of the list.
 129  */
 130 void
 131 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
 132 {
 133         rfs4_drc_t *drc;
 134 
 135         ASSERT(drp);
 136         ASSERT(drp->drc);
 137         ASSERT(drp->dr_bkt);
 138         ASSERT(MUTEX_HELD(&drp->drc->lock));
 139 
 140         drp->dr_state = new_state;
 141 
 142         if (new_state != NFS4_DUP_FREE)
 143                 return;
 144 
 145         drc = drp->drc;
 146 
 147         /*
 148          * Remove entry from the bucket and
 149          * dr_cache list, free compound results.
 150          */
 151         list_remove(drp->dr_bkt, drp);
 152         list_remove(&(drc->dr_cache), drp);
 153         rfs4_compound_free(&(drp->dr_res));
 154 }
 155 
 156 /*
 157  * rfs4_alloc_dr:
 158  *
 159  * Malloc a new one if we have not reached our maximum cache
 160  * limit, otherwise pick an entry off the tail -- Use if it
 161  * is marked as NFS4_DUP_FREE, or is an entry in the
 162  * NFS4_DUP_REPLAY state.
 163  */
 164 rfs4_dupreq_t *
 165 rfs4_alloc_dr(rfs4_drc_t *drc)
 166 {
 167         rfs4_dupreq_t *drp_tail, *drp = NULL;
 168 
 169         ASSERT(drc);
 170         ASSERT(MUTEX_HELD(&drc->lock));
 171 
 172         /*
 173          * Have we hit the cache limit yet ?
 174          */
 175         if (drc->in_use < drc->max_size) {
 176                 /*
 177                  * nope, so let's malloc a new one
 178                  */
 179                 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
 180                 drp->drc = drc;
 181                 drc->in_use++;
 182                 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
 183                 return (drp);
 184         }
 185 
 186         /*
 187          * Cache is all allocated now traverse the list
 188          * backwards to find one we can reuse.
 189          */
 190         for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
 191             drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
 192 
 193                 switch (drp_tail->dr_state) {
 194 
 195                 case NFS4_DUP_FREE:
 196                         list_remove(&(drc->dr_cache), drp_tail);
 197                         DTRACE_PROBE1(nfss__i__drc_freeclaim,
 198                             rfs4_dupreq_t *, drp_tail);
 199                         return (drp_tail);
 200                         /* NOTREACHED */
 201 
 202                 case NFS4_DUP_REPLAY:
 203                         /* grab it. */
 204                         rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
 205                         DTRACE_PROBE1(nfss__i__drc_replayclaim,
 206                             rfs4_dupreq_t *, drp_tail);
 207                         return (drp_tail);
 208                         /* NOTREACHED */
 209                 }
 210         }
 211         DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
 212         return (NULL);
 213 }
 214 
 215 /*
 216  * rfs4_find_dr:
 217  *
 218  * Search for an entry in the duplicate request cache by
 219  * calculating the hash index based on the XID, and examining
 220  * the entries in the hash bucket. If we find a match, return.
 221  * Once we have searched the bucket we call rfs4_alloc_dr() to
 222  * allocate a new entry, or reuse one that is available.
 223  */
 224 int
 225 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
 226 {
 227 
 228         uint32_t        the_xid;
 229         list_t          *dr_bkt;
 230         rfs4_dupreq_t   *drp;
 231         int             bktdex;
 232 
 233         /*
 234          * Get the XID, calculate the bucket and search to
 235          * see if we need to replay from the cache.
 236          */
 237         the_xid = req->rq_xprt->xp_xid;
 238         bktdex = the_xid % drc->dr_hash;
 239 
 240         dr_bkt = (list_t *)
 241             &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
 242 
 243         DTRACE_PROBE3(nfss__i__drc_bktdex,
 244             int, bktdex,
 245             uint32_t, the_xid,
 246             list_t *, dr_bkt);
 247 
 248         *dup = NULL;
 249 
 250         mutex_enter(&drc->lock);
 251         /*
 252          * Search the bucket for a matching xid and address.
 253          */
 254         for (drp = list_head(dr_bkt); drp != NULL;
 255             drp = list_next(dr_bkt, drp)) {
 256 
 257                 if (drp->dr_xid == the_xid &&
 258                     drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
 259                     bcmp((caddr_t)drp->dr_addr.buf,
 260                     (caddr_t)req->rq_xprt->xp_rtaddr.buf,
 261                     drp->dr_addr.len) == 0) {
 262 
 263                         /*
 264                          * Found a match so REPLAY the Reply
 265                          */
 266                         if (drp->dr_state == NFS4_DUP_REPLAY) {
 267                                 rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
 268                                 mutex_exit(&drc->lock);
 269                                 *dup = drp;
 270                                 DTRACE_PROBE1(nfss__i__drc_replay,
 271                                     rfs4_dupreq_t *, drp);
 272                                 return (NFS4_DUP_REPLAY);
 273                         }
 274 
 275                         /*
 276                          * This entry must be in transition, so return
 277                          * the 'pending' status.
 278                          */
 279                         mutex_exit(&drc->lock);
 280                         return (NFS4_DUP_PENDING);
 281                 }
 282         }
 283 
 284         drp = rfs4_alloc_dr(drc);
 285         mutex_exit(&drc->lock);
 286 
 287         /*
 288          * The DRC is full and all entries are in use. Upper function
 289          * should error out this request and force the client to
 290          * retransmit -- effectively this is a resource issue. NFSD
 291          * threads tied up with native File System, or the cache size
 292          * is too small for the server load.
 293          */
 294         if (drp == NULL)
 295                 return (NFS4_DUP_ERROR);
 296 
 297         /*
 298          * Init the state to NEW.
 299          */
 300         drp->dr_state = NFS4_DUP_NEW;
 301 
 302         /*
 303          * If needed, resize the address buffer
 304          */
 305         if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
 306                 if (drp->dr_addr.buf != NULL)
 307                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 308                 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
 309                 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
 310                 if (drp->dr_addr.buf == NULL) {
 311                         /*
 312                          * If the malloc fails, mark the entry
 313                          * as free and put on the tail.
 314                          */
 315                         drp->dr_addr.maxlen = 0;
 316                         drp->dr_state = NFS4_DUP_FREE;
 317                         mutex_enter(&drc->lock);
 318                         list_insert_tail(&(drc->dr_cache), drp);
 319                         mutex_exit(&drc->lock);
 320                         return (NFS4_DUP_ERROR);
 321                 }
 322         }
 323 
 324 
 325         /*
 326          * Copy the address.
 327          */
 328         drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
 329 
 330         bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
 331             (caddr_t)drp->dr_addr.buf,
 332             drp->dr_addr.len);
 333 
 334         drp->dr_xid = the_xid;
 335         drp->dr_bkt = dr_bkt;
 336 
 337         /*
 338          * Insert at the head of the bucket and
 339          * the drc lists..
 340          */
 341         mutex_enter(&drc->lock);
 342         list_insert_head(&drc->dr_cache, drp);
 343         list_insert_head(dr_bkt, drp);
 344         mutex_exit(&drc->lock);
 345 
 346         *dup = drp;
 347 
 348         return (NFS4_DUP_NEW);
 349 }
 350 
 351 /*
 352  *
 353  * This function handles the duplicate request cache,
 354  * NULL_PROC and COMPOUND procedure calls for NFSv4;
 355  *
 356  * Passed into this function are:-
 357  *
 358  *      disp    A pointer to our dispatch table entry
 359  *      req     The request to process
 360  *      xprt    The server transport handle
 361  *      ap      A pointer to the arguments
 362  *
 363  *
 364  * When appropriate this function is responsible for inserting
 365  * the reply into the duplicate cache or replaying an existing
 366  * cached reply.
 367  *
 368  * dr_stat      reflects the state of the duplicate request that
 369  *              has been inserted into or retrieved from the cache
 370  *
 371  * drp          is the duplicate request entry
 372  *
 373  */
 374 int
 375 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
 376                 SVCXPRT *xprt, char *ap)
 377 {
 378 
 379         COMPOUND4res     res_buf;
 380         COMPOUND4res    *rbp;
 381         COMPOUND4args   *cap;
 382         cred_t          *cr = NULL;
 383         int              error = 0;
 384         int              dis_flags = 0;
 385         int              dr_stat = NFS4_NOT_DUP;
 386         rfs4_dupreq_t   *drp = NULL;
 387         int              rv;
 388         nfs4_srv_t *nsrv4 = nfs4_get_srv();
 389         rfs4_drc_t *nfs4_drc = nsrv4->nfs4_drc;
 390 
 391         ASSERT(disp);
 392 
 393         /*
 394          * Short circuit the RPC_NULL proc.
 395          */
 396         if (disp->dis_proc == rpc_null) {
 397                 DTRACE_NFSV4_1(null__start, struct svc_req *, req);
 398                 if (!svc_sendreply(xprt, xdr_void, NULL)) {
 399                         DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 400                         svcerr_systemerr(xprt);
 401                         return (1);
 402                 }
 403                 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 404                 return (0);
 405         }
 406 
 407         /* Only NFSv4 Compounds from this point onward */
 408 
 409         rbp = &res_buf;
 410         cap = (COMPOUND4args *)ap;
 411 
 412         /*
 413          * Figure out the disposition of the whole COMPOUND
 414          * and record it's IDEMPOTENTCY.
 415          */
 416         rfs4_compound_flagproc(cap, &dis_flags);
 417 
 418         /*
 419          * If NON-IDEMPOTENT then we need to figure out if this
 420          * request can be replied from the duplicate cache.
 421          *
 422          * If this is a new request then we need to insert the
 423          * reply into the duplicate cache.
 424          */
 425         if (!(dis_flags & RPC_IDEMPOTENT)) {
 426                 /* look for a replay from the cache or allocate */
 427                 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
 428 
 429                 switch (dr_stat) {
 430 
 431                 case NFS4_DUP_ERROR:
 432                         rfs4_resource_err(req, cap);
 433                         return (1);
 434                         /* NOTREACHED */
 435 
 436                 case NFS4_DUP_PENDING:
 437                         /*
 438                          * reply has previously been inserted into the
 439                          * duplicate cache, however the reply has
 440                          * not yet been sent via svc_sendreply()
 441                          */
 442                         return (1);
 443                         /* NOTREACHED */
 444 
 445                 case NFS4_DUP_NEW:
 446                         curthread->t_flag |= T_DONTPEND;
 447                         /* NON-IDEMPOTENT proc call */
 448                         rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 449                         curthread->t_flag &= ~T_DONTPEND;
 450 
 451                         if (rv)         /* short ckt sendreply on error */
 452                                 return (rv);
 453 
 454                         /*
 455                          * dr_res must be initialized before calling
 456                          * rfs4_dr_chstate (it frees the reply).
 457                          */
 458                         drp->dr_res = res_buf;
 459                         if (curthread->t_flag & T_WOULDBLOCK) {
 460                                 curthread->t_flag &= ~T_WOULDBLOCK;
 461                                 /*
 462                                  * mark this entry as FREE and plop
 463                                  * on the end of the cache list
 464                                  */
 465                                 mutex_enter(&drp->drc->lock);
 466                                 rfs4_dr_chstate(drp, NFS4_DUP_FREE);
 467                                 list_insert_tail(&(drp->drc->dr_cache), drp);
 468                                 mutex_exit(&drp->drc->lock);
 469                                 return (1);
 470                         }
 471                         break;
 472 
 473                 case NFS4_DUP_REPLAY:
 474                         /* replay from the cache */
 475                         rbp = &(drp->dr_res);
 476                         break;
 477                 }
 478         } else {
 479                 curthread->t_flag |= T_DONTPEND;
 480                 /* IDEMPOTENT proc call */
 481                 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 482                 curthread->t_flag &= ~T_DONTPEND;
 483 
 484                 if (rv)         /* short ckt sendreply on error */
 485                         return (rv);
 486 
 487                 if (curthread->t_flag & T_WOULDBLOCK) {
 488                         curthread->t_flag &= ~T_WOULDBLOCK;
 489                         return (1);
 490                 }
 491         }
 492 
 493         /*
 494          * Send out the replayed reply or the 'real' one.
 495          */
 496         if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
 497                 DTRACE_PROBE2(nfss__e__dispatch_sendfail,
 498                     struct svc_req *, xprt,
 499                     char *, rbp);
 500                 svcerr_systemerr(xprt);
 501                 error++;
 502         }
 503 
 504         /*
 505          * If this reply was just inserted into the duplicate cache
 506          * or it was replayed from the dup cache; (re)mark it as
 507          * available for replay
 508          *
 509          * At first glance, this 'if' statement seems a little strange;
 510          * testing for NFS4_DUP_REPLAY, and then calling...
 511          *
 512          *      rfs4_dr_chatate(NFS4_DUP_REPLAY)
 513          *
 514          * ... but notice that we are checking dr_stat, and not the
 515          * state of the entry itself, the entry will be NFS4_DUP_INUSE,
 516          * we do that so that we know not to prematurely reap it whilst
 517          * we resent it to the client.
 518          *
 519          */
 520         if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
 521                 mutex_enter(&drp->drc->lock);
 522                 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
 523                 mutex_exit(&drp->drc->lock);
 524         } else if (dr_stat == NFS4_NOT_DUP) {
 525                 rfs4_compound_free(rbp);
 526         }
 527 
 528         return (error);
 529 }
 530 
 531 bool_t
 532 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
 533 {
 534         COMPOUND4args *argsp;
 535         COMPOUND4res res_buf, *resp;
 536 
 537         if (req->rq_vers != 4)
 538                 return (FALSE);
 539 
 540         argsp = (COMPOUND4args *)args;
 541 
 542         if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
 543                 return (FALSE);
 544 
 545         resp = &res_buf;
 546 
 547         /*
 548          * Form a reply tag by copying over the reqeuest tag.
 549          */
 550         resp->tag.utf8string_val =
 551             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 552         resp->tag.utf8string_len = argsp->tag.utf8string_len;
 553         bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
 554             resp->tag.utf8string_len);
 555         resp->array_len = 0;
 556         resp->array = NULL;
 557         resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
 558         if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
 559                 DTRACE_PROBE2(nfss__e__minorvers_mismatch,
 560                     SVCXPRT *, xprt, char *, resp);
 561                 svcerr_systemerr(xprt);
 562         }
 563         rfs4_compound_free(resp);
 564         return (TRUE);
 565 }
 566 
 567 void
 568 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
 569 {
 570         COMPOUND4res res_buf, *rbp;
 571         nfs_resop4 *resop;
 572         PUTFH4res *resp;
 573 
 574         rbp = &res_buf;
 575 
 576         /*
 577          * Form a reply tag by copying over the request tag.
 578          */
 579         rbp->tag.utf8string_val =
 580             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 581         rbp->tag.utf8string_len = argsp->tag.utf8string_len;
 582         bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
 583             rbp->tag.utf8string_len);
 584 
 585         rbp->array_len = 1;
 586         rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
 587             KM_SLEEP);
 588         resop = &rbp->array[0];
 589         resop->resop = argsp->array[0].argop;     /* copy first op over */
 590 
 591         /* Any op will do, just need to access status field */
 592         resp = &resop->nfs_resop4_u.opputfh;
 593 
 594         /*
 595          * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
 596          * Note that all op numbers in the compound array were already
 597          * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
 598          */
 599         resp->status = (resop->resop == OP_ILLEGAL ?
 600             NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
 601 
 602         /* compound status is same as first op status */
 603         rbp->status = resp->status;
 604 
 605         if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
 606                 DTRACE_PROBE2(nfss__rsrc_err__sendfail,
 607                     struct svc_req *, req->rq_xprt, char *, rbp);
 608                 svcerr_systemerr(req->rq_xprt);
 609         }
 610 
 611         UTF8STRING_FREE(rbp->tag);
 612         kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
 613 }