1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/systm.h>
  28 #include <sys/sdt.h>
  29 #include <rpc/types.h>
  30 #include <rpc/auth.h>
  31 #include <rpc/auth_unix.h>
  32 #include <rpc/auth_des.h>
  33 #include <rpc/svc.h>
  34 #include <rpc/xdr.h>
  35 #include <nfs/nfs4.h>
  36 #include <nfs/nfs_dispatch.h>
  37 #include <nfs/nfs4_drc.h>
  38 
  39 #define NFS4_MAX_MINOR_VERSION  0
  40 
  41 /*
  42  * This is the duplicate request cache for NFSv4
  43  */
  44 rfs4_drc_t *nfs4_drc = NULL;
  45 
  46 /*
  47  * The default size of the duplicate request cache
  48  */
  49 uint32_t nfs4_drc_max = 8 * 1024;
  50 
  51 /*
  52  * The number of buckets we'd like to hash the
  53  * replies into.. do not change this on the fly.
  54  */
  55 uint32_t nfs4_drc_hash = 541;
  56 
  57 static void rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp);
  58 
  59 /*
  60  * Initialize a duplicate request cache.
  61  */
  62 rfs4_drc_t *
  63 rfs4_init_drc(uint32_t drc_size, uint32_t drc_hash_size)
  64 {
  65         rfs4_drc_t *drc;
  66         uint32_t   bki;
  67 
  68         ASSERT(drc_size);
  69         ASSERT(drc_hash_size);
  70 
  71         drc = kmem_alloc(sizeof (rfs4_drc_t), KM_SLEEP);
  72 
  73         drc->max_size = drc_size;
  74         drc->in_use = 0;
  75 
  76         mutex_init(&drc->lock, NULL, MUTEX_DEFAULT, NULL);
  77 
  78         drc->dr_hash = drc_hash_size;
  79 
  80         drc->dr_buckets = kmem_alloc(sizeof (list_t)*drc_hash_size, KM_SLEEP);
  81 
  82         for (bki = 0; bki < drc_hash_size; bki++) {
  83                 list_create(&drc->dr_buckets[bki], sizeof (rfs4_dupreq_t),
  84                     offsetof(rfs4_dupreq_t, dr_bkt_next));
  85         }
  86 
  87         list_create(&(drc->dr_cache), sizeof (rfs4_dupreq_t),
  88             offsetof(rfs4_dupreq_t, dr_next));
  89 
  90         return (drc);
  91 }
  92 
  93 /*
  94  * Destroy a duplicate request cache.
  95  */
  96 void
  97 rfs4_fini_drc(rfs4_drc_t *drc)
  98 {
  99         rfs4_dupreq_t *drp, *drp_next;
 100 
 101         ASSERT(drc);
 102 
 103         /* iterate over the dr_cache and free the enties */
 104         for (drp = list_head(&(drc->dr_cache)); drp != NULL; drp = drp_next) {
 105 
 106                 if (drp->dr_state == NFS4_DUP_REPLAY)
 107                         rfs4_compound_free(&(drp->dr_res));
 108 
 109                 if (drp->dr_addr.buf != NULL)
 110                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 111 
 112                 drp_next = list_next(&(drc->dr_cache), drp);
 113 
 114                 kmem_free(drp, sizeof (rfs4_dupreq_t));
 115         }
 116 
 117         mutex_destroy(&drc->lock);
 118         kmem_free(drc->dr_buckets,
 119             sizeof (list_t)*drc->dr_hash);
 120         kmem_free(drc, sizeof (rfs4_drc_t));
 121 }
 122 
 123 /*
 124  * rfs4_dr_chstate:
 125  *
 126  * Change the state of a rfs4_dupreq. If it's not in transition
 127  * to the FREE state, return. If we are moving to the FREE state
 128  * then we need to clean up the compound results and move the entry
 129  * to the end of the list.
 130  */
 131 void
 132 rfs4_dr_chstate(rfs4_dupreq_t *drp, int new_state)
 133 {
 134         rfs4_drc_t *drc;
 135 
 136         ASSERT(drp);
 137         ASSERT(drp->drc);
 138         ASSERT(drp->dr_bkt);
 139         ASSERT(MUTEX_HELD(&drp->drc->lock));
 140 
 141         drp->dr_state = new_state;
 142 
 143         if (new_state != NFS4_DUP_FREE)
 144                 return;
 145 
 146         drc = drp->drc;
 147 
 148         /*
 149          * Remove entry from the bucket and
 150          * dr_cache list, free compound results.
 151          */
 152         list_remove(drp->dr_bkt, drp);
 153         list_remove(&(drc->dr_cache), drp);
 154         rfs4_compound_free(&(drp->dr_res));
 155 }
 156 
 157 /*
 158  * rfs4_alloc_dr:
 159  *
 160  * Malloc a new one if we have not reached our maximum cache
 161  * limit, otherwise pick an entry off the tail -- Use if it
 162  * is marked as NFS4_DUP_FREE, or is an entry in the
 163  * NFS4_DUP_REPLAY state.
 164  */
 165 rfs4_dupreq_t *
 166 rfs4_alloc_dr(rfs4_drc_t *drc)
 167 {
 168         rfs4_dupreq_t *drp_tail, *drp = NULL;
 169 
 170         ASSERT(drc);
 171         ASSERT(MUTEX_HELD(&drc->lock));
 172 
 173         /*
 174          * Have we hit the cache limit yet ?
 175          */
 176         if (drc->in_use < drc->max_size) {
 177                 /*
 178                  * nope, so let's malloc a new one
 179                  */
 180                 drp = kmem_zalloc(sizeof (rfs4_dupreq_t), KM_SLEEP);
 181                 drp->drc = drc;
 182                 drc->in_use++;
 183                 DTRACE_PROBE1(nfss__i__drc_new, rfs4_dupreq_t *, drp);
 184                 return (drp);
 185         }
 186 
 187         /*
 188          * Cache is all allocated now traverse the list
 189          * backwards to find one we can reuse.
 190          */
 191         for (drp_tail = list_tail(&drc->dr_cache); drp_tail != NULL;
 192             drp_tail = list_prev(&drc->dr_cache, drp_tail)) {
 193 
 194                 switch (drp_tail->dr_state) {
 195 
 196                 case NFS4_DUP_FREE:
 197                         list_remove(&(drc->dr_cache), drp_tail);
 198                         DTRACE_PROBE1(nfss__i__drc_freeclaim,
 199                             rfs4_dupreq_t *, drp_tail);
 200                         return (drp_tail);
 201                         /* NOTREACHED */
 202 
 203                 case NFS4_DUP_REPLAY:
 204                         /* grab it. */
 205                         rfs4_dr_chstate(drp_tail, NFS4_DUP_FREE);
 206                         DTRACE_PROBE1(nfss__i__drc_replayclaim,
 207                             rfs4_dupreq_t *, drp_tail);
 208                         return (drp_tail);
 209                         /* NOTREACHED */
 210                 }
 211         }
 212         DTRACE_PROBE1(nfss__i__drc_full, rfs4_drc_t *, drc);
 213         return (NULL);
 214 }
 215 
 216 /*
 217  * rfs4_find_dr:
 218  *
 219  * Search for an entry in the duplicate request cache by
 220  * calculating the hash index based on the XID, and examining
 221  * the entries in the hash bucket. If we find a match, return.
 222  * Once we have searched the bucket we call rfs4_alloc_dr() to
 223  * allocate a new entry, or reuse one that is available.
 224  */
 225 int
 226 rfs4_find_dr(struct svc_req *req, rfs4_drc_t *drc, rfs4_dupreq_t **dup)
 227 {
 228 
 229         uint32_t        the_xid;
 230         list_t          *dr_bkt;
 231         rfs4_dupreq_t   *drp;
 232         int             bktdex;
 233 
 234         /*
 235          * Get the XID, calculate the bucket and search to
 236          * see if we need to replay from the cache.
 237          */
 238         the_xid = req->rq_xprt->xp_xid;
 239         bktdex = the_xid % drc->dr_hash;
 240 
 241         dr_bkt = (list_t *)
 242             &(drc->dr_buckets[(the_xid % drc->dr_hash)]);
 243 
 244         DTRACE_PROBE3(nfss__i__drc_bktdex,
 245             int, bktdex,
 246             uint32_t, the_xid,
 247             list_t *, dr_bkt);
 248 
 249         *dup = NULL;
 250 
 251         mutex_enter(&drc->lock);
 252         /*
 253          * Search the bucket for a matching xid and address.
 254          */
 255         for (drp = list_head(dr_bkt); drp != NULL;
 256             drp = list_next(dr_bkt, drp)) {
 257 
 258                 if (drp->dr_xid == the_xid &&
 259                     drp->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
 260                     bcmp((caddr_t)drp->dr_addr.buf,
 261                     (caddr_t)req->rq_xprt->xp_rtaddr.buf,
 262                     drp->dr_addr.len) == 0) {
 263 
 264                         /*
 265                          * Found a match so REPLAY the Reply
 266                          */
 267                         if (drp->dr_state == NFS4_DUP_REPLAY) {
 268                                 rfs4_dr_chstate(drp, NFS4_DUP_INUSE);
 269                                 mutex_exit(&drc->lock);
 270                                 *dup = drp;
 271                                 DTRACE_PROBE1(nfss__i__drc_replay,
 272                                     rfs4_dupreq_t *, drp);
 273                                 return (NFS4_DUP_REPLAY);
 274                         }
 275 
 276                         /*
 277                          * This entry must be in transition, so return
 278                          * the 'pending' status.
 279                          */
 280                         mutex_exit(&drc->lock);
 281                         return (NFS4_DUP_PENDING);
 282                 }
 283         }
 284 
 285         drp = rfs4_alloc_dr(drc);
 286         mutex_exit(&drc->lock);
 287 
 288         /*
 289          * The DRC is full and all entries are in use. Upper function
 290          * should error out this request and force the client to
 291          * retransmit -- effectively this is a resource issue. NFSD
 292          * threads tied up with native File System, or the cache size
 293          * is too small for the server load.
 294          */
 295         if (drp == NULL)
 296                 return (NFS4_DUP_ERROR);
 297 
 298         /*
 299          * Init the state to NEW.
 300          */
 301         drp->dr_state = NFS4_DUP_NEW;
 302 
 303         /*
 304          * If needed, resize the address buffer
 305          */
 306         if (drp->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
 307                 if (drp->dr_addr.buf != NULL)
 308                         kmem_free(drp->dr_addr.buf, drp->dr_addr.maxlen);
 309                 drp->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
 310                 drp->dr_addr.buf = kmem_alloc(drp->dr_addr.maxlen, KM_NOSLEEP);
 311                 if (drp->dr_addr.buf == NULL) {
 312                         /*
 313                          * If the malloc fails, mark the entry
 314                          * as free and put on the tail.
 315                          */
 316                         drp->dr_addr.maxlen = 0;
 317                         drp->dr_state = NFS4_DUP_FREE;
 318                         mutex_enter(&drc->lock);
 319                         list_insert_tail(&(drc->dr_cache), drp);
 320                         mutex_exit(&drc->lock);
 321                         return (NFS4_DUP_ERROR);
 322                 }
 323         }
 324 
 325 
 326         /*
 327          * Copy the address.
 328          */
 329         drp->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
 330 
 331         bcopy((caddr_t)req->rq_xprt->xp_rtaddr.buf,
 332             (caddr_t)drp->dr_addr.buf,
 333             drp->dr_addr.len);
 334 
 335         drp->dr_xid = the_xid;
 336         drp->dr_bkt = dr_bkt;
 337 
 338         /*
 339          * Insert at the head of the bucket and
 340          * the drc lists..
 341          */
 342         mutex_enter(&drc->lock);
 343         list_insert_head(&drc->dr_cache, drp);
 344         list_insert_head(dr_bkt, drp);
 345         mutex_exit(&drc->lock);
 346 
 347         *dup = drp;
 348 
 349         return (NFS4_DUP_NEW);
 350 }
 351 
 352 /*
 353  *
 354  * This function handles the duplicate request cache,
 355  * NULL_PROC and COMPOUND procedure calls for NFSv4;
 356  *
 357  * Passed into this function are:-
 358  *
 359  *      disp    A pointer to our dispatch table entry
 360  *      req     The request to process
 361  *      xprt    The server transport handle
 362  *      ap      A pointer to the arguments
 363  *
 364  *
 365  * When appropriate this function is responsible for inserting
 366  * the reply into the duplicate cache or replaying an existing
 367  * cached reply.
 368  *
 369  * dr_stat      reflects the state of the duplicate request that
 370  *              has been inserted into or retrieved from the cache
 371  *
 372  * drp          is the duplicate request entry
 373  *
 374  */
 375 int
 376 rfs4_dispatch(struct rpcdisp *disp, struct svc_req *req,
 377                 SVCXPRT *xprt, char *ap)
 378 {
 379 
 380         COMPOUND4res     res_buf;
 381         COMPOUND4res    *rbp;
 382         COMPOUND4args   *cap;
 383         cred_t          *cr = NULL;
 384         int              error = 0;
 385         int              dis_flags = 0;
 386         int              dr_stat = NFS4_NOT_DUP;
 387         rfs4_dupreq_t   *drp = NULL;
 388         int              rv;
 389 
 390         ASSERT(disp);
 391 
 392         /*
 393          * Short circuit the RPC_NULL proc.
 394          */
 395         if (disp->dis_proc == rpc_null) {
 396                 DTRACE_NFSV4_1(null__start, struct svc_req *, req);
 397                 if (!svc_sendreply(xprt, xdr_void, NULL)) {
 398                         DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 399                         svcerr_systemerr(xprt);
 400                         return (1);
 401                 }
 402                 DTRACE_NFSV4_1(null__done, struct svc_req *, req);
 403                 return (0);
 404         }
 405 
 406         /* Only NFSv4 Compounds from this point onward */
 407 
 408         rbp = &res_buf;
 409         cap = (COMPOUND4args *)ap;
 410 
 411         /*
 412          * Figure out the disposition of the whole COMPOUND
 413          * and record it's IDEMPOTENTCY.
 414          */
 415         rfs4_compound_flagproc(cap, &dis_flags);
 416 
 417         /*
 418          * If NON-IDEMPOTENT then we need to figure out if this
 419          * request can be replied from the duplicate cache.
 420          *
 421          * If this is a new request then we need to insert the
 422          * reply into the duplicate cache.
 423          */
 424         if (!(dis_flags & RPC_IDEMPOTENT)) {
 425                 /* look for a replay from the cache or allocate */
 426                 dr_stat = rfs4_find_dr(req, nfs4_drc, &drp);
 427 
 428                 switch (dr_stat) {
 429 
 430                 case NFS4_DUP_ERROR:
 431                         rfs4_resource_err(req, cap);
 432                         return (1);
 433                         /* NOTREACHED */
 434 
 435                 case NFS4_DUP_PENDING:
 436                         /*
 437                          * reply has previously been inserted into the
 438                          * duplicate cache, however the reply has
 439                          * not yet been sent via svc_sendreply()
 440                          */
 441                         return (1);
 442                         /* NOTREACHED */
 443 
 444                 case NFS4_DUP_NEW:
 445                         curthread->t_flag |= T_DONTPEND;
 446                         /* NON-IDEMPOTENT proc call */
 447                         rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 448                         curthread->t_flag &= ~T_DONTPEND;
 449 
 450                         if (rv)         /* short ckt sendreply on error */
 451                                 return (rv);
 452 
 453                         /*
 454                          * dr_res must be initialized before calling
 455                          * rfs4_dr_chstate (it frees the reply).
 456                          */
 457                         drp->dr_res = res_buf;
 458                         if (curthread->t_flag & T_WOULDBLOCK) {
 459                                 curthread->t_flag &= ~T_WOULDBLOCK;
 460                                 /*
 461                                  * mark this entry as FREE and plop
 462                                  * on the end of the cache list
 463                                  */
 464                                 mutex_enter(&drp->drc->lock);
 465                                 rfs4_dr_chstate(drp, NFS4_DUP_FREE);
 466                                 list_insert_tail(&(drp->drc->dr_cache), drp);
 467                                 mutex_exit(&drp->drc->lock);
 468                                 return (1);
 469                         }
 470                         break;
 471 
 472                 case NFS4_DUP_REPLAY:
 473                         /* replay from the cache */
 474                         rbp = &(drp->dr_res);
 475                         break;
 476                 }
 477         } else {
 478                 curthread->t_flag |= T_DONTPEND;
 479                 /* IDEMPOTENT proc call */
 480                 rfs4_compound(cap, rbp, NULL, req, cr, &rv);
 481                 curthread->t_flag &= ~T_DONTPEND;
 482 
 483                 if (rv)         /* short ckt sendreply on error */
 484                         return (rv);
 485 
 486                 if (curthread->t_flag & T_WOULDBLOCK) {
 487                         curthread->t_flag &= ~T_WOULDBLOCK;
 488                         return (1);
 489                 }
 490         }
 491 
 492         /*
 493          * Send out the replayed reply or the 'real' one.
 494          */
 495         if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)rbp)) {
 496                 DTRACE_PROBE2(nfss__e__dispatch_sendfail,
 497                     struct svc_req *, xprt,
 498                     char *, rbp);
 499                 svcerr_systemerr(xprt);
 500                 error++;
 501         }
 502 
 503         /*
 504          * If this reply was just inserted into the duplicate cache
 505          * or it was replayed from the dup cache; (re)mark it as
 506          * available for replay
 507          *
 508          * At first glance, this 'if' statement seems a little strange;
 509          * testing for NFS4_DUP_REPLAY, and then calling...
 510          *
 511          *      rfs4_dr_chatate(NFS4_DUP_REPLAY)
 512          *
 513          * ... but notice that we are checking dr_stat, and not the
 514          * state of the entry itself, the entry will be NFS4_DUP_INUSE,
 515          * we do that so that we know not to prematurely reap it whilst
 516          * we resent it to the client.
 517          *
 518          */
 519         if (dr_stat == NFS4_DUP_NEW || dr_stat == NFS4_DUP_REPLAY) {
 520                 mutex_enter(&drp->drc->lock);
 521                 rfs4_dr_chstate(drp, NFS4_DUP_REPLAY);
 522                 mutex_exit(&drp->drc->lock);
 523         } else if (dr_stat == NFS4_NOT_DUP) {
 524                 rfs4_compound_free(rbp);
 525         }
 526 
 527         return (error);
 528 }
 529 
 530 bool_t
 531 rfs4_minorvers_mismatch(struct svc_req *req, SVCXPRT *xprt, void *args)
 532 {
 533         COMPOUND4args *argsp;
 534         COMPOUND4res res_buf, *resp;
 535 
 536         if (req->rq_vers != 4)
 537                 return (FALSE);
 538 
 539         argsp = (COMPOUND4args *)args;
 540 
 541         if (argsp->minorversion <= NFS4_MAX_MINOR_VERSION)
 542                 return (FALSE);
 543 
 544         resp = &res_buf;
 545 
 546         /*
 547          * Form a reply tag by copying over the reqeuest tag.
 548          */
 549         resp->tag.utf8string_val =
 550             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 551         resp->tag.utf8string_len = argsp->tag.utf8string_len;
 552         bcopy(argsp->tag.utf8string_val, resp->tag.utf8string_val,
 553             resp->tag.utf8string_len);
 554         resp->array_len = 0;
 555         resp->array = NULL;
 556         resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
 557         if (!svc_sendreply(xprt,  xdr_COMPOUND4res_srv, (char *)resp)) {
 558                 DTRACE_PROBE2(nfss__e__minorvers_mismatch,
 559                     SVCXPRT *, xprt, char *, resp);
 560                 svcerr_systemerr(xprt);
 561         }
 562         rfs4_compound_free(resp);
 563         return (TRUE);
 564 }
 565 
 566 void
 567 rfs4_resource_err(struct svc_req *req, COMPOUND4args *argsp)
 568 {
 569         COMPOUND4res res_buf, *rbp;
 570         nfs_resop4 *resop;
 571         PUTFH4res *resp;
 572 
 573         rbp = &res_buf;
 574 
 575         /*
 576          * Form a reply tag by copying over the request tag.
 577          */
 578         rbp->tag.utf8string_val =
 579             kmem_alloc(argsp->tag.utf8string_len, KM_SLEEP);
 580         rbp->tag.utf8string_len = argsp->tag.utf8string_len;
 581         bcopy(argsp->tag.utf8string_val, rbp->tag.utf8string_val,
 582             rbp->tag.utf8string_len);
 583 
 584         rbp->array_len = 1;
 585         rbp->array = kmem_zalloc(rbp->array_len * sizeof (nfs_resop4),
 586             KM_SLEEP);
 587         resop = &rbp->array[0];
 588         resop->resop = argsp->array[0].argop;     /* copy first op over */
 589 
 590         /* Any op will do, just need to access status field */
 591         resp = &resop->nfs_resop4_u.opputfh;
 592 
 593         /*
 594          * NFS4ERR_RESOURCE is allowed for all ops, except OP_ILLEGAL.
 595          * Note that all op numbers in the compound array were already
 596          * validated by the XDR decoder (xdr_COMPOUND4args_srv()).
 597          */
 598         resp->status = (resop->resop == OP_ILLEGAL ?
 599             NFS4ERR_OP_ILLEGAL : NFS4ERR_RESOURCE);
 600 
 601         /* compound status is same as first op status */
 602         rbp->status = resp->status;
 603 
 604         if (!svc_sendreply(req->rq_xprt, xdr_COMPOUND4res_srv, (char *)rbp)) {
 605                 DTRACE_PROBE2(nfss__rsrc_err__sendfail,
 606                     struct svc_req *, req->rq_xprt, char *, rbp);
 607                 svcerr_systemerr(req->rq_xprt);
 608         }
 609 
 610         UTF8STRING_FREE(rbp->tag);
 611         kmem_free(rbp->array, rbp->array_len * sizeof (nfs_resop4));
 612 }