1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  24  *  Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * Portions of this source code were derived from Berkeley 4.3 BSD
  32  * under license from the Regents of the University of California.
  33  */
  34 
  35 /*
  36  * svc_cots.c
  37  * Server side for connection-oriented RPC in the kernel.
  38  *
  39  */
  40 
  41 #include <sys/param.h>
  42 #include <sys/types.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/file.h>
  45 #include <sys/stream.h>
  46 #include <sys/strsubr.h>
  47 #include <sys/strsun.h>
  48 #include <sys/stropts.h>
  49 #include <sys/tiuser.h>
  50 #include <sys/timod.h>
  51 #include <sys/tihdr.h>
  52 #include <sys/fcntl.h>
  53 #include <sys/errno.h>
  54 #include <sys/kmem.h>
  55 #include <sys/systm.h>
  56 #include <sys/debug.h>
  57 #include <sys/cmn_err.h>
  58 #include <sys/kstat.h>
  59 #include <sys/vtrace.h>
  60 
  61 #include <rpc/types.h>
  62 #include <rpc/xdr.h>
  63 #include <rpc/auth.h>
  64 #include <rpc/rpc_msg.h>
  65 #include <rpc/svc.h>
  66 #include <inet/ip.h>
  67 
  68 #define COTS_MAX_ALLOCSIZE      2048
  69 #define MSG_OFFSET              128     /* offset of call into the mblk */
  70 #define RM_HDR_SIZE             4       /* record mark header size */
  71 
  72 /*
  73  * Routines exported through ops vector.
  74  */
  75 static bool_t           svc_cots_krecv(SVCXPRT *, mblk_t *, struct rpc_msg *);
  76 static bool_t           svc_cots_ksend(SVCXPRT *, struct rpc_msg *);
  77 static bool_t           svc_cots_kgetargs(SVCXPRT *, xdrproc_t, caddr_t);
  78 static bool_t           svc_cots_kfreeargs(SVCXPRT *, xdrproc_t, caddr_t);
  79 static void             svc_cots_kdestroy(SVCMASTERXPRT *);
  80 static int              svc_cots_kdup(struct svc_req *, caddr_t, int,
  81                                 struct dupreq **, bool_t *);
  82 static void             svc_cots_kdupdone(struct dupreq *, caddr_t,
  83                                 void (*)(), int, int);
  84 static int32_t          *svc_cots_kgetres(SVCXPRT *, int);
  85 static void             svc_cots_kfreeres(SVCXPRT *);
  86 static void             svc_cots_kclone_destroy(SVCXPRT *);
  87 static void             svc_cots_kstart(SVCMASTERXPRT *);
  88 static void             svc_cots_ktattrs(SVCXPRT *, int, void **);
  89 
  90 /*
  91  * Server transport operations vector.
  92  */
  93 struct svc_ops svc_cots_op = {
  94         svc_cots_krecv,         /* Get requests */
  95         svc_cots_kgetargs,      /* Deserialize arguments */
  96         svc_cots_ksend,         /* Send reply */
  97         svc_cots_kfreeargs,     /* Free argument data space */
  98         svc_cots_kdestroy,      /* Destroy transport handle */
  99         svc_cots_kdup,          /* Check entry in dup req cache */
 100         svc_cots_kdupdone,      /* Mark entry in dup req cache as done */
 101         svc_cots_kgetres,       /* Get pointer to response buffer */
 102         svc_cots_kfreeres,      /* Destroy pre-serialized response header */
 103         svc_cots_kclone_destroy, /* Destroy a clone xprt */
 104         svc_cots_kstart,        /* Tell `ready-to-receive' to rpcmod */
 105         NULL,                   /* Transport specific clone xprt */
 106         svc_cots_ktattrs        /* Transport Attributes */
 107 };
 108 
 109 /*
 110  * Master transport private data.
 111  * Kept in xprt->xp_p2.
 112  */
 113 struct cots_master_data {
 114         char    *cmd_src_addr;  /* client's address */
 115         int     cmd_xprt_started; /* flag for clone routine to call */
 116                                 /* rpcmod's start routine. */
 117         struct rpc_cots_server *cmd_stats;      /* stats for zone */
 118 };
 119 
 120 /*
 121  * Transport private data.
 122  * Kept in clone_xprt->xp_p2buf.
 123  */
 124 typedef struct cots_data {
 125         mblk_t  *cd_mp;         /* pre-allocated reply message */
 126         mblk_t  *cd_req_mp;     /* request message */
 127 } cots_data_t;
 128 
 129 /*
 130  * Server statistics
 131  * NOTE: This structure type is duplicated in the NFS fast path.
 132  */
 133 static const struct rpc_cots_server {
 134         kstat_named_t   rscalls;
 135         kstat_named_t   rsbadcalls;
 136         kstat_named_t   rsnullrecv;
 137         kstat_named_t   rsbadlen;
 138         kstat_named_t   rsxdrcall;
 139         kstat_named_t   rsdupchecks;
 140         kstat_named_t   rsdupreqs;
 141 } cots_rsstat_tmpl = {
 142         { "calls",      KSTAT_DATA_UINT64 },
 143         { "badcalls",   KSTAT_DATA_UINT64 },
 144         { "nullrecv",   KSTAT_DATA_UINT64 },
 145         { "badlen",     KSTAT_DATA_UINT64 },
 146         { "xdrcall",    KSTAT_DATA_UINT64 },
 147         { "dupchecks",  KSTAT_DATA_UINT64 },
 148         { "dupreqs",    KSTAT_DATA_UINT64 }
 149 };
 150 
 151 #define CLONE2STATS(clone_xprt) \
 152         ((struct cots_master_data *)(clone_xprt)->xp_master->xp_p2)->cmd_stats
 153 #define RSSTAT_INCR(s, x)       \
 154         atomic_inc_64(&(s)->x.value.ui64)
 155 
 156 /*
 157  * Pointer to a transport specific `ready to receive' function in rpcmod
 158  * (set from rpcmod).
 159  */
 160 void    (*mir_start)(queue_t *);
 161 uint_t  *svc_max_msg_sizep;
 162 
 163 /*
 164  * the address size of the underlying transport can sometimes be
 165  * unknown (tinfo->ADDR_size == -1).  For this case, it is
 166  * necessary to figure out what the size is so the correct amount
 167  * of data is allocated.  This is an itterative process:
 168  *      1. take a good guess (use T_MINADDRSIZE)
 169  *      2. try it.
 170  *      3. if it works then everything is ok
 171  *      4. if the error is ENAMETOLONG, double the guess
 172  *      5. go back to step 2.
 173  */
 174 #define T_UNKNOWNADDRSIZE       (-1)
 175 #define T_MINADDRSIZE   32
 176 
 177 /*
 178  * Create a transport record.
 179  * The transport record, output buffer, and private data structure
 180  * are allocated.  The output buffer is serialized into using xdrmem.
 181  * There is one transport record per user process which implements a
 182  * set of services.
 183  */
 184 static kmutex_t cots_kcreate_lock;
 185 
 186 int
 187 svc_cots_kcreate(file_t *fp, uint_t max_msgsize, struct T_info_ack *tinfo,
 188     SVCMASTERXPRT **nxprt)
 189 {
 190         struct cots_master_data *cmd;
 191         int err, retval;
 192         SVCMASTERXPRT *xprt;
 193         struct rpcstat *rpcstat;
 194         struct T_addr_ack *ack_p;
 195         struct strioctl getaddr;
 196 
 197         if (nxprt == NULL)
 198                 return (EINVAL);
 199 
 200         rpcstat = zone_getspecific(rpcstat_zone_key, curproc->p_zone);
 201         ASSERT(rpcstat != NULL);
 202 
 203         xprt = kmem_zalloc(sizeof (SVCMASTERXPRT), KM_SLEEP);
 204 
 205         cmd = kmem_zalloc(sizeof (*cmd) + sizeof (*ack_p)
 206             + (2 * sizeof (sin6_t)), KM_SLEEP);
 207 
 208         ack_p = (struct T_addr_ack *)&cmd[1];
 209 
 210         if ((tinfo->TIDU_size > COTS_MAX_ALLOCSIZE) ||
 211             (tinfo->TIDU_size <= 0))
 212                 xprt->xp_msg_size = COTS_MAX_ALLOCSIZE;
 213         else {
 214                 xprt->xp_msg_size = tinfo->TIDU_size -
 215                     (tinfo->TIDU_size % BYTES_PER_XDR_UNIT);
 216         }
 217 
 218         xprt->xp_ops = &svc_cots_op;
 219         xprt->xp_p2 = (caddr_t)cmd;
 220         cmd->cmd_xprt_started = 0;
 221         cmd->cmd_stats = rpcstat->rpc_cots_server;
 222 
 223         getaddr.ic_cmd = TI_GETINFO;
 224         getaddr.ic_timout = -1;
 225         getaddr.ic_len = sizeof (*ack_p) + (2 * sizeof (sin6_t));
 226         getaddr.ic_dp = (char *)ack_p;
 227         ack_p->PRIM_type = T_ADDR_REQ;
 228 
 229         err = strioctl(fp->f_vnode, I_STR, (intptr_t)&getaddr,
 230             0, K_TO_K, CRED(), &retval);
 231         if (err) {
 232                 kmem_free(cmd, sizeof (*cmd) + sizeof (*ack_p) +
 233                     (2 * sizeof (sin6_t)));
 234                 kmem_free(xprt, sizeof (SVCMASTERXPRT));
 235                 return (err);
 236         }
 237 
 238         xprt->xp_rtaddr.maxlen = ack_p->REMADDR_length;
 239         xprt->xp_rtaddr.len = ack_p->REMADDR_length;
 240         cmd->cmd_src_addr = xprt->xp_rtaddr.buf =
 241             (char *)ack_p + ack_p->REMADDR_offset;
 242 
 243         xprt->xp_lcladdr.maxlen = ack_p->LOCADDR_length;
 244         xprt->xp_lcladdr.len = ack_p->LOCADDR_length;
 245         xprt->xp_lcladdr.buf = (char *)ack_p + ack_p->LOCADDR_offset;
 246 
 247         /*
 248          * If the current sanity check size in rpcmod is smaller
 249          * than the size needed for this xprt, then increase
 250          * the sanity check.
 251          */
 252         if (max_msgsize != 0 && svc_max_msg_sizep &&
 253             max_msgsize > *svc_max_msg_sizep) {
 254 
 255                 /* This check needs a lock */
 256                 mutex_enter(&cots_kcreate_lock);
 257                 if (svc_max_msg_sizep && max_msgsize > *svc_max_msg_sizep)
 258                         *svc_max_msg_sizep = max_msgsize;
 259                 mutex_exit(&cots_kcreate_lock);
 260         }
 261 
 262         *nxprt = xprt;
 263 
 264         return (0);
 265 }
 266 
 267 /*
 268  * Destroy a master transport record.
 269  * Frees the space allocated for a transport record.
 270  */
 271 static void
 272 svc_cots_kdestroy(SVCMASTERXPRT *xprt)
 273 {
 274         struct cots_master_data *cmd = (struct cots_master_data *)xprt->xp_p2;
 275 
 276         ASSERT(cmd);
 277 
 278         if (xprt->xp_netid)
 279                 kmem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1);
 280         if (xprt->xp_addrmask.maxlen)
 281                 kmem_free(xprt->xp_addrmask.buf, xprt->xp_addrmask.maxlen);
 282 
 283         mutex_destroy(&xprt->xp_req_lock);
 284         mutex_destroy(&xprt->xp_thread_lock);
 285 
 286         kmem_free(cmd, sizeof (*cmd) + sizeof (struct T_addr_ack) +
 287             (2 * sizeof (sin6_t)));
 288 
 289         kmem_free(xprt, sizeof (SVCMASTERXPRT));
 290 }
 291 
 292 /*
 293  * svc_tli_kcreate() calls this function at the end to tell
 294  * rpcmod that the transport is ready to receive requests.
 295  */
 296 static void
 297 svc_cots_kstart(SVCMASTERXPRT *xprt)
 298 {
 299         struct cots_master_data *cmd = (struct cots_master_data *)xprt->xp_p2;
 300 
 301         if (cmd->cmd_xprt_started == 0) {
 302                 /*
 303                  * Acquire the xp_req_lock in order to use xp_wq
 304                  * safely (we don't want to qenable a queue that has
 305                  * already been closed).
 306                  */
 307                 mutex_enter(&xprt->xp_req_lock);
 308                 if (cmd->cmd_xprt_started == 0 &&
 309                     xprt->xp_wq != NULL) {
 310                         (*mir_start)(xprt->xp_wq);
 311                         cmd->cmd_xprt_started = 1;
 312                 }
 313                 mutex_exit(&xprt->xp_req_lock);
 314         }
 315 }
 316 
 317 /*
 318  * Transport-type specific part of svc_xprt_cleanup().
 319  */
 320 static void
 321 svc_cots_kclone_destroy(SVCXPRT *clone_xprt)
 322 {
 323         cots_data_t *cd = (cots_data_t *)clone_xprt->xp_p2buf;
 324 
 325         if (cd->cd_req_mp) {
 326                 freemsg(cd->cd_req_mp);
 327                 cd->cd_req_mp = (mblk_t *)0;
 328         }
 329         ASSERT(cd->cd_mp == NULL);
 330 }
 331 
 332 /*
 333  * Transport Attributes.
 334  */
 335 static void
 336 svc_cots_ktattrs(SVCXPRT *clone_xprt, int attrflag, void **tattr)
 337 {
 338         *tattr = NULL;
 339 
 340         switch (attrflag) {
 341         case SVC_TATTR_ADDRMASK:
 342                 *tattr = (void *)&clone_xprt->xp_master->xp_addrmask;
 343         }
 344 }
 345 
 346 /*
 347  * Receive rpc requests.
 348  * Checks if the message is intact, and deserializes the call packet.
 349  */
 350 static bool_t
 351 svc_cots_krecv(SVCXPRT *clone_xprt, mblk_t *mp, struct rpc_msg *msg)
 352 {
 353         cots_data_t *cd = (cots_data_t *)clone_xprt->xp_p2buf;
 354         XDR *xdrs = &clone_xprt->xp_xdrin;
 355         struct rpc_cots_server *stats = CLONE2STATS(clone_xprt);
 356 
 357         TRACE_0(TR_FAC_KRPC, TR_SVC_COTS_KRECV_START,
 358             "svc_cots_krecv_start:");
 359         RPCLOG(4, "svc_cots_krecv_start clone_xprt = %p:\n",
 360             (void *)clone_xprt);
 361 
 362         RSSTAT_INCR(stats, rscalls);
 363 
 364         if (mp->b_datap->db_type != M_DATA) {
 365                 RPCLOG(16, "svc_cots_krecv bad db_type %d\n",
 366                     mp->b_datap->db_type);
 367                 goto bad;
 368         }
 369 
 370         xdrmblk_init(xdrs, mp, XDR_DECODE, 0);
 371 
 372         TRACE_0(TR_FAC_KRPC, TR_XDR_CALLMSG_START,
 373             "xdr_callmsg_start:");
 374         RPCLOG0(4, "xdr_callmsg_start:\n");
 375         if (!xdr_callmsg(xdrs, msg)) {
 376                 XDR_DESTROY(xdrs);
 377                 TRACE_1(TR_FAC_KRPC, TR_XDR_CALLMSG_END,
 378                     "xdr_callmsg_end:(%S)", "bad");
 379                 RPCLOG0(1, "svc_cots_krecv xdr_callmsg failure\n");
 380                 RSSTAT_INCR(stats, rsxdrcall);
 381                 goto bad;
 382         }
 383         TRACE_1(TR_FAC_KRPC, TR_XDR_CALLMSG_END,
 384             "xdr_callmsg_end:(%S)", "good");
 385 
 386         clone_xprt->xp_xid = msg->rm_xid;
 387         cd->cd_req_mp = mp;
 388 
 389         TRACE_1(TR_FAC_KRPC, TR_SVC_COTS_KRECV_END,
 390             "svc_cots_krecv_end:(%S)", "good");
 391         RPCLOG0(4, "svc_cots_krecv_end:good\n");
 392         return (TRUE);
 393 
 394 bad:
 395         if (mp)
 396                 freemsg(mp);
 397 
 398         RSSTAT_INCR(stats, rsbadcalls);
 399         TRACE_1(TR_FAC_KRPC, TR_SVC_COTS_KRECV_END,
 400             "svc_cots_krecv_end:(%S)", "bad");
 401         return (FALSE);
 402 }
 403 
 404 /*
 405  * Send rpc reply.
 406  */
 407 static bool_t
 408 svc_cots_ksend(SVCXPRT *clone_xprt, struct rpc_msg *msg)
 409 {
 410         /* LINTED pointer alignment */
 411         cots_data_t *cd = (cots_data_t *)clone_xprt->xp_p2buf;
 412         XDR *xdrs = &(clone_xprt->xp_xdrout);
 413         int retval = FALSE;
 414         mblk_t *mp;
 415         xdrproc_t xdr_results;
 416         caddr_t xdr_location;
 417         bool_t has_args;
 418 
 419         TRACE_0(TR_FAC_KRPC, TR_SVC_COTS_KSEND_START,
 420             "svc_cots_ksend_start:");
 421 
 422         /*
 423          * If there is a result procedure specified in the reply message,
 424          * it will be processed in the xdr_replymsg and SVCAUTH_WRAP.
 425          * We need to make sure it won't be processed twice, so we null
 426          * it for xdr_replymsg here.
 427          */
 428         has_args = FALSE;
 429         if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
 430             msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
 431                 if ((xdr_results = msg->acpted_rply.ar_results.proc) != NULL) {
 432                         has_args = TRUE;
 433                         xdr_location = msg->acpted_rply.ar_results.where;
 434                         msg->acpted_rply.ar_results.proc = xdr_void;
 435                         msg->acpted_rply.ar_results.where = NULL;
 436                 }
 437         }
 438 
 439         mp = cd->cd_mp;
 440         if (mp) {
 441                 /*
 442                  * The program above pre-allocated an mblk and put
 443                  * the data in place.
 444                  */
 445                 cd->cd_mp = (mblk_t *)NULL;
 446                 if (!(xdr_replymsg_body(xdrs, msg) &&
 447                     (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, xdrs,
 448                     xdr_results, xdr_location)))) {
 449                         XDR_DESTROY(xdrs);
 450                         RPCLOG0(1, "svc_cots_ksend: "
 451                             "xdr_replymsg_body/SVCAUTH_WRAP failed\n");
 452                         freemsg(mp);
 453                         goto out;
 454                 }
 455         } else {
 456                 int     len;
 457                 int     mpsize;
 458 
 459                 /*
 460                  * Leave space for protocol headers.
 461                  */
 462                 len = MSG_OFFSET + clone_xprt->xp_msg_size;
 463 
 464                 /*
 465                  * Allocate an initial mblk for the response data.
 466                  */
 467                 while (!(mp = allocb(len, BPRI_LO))) {
 468                         RPCLOG0(16, "svc_cots_ksend: allocb failed failed\n");
 469                         if (strwaitbuf(len, BPRI_LO)) {
 470                                 TRACE_1(TR_FAC_KRPC, TR_SVC_COTS_KSEND_END,
 471                                     "svc_cots_ksend_end:(%S)", "strwaitbuf");
 472                                 RPCLOG0(1,
 473                                     "svc_cots_ksend: strwaitbuf failed\n");
 474                                 goto out;
 475                         }
 476                 }
 477 
 478                 /*
 479                  * Initialize the XDR encode stream.  Additional mblks
 480                  * will be allocated if necessary.  They will be TIDU
 481                  * sized.
 482                  */
 483                 xdrmblk_init(xdrs, mp, XDR_ENCODE, clone_xprt->xp_msg_size);
 484                 mpsize = MBLKSIZE(mp);
 485                 ASSERT(mpsize >= len);
 486                 ASSERT(mp->b_rptr == mp->b_datap->db_base);
 487 
 488                 /*
 489                  * If the size of mblk is not appreciably larger than what we
 490                  * asked, then resize the mblk to exactly len bytes. Reason for
 491                  * this: suppose len is 1600 bytes, the tidu is 1460 bytes
 492                  * (from TCP over ethernet), and the arguments to RPC require
 493                  * 2800 bytes. Ideally we want the protocol to render two
 494                  * ~1400 byte segments over the wire. If allocb() gives us a 2k
 495                  * mblk, and we allocate a second mblk for the rest, the
 496                  * protocol module may generate 3 segments over the wire:
 497                  * 1460 bytes for the first, 448 (2048 - 1600) for the 2nd, and
 498                  * 892 for the 3rd. If we "waste" 448 bytes in the first mblk,
 499                  * the XDR encoding will generate two ~1400 byte mblks, and the
 500                  * protocol module is more likely to produce properly sized
 501                  * segments.
 502                  */
 503                 if ((mpsize >> 1) <= len) {
 504                         mp->b_rptr += (mpsize - len);
 505                 }
 506 
 507                 /*
 508                  * Adjust b_rptr to reserve space for the non-data protocol
 509                  * headers that any downstream modules might like to add, and
 510                  * for the record marking header.
 511                  */
 512                 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE);
 513 
 514                 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base));
 515                 ASSERT(mp->b_wptr == mp->b_rptr);
 516 
 517                 msg->rm_xid = clone_xprt->xp_xid;
 518 
 519                 TRACE_0(TR_FAC_KRPC, TR_XDR_REPLYMSG_START,
 520                     "xdr_replymsg_start:");
 521                 if (!(xdr_replymsg(xdrs, msg) &&
 522                     (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, xdrs,
 523                     xdr_results, xdr_location)))) {
 524                         XDR_DESTROY(xdrs);
 525                         TRACE_1(TR_FAC_KRPC, TR_XDR_REPLYMSG_END,
 526                             "xdr_replymsg_end:(%S)", "bad");
 527                         freemsg(mp);
 528                         RPCLOG0(1, "svc_cots_ksend: xdr_replymsg/SVCAUTH_WRAP "
 529                             "failed\n");
 530                         goto out;
 531                 }
 532                 TRACE_1(TR_FAC_KRPC, TR_XDR_REPLYMSG_END,
 533                     "xdr_replymsg_end:(%S)", "good");
 534         }
 535 
 536         XDR_DESTROY(xdrs);
 537 
 538         put(clone_xprt->xp_wq, mp);
 539         retval = TRUE;
 540 
 541 out:
 542         /*
 543          * This is completely disgusting.  If public is set it is
 544          * a pointer to a structure whose first field is the address
 545          * of the function to free that structure and any related
 546          * stuff.  (see rrokfree in nfs_xdr.c).
 547          */
 548         if (xdrs->x_public) {
 549                 /* LINTED pointer alignment */
 550                 (**((int (**)())xdrs->x_public))(xdrs->x_public);
 551         }
 552 
 553         TRACE_1(TR_FAC_KRPC, TR_SVC_COTS_KSEND_END,
 554             "svc_cots_ksend_end:(%S)", "done");
 555         return (retval);
 556 }
 557 
 558 /*
 559  * Deserialize arguments.
 560  */
 561 static bool_t
 562 svc_cots_kgetargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args,
 563     caddr_t args_ptr)
 564 {
 565         return (SVCAUTH_UNWRAP(&clone_xprt->xp_auth, &clone_xprt->xp_xdrin,
 566             xdr_args, args_ptr));
 567 }
 568 
 569 static bool_t
 570 svc_cots_kfreeargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args,
 571     caddr_t args_ptr)
 572 {
 573         cots_data_t *cd = (cots_data_t *)clone_xprt->xp_p2buf;
 574         /* LINTED pointer alignment */
 575         XDR *xdrs = &clone_xprt->xp_xdrin;
 576         mblk_t *mp;
 577         bool_t retval;
 578 
 579         /*
 580          * It is important to call the XDR routine before
 581          * freeing the request mblk.  Structures in the
 582          * XDR data may point into the mblk and require that
 583          * the memory be intact during the free routine.
 584          */
 585         if (args_ptr) {
 586                 xdrs->x_op = XDR_FREE;
 587                 retval = (*xdr_args)(xdrs, args_ptr);
 588         } else
 589                 retval = TRUE;
 590 
 591         XDR_DESTROY(xdrs);
 592 
 593         if ((mp = cd->cd_req_mp) != NULL) {
 594                 cd->cd_req_mp = (mblk_t *)0;
 595                 freemsg(mp);
 596         }
 597 
 598         return (retval);
 599 }
 600 
 601 static int32_t *
 602 svc_cots_kgetres(SVCXPRT *clone_xprt, int size)
 603 {
 604         /* LINTED pointer alignment */
 605         cots_data_t *cd = (cots_data_t *)clone_xprt->xp_p2buf;
 606         XDR *xdrs = &clone_xprt->xp_xdrout;
 607         mblk_t *mp;
 608         int32_t *buf;
 609         struct rpc_msg rply;
 610         int len;
 611         int mpsize;
 612 
 613         /*
 614          * Leave space for protocol headers.
 615          */
 616         len = MSG_OFFSET + clone_xprt->xp_msg_size;
 617 
 618         /*
 619          * Allocate an initial mblk for the response data.
 620          */
 621         while ((mp = allocb(len, BPRI_LO)) == NULL) {
 622                 if (strwaitbuf(len, BPRI_LO))
 623                         return (NULL);
 624         }
 625 
 626         /*
 627          * Initialize the XDR encode stream.  Additional mblks
 628          * will be allocated if necessary.  They will be TIDU
 629          * sized.
 630          */
 631         xdrmblk_init(xdrs, mp, XDR_ENCODE, clone_xprt->xp_msg_size);
 632         mpsize = MBLKSIZE(mp);
 633         ASSERT(mpsize >= len);
 634         ASSERT(mp->b_rptr == mp->b_datap->db_base);
 635 
 636         /*
 637          * If the size of mblk is not appreciably larger than what we
 638          * asked, then resize the mblk to exactly len bytes. Reason for
 639          * this: suppose len is 1600 bytes, the tidu is 1460 bytes
 640          * (from TCP over ethernet), and the arguments to RPC require
 641          * 2800 bytes. Ideally we want the protocol to render two
 642          * ~1400 byte segments over the wire. If allocb() gives us a 2k
 643          * mblk, and we allocate a second mblk for the rest, the
 644          * protocol module may generate 3 segments over the wire:
 645          * 1460 bytes for the first, 448 (2048 - 1600) for the 2nd, and
 646          * 892 for the 3rd. If we "waste" 448 bytes in the first mblk,
 647          * the XDR encoding will generate two ~1400 byte mblks, and the
 648          * protocol module is more likely to produce properly sized
 649          * segments.
 650          */
 651         if ((mpsize >> 1) <= len) {
 652                 mp->b_rptr += (mpsize - len);
 653         }
 654 
 655         /*
 656          * Adjust b_rptr to reserve space for the non-data protocol
 657          * headers that any downstream modules might like to add, and
 658          * for the record marking header.
 659          */
 660         mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE);
 661 
 662         XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base));
 663         ASSERT(mp->b_wptr == mp->b_rptr);
 664 
 665         /*
 666          * Assume a successful RPC since most of them are.
 667          */
 668         rply.rm_xid = clone_xprt->xp_xid;
 669         rply.rm_direction = REPLY;
 670         rply.rm_reply.rp_stat = MSG_ACCEPTED;
 671         rply.acpted_rply.ar_verf = clone_xprt->xp_verf;
 672         rply.acpted_rply.ar_stat = SUCCESS;
 673 
 674         if (!xdr_replymsg_hdr(xdrs, &rply)) {
 675                 XDR_DESTROY(xdrs);
 676                 freeb(mp);
 677                 return (NULL);
 678         }
 679 
 680         buf = XDR_INLINE(xdrs, size);
 681         if (buf == NULL) {
 682                 XDR_DESTROY(xdrs);
 683                 ASSERT(cd->cd_mp == NULL);
 684                 freemsg(mp);
 685         } else {
 686                 cd->cd_mp = mp;
 687         }
 688         return (buf);
 689 }
 690 
 691 static void
 692 svc_cots_kfreeres(SVCXPRT *clone_xprt)
 693 {
 694         cots_data_t *cd;
 695         mblk_t *mp;
 696 
 697         cd = (cots_data_t *)clone_xprt->xp_p2buf;
 698         if ((mp = cd->cd_mp) != NULL) {
 699                 XDR_DESTROY(&clone_xprt->xp_xdrout);
 700                 cd->cd_mp = (mblk_t *)NULL;
 701                 freemsg(mp);
 702         }
 703 }
 704 
 705 /*
 706  * the dup cacheing routines below provide a cache of non-failure
 707  * transaction id's.  rpc service routines can use this to detect
 708  * retransmissions and re-send a non-failure response.
 709  */
 710 
 711 /*
 712  * MAXDUPREQS is the number of cached items.  It should be adjusted
 713  * to the service load so that there is likely to be a response entry
 714  * when the first retransmission comes in.
 715  */
 716 #define MAXDUPREQS      1024
 717 
 718 /*
 719  * This should be appropriately scaled to MAXDUPREQS.
 720  */
 721 #define DRHASHSZ        257
 722 
 723 #if ((DRHASHSZ & (DRHASHSZ - 1)) == 0)
 724 #define XIDHASH(xid)    ((xid) & (DRHASHSZ - 1))
 725 #else
 726 #define XIDHASH(xid)    ((xid) % DRHASHSZ)
 727 #endif
 728 #define DRHASH(dr)      XIDHASH((dr)->dr_xid)
 729 #define REQTOXID(req)   ((req)->rq_xprt->xp_xid)
 730 
 731 static int      cotsndupreqs = 0;
 732 int     cotsmaxdupreqs = MAXDUPREQS;
 733 static kmutex_t cotsdupreq_lock;
 734 static struct dupreq *cotsdrhashtbl[DRHASHSZ];
 735 static int      cotsdrhashstat[DRHASHSZ];
 736 
 737 static void unhash(struct dupreq *);
 738 
 739 /*
 740  * cotsdrmru points to the head of a circular linked list in lru order.
 741  * cotsdrmru->dr_next == drlru
 742  */
 743 struct dupreq *cotsdrmru;
 744 
 745 /*
 746  * PSARC 2003/523 Contract Private Interface
 747  * svc_cots_kdup
 748  * Changes must be reviewed by Solaris File Sharing
 749  * Changes must be communicated to contract-2003-523@sun.com
 750  *
 751  * svc_cots_kdup searches the request cache and returns 0 if the
 752  * request is not found in the cache.  If it is found, then it
 753  * returns the state of the request (in progress or done) and
 754  * the status or attributes that were part of the original reply.
 755  *
 756  * If DUP_DONE (there is a duplicate) svc_cots_kdup copies over the
 757  * value of the response. In that case, also return in *dupcachedp
 758  * whether the response free routine is cached in the dupreq - in which case
 759  * the caller should not be freeing it, because it will be done later
 760  * in the svc_cots_kdup code when the dupreq is reused.
 761  */
 762 static int
 763 svc_cots_kdup(struct svc_req *req, caddr_t res, int size, struct dupreq **drpp,
 764         bool_t *dupcachedp)
 765 {
 766         struct rpc_cots_server *stats = CLONE2STATS(req->rq_xprt);
 767         struct dupreq *dr;
 768         uint32_t xid;
 769         uint32_t drhash;
 770         int status;
 771 
 772         xid = REQTOXID(req);
 773         mutex_enter(&cotsdupreq_lock);
 774         RSSTAT_INCR(stats, rsdupchecks);
 775         /*
 776          * Check to see whether an entry already exists in the cache.
 777          */
 778         dr = cotsdrhashtbl[XIDHASH(xid)];
 779         while (dr != NULL) {
 780                 if (dr->dr_xid == xid &&
 781                     dr->dr_proc == req->rq_proc &&
 782                     dr->dr_prog == req->rq_prog &&
 783                     dr->dr_vers == req->rq_vers &&
 784                     dr->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
 785                     bcmp((caddr_t)dr->dr_addr.buf,
 786                     (caddr_t)req->rq_xprt->xp_rtaddr.buf,
 787                     dr->dr_addr.len) == 0) {
 788                         status = dr->dr_status;
 789                         if (status == DUP_DONE) {
 790                                 bcopy(dr->dr_resp.buf, res, size);
 791                                 if (dupcachedp != NULL)
 792                                         *dupcachedp = (dr->dr_resfree != NULL);
 793                                 TRACE_0(TR_FAC_KRPC, TR_SVC_COTS_KDUP_DONE,
 794                                     "svc_cots_kdup: DUP_DONE");
 795                         } else {
 796                                 dr->dr_status = DUP_INPROGRESS;
 797                                 *drpp = dr;
 798                                 TRACE_0(TR_FAC_KRPC,
 799                                     TR_SVC_COTS_KDUP_INPROGRESS,
 800                                     "svc_cots_kdup: DUP_INPROGRESS");
 801                         }
 802                         RSSTAT_INCR(stats, rsdupreqs);
 803                         mutex_exit(&cotsdupreq_lock);
 804                         return (status);
 805                 }
 806                 dr = dr->dr_chain;
 807         }
 808 
 809         /*
 810          * There wasn't an entry, either allocate a new one or recycle
 811          * an old one.
 812          */
 813         if (cotsndupreqs < cotsmaxdupreqs) {
 814                 dr = kmem_alloc(sizeof (*dr), KM_NOSLEEP);
 815                 if (dr == NULL) {
 816                         mutex_exit(&cotsdupreq_lock);
 817                         return (DUP_ERROR);
 818                 }
 819                 dr->dr_resp.buf = NULL;
 820                 dr->dr_resp.maxlen = 0;
 821                 dr->dr_addr.buf = NULL;
 822                 dr->dr_addr.maxlen = 0;
 823                 if (cotsdrmru) {
 824                         dr->dr_next = cotsdrmru->dr_next;
 825                         cotsdrmru->dr_next = dr;
 826                 } else {
 827                         dr->dr_next = dr;
 828                 }
 829                 cotsndupreqs++;
 830         } else {
 831                 dr = cotsdrmru->dr_next;
 832                 while (dr->dr_status == DUP_INPROGRESS) {
 833                         dr = dr->dr_next;
 834                         if (dr == cotsdrmru->dr_next) {
 835                                 cmn_err(CE_WARN, "svc_cots_kdup no slots free");
 836                                 mutex_exit(&cotsdupreq_lock);
 837                                 return (DUP_ERROR);
 838                         }
 839                 }
 840                 unhash(dr);
 841                 if (dr->dr_resfree) {
 842                         (*dr->dr_resfree)(dr->dr_resp.buf);
 843                 }
 844         }
 845         dr->dr_resfree = NULL;
 846         cotsdrmru = dr;
 847 
 848         dr->dr_xid = REQTOXID(req);
 849         dr->dr_prog = req->rq_prog;
 850         dr->dr_vers = req->rq_vers;
 851         dr->dr_proc = req->rq_proc;
 852         if (dr->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
 853                 if (dr->dr_addr.buf != NULL)
 854                         kmem_free(dr->dr_addr.buf, dr->dr_addr.maxlen);
 855                 dr->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
 856                 dr->dr_addr.buf = kmem_alloc(dr->dr_addr.maxlen, KM_NOSLEEP);
 857                 if (dr->dr_addr.buf == NULL) {
 858                         dr->dr_addr.maxlen = 0;
 859                         dr->dr_status = DUP_DROP;
 860                         mutex_exit(&cotsdupreq_lock);
 861                         return (DUP_ERROR);
 862                 }
 863         }
 864         dr->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
 865         bcopy(req->rq_xprt->xp_rtaddr.buf, dr->dr_addr.buf, dr->dr_addr.len);
 866         if (dr->dr_resp.maxlen < size) {
 867                 if (dr->dr_resp.buf != NULL)
 868                         kmem_free(dr->dr_resp.buf, dr->dr_resp.maxlen);
 869                 dr->dr_resp.maxlen = (unsigned int)size;
 870                 dr->dr_resp.buf = kmem_alloc(size, KM_NOSLEEP);
 871                 if (dr->dr_resp.buf == NULL) {
 872                         dr->dr_resp.maxlen = 0;
 873                         dr->dr_status = DUP_DROP;
 874                         mutex_exit(&cotsdupreq_lock);
 875                         return (DUP_ERROR);
 876                 }
 877         }
 878         dr->dr_status = DUP_INPROGRESS;
 879 
 880         drhash = (uint32_t)DRHASH(dr);
 881         dr->dr_chain = cotsdrhashtbl[drhash];
 882         cotsdrhashtbl[drhash] = dr;
 883         cotsdrhashstat[drhash]++;
 884         mutex_exit(&cotsdupreq_lock);
 885         *drpp = dr;
 886         return (DUP_NEW);
 887 }
 888 
 889 /*
 890  * PSARC 2003/523 Contract Private Interface
 891  * svc_cots_kdupdone
 892  * Changes must be reviewed by Solaris File Sharing
 893  * Changes must be communicated to contract-2003-523@sun.com
 894  *
 895  * svc_cots_kdupdone marks the request done (DUP_DONE or DUP_DROP)
 896  * and stores the response.
 897  */
 898 static void
 899 svc_cots_kdupdone(struct dupreq *dr, caddr_t res, void (*dis_resfree)(),
 900         int size, int status)
 901 {
 902         ASSERT(dr->dr_resfree == NULL);
 903         if (status == DUP_DONE) {
 904                 bcopy(res, dr->dr_resp.buf, size);
 905                 dr->dr_resfree = dis_resfree;
 906         }
 907         dr->dr_status = status;
 908 }
 909 
 910 /*
 911  * This routine expects that the mutex, cotsdupreq_lock, is already held.
 912  */
 913 static void
 914 unhash(struct dupreq *dr)
 915 {
 916         struct dupreq *drt;
 917         struct dupreq *drtprev = NULL;
 918         uint32_t drhash;
 919 
 920         ASSERT(MUTEX_HELD(&cotsdupreq_lock));
 921 
 922         drhash = (uint32_t)DRHASH(dr);
 923         drt = cotsdrhashtbl[drhash];
 924         while (drt != NULL) {
 925                 if (drt == dr) {
 926                         cotsdrhashstat[drhash]--;
 927                         if (drtprev == NULL) {
 928                                 cotsdrhashtbl[drhash] = drt->dr_chain;
 929                         } else {
 930                                 drtprev->dr_chain = drt->dr_chain;
 931                         }
 932                         return;
 933                 }
 934                 drtprev = drt;
 935                 drt = drt->dr_chain;
 936         }
 937 }
 938 
 939 void
 940 svc_cots_stats_init(zoneid_t zoneid, struct rpc_cots_server **statsp)
 941 {
 942         *statsp = (struct rpc_cots_server *)rpcstat_zone_init_common(zoneid,
 943             "unix", "rpc_cots_server", (const kstat_named_t *)&cots_rsstat_tmpl,
 944             sizeof (cots_rsstat_tmpl));
 945 }
 946 
 947 void
 948 svc_cots_stats_fini(zoneid_t zoneid, struct rpc_cots_server **statsp)
 949 {
 950         rpcstat_zone_fini_common(zoneid, "unix", "rpc_cots_server");
 951         kmem_free(*statsp, sizeof (cots_rsstat_tmpl));
 952 }
 953 
 954 void
 955 svc_cots_init(void)
 956 {
 957         /*
 958          * Check to make sure that the cots private data will fit into
 959          * the stack buffer allocated by svc_run.  The ASSERT is a safety
 960          * net if the cots_data_t structure ever changes.
 961          */
 962         /*CONSTANTCONDITION*/
 963         ASSERT(sizeof (cots_data_t) <= SVC_P2LEN);
 964 
 965         mutex_init(&cots_kcreate_lock, NULL, MUTEX_DEFAULT, NULL);
 966         mutex_init(&cotsdupreq_lock, NULL, MUTEX_DEFAULT, NULL);
 967 }