1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /* Copyright (c) 1990 Mentat Inc. */
  26 
  27 /*
  28  * Copyright 2019 Joyent, Inc.
  29  */
  30 
  31 #include <sys/types.h>
  32 #include <sys/stream.h>
  33 #include <sys/strsun.h>
  34 #include <sys/zone.h>
  35 #include <sys/ddi.h>
  36 #include <sys/sunddi.h>
  37 #include <sys/cmn_err.h>
  38 #include <sys/debug.h>
  39 #include <sys/atomic.h>
  40 
  41 #include <sys/systm.h>
  42 #include <sys/param.h>
  43 #include <sys/kmem.h>
  44 #include <sys/sdt.h>
  45 #include <sys/socket.h>
  46 #include <sys/mac.h>
  47 #include <net/if.h>
  48 #include <net/if_arp.h>
  49 #include <net/route.h>
  50 #include <sys/sockio.h>
  51 #include <netinet/in.h>
  52 #include <net/if_dl.h>
  53 
  54 #include <inet/common.h>
  55 #include <inet/mi.h>
  56 #include <inet/mib2.h>
  57 #include <inet/nd.h>
  58 #include <inet/arp.h>
  59 #include <inet/snmpcom.h>
  60 #include <inet/kstatcom.h>
  61 
  62 #include <netinet/igmp_var.h>
  63 #include <netinet/ip6.h>
  64 #include <netinet/icmp6.h>
  65 #include <netinet/sctp.h>
  66 
  67 #include <inet/ip.h>
  68 #include <inet/ip_impl.h>
  69 #include <inet/ip6.h>
  70 #include <inet/ip6_asp.h>
  71 #include <inet/tcp.h>
  72 #include <inet/ip_multi.h>
  73 #include <inet/ip_if.h>
  74 #include <inet/ip_ire.h>
  75 #include <inet/ip_ftable.h>
  76 #include <inet/ip_rts.h>
  77 #include <inet/optcom.h>
  78 #include <inet/ip_ndp.h>
  79 #include <inet/ip_listutils.h>
  80 #include <netinet/igmp.h>
  81 #include <netinet/ip_mroute.h>
  82 #include <inet/ipp_common.h>
  83 
  84 #include <net/pfkeyv2.h>
  85 #include <inet/sadb.h>
  86 #include <inet/ipsec_impl.h>
  87 #include <inet/ipdrop.h>
  88 #include <inet/ip_netinfo.h>
  89 #include <sys/squeue_impl.h>
  90 #include <sys/squeue.h>
  91 
  92 #include <inet/ipclassifier.h>
  93 #include <inet/sctp_ip.h>
  94 #include <inet/sctp/sctp_impl.h>
  95 #include <inet/udp_impl.h>
  96 #include <sys/sunddi.h>
  97 
  98 #include <sys/tsol/label.h>
  99 #include <sys/tsol/tnet.h>
 100 
 101 /*
 102  * Release a reference on ip_xmit_attr.
 103  * The reference is acquired by conn_get_ixa()
 104  *
 105  * This macro has a lowercase function-call version for callers outside
 106  * this file.
 107  */
 108 #define IXA_REFRELE(ixa)                                        \
 109 {                                                               \
 110         if (atomic_dec_32_nv(&(ixa)->ixa_refcnt) == 0)   \
 111                 ixa_inactive(ixa);                              \
 112 }
 113 
 114 #define IXA_REFHOLD(ixa)                                        \
 115 {                                                               \
 116         ASSERT3U((ixa)->ixa_refcnt, !=, 0);                  \
 117         atomic_inc_32(&(ixa)->ixa_refcnt);                       \
 118 }
 119 
 120 /*
 121  * When we need to handle a transmit side asynchronous operation, then we need
 122  * to save sufficient information so that we can call the fragment and postfrag
 123  * functions. That information is captured in an mblk containing this structure.
 124  *
 125  * Since this is currently only used for IPsec, we include information for
 126  * the kernel crypto framework.
 127  */
 128 typedef struct ixamblk_s {
 129         boolean_t       ixm_inbound;    /* B_FALSE */
 130         iaflags_t       ixm_flags;      /* ixa_flags */
 131         netstackid_t    ixm_stackid;    /* Verify it didn't go away */
 132         uint_t          ixm_ifindex;    /* Used to find the nce */
 133         in6_addr_t      ixm_nceaddr_v6; /* Used to find nce */
 134 #define ixm_nceaddr_v4  V4_PART_OF_V6(ixm_nceaddr_v6)
 135         uint32_t        ixm_fragsize;
 136         uint_t          ixm_pktlen;
 137         uint16_t        ixm_ip_hdr_length; /* Points to ULP header */
 138         uint8_t         ixm_protocol;   /* Protocol number for ULP cksum */
 139         pfirepostfrag_t ixm_postfragfn;
 140 
 141         zoneid_t        ixm_zoneid;             /* Needed for ipobs */
 142         zoneid_t        ixm_no_loop_zoneid;     /* IXAF_NO_LOOP_ZONEID_SET */
 143 
 144         uint_t          ixm_scopeid;            /* For IPv6 link-locals */
 145 
 146         uint32_t        ixm_ident;              /* For IPv6 fragment header */
 147         uint32_t        ixm_xmit_hint;
 148 
 149         uint64_t        ixm_conn_id;            /* Used by DTrace */
 150         cred_t          *ixm_cred;      /* For getpeerucred - refhold if set */
 151         pid_t           ixm_cpid;       /* For getpeerucred */
 152 
 153         ts_label_t      *ixm_tsl;       /* Refhold if set. */
 154 
 155         /*
 156          * When the pointers below are set they have a refhold on the struct.
 157          */
 158         ipsec_latch_t           *ixm_ipsec_latch;
 159         struct ipsa_s           *ixm_ipsec_ah_sa;       /* SA for AH */
 160         struct ipsa_s           *ixm_ipsec_esp_sa;      /* SA for ESP */
 161         struct ipsec_policy_s   *ixm_ipsec_policy;      /* why are we here? */
 162         struct ipsec_action_s   *ixm_ipsec_action; /* For reflected packets */
 163 
 164         ipsa_ref_t              ixm_ipsec_ref[2]; /* Soft reference to SA */
 165 
 166         /* Need these while waiting for SA */
 167         uint16_t ixm_ipsec_src_port;    /* Source port number of d-gram. */
 168         uint16_t ixm_ipsec_dst_port;    /* Destination port number of d-gram. */
 169         uint8_t  ixm_ipsec_icmp_type;   /* ICMP type of d-gram */
 170         uint8_t  ixm_ipsec_icmp_code;   /* ICMP code of d-gram */
 171 
 172         sa_family_t ixm_ipsec_inaf;     /* Inner address family */
 173         uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN];      /* Inner src address */
 174         uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN];      /* Inner dest address */
 175         uint8_t  ixm_ipsec_insrcpfx;    /* Inner source prefix */
 176         uint8_t  ixm_ipsec_indstpfx;    /* Inner destination prefix */
 177 
 178         uint8_t ixm_ipsec_proto;        /* IP protocol number for d-gram. */
 179 } ixamblk_t;
 180 
 181 
 182 /*
 183  * When we need to handle a receive side asynchronous operation, then we need
 184  * to save sufficient information so that we can call ip_fanout.
 185  * That information is captured in an mblk containing this structure.
 186  *
 187  * Since this is currently only used for IPsec, we include information for
 188  * the kernel crypto framework.
 189  */
 190 typedef struct iramblk_s {
 191         boolean_t       irm_inbound;    /* B_TRUE */
 192         iaflags_t       irm_flags;      /* ira_flags */
 193         netstackid_t    irm_stackid;    /* Verify it didn't go away */
 194         uint_t          irm_ifindex;    /* To find ira_ill */
 195 
 196         uint_t          irm_rifindex;   /* ira_rifindex */
 197         uint_t          irm_ruifindex;  /* ira_ruifindex */
 198         uint_t          irm_pktlen;
 199         uint16_t        irm_ip_hdr_length; /* Points to ULP header */
 200         uint8_t         irm_protocol;   /* Protocol number for ULP cksum */
 201         zoneid_t        irm_zoneid;     /* ALL_ZONES unless local delivery */
 202 
 203         squeue_t        *irm_sqp;
 204         ill_rx_ring_t   *irm_ring;
 205 
 206         ipaddr_t        irm_mroute_tunnel;      /* IRAF_MROUTE_TUNNEL_SET */
 207         zoneid_t        irm_no_loop_zoneid;     /* IRAF_NO_LOOP_ZONEID_SET */
 208         uint32_t        irm_esp_udp_ports;      /* IRAF_ESP_UDP_PORTS */
 209 
 210         char            irm_l2src[IRA_L2SRC_SIZE];      /* If IRAF_L2SRC_SET */
 211 
 212         cred_t          *irm_cred;      /* For getpeerucred - refhold if set */
 213         pid_t           irm_cpid;       /* For getpeerucred */
 214 
 215         ts_label_t      *irm_tsl;       /* Refhold if set. */
 216 
 217         /*
 218          * When set these correspond to a refhold on the object.
 219          */
 220         struct ipsa_s           *irm_ipsec_ah_sa;       /* SA for AH */
 221         struct ipsa_s           *irm_ipsec_esp_sa;      /* SA for ESP */
 222         struct ipsec_action_s   *irm_ipsec_action; /* For reflected packets */
 223 } iramblk_t;
 224 
 225 
 226 /*
 227  * Take the information in ip_xmit_attr_t and stick it in an mblk
 228  * that can later be passed to ip_xmit_attr_from_mblk to recreate the
 229  * ip_xmit_attr_t.
 230  *
 231  * Returns NULL on memory allocation failure.
 232  */
 233 mblk_t *
 234 ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
 235 {
 236         mblk_t          *ixamp;
 237         ixamblk_t       *ixm;
 238         nce_t           *nce = ixa->ixa_nce;
 239 
 240         ASSERT(nce != NULL);
 241         ixamp = allocb(sizeof (*ixm), BPRI_MED);
 242         if (ixamp == NULL)
 243                 return (NULL);
 244 
 245         ixamp->b_datap->db_type = M_BREAK;
 246         ixamp->b_wptr += sizeof (*ixm);
 247         ixm = (ixamblk_t *)ixamp->b_rptr;
 248 
 249         bzero(ixm, sizeof (*ixm));
 250         ixm->ixm_inbound = B_FALSE;
 251         ixm->ixm_flags = ixa->ixa_flags;
 252         ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
 253         ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
 254         ixm->ixm_nceaddr_v6 = nce->nce_addr;
 255         ixm->ixm_fragsize = ixa->ixa_fragsize;
 256         ixm->ixm_pktlen = ixa->ixa_pktlen;
 257         ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
 258         ixm->ixm_protocol = ixa->ixa_protocol;
 259         ixm->ixm_postfragfn = ixa->ixa_postfragfn;
 260         ixm->ixm_zoneid = ixa->ixa_zoneid;
 261         ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
 262         ixm->ixm_scopeid = ixa->ixa_scopeid;
 263         ixm->ixm_ident = ixa->ixa_ident;
 264         ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;
 265 
 266         if (ixa->ixa_tsl != NULL) {
 267                 ixm->ixm_tsl = ixa->ixa_tsl;
 268                 label_hold(ixm->ixm_tsl);
 269         }
 270         if (ixa->ixa_cred != NULL) {
 271                 ixm->ixm_cred = ixa->ixa_cred;
 272                 crhold(ixa->ixa_cred);
 273         }
 274         ixm->ixm_cpid = ixa->ixa_cpid;
 275         ixm->ixm_conn_id = ixa->ixa_conn_id;
 276 
 277         if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
 278                 if (ixa->ixa_ipsec_ah_sa != NULL) {
 279                         ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
 280                         IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
 281                 }
 282                 if (ixa->ixa_ipsec_esp_sa != NULL) {
 283                         ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
 284                         IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
 285                 }
 286                 if (ixa->ixa_ipsec_policy != NULL) {
 287                         ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
 288                         IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
 289                 }
 290                 if (ixa->ixa_ipsec_action != NULL) {
 291                         ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
 292                         IPACT_REFHOLD(ixa->ixa_ipsec_action);
 293                 }
 294                 if (ixa->ixa_ipsec_latch != NULL) {
 295                         ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
 296                         IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
 297                 }
 298                 ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
 299                 ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
 300                 ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
 301                 ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
 302                 ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
 303                 ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
 304                 ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
 305                 ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
 306                 ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
 307                 ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
 308                 ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
 309                 ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
 310                 ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
 311                 ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
 312                 ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
 313                 ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
 314                 ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
 315                 ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
 316         }
 317         return (ixamp);
 318 }
 319 
 320 /*
 321  * Extract the ip_xmit_attr_t from the mblk, checking that the
 322  * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
 323  * not the case.
 324  *
 325  * Otherwise ixa is updated.
 326  * Caller needs to release references on the ixa by calling ixa_refrele()
 327  * which will imediately call ixa_inactive to release the references.
 328  */
 329 boolean_t
 330 ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
 331 {
 332         ixamblk_t       *ixm;
 333         netstack_t      *ns;
 334         ip_stack_t      *ipst;
 335         ill_t           *ill;
 336         nce_t           *nce;
 337 
 338         /* We assume the caller hasn't initialized ixa */
 339         bzero(ixa, sizeof (*ixa));
 340 
 341         ASSERT(DB_TYPE(ixamp) == M_BREAK);
 342         ASSERT(ixamp->b_cont == NULL);
 343 
 344         ixm = (ixamblk_t *)ixamp->b_rptr;
 345         ASSERT(!ixm->ixm_inbound);
 346 
 347         /* Verify the netstack is still around */
 348         ns = netstack_find_by_stackid(ixm->ixm_stackid);
 349         if (ns == NULL) {
 350                 /* Disappeared on us */
 351                 (void) ip_xmit_attr_free_mblk(ixamp);
 352                 return (B_FALSE);
 353         }
 354         ipst = ns->netstack_ip;
 355 
 356         /* Verify the ill is still around */
 357         ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
 358             !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);
 359 
 360         /* We have the ill, hence the netstack can't go away */
 361         netstack_rele(ns);
 362         if (ill == NULL) {
 363                 /* Disappeared on us */
 364                 (void) ip_xmit_attr_free_mblk(ixamp);
 365                 return (B_FALSE);
 366         }
 367         /*
 368          * Find the nce. We don't load-spread (only lookup nce's on the ill)
 369          * because we want to find the same nce as the one we had when
 370          * ip_xmit_attr_to_mblk was called.
 371          */
 372         if (ixm->ixm_flags & IXAF_IS_IPV4) {
 373                 nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
 374         } else {
 375                 nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
 376         }
 377 
 378         /* We have the nce, hence the ill can't go away */
 379         ill_refrele(ill);
 380         if (nce == NULL) {
 381                 /*
 382                  * Since this is unusual and we don't know what type of
 383                  * nce it was, we drop the packet.
 384                  */
 385                 (void) ip_xmit_attr_free_mblk(ixamp);
 386                 return (B_FALSE);
 387         }
 388 
 389         ixa->ixa_flags = ixm->ixm_flags;
 390         ixa->ixa_refcnt = 1;
 391         ixa->ixa_ipst = ipst;
 392         ixa->ixa_fragsize = ixm->ixm_fragsize;
 393         ixa->ixa_pktlen =  ixm->ixm_pktlen;
 394         ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
 395         ixa->ixa_protocol = ixm->ixm_protocol;
 396         ixa->ixa_nce = nce;
 397         ixa->ixa_postfragfn = ixm->ixm_postfragfn;
 398         ixa->ixa_zoneid = ixm->ixm_zoneid;
 399         ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
 400         ixa->ixa_scopeid = ixm->ixm_scopeid;
 401         ixa->ixa_ident = ixm->ixm_ident;
 402         ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;
 403 
 404         if (ixm->ixm_tsl != NULL) {
 405                 ixa->ixa_tsl = ixm->ixm_tsl;
 406                 ixa->ixa_free_flags |= IXA_FREE_TSL;
 407                 ixm->ixm_tsl = NULL;
 408         }
 409         if (ixm->ixm_cred != NULL) {
 410                 ixa->ixa_cred = ixm->ixm_cred;
 411                 ixa->ixa_free_flags |= IXA_FREE_CRED;
 412                 ixm->ixm_cred = NULL;
 413         }
 414         ixa->ixa_cpid = ixm->ixm_cpid;
 415         ixa->ixa_conn_id = ixm->ixm_conn_id;
 416 
 417         ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
 418         ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
 419         ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
 420         ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
 421         ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;
 422 
 423         ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
 424         ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
 425         ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
 426         ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
 427         ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
 428         ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
 429         ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
 430         ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
 431         ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
 432         ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
 433         ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
 434         ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
 435         ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
 436         ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
 437         ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
 438         ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
 439         ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
 440         ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;
 441 
 442         freeb(ixamp);
 443         return (B_TRUE);
 444 }
 445 
 446 /*
 447  * Free the ixm mblk and any references it holds
 448  * Returns b_cont.
 449  */
 450 mblk_t *
 451 ip_xmit_attr_free_mblk(mblk_t *ixamp)
 452 {
 453         ixamblk_t       *ixm;
 454         mblk_t          *mp;
 455 
 456         /* Consume mp */
 457         ASSERT(DB_TYPE(ixamp) == M_BREAK);
 458         mp = ixamp->b_cont;
 459 
 460         ixm = (ixamblk_t *)ixamp->b_rptr;
 461         ASSERT(!ixm->ixm_inbound);
 462 
 463         if (ixm->ixm_ipsec_ah_sa != NULL) {
 464                 IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
 465                 ixm->ixm_ipsec_ah_sa = NULL;
 466         }
 467         if (ixm->ixm_ipsec_esp_sa != NULL) {
 468                 IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
 469                 ixm->ixm_ipsec_esp_sa = NULL;
 470         }
 471         if (ixm->ixm_ipsec_policy != NULL) {
 472                 IPPOL_REFRELE(ixm->ixm_ipsec_policy);
 473                 ixm->ixm_ipsec_policy = NULL;
 474         }
 475         if (ixm->ixm_ipsec_action != NULL) {
 476                 IPACT_REFRELE(ixm->ixm_ipsec_action);
 477                 ixm->ixm_ipsec_action = NULL;
 478         }
 479         if (ixm->ixm_ipsec_latch) {
 480                 IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
 481                 ixm->ixm_ipsec_latch = NULL;
 482         }
 483 
 484         if (ixm->ixm_tsl != NULL) {
 485                 label_rele(ixm->ixm_tsl);
 486                 ixm->ixm_tsl = NULL;
 487         }
 488         if (ixm->ixm_cred != NULL) {
 489                 crfree(ixm->ixm_cred);
 490                 ixm->ixm_cred = NULL;
 491         }
 492         freeb(ixamp);
 493         return (mp);
 494 }
 495 
 496 /*
 497  * Take the information in ip_recv_attr_t and stick it in an mblk
 498  * that can later be passed to ip_recv_attr_from_mblk to recreate the
 499  * ip_recv_attr_t.
 500  *
 501  * Returns NULL on memory allocation failure.
 502  */
 503 mblk_t *
 504 ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
 505 {
 506         mblk_t          *iramp;
 507         iramblk_t       *irm;
 508         ill_t           *ill = ira->ira_ill;
 509 
 510         ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);
 511 
 512         iramp = allocb(sizeof (*irm), BPRI_MED);
 513         if (iramp == NULL)
 514                 return (NULL);
 515 
 516         iramp->b_datap->db_type = M_BREAK;
 517         iramp->b_wptr += sizeof (*irm);
 518         irm = (iramblk_t *)iramp->b_rptr;
 519 
 520         bzero(irm, sizeof (*irm));
 521         irm->irm_inbound = B_TRUE;
 522         irm->irm_flags = ira->ira_flags;
 523         if (ill != NULL) {
 524                 /* Internal to IP - preserve ip_stack_t, ill and rill */
 525                 irm->irm_stackid =
 526                     ill->ill_ipst->ips_netstack->netstack_stackid;
 527                 irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
 528                 ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
 529                     ira->ira_rifindex);
 530         } else {
 531                 /* Let ip_recv_attr_from_stackid know there isn't one */
 532                 irm->irm_stackid = -1;
 533         }
 534         irm->irm_rifindex = ira->ira_rifindex;
 535         irm->irm_ruifindex = ira->ira_ruifindex;
 536         irm->irm_pktlen = ira->ira_pktlen;
 537         irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
 538         irm->irm_protocol = ira->ira_protocol;
 539 
 540         irm->irm_sqp = ira->ira_sqp;
 541         irm->irm_ring = ira->ira_ring;
 542 
 543         irm->irm_zoneid = ira->ira_zoneid;
 544         irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
 545         irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
 546         irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;
 547 
 548         if (ira->ira_tsl != NULL) {
 549                 irm->irm_tsl = ira->ira_tsl;
 550                 label_hold(irm->irm_tsl);
 551         }
 552         if (ira->ira_cred != NULL) {
 553                 irm->irm_cred = ira->ira_cred;
 554                 crhold(ira->ira_cred);
 555         }
 556         irm->irm_cpid = ira->ira_cpid;
 557 
 558         if (ira->ira_flags & IRAF_L2SRC_SET)
 559                 bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);
 560 
 561         if (ira->ira_flags & IRAF_IPSEC_SECURE) {
 562                 if (ira->ira_ipsec_ah_sa != NULL) {
 563                         irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
 564                         IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
 565                 }
 566                 if (ira->ira_ipsec_esp_sa != NULL) {
 567                         irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
 568                         IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
 569                 }
 570                 if (ira->ira_ipsec_action != NULL) {
 571                         irm->irm_ipsec_action = ira->ira_ipsec_action;
 572                         IPACT_REFHOLD(ira->ira_ipsec_action);
 573                 }
 574         }
 575         return (iramp);
 576 }
 577 
 578 /*
 579  * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
 580  * then irm_stackid is not -1, in which case we check that the
 581  * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
 582  * not the case.
 583  * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
 584  * and we just proceed with ira_ill and ira_rill as NULL.
 585  *
 586  * The caller needs to release any references on the pointers inside the ire
 587  * by calling ira_cleanup.
 588  */
 589 boolean_t
 590 ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
 591 {
 592         iramblk_t       *irm;
 593         netstack_t      *ns;
 594         ip_stack_t      *ipst = NULL;
 595         ill_t           *ill = NULL, *rill = NULL;
 596 
 597         /* We assume the caller hasn't initialized ira */
 598         bzero(ira, sizeof (*ira));
 599 
 600         ASSERT(DB_TYPE(iramp) == M_BREAK);
 601         ASSERT(iramp->b_cont == NULL);
 602 
 603         irm = (iramblk_t *)iramp->b_rptr;
 604         ASSERT(irm->irm_inbound);
 605 
 606         if (irm->irm_stackid != -1) {
 607                 /* Verify the netstack is still around */
 608                 ns = netstack_find_by_stackid(irm->irm_stackid);
 609                 if (ns == NULL) {
 610                         /* Disappeared on us */
 611                         (void) ip_recv_attr_free_mblk(iramp);
 612                         return (B_FALSE);
 613                 }
 614                 ipst = ns->netstack_ip;
 615 
 616                 /* Verify the ill is still around */
 617                 ill = ill_lookup_on_ifindex(irm->irm_ifindex,
 618                     !(irm->irm_flags & IRAF_IS_IPV4), ipst);
 619 
 620                 if (irm->irm_ifindex == irm->irm_rifindex) {
 621                         rill = ill;
 622                 } else {
 623                         rill = ill_lookup_on_ifindex(irm->irm_rifindex,
 624                             !(irm->irm_flags & IRAF_IS_IPV4), ipst);
 625                 }
 626 
 627                 /* We have the ill, hence the netstack can't go away */
 628                 netstack_rele(ns);
 629                 if (ill == NULL || rill == NULL) {
 630                         /* Disappeared on us */
 631                         if (ill != NULL)
 632                                 ill_refrele(ill);
 633                         if (rill != NULL && rill != ill)
 634                                 ill_refrele(rill);
 635                         (void) ip_recv_attr_free_mblk(iramp);
 636                         return (B_FALSE);
 637                 }
 638         }
 639 
 640         ira->ira_flags = irm->irm_flags;
 641         /* Caller must ill_refele(ira_ill) by using ira_cleanup() */
 642         ira->ira_ill = ill;
 643         ira->ira_rill = rill;
 644 
 645         ira->ira_rifindex = irm->irm_rifindex;
 646         ira->ira_ruifindex = irm->irm_ruifindex;
 647         ira->ira_pktlen = irm->irm_pktlen;
 648         ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
 649         ira->ira_protocol = irm->irm_protocol;
 650 
 651         ira->ira_sqp = irm->irm_sqp;
 652         /* The rest of IP assumes that the rings never go away. */
 653         ira->ira_ring = irm->irm_ring;
 654 
 655         ira->ira_zoneid = irm->irm_zoneid;
 656         ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
 657         ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
 658         ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;
 659 
 660         if (irm->irm_tsl != NULL) {
 661                 ira->ira_tsl = irm->irm_tsl;
 662                 ira->ira_free_flags |= IRA_FREE_TSL;
 663                 irm->irm_tsl = NULL;
 664         }
 665         if (irm->irm_cred != NULL) {
 666                 ira->ira_cred = irm->irm_cred;
 667                 ira->ira_free_flags |= IRA_FREE_CRED;
 668                 irm->irm_cred = NULL;
 669         }
 670         ira->ira_cpid = irm->irm_cpid;
 671 
 672         if (ira->ira_flags & IRAF_L2SRC_SET)
 673                 bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);
 674 
 675         ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
 676         ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
 677         ira->ira_ipsec_action = irm->irm_ipsec_action;
 678 
 679         freeb(iramp);
 680         return (B_TRUE);
 681 }
 682 
 683 /*
 684  * Free the irm mblk and any references it holds
 685  * Returns b_cont.
 686  */
 687 mblk_t *
 688 ip_recv_attr_free_mblk(mblk_t *iramp)
 689 {
 690         iramblk_t       *irm;
 691         mblk_t          *mp;
 692 
 693         /* Consume mp */
 694         ASSERT(DB_TYPE(iramp) == M_BREAK);
 695         mp = iramp->b_cont;
 696 
 697         irm = (iramblk_t *)iramp->b_rptr;
 698         ASSERT(irm->irm_inbound);
 699 
 700         if (irm->irm_ipsec_ah_sa != NULL) {
 701                 IPSA_REFRELE(irm->irm_ipsec_ah_sa);
 702                 irm->irm_ipsec_ah_sa = NULL;
 703         }
 704         if (irm->irm_ipsec_esp_sa != NULL) {
 705                 IPSA_REFRELE(irm->irm_ipsec_esp_sa);
 706                 irm->irm_ipsec_esp_sa = NULL;
 707         }
 708         if (irm->irm_ipsec_action != NULL) {
 709                 IPACT_REFRELE(irm->irm_ipsec_action);
 710                 irm->irm_ipsec_action = NULL;
 711         }
 712         if (irm->irm_tsl != NULL) {
 713                 label_rele(irm->irm_tsl);
 714                 irm->irm_tsl = NULL;
 715         }
 716         if (irm->irm_cred != NULL) {
 717                 crfree(irm->irm_cred);
 718                 irm->irm_cred = NULL;
 719         }
 720 
 721         freeb(iramp);
 722         return (mp);
 723 }
 724 
 725 /*
 726  * Returns true if the mblk contains an ip_recv_attr_t
 727  * For now we just check db_type.
 728  */
 729 boolean_t
 730 ip_recv_attr_is_mblk(mblk_t *mp)
 731 {
 732         /*
 733          * Need to handle the various forms of tcp_timermp which are tagged
 734          * with b_wptr and might have a NULL b_datap.
 735          */
 736         if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
 737                 return (B_FALSE);
 738 
 739 #ifdef  DEBUG
 740         iramblk_t       *irm;
 741 
 742         if (DB_TYPE(mp) != M_BREAK)
 743                 return (B_FALSE);
 744 
 745         irm = (iramblk_t *)mp->b_rptr;
 746         ASSERT(irm->irm_inbound);
 747         return (B_TRUE);
 748 #else
 749         return (DB_TYPE(mp) == M_BREAK);
 750 #endif
 751 }
 752 
 753 static ip_xmit_attr_t *
 754 conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
 755 {
 756         ip_xmit_attr_t  *oldixa;        /* Already attached to conn_t */
 757         ip_xmit_attr_t  *ixa;           /* New one, which we return. */
 758 
 759         /*
 760          * NOTE: If the marked-below common case isn't, move the
 761          * kmem_alloc() up here and put a free in what was marked as the
 762          * (not really) common case instead.
 763          */
 764 
 765         mutex_enter(&connp->conn_lock);
 766         oldixa = connp->conn_ixa;
 767 
 768         /* At least one reference for the conn_t */
 769         ASSERT3U(oldixa->ixa_refcnt, >=, 1);
 770         if (atomic_inc_32_nv(&oldixa->ixa_refcnt) == 2) {
 771                 /* No other thread using conn_ixa (common case) */
 772                 mutex_exit(&connp->conn_lock);
 773                 return (oldixa);
 774         }
 775         /* Do allocation inside-the-conn_lock because it's less common. */
 776         ixa = kmem_alloc(sizeof (*ixa), kmflag);
 777         if (ixa == NULL) {
 778                 mutex_exit(&connp->conn_lock);
 779                 IXA_REFRELE(oldixa);
 780                 return (NULL);
 781         }
 782         ixa_safe_copy(oldixa, ixa);
 783 
 784         /* Make sure we drop conn_lock before any refrele */
 785         if (replace) {
 786                 ixa->ixa_refcnt++;   /* No atomic needed - not visible */
 787                 connp->conn_ixa = ixa;
 788                 mutex_exit(&connp->conn_lock);
 789                 IXA_REFRELE(oldixa);    /* Undo refcnt from conn_t */
 790         } else {
 791                 mutex_exit(&connp->conn_lock);
 792         }
 793         IXA_REFRELE(oldixa);    /* Undo above atomic_add_32_nv */
 794 
 795         return (ixa);
 796 }
 797 
 798 /*
 799  * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
 800  * the caller can access the ip_xmit_attr_t.
 801  *
 802  * If nobody else is using conn_ixa we return it.
 803  * Otherwise we make a "safe" copy of conn_ixa
 804  * and return it. The "safe" copy has the pointers set to NULL
 805  * (since the pointers might be changed by another thread using
 806  * conn_ixa). The caller needs to check for NULL pointers to see
 807  * if ip_set_destination needs to be called to re-establish the pointers.
 808  *
 809  * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
 810  * That is used when we connect() the ULP.
 811  */
 812 ip_xmit_attr_t *
 813 conn_get_ixa(conn_t *connp, boolean_t replace)
 814 {
 815         return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
 816 }
 817 
 818 /*
 819  * Used only when the option is to have the kernel hang due to not
 820  * cleaning up ixa references on ills etc.
 821  */
 822 ip_xmit_attr_t *
 823 conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
 824 {
 825         return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
 826 }
 827 
 828 /*
 829  * Replace conn_ixa with the ixa argument.
 830  *
 831  * The caller must hold conn_lock.
 832  *
 833  * We return the old ixa; the caller must ixa_refrele that after conn_lock
 834  * has been dropped.
 835  */
 836 ip_xmit_attr_t *
 837 conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
 838 {
 839         ip_xmit_attr_t  *oldixa;
 840 
 841         ASSERT(MUTEX_HELD(&connp->conn_lock));
 842 
 843         oldixa = connp->conn_ixa;
 844         IXA_REFHOLD(ixa);
 845         ixa->ixa_conn_id = oldixa->ixa_conn_id;
 846         connp->conn_ixa = ixa;
 847         return (oldixa);
 848 }
 849 
 850 /*
 851  * Return a ip_xmit_attr_t to use with a conn_t that is based on but
 852  * separate from conn_ixa.
 853  *
 854  * This "safe" copy has the pointers set to NULL
 855  * (since the pointers might be changed by another thread using
 856  * conn_ixa). The caller needs to check for NULL pointers to see
 857  * if ip_set_destination needs to be called to re-establish the pointers.
 858  */
 859 ip_xmit_attr_t *
 860 conn_get_ixa_exclusive(conn_t *connp)
 861 {
 862         ip_xmit_attr_t *oldixa;
 863         ip_xmit_attr_t *ixa;
 864 
 865         ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP_LAZY);
 866         if (ixa == NULL)
 867                 return (NULL);
 868 
 869         mutex_enter(&connp->conn_lock);
 870 
 871         oldixa = connp->conn_ixa;
 872         IXA_REFHOLD(oldixa);
 873 
 874         ixa_safe_copy(oldixa, ixa);
 875         mutex_exit(&connp->conn_lock);
 876         IXA_REFRELE(oldixa);
 877         return (ixa);
 878 }
 879 
 880 void
 881 ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
 882 {
 883         bcopy(src, ixa, sizeof (*ixa));
 884         ixa->ixa_refcnt = 1;
 885         /*
 886          * Clear any pointers that have references and might be changed
 887          * by ip_set_destination or the ULP
 888          */
 889         ixa->ixa_ire = NULL;
 890         ixa->ixa_nce = NULL;
 891         ixa->ixa_dce = NULL;
 892         ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
 893         ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
 894 #ifdef DEBUG
 895         ixa->ixa_curthread = NULL;
 896 #endif
 897         /* Clear all the IPsec pointers and the flag as well. */
 898         ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
 899 
 900         ixa->ixa_ipsec_latch = NULL;
 901         ixa->ixa_ipsec_ah_sa = NULL;
 902         ixa->ixa_ipsec_esp_sa = NULL;
 903         ixa->ixa_ipsec_policy = NULL;
 904         ixa->ixa_ipsec_action = NULL;
 905 
 906         /*
 907          * We leave ixa_tsl unchanged, but if it has a refhold we need
 908          * to get an extra refhold.
 909          */
 910         if (ixa->ixa_free_flags & IXA_FREE_TSL)
 911                 label_hold(ixa->ixa_tsl);
 912 
 913         /*
 914          * We leave ixa_cred unchanged, but if it has a refhold we need
 915          * to get an extra refhold.
 916          */
 917         if (ixa->ixa_free_flags & IXA_FREE_CRED)
 918                 crhold(ixa->ixa_cred);
 919 
 920         /*
 921          * There is no cleanup in progress on this new copy.
 922          */
 923         ixa->ixa_tcpcleanup = IXATC_IDLE;
 924 }
 925 
 926 /*
 927  * Duplicate an ip_xmit_attr_t.
 928  * Assumes that the caller controls the ixa, hence we do not need to use
 929  * a safe copy. We just have to increase the refcnt on any pointers.
 930  */
 931 ip_xmit_attr_t *
 932 ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
 933 {
 934         ip_xmit_attr_t *ixa;
 935 
 936         ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
 937         if (ixa == NULL)
 938                 return (NULL);
 939         bcopy(src_ixa, ixa, sizeof (*ixa));
 940         ixa->ixa_refcnt = 1;
 941 
 942         if (ixa->ixa_ire != NULL)
 943                 ire_refhold_notr(ixa->ixa_ire);
 944         if (ixa->ixa_nce != NULL)
 945                 nce_refhold(ixa->ixa_nce);
 946         if (ixa->ixa_dce != NULL)
 947                 dce_refhold_notr(ixa->ixa_dce);
 948 
 949 #ifdef DEBUG
 950         ixa->ixa_curthread = NULL;
 951 #endif
 952 
 953         if (ixa->ixa_ipsec_latch != NULL)
 954                 IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
 955         if (ixa->ixa_ipsec_ah_sa != NULL)
 956                 IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
 957         if (ixa->ixa_ipsec_esp_sa != NULL)
 958                 IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
 959         if (ixa->ixa_ipsec_policy != NULL)
 960                 IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
 961         if (ixa->ixa_ipsec_action != NULL)
 962                 IPACT_REFHOLD(ixa->ixa_ipsec_action);
 963 
 964         if (ixa->ixa_tsl != NULL) {
 965                 label_hold(ixa->ixa_tsl);
 966                 ixa->ixa_free_flags |= IXA_FREE_TSL;
 967         }
 968         if (ixa->ixa_cred != NULL) {
 969                 crhold(ixa->ixa_cred);
 970                 ixa->ixa_free_flags |= IXA_FREE_CRED;
 971         }
 972         return (ixa);
 973 }
 974 
 975 /*
 976  * Used to replace the ixa_label field.
 977  * The caller should have a reference on the label, which we transfer to
 978  * the attributes so that when the attribute is freed/cleaned up
 979  * we will release that reference.
 980  */
 981 void
 982 ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
 983 {
 984         ASSERT(tsl != NULL);
 985 
 986         if (ixa->ixa_free_flags & IXA_FREE_TSL) {
 987                 ASSERT(ixa->ixa_tsl != NULL);
 988                 label_rele(ixa->ixa_tsl);
 989         } else {
 990                 ixa->ixa_free_flags |= IXA_FREE_TSL;
 991         }
 992         ixa->ixa_tsl = tsl;
 993 }
 994 
 995 /*
 996  * Replace the ip_recv_attr_t's label.
 997  * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
 998  * TCP/UDP uses ira_cred to set db_credp for non-socket users.
 999  * This can fail (and return B_FALSE) due to lack of memory.
1000  */
1001 boolean_t
1002 ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
1003 {
1004         cred_t  *newcr;
1005 
1006         if (ira->ira_free_flags & IRA_FREE_TSL) {
1007                 ASSERT(ira->ira_tsl != NULL);
1008                 label_rele(ira->ira_tsl);
1009         }
1010         label_hold(tsl);
1011         ira->ira_tsl = tsl;
1012         ira->ira_free_flags |= IRA_FREE_TSL;
1013 
1014         /*
1015          * Reset zoneid if we have a shared address. That allows
1016          * ip_fanout_tx_v4/v6 to determine the zoneid again.
1017          */
1018         if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
1019                 ira->ira_zoneid = ALL_ZONES;
1020 
1021         /* We update ira_cred for RPC */
1022         newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
1023         if (newcr == NULL)
1024                 return (B_FALSE);
1025         if (ira->ira_free_flags & IRA_FREE_CRED)
1026                 crfree(ira->ira_cred);
1027         ira->ira_cred = newcr;
1028         ira->ira_free_flags |= IRA_FREE_CRED;
1029         return (B_TRUE);
1030 }
1031 
1032 /*
1033  * This needs to be called after ip_set_destination/tsol_check_dest might
1034  * have changed ixa_tsl to be specific for a destination, and we now want to
1035  * send to a different destination.
1036  * We have to restart with crgetlabel() since ip_set_destination/
1037  * tsol_check_dest will start with ixa_tsl.
1038  */
1039 void
1040 ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
1041 {
1042         if (!is_system_labeled())
1043                 return;
1044 
1045         if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1046                 ASSERT(ixa->ixa_tsl != NULL);
1047                 label_rele(ixa->ixa_tsl);
1048                 ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1049         }
1050         ixa->ixa_tsl = crgetlabel(cr);
1051 }
1052 
1053 void
1054 ixa_refrele(ip_xmit_attr_t *ixa)
1055 {
1056         IXA_REFRELE(ixa);
1057 }
1058 
1059 void
1060 ixa_inactive(ip_xmit_attr_t *ixa)
1061 {
1062         ASSERT(ixa->ixa_refcnt == 0);
1063 
1064         ixa_cleanup(ixa);
1065         kmem_free(ixa, sizeof (*ixa));
1066 }
1067 
1068 /*
1069  * Release any references contained in the ixa.
1070  * Also clear any fields that are not controlled by ixa_flags.
1071  */
1072 void
1073 ixa_cleanup(ip_xmit_attr_t *ixa)
1074 {
1075         if (ixa->ixa_ire != NULL) {
1076                 ire_refrele_notr(ixa->ixa_ire);
1077                 ixa->ixa_ire = NULL;
1078         }
1079         if (ixa->ixa_dce != NULL) {
1080                 dce_refrele_notr(ixa->ixa_dce);
1081                 ixa->ixa_dce = NULL;
1082         }
1083         if (ixa->ixa_nce != NULL) {
1084                 nce_refrele(ixa->ixa_nce);
1085                 ixa->ixa_nce = NULL;
1086         }
1087         ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1088         ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1089         if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
1090                 ipsec_out_release_refs(ixa);
1091         }
1092         if (ixa->ixa_free_flags & IXA_FREE_TSL) {
1093                 ASSERT(ixa->ixa_tsl != NULL);
1094                 label_rele(ixa->ixa_tsl);
1095                 ixa->ixa_free_flags &= ~IXA_FREE_TSL;
1096         }
1097         ixa->ixa_tsl = NULL;
1098         if (ixa->ixa_free_flags & IXA_FREE_CRED) {
1099                 ASSERT(ixa->ixa_cred != NULL);
1100                 crfree(ixa->ixa_cred);
1101                 ixa->ixa_free_flags &= ~IXA_FREE_CRED;
1102         }
1103         ixa->ixa_cred = NULL;
1104         ixa->ixa_src_preferences = 0;
1105         ixa->ixa_ifindex = 0;
1106         ixa->ixa_multicast_ifindex = 0;
1107         ixa->ixa_multicast_ifaddr = INADDR_ANY;
1108 }
1109 
1110 /*
1111  * Release any references contained in the ira.
1112  * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
1113  * argument.
1114  */
1115 void
1116 ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
1117 {
1118         if (ira->ira_ill != NULL) {
1119                 if (ira->ira_rill != ira->ira_ill) {
1120                         /* Caused by async processing */
1121                         ill_refrele(ira->ira_rill);
1122                 }
1123                 if (refrele_ill)
1124                         ill_refrele(ira->ira_ill);
1125         }
1126         if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1127                 ipsec_in_release_refs(ira);
1128         }
1129         if (ira->ira_free_flags & IRA_FREE_TSL) {
1130                 ASSERT(ira->ira_tsl != NULL);
1131                 label_rele(ira->ira_tsl);
1132                 ira->ira_free_flags &= ~IRA_FREE_TSL;
1133         }
1134         ira->ira_tsl = NULL;
1135         if (ira->ira_free_flags & IRA_FREE_CRED) {
1136                 ASSERT(ira->ira_cred != NULL);
1137                 crfree(ira->ira_cred);
1138                 ira->ira_free_flags &= ~IRA_FREE_CRED;
1139         }
1140         ira->ira_cred = NULL;
1141 }
1142 
1143 /*
1144  * Function to help release any IRE, NCE, or DCEs that
1145  * have been deleted and are marked as condemned.
1146  * The caller is responsible for any serialization which is different
1147  * for TCP, SCTP, and others.
1148  */
1149 static void
1150 ixa_cleanup_stale(ip_xmit_attr_t *ixa)
1151 {
1152         ire_t           *ire;
1153         nce_t           *nce;
1154         dce_t           *dce;
1155 
1156         ire = ixa->ixa_ire;
1157         nce = ixa->ixa_nce;
1158         dce = ixa->ixa_dce;
1159 
1160         if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
1161                 ire_refrele_notr(ire);
1162                 ire = ire_blackhole(ixa->ixa_ipst,
1163                     !(ixa->ixa_flags & IXAF_IS_IPV4));
1164                 ASSERT(ire != NULL);
1165 #ifdef DEBUG
1166                 ire_refhold_notr(ire);
1167                 ire_refrele(ire);
1168 #endif
1169                 ixa->ixa_ire = ire;
1170                 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1171         }
1172         if (nce != NULL && nce->nce_is_condemned) {
1173                 /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
1174                 nce_refrele(nce);
1175                 ixa->ixa_nce = NULL;
1176                 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
1177         }
1178         if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
1179                 dce_refrele_notr(dce);
1180                 dce = dce_get_default(ixa->ixa_ipst);
1181                 ASSERT(dce != NULL);
1182 #ifdef DEBUG
1183                 dce_refhold_notr(dce);
1184                 dce_refrele(dce);
1185 #endif
1186                 ixa->ixa_dce = dce;
1187                 ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
1188         }
1189 }
1190 
1191 static mblk_t *
1192 tcp_ixa_cleanup_getmblk(conn_t *connp)
1193 {
1194         tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1195         int need_retry;
1196         mblk_t *mp;
1197 
1198         mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1199 
1200         /*
1201          * It's possible that someone else came in and started cleaning up
1202          * another connection between the time we verified this one is not being
1203          * cleaned up and the time we actually get the shared mblk.  If that's
1204          * the case, we've dropped the lock, and some other thread may have
1205          * cleaned up this connection again, and is still waiting for
1206          * notification of that cleanup's completion.  Therefore we need to
1207          * recheck.
1208          */
1209         do {
1210                 need_retry = 0;
1211                 while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
1212                         cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1213                             &tcps->tcps_ixa_cleanup_lock);
1214                 }
1215 
1216                 while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
1217                         /*
1218                          * Multiple concurrent cleanups; need to have the last
1219                          * one run since it could be an unplumb.
1220                          */
1221                         need_retry = 1;
1222                         cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
1223                             &tcps->tcps_ixa_cleanup_lock);
1224                 }
1225         } while (need_retry);
1226 
1227         /*
1228          * We now have the lock and the mblk; now make sure that no one else can
1229          * try to clean up this connection or enqueue it for cleanup, clear the
1230          * mblk pointer for this stack, drop the lock, and return the mblk.
1231          */
1232         ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
1233         ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
1234         ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
1235         ASSERT(mp != NULL);
1236 
1237         connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
1238         tcps->tcps_ixa_cleanup_mp = NULL;
1239         mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1240 
1241         return (mp);
1242 }
1243 
1244 /*
1245  * Used to run ixa_cleanup_stale inside the tcp squeue.
1246  * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
1247  * and waking up the caller.
1248  */
1249 /* ARGSUSED2 */
1250 static void
1251 tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
1252     ip_recv_attr_t *dummy)
1253 {
1254         conn_t  *connp = (conn_t *)arg;
1255         tcp_stack_t     *tcps;
1256 
1257         tcps = connp->conn_netstack->netstack_tcp;
1258 
1259         ixa_cleanup_stale(connp->conn_ixa);
1260 
1261         mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1262         ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
1263         connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
1264         tcps->tcps_ixa_cleanup_mp = mp;
1265         cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
1266         /*
1267          * It is possible for any number of threads to be waiting for cleanup of
1268          * different connections.  Absent a per-connection (or per-IXA) CV, we
1269          * need to wake them all up even though only one can be waiting on this
1270          * particular cleanup.
1271          */
1272         cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1273         mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1274 }
1275 
1276 static void
1277 tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
1278 {
1279         tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
1280 
1281         mutex_enter(&tcps->tcps_ixa_cleanup_lock);
1282 
1283         ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
1284 
1285         while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
1286                 cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
1287                     &tcps->tcps_ixa_cleanup_lock);
1288         }
1289 
1290         ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
1291         connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
1292         cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
1293 
1294         mutex_exit(&tcps->tcps_ixa_cleanup_lock);
1295 }
1296 
1297 /*
1298  * ipcl_walk() function to help release any IRE, NCE, or DCEs that
1299  * have been deleted and are marked as condemned.
1300  * Note that we can't cleanup the pointers since there can be threads
1301  * in conn_ip_output() sending while we are called.
1302  */
1303 void
1304 conn_ixa_cleanup(conn_t *connp, void *arg)
1305 {
1306         boolean_t tryhard = (boolean_t)arg;
1307 
1308         if (IPCL_IS_TCP(connp)) {
1309                 mblk_t          *mp;
1310 
1311                 mp = tcp_ixa_cleanup_getmblk(connp);
1312 
1313                 if (connp->conn_sqp->sq_run == curthread) {
1314                         /* Already on squeue */
1315                         tcp_ixa_cleanup(connp, mp, NULL, NULL);
1316                 } else {
1317                         CONN_INC_REF(connp);
1318                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
1319                             connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
1320                 }
1321                 tcp_ixa_cleanup_wait_and_finish(connp);
1322         } else if (IPCL_IS_SCTP(connp)) {
1323                 sctp_t  *sctp;
1324                 sctp_faddr_t *fp;
1325 
1326                 sctp = CONN2SCTP(connp);
1327                 RUN_SCTP(sctp);
1328                 ixa_cleanup_stale(connp->conn_ixa);
1329                 for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next)
1330                         ixa_cleanup_stale(fp->sf_ixa);
1331                 WAKE_SCTP(sctp);
1332         } else {
1333                 ip_xmit_attr_t  *ixa;
1334 
1335                 /*
1336                  * If there is a different thread using conn_ixa then we get a
1337                  * new copy and cut the old one loose from conn_ixa. Otherwise
1338                  * we use conn_ixa and prevent any other thread from
1339                  * using/changing it. Anybody using conn_ixa (e.g., a thread in
1340                  * conn_ip_output) will do an ixa_refrele which will remove any
1341                  * references on the ire etc.
1342                  *
1343                  * Once we are done other threads can use conn_ixa since the
1344                  * refcnt will be back at one.
1345                  *
1346                  * We are called either because an ill is going away, or
1347                  * due to memory reclaim. In the former case we wait for
1348                  * memory since we must remove the refcnts on the ill.
1349                  */
1350                 if (tryhard) {
1351                         ixa = conn_get_ixa_tryhard(connp, B_TRUE);
1352                         ASSERT(ixa != NULL);
1353                 } else {
1354                         ixa = conn_get_ixa(connp, B_TRUE);
1355                         if (ixa == NULL) {
1356                                 /*
1357                                  * Somebody else was using it and kmem_alloc
1358                                  * failed! Next memory reclaim will try to
1359                                  * clean up.
1360                                  */
1361                                 DTRACE_PROBE1(conn__ixa__cleanup__bail,
1362                                     conn_t *, connp);
1363                                 return;
1364                         }
1365                 }
1366                 ixa_cleanup_stale(ixa);
1367                 IXA_REFRELE(ixa);
1368         }
1369 }
1370 
1371 /*
1372  * ixa needs to be an exclusive copy so that no one changes the cookie
1373  * or the ixa_nce.
1374  */
1375 boolean_t
1376 ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
1377 {
1378         uintptr_t cookie = ixa->ixa_cookie;
1379         ill_dld_direct_t *idd;
1380         idl_tx_list_t *idl_txl;
1381         ill_t *ill = ixa->ixa_nce->nce_ill;
1382         boolean_t inserted = B_FALSE;
1383 
1384         idd = &(ill)->ill_dld_capab->idc_direct;
1385         idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
1386         mutex_enter(&idl_txl->txl_lock);
1387 
1388         /*
1389          * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
1390          * control is asserted on an ill that does not support direct calls.
1391          * Jump to insert.
1392          */
1393         if (cookie == 0)
1394                 goto tryinsert;
1395 
1396         ASSERT(ILL_DIRECT_CAPABLE(ill));
1397 
1398         if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
1399                 DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
1400         } else if (idl_txl->txl_cookie != (uintptr_t)NULL &&
1401             idl_txl->txl_cookie != ixa->ixa_cookie) {
1402                 DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
1403                     uintptr_t, idl_txl->txl_cookie);
1404                 /* TODO: bump kstat for cookie collision */
1405         } else {
1406                 /*
1407                  * Check/set conn_blocked under conn_lock.  Note that txl_lock
1408                  * will not suffice since two separate UDP threads may be
1409                  * racing to send to different destinations that are
1410                  * associated with different cookies and thus may not be
1411                  * holding the same txl_lock.  Further, since a given conn_t
1412                  * can only be on a single drain list, the conn_t will be
1413                  * enqueued on whichever thread wins this race.
1414                  */
1415 tryinsert:      mutex_enter(&connp->conn_lock);
1416                 if (connp->conn_blocked) {
1417                         DTRACE_PROBE1(ill__tx__conn__already__blocked,
1418                             conn_t *, connp);
1419                         mutex_exit(&connp->conn_lock);
1420                 } else {
1421                         connp->conn_blocked = B_TRUE;
1422                         mutex_exit(&connp->conn_lock);
1423                         idl_txl->txl_cookie = cookie;
1424                         conn_drain_insert(connp, idl_txl);
1425                         if (!IPCL_IS_NONSTR(connp))
1426                                 noenable(connp->conn_wq);
1427                         inserted = B_TRUE;
1428                 }
1429         }
1430         mutex_exit(&idl_txl->txl_lock);
1431         return (inserted);
1432 }