1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013 by Delphix. All rights reserved.
  24  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  25  * Copyright 2016 Joyent, Inc.
  26  */
  27 /* Copyright (c) 1990 Mentat Inc. */
  28 
  29 #include <sys/types.h>
  30 #include <sys/stream.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strlog.h>
  33 #include <sys/strsun.h>
  34 #define _SUN_TPI_VERSION 2
  35 #include <sys/tihdr.h>
  36 #include <sys/timod.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/strsubr.h>
  40 #include <sys/suntpi.h>
  41 #include <sys/xti_inet.h>
  42 #include <sys/cmn_err.h>
  43 #include <sys/kmem.h>
  44 #include <sys/cred.h>
  45 #include <sys/policy.h>
  46 #include <sys/priv.h>
  47 #include <sys/ucred.h>
  48 #include <sys/zone.h>
  49 
  50 #include <sys/sockio.h>
  51 #include <sys/socket.h>
  52 #include <sys/socketvar.h>
  53 #include <sys/vtrace.h>
  54 #include <sys/sdt.h>
  55 #include <sys/debug.h>
  56 #include <sys/isa_defs.h>
  57 #include <sys/random.h>
  58 #include <netinet/in.h>
  59 #include <netinet/ip6.h>
  60 #include <netinet/icmp6.h>
  61 #include <netinet/udp.h>
  62 
  63 #include <inet/common.h>
  64 #include <inet/ip.h>
  65 #include <inet/ip_impl.h>
  66 #include <inet/ipsec_impl.h>
  67 #include <inet/ip6.h>
  68 #include <inet/ip_ire.h>
  69 #include <inet/ip_if.h>
  70 #include <inet/ip_multi.h>
  71 #include <inet/ip_ndp.h>
  72 #include <inet/proto_set.h>
  73 #include <inet/mib2.h>
  74 #include <inet/nd.h>
  75 #include <inet/optcom.h>
  76 #include <inet/snmpcom.h>
  77 #include <inet/kstatcom.h>
  78 #include <inet/ipclassifier.h>
  79 
  80 #include <sys/tsol/label.h>
  81 #include <sys/tsol/tnet.h>
  82 
  83 #include <inet/rawip_impl.h>
  84 #include <net/bpf.h>
  85 
  86 #include <sys/disp.h>
  87 
  88 /*
  89  * Synchronization notes:
  90  *
  91  * RAWIP is MT and uses the usual kernel synchronization primitives. We use
  92  * conn_lock to protect the icmp_t.
  93  *
  94  * Plumbing notes:
  95  * ICMP is always a device driver. For compatibility with mibopen() code
  96  * it is possible to I_PUSH "icmp", but that results in pushing a passthrough
  97  * dummy module.
  98  */
  99 static void     icmp_addr_req(queue_t *q, mblk_t *mp);
 100 static void     icmp_tpi_bind(queue_t *q, mblk_t *mp);
 101 static void     icmp_bind_proto(icmp_t *icmp);
 102 static int      icmp_build_hdr_template(conn_t *, const in6_addr_t *,
 103     const in6_addr_t *, uint32_t);
 104 static void     icmp_capability_req(queue_t *q, mblk_t *mp);
 105 static int      icmp_close(queue_t *q, int flags);
 106 static void     icmp_close_free(conn_t *);
 107 static void     icmp_tpi_connect(queue_t *q, mblk_t *mp);
 108 static void     icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
 109 static void     icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
 110     int sys_error);
 111 static void     icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
 112     t_scalar_t tlierr, int sys_error);
 113 static void     icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2,
 114     ip_recv_attr_t *);
 115 static void     icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp,
 116     ip_recv_attr_t *);
 117 static void     icmp_info_req(queue_t *q, mblk_t *mp);
 118 static void     icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
 119 static conn_t   *icmp_open(int family, cred_t *credp, int *err, int flags);
 120 static int      icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
 121                     cred_t *credp);
 122 static int      icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
 123                     cred_t *credp);
 124 static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
 125 int             icmp_opt_set(conn_t *connp, uint_t optset_context,
 126                     int level, int name, uint_t inlen,
 127                     uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
 128                     void *thisdg_attrs, cred_t *cr);
 129 int             icmp_opt_get(conn_t *connp, int level, int name,
 130                     uchar_t *ptr);
 131 static int      icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin,
 132                     sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa);
 133 static mblk_t   *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *,
 134     const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *);
 135 static mblk_t   *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *,
 136     mblk_t *, const in6_addr_t *, uint32_t, int *);
 137 static int      icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
 138                     uchar_t *ptr, int len);
 139 static void     icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
 140 static void     icmp_tpi_unbind(queue_t *q, mblk_t *mp);
 141 static void     icmp_wput(queue_t *q, mblk_t *mp);
 142 static void     icmp_wput_fallback(queue_t *q, mblk_t *mp);
 143 static void     icmp_wput_other(queue_t *q, mblk_t *mp);
 144 static void     icmp_wput_iocdata(queue_t *q, mblk_t *mp);
 145 static void     icmp_wput_restricted(queue_t *q, mblk_t *mp);
 146 static void     icmp_ulp_recv(conn_t *, mblk_t *, uint_t);
 147 
 148 static void     *rawip_stack_init(netstackid_t stackid, netstack_t *ns);
 149 static void     rawip_stack_fini(netstackid_t stackid, void *arg);
 150 
 151 static void     *rawip_kstat_init(netstackid_t stackid);
 152 static void     rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
 153 static int      rawip_kstat_update(kstat_t *kp, int rw);
 154 static void     rawip_stack_shutdown(netstackid_t stackid, void *arg);
 155 
 156 /* Common routines for TPI and socket module */
 157 static conn_t   *rawip_do_open(int, cred_t *, int *, int);
 158 static void     rawip_do_close(conn_t *);
 159 static int      rawip_do_bind(conn_t *, struct sockaddr *, socklen_t);
 160 static int      rawip_do_unbind(conn_t *);
 161 static int      rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t,
 162     cred_t *, pid_t);
 163 
 164 int             rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
 165                     socklen_t *, cred_t *);
 166 int             rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
 167                     socklen_t *, cred_t *);
 168 
 169 static struct module_info icmp_mod_info =  {
 170         5707, "icmp", 1, INFPSZ, 512, 128
 171 };
 172 
 173 /*
 174  * Entry points for ICMP as a device.
 175  * We have separate open functions for the /dev/icmp and /dev/icmp6 devices.
 176  */
 177 static struct qinit icmprinitv4 = {
 178         NULL, NULL, icmp_openv4, icmp_close, NULL, &icmp_mod_info
 179 };
 180 
 181 static struct qinit icmprinitv6 = {
 182         NULL, NULL, icmp_openv6, icmp_close, NULL, &icmp_mod_info
 183 };
 184 
 185 static struct qinit icmpwinit = {
 186         (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
 187 };
 188 
 189 /* ICMP entry point during fallback */
 190 static struct qinit icmp_fallback_sock_winit = {
 191         (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
 192 };
 193 
 194 /* For AF_INET aka /dev/icmp */
 195 struct streamtab icmpinfov4 = {
 196         &icmprinitv4, &icmpwinit
 197 };
 198 
 199 /* For AF_INET6 aka /dev/icmp6 */
 200 struct streamtab icmpinfov6 = {
 201         &icmprinitv6, &icmpwinit
 202 };
 203 
 204 /* Default structure copied into T_INFO_ACK messages */
 205 static struct T_info_ack icmp_g_t_info_ack = {
 206         T_INFO_ACK,
 207         IP_MAXPACKET,    /* TSDU_size.  icmp allows maximum size messages. */
 208         T_INVALID,      /* ETSDU_size.  icmp does not support expedited data. */
 209         T_INVALID,      /* CDATA_size. icmp does not support connect data. */
 210         T_INVALID,      /* DDATA_size. icmp does not support disconnect data. */
 211         0,              /* ADDR_size - filled in later. */
 212         0,              /* OPT_size - not initialized here */
 213         IP_MAXPACKET,   /* TIDU_size.  icmp allows maximum size messages. */
 214         T_CLTS,         /* SERV_type.  icmp supports connection-less. */
 215         TS_UNBND,       /* CURRENT_state.  This is set from icmp_state. */
 216         (XPG4_1|SENDZERO) /* PROVIDER_flag */
 217 };
 218 
 219 static int
 220 icmp_set_buf_prop(netstack_t *stack, cred_t *cr, mod_prop_info_t *pinfo,
 221     const char *ifname, const void *pval, uint_t flags)
 222 {
 223         return (mod_set_buf_prop(stack->netstack_icmp->is_propinfo_tbl,
 224             stack, cr, pinfo, ifname, pval, flags));
 225 }
 226 
 227 static int
 228 icmp_get_buf_prop(netstack_t *stack, mod_prop_info_t *pinfo, const char *ifname,
 229     void *val, uint_t psize, uint_t flags)
 230 {
 231         return (mod_get_buf_prop(stack->netstack_icmp->is_propinfo_tbl, stack,
 232             pinfo, ifname, val, psize, flags));
 233 }
 234 
 235 /*
 236  * All of these are alterable, within the min/max values given, at run time.
 237  *
 238  * Note: All those tunables which do not start with "icmp_" are Committed and
 239  * therefore are public. See PSARC 2010/080.
 240  */
 241 static mod_prop_info_t icmp_propinfo_tbl[] = {
 242         /* tunable - 0 */
 243         { "_wroff_extra", MOD_PROTO_RAWIP,
 244             mod_set_uint32, mod_get_uint32,
 245             {0, 128, 32}, {32} },
 246 
 247         { "_ipv4_ttl", MOD_PROTO_RAWIP,
 248             mod_set_uint32, mod_get_uint32,
 249             {1, 255, 255}, {255} },
 250 
 251         { "_ipv6_hoplimit", MOD_PROTO_RAWIP,
 252             mod_set_uint32, mod_get_uint32,
 253             {0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS},
 254             {IPV6_DEFAULT_HOPS} },
 255 
 256         { "_bsd_compat", MOD_PROTO_RAWIP,
 257             mod_set_boolean, mod_get_boolean,
 258             {B_TRUE}, {B_TRUE} },
 259 
 260         { "send_buf", MOD_PROTO_RAWIP,
 261             icmp_set_buf_prop, icmp_get_buf_prop,
 262             {4096, 65536, 8192}, {8192} },
 263 
 264         { "_xmit_lowat", MOD_PROTO_RAWIP,
 265             mod_set_uint32, mod_get_uint32,
 266             {0, 65536, 1024}, {1024} },
 267 
 268         { "recv_buf", MOD_PROTO_RAWIP,
 269             icmp_set_buf_prop, icmp_get_buf_prop,
 270             {4096, 65536, 8192}, {8192} },
 271 
 272         { "max_buf", MOD_PROTO_RAWIP,
 273             mod_set_uint32, mod_get_uint32,
 274             {65536, ULP_MAX_BUF, 256*1024}, {256*1024} },
 275 
 276         { "_pmtu_discovery", MOD_PROTO_RAWIP,
 277             mod_set_boolean, mod_get_boolean,
 278             {B_FALSE}, {B_FALSE} },
 279 
 280         { "_sendto_ignerr", MOD_PROTO_RAWIP,
 281             mod_set_boolean, mod_get_boolean,
 282             {B_FALSE}, {B_FALSE} },
 283 
 284         { "?", MOD_PROTO_RAWIP, NULL, mod_get_allprop, {0}, {0} },
 285 
 286         { NULL, 0, NULL, NULL, {0}, {0} }
 287 };
 288 
 289 #define is_wroff_extra                  is_propinfo_tbl[0].prop_cur_uval
 290 #define is_ipv4_ttl                     is_propinfo_tbl[1].prop_cur_uval
 291 #define is_ipv6_hoplimit                is_propinfo_tbl[2].prop_cur_uval
 292 #define is_bsd_compat                   is_propinfo_tbl[3].prop_cur_bval
 293 #define is_xmit_hiwat                   is_propinfo_tbl[4].prop_cur_uval
 294 #define is_xmit_lowat                   is_propinfo_tbl[5].prop_cur_uval
 295 #define is_recv_hiwat                   is_propinfo_tbl[6].prop_cur_uval
 296 #define is_max_buf                      is_propinfo_tbl[7].prop_cur_uval
 297 #define is_pmtu_discovery               is_propinfo_tbl[8].prop_cur_bval
 298 #define is_sendto_ignerr                is_propinfo_tbl[9].prop_cur_bval
 299 
 300 typedef union T_primitives *t_primp_t;
 301 
 302 /*
 303  * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
 304  * passed to icmp_wput.
 305  * It calls IP to verify the local IP address, and calls IP to insert
 306  * the conn_t in the fanout table.
 307  * If everything is ok it then sends the T_BIND_ACK back up.
 308  */
 309 static void
 310 icmp_tpi_bind(queue_t *q, mblk_t *mp)
 311 {
 312         int     error;
 313         struct sockaddr *sa;
 314         struct T_bind_req *tbr;
 315         socklen_t       len;
 316         sin_t   *sin;
 317         sin6_t  *sin6;
 318         icmp_t          *icmp;
 319         conn_t  *connp = Q_TO_CONN(q);
 320         mblk_t *mp1;
 321         cred_t *cr;
 322 
 323         /*
 324          * All Solaris components should pass a db_credp
 325          * for this TPI message, hence we ASSERT.
 326          * But in case there is some other M_PROTO that looks
 327          * like a TPI message sent by some other kernel
 328          * component, we check and return an error.
 329          */
 330         cr = msg_getcred(mp, NULL);
 331         ASSERT(cr != NULL);
 332         if (cr == NULL) {
 333                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
 334                 return;
 335         }
 336 
 337         icmp = connp->conn_icmp;
 338         if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
 339                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 340                     "icmp_bind: bad req, len %u",
 341                     (uint_t)(mp->b_wptr - mp->b_rptr));
 342                 icmp_err_ack(q, mp, TPROTO, 0);
 343                 return;
 344         }
 345 
 346         if (icmp->icmp_state != TS_UNBND) {
 347                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 348                     "icmp_bind: bad state, %u", icmp->icmp_state);
 349                 icmp_err_ack(q, mp, TOUTSTATE, 0);
 350                 return;
 351         }
 352 
 353         /*
 354          * Reallocate the message to make sure we have enough room for an
 355          * address.
 356          */
 357         mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
 358         if (mp1 == NULL) {
 359                 icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 360                 return;
 361         }
 362         mp = mp1;
 363 
 364         /* Reset the message type in preparation for shipping it back. */
 365         DB_TYPE(mp) = M_PCPROTO;
 366         tbr = (struct T_bind_req *)mp->b_rptr;
 367         len = tbr->ADDR_length;
 368         switch (len) {
 369         case 0: /* request for a generic port */
 370                 tbr->ADDR_offset = sizeof (struct T_bind_req);
 371                 if (connp->conn_family == AF_INET) {
 372                         tbr->ADDR_length = sizeof (sin_t);
 373                         sin = (sin_t *)&tbr[1];
 374                         *sin = sin_null;
 375                         sin->sin_family = AF_INET;
 376                         mp->b_wptr = (uchar_t *)&sin[1];
 377                         sa = (struct sockaddr *)sin;
 378                         len = sizeof (sin_t);
 379                 } else {
 380                         ASSERT(connp->conn_family == AF_INET6);
 381                         tbr->ADDR_length = sizeof (sin6_t);
 382                         sin6 = (sin6_t *)&tbr[1];
 383                         *sin6 = sin6_null;
 384                         sin6->sin6_family = AF_INET6;
 385                         mp->b_wptr = (uchar_t *)&sin6[1];
 386                         sa = (struct sockaddr *)sin6;
 387                         len = sizeof (sin6_t);
 388                 }
 389                 break;
 390 
 391         case sizeof (sin_t):    /* Complete IPv4 address */
 392                 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
 393                     sizeof (sin_t));
 394                 break;
 395 
 396         case sizeof (sin6_t):   /* Complete IPv6 address */
 397                 sa = (struct sockaddr *)mi_offset_param(mp,
 398                     tbr->ADDR_offset, sizeof (sin6_t));
 399                 break;
 400 
 401         default:
 402                 (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
 403                     "icmp_bind: bad ADDR_length %u", tbr->ADDR_length);
 404                 icmp_err_ack(q, mp, TBADADDR, 0);
 405                 return;
 406         }
 407 
 408         error = rawip_do_bind(connp, sa, len);
 409         if (error != 0) {
 410                 if (error > 0) {
 411                         icmp_err_ack(q, mp, TSYSERR, error);
 412                 } else {
 413                         icmp_err_ack(q, mp, -error, 0);
 414                 }
 415         } else {
 416                 tbr->PRIM_type = T_BIND_ACK;
 417                 qreply(q, mp);
 418         }
 419 }
 420 
 421 static int
 422 rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
 423 {
 424         sin_t           *sin;
 425         sin6_t          *sin6;
 426         icmp_t          *icmp = connp->conn_icmp;
 427         int             error = 0;
 428         ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 429         in_port_t       lport;          /* Network byte order */
 430         ipaddr_t        v4src;          /* Set if AF_INET */
 431         in6_addr_t      v6src;
 432         uint_t          scopeid = 0;
 433         zoneid_t        zoneid = IPCL_ZONEID(connp);
 434         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 435 
 436         if (sa == NULL || !OK_32PTR((char *)sa)) {
 437                 return (EINVAL);
 438         }
 439 
 440         switch (len) {
 441         case sizeof (sin_t):    /* Complete IPv4 address */
 442                 sin = (sin_t *)sa;
 443                 if (sin->sin_family != AF_INET ||
 444                     connp->conn_family != AF_INET) {
 445                         /* TSYSERR, EAFNOSUPPORT */
 446                         return (EAFNOSUPPORT);
 447                 }
 448                 v4src = sin->sin_addr.s_addr;
 449                 IN6_IPADDR_TO_V4MAPPED(v4src, &v6src);
 450                 if (v4src != INADDR_ANY) {
 451                         laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst,
 452                             B_TRUE);
 453                 }
 454                 lport = sin->sin_port;
 455                 break;
 456         case sizeof (sin6_t): /* Complete IPv6 address */
 457                 sin6 = (sin6_t *)sa;
 458                 if (sin6->sin6_family != AF_INET6 ||
 459                     connp->conn_family != AF_INET6) {
 460                         /* TSYSERR, EAFNOSUPPORT */
 461                         return (EAFNOSUPPORT);
 462                 }
 463                 /* No support for mapped addresses on raw sockets */
 464                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 465                         /* TSYSERR, EADDRNOTAVAIL */
 466                         return (EADDRNOTAVAIL);
 467                 }
 468                 v6src = sin6->sin6_addr;
 469                 if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 470                         if (IN6_IS_ADDR_LINKSCOPE(&v6src))
 471                                 scopeid = sin6->sin6_scope_id;
 472                         laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst,
 473                             B_TRUE, scopeid);
 474                 }
 475                 lport = sin6->sin6_port;
 476                 break;
 477 
 478         default:
 479                 /* TBADADDR */
 480                 return (EADDRNOTAVAIL);
 481         }
 482 
 483         /* Is the local address a valid unicast, multicast, or broadcast? */
 484         if (laddr_type == IPVL_BAD)
 485                 return (EADDRNOTAVAIL);
 486 
 487         /*
 488          * The state must be TS_UNBND.
 489          */
 490         mutex_enter(&connp->conn_lock);
 491         if (icmp->icmp_state != TS_UNBND) {
 492                 mutex_exit(&connp->conn_lock);
 493                 return (-TOUTSTATE);
 494         }
 495 
 496         /*
 497          * Copy the source address into our icmp structure.  This address
 498          * may still be zero; if so, ip will fill in the correct address
 499          * each time an outbound packet is passed to it.
 500          * If we are binding to a broadcast or multicast address then
 501          * we just set the conn_bound_addr since we don't want to use
 502          * that as the source address when sending.
 503          */
 504         connp->conn_bound_addr_v6 = v6src;
 505         connp->conn_laddr_v6 = v6src;
 506         if (scopeid != 0) {
 507                 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET;
 508                 connp->conn_ixa->ixa_scopeid = scopeid;
 509                 connp->conn_incoming_ifindex = scopeid;
 510         } else {
 511                 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 512                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 513         }
 514 
 515         switch (laddr_type) {
 516         case IPVL_UNICAST_UP:
 517         case IPVL_UNICAST_DOWN:
 518                 connp->conn_saddr_v6 = v6src;
 519                 connp->conn_mcbc_bind = B_FALSE;
 520                 break;
 521         case IPVL_MCAST:
 522         case IPVL_BCAST:
 523                 /* ip_set_destination will pick a source address later */
 524                 connp->conn_saddr_v6 = ipv6_all_zeros;
 525                 connp->conn_mcbc_bind = B_TRUE;
 526                 break;
 527         }
 528 
 529         /* Any errors after this point should use late_error */
 530 
 531         /*
 532          * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 533          * with IPPROTO_TCP.
 534          */
 535         connp->conn_lport = lport;
 536         connp->conn_fport = 0;
 537 
 538         if (connp->conn_family == AF_INET) {
 539                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
 540         } else {
 541                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 542         }
 543 
 544         icmp->icmp_state = TS_IDLE;
 545 
 546         /*
 547          * We create an initial header template here to make a subsequent
 548          * sendto have a starting point. Since conn_last_dst is zero the
 549          * first sendto will always follow the 'dst changed' code path.
 550          * Note that we defer massaging options and the related checksum
 551          * adjustment until we have a destination address.
 552          */
 553         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 554             &connp->conn_faddr_v6, connp->conn_flowinfo);
 555         if (error != 0) {
 556                 mutex_exit(&connp->conn_lock);
 557                 goto late_error;
 558         }
 559         /* Just in case */
 560         connp->conn_faddr_v6 = ipv6_all_zeros;
 561         connp->conn_v6lastdst = ipv6_all_zeros;
 562         mutex_exit(&connp->conn_lock);
 563 
 564         error = ip_laddr_fanout_insert(connp);
 565         if (error != 0)
 566                 goto late_error;
 567 
 568         /* Bind succeeded */
 569         return (0);
 570 
 571 late_error:
 572         mutex_enter(&connp->conn_lock);
 573         connp->conn_saddr_v6 = ipv6_all_zeros;
 574         connp->conn_bound_addr_v6 = ipv6_all_zeros;
 575         connp->conn_laddr_v6 = ipv6_all_zeros;
 576         if (scopeid != 0) {
 577                 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 578                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 579         }
 580         icmp->icmp_state = TS_UNBND;
 581         connp->conn_v6lastdst = ipv6_all_zeros;
 582         connp->conn_lport = 0;
 583 
 584         /* Restore the header that was built above - different source address */
 585         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 586             &connp->conn_faddr_v6, connp->conn_flowinfo);
 587         mutex_exit(&connp->conn_lock);
 588         return (error);
 589 }
 590 
 591 /*
 592  * Tell IP to just bind to the protocol.
 593  */
 594 static void
 595 icmp_bind_proto(icmp_t *icmp)
 596 {
 597         conn_t  *connp = icmp->icmp_connp;
 598 
 599         mutex_enter(&connp->conn_lock);
 600         connp->conn_saddr_v6 = ipv6_all_zeros;
 601         connp->conn_laddr_v6 = ipv6_all_zeros;
 602         connp->conn_faddr_v6 = ipv6_all_zeros;
 603         connp->conn_v6lastdst = ipv6_all_zeros;
 604         mutex_exit(&connp->conn_lock);
 605 
 606         (void) ip_laddr_fanout_insert(connp);
 607 }
 608 
 609 /*
 610  * This routine handles each T_CONN_REQ message passed to icmp.  It
 611  * associates a default destination address with the stream.
 612  *
 613  * After various error checks are completed, icmp_connect() lays
 614  * the target address and port into the composite header template.
 615  * Then we ask IP for information, including a source address if we didn't
 616  * already have one. Finally we send up the T_OK_ACK reply message.
 617  */
 618 static void
 619 icmp_tpi_connect(queue_t *q, mblk_t *mp)
 620 {
 621         conn_t  *connp = Q_TO_CONN(q);
 622         struct T_conn_req       *tcr;
 623         struct sockaddr *sa;
 624         socklen_t len;
 625         int error;
 626         cred_t *cr;
 627         pid_t pid;
 628         /*
 629          * All Solaris components should pass a db_credp
 630          * for this TPI message, hence we ASSERT.
 631          * But in case there is some other M_PROTO that looks
 632          * like a TPI message sent by some other kernel
 633          * component, we check and return an error.
 634          */
 635         cr = msg_getcred(mp, &pid);
 636         ASSERT(cr != NULL);
 637         if (cr == NULL) {
 638                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
 639                 return;
 640         }
 641 
 642         tcr = (struct T_conn_req *)mp->b_rptr;
 643         /* Sanity checks */
 644         if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
 645                 icmp_err_ack(q, mp, TPROTO, 0);
 646                 return;
 647         }
 648 
 649         if (tcr->OPT_length != 0) {
 650                 icmp_err_ack(q, mp, TBADOPT, 0);
 651                 return;
 652         }
 653 
 654         len = tcr->DEST_length;
 655 
 656         switch (len) {
 657         default:
 658                 icmp_err_ack(q, mp, TBADADDR, 0);
 659                 return;
 660         case sizeof (sin_t):
 661                 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
 662                     sizeof (sin_t));
 663                 break;
 664         case sizeof (sin6_t):
 665                 sa = (struct sockaddr *)mi_offset_param(mp,
 666                     tcr->DEST_offset, sizeof (sin6_t));
 667                 break;
 668         }
 669 
 670         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 671         if (error != 0) {
 672                 icmp_err_ack(q, mp, TSYSERR, error);
 673                 return;
 674         }
 675 
 676         error = rawip_do_connect(connp, sa, len, cr, pid);
 677         if (error != 0) {
 678                 if (error < 0) {
 679                         icmp_err_ack(q, mp, -error, 0);
 680                 } else {
 681                         icmp_err_ack(q, mp, 0, error);
 682                 }
 683         } else {
 684                 mblk_t *mp1;
 685 
 686                 /*
 687                  * We have to send a connection confirmation to
 688                  * keep TLI happy.
 689                  */
 690                 if (connp->conn_family == AF_INET) {
 691                         mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 692                             sizeof (sin_t), NULL, 0);
 693                 } else {
 694                         ASSERT(connp->conn_family == AF_INET6);
 695                         mp1 = mi_tpi_conn_con(NULL, (char *)sa,
 696                             sizeof (sin6_t), NULL, 0);
 697                 }
 698                 if (mp1 == NULL) {
 699                         icmp_err_ack(q, mp, TSYSERR, ENOMEM);
 700                         return;
 701                 }
 702 
 703                 /*
 704                  * Send ok_ack for T_CONN_REQ
 705                  */
 706                 mp = mi_tpi_ok_ack_alloc(mp);
 707                 if (mp == NULL) {
 708                         /* Unable to reuse the T_CONN_REQ for the ack. */
 709                         icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
 710                         return;
 711                 }
 712                 putnext(connp->conn_rq, mp);
 713                 putnext(connp->conn_rq, mp1);
 714         }
 715 }
 716 
 717 static int
 718 rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
 719     cred_t *cr, pid_t pid)
 720 {
 721         icmp_t          *icmp;
 722         sin_t           *sin;
 723         sin6_t          *sin6;
 724         int             error;
 725         uint16_t        dstport;
 726         ipaddr_t        v4dst;
 727         in6_addr_t      v6dst;
 728         uint32_t        flowinfo;
 729         ip_xmit_attr_t  *ixa;
 730         ip_xmit_attr_t  *oldixa;
 731         uint_t          scopeid = 0;
 732         uint_t          srcid = 0;
 733         in6_addr_t      v6src = connp->conn_saddr_v6;
 734 
 735         icmp = connp->conn_icmp;
 736 
 737         if (sa == NULL || !OK_32PTR((char *)sa)) {
 738                 return (EINVAL);
 739         }
 740 
 741         ASSERT(sa != NULL && len != 0);
 742 
 743         /*
 744          * Determine packet type based on type of address passed in
 745          * the request should contain an IPv4 or IPv6 address.
 746          * Make sure that address family matches the type of
 747          * family of the address passed down.
 748          */
 749         switch (len) {
 750         case sizeof (sin_t):
 751                 sin = (sin_t *)sa;
 752 
 753                 v4dst = sin->sin_addr.s_addr;
 754                 dstport = sin->sin_port;
 755                 IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 756                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
 757                 break;
 758 
 759         case sizeof (sin6_t):
 760                 sin6 = (sin6_t *)sa;
 761 
 762                 /* No support for mapped addresses on raw sockets */
 763                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 764                         return (EADDRNOTAVAIL);
 765                 }
 766                 v6dst = sin6->sin6_addr;
 767                 dstport = sin6->sin6_port;
 768                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 769                 flowinfo = sin6->sin6_flowinfo;
 770                 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))
 771                         scopeid = sin6->sin6_scope_id;
 772                 srcid = sin6->__sin6_src_id;
 773                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
 774                         /* Due to check above, we know sin6_addr is v6-only. */
 775                         if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
 776                             B_FALSE, connp->conn_netstack)) {
 777                                 /* Mismatch - v6src would be v4mapped. */
 778                                 return (EADDRNOTAVAIL);
 779                         }
 780                 }
 781                 break;
 782         }
 783 
 784         /*
 785          * If there is a different thread using conn_ixa then we get a new
 786          * copy and cut the old one loose from conn_ixa. Otherwise we use
 787          * conn_ixa and prevent any other thread from using/changing it.
 788          * Once connect() is done other threads can use conn_ixa since the
 789          * refcnt will be back at one.
 790          * We defer updating conn_ixa until later to handle any concurrent
 791          * conn_ixa_cleanup thread.
 792          */
 793         ixa = conn_get_ixa(connp, B_FALSE);
 794         if (ixa == NULL)
 795                 return (ENOMEM);
 796 
 797         mutex_enter(&connp->conn_lock);
 798         /*
 799          * This icmp_t must have bound already before doing a connect.
 800          * Reject if a connect is in progress (we drop conn_lock during
 801          * rawip_do_connect).
 802          */
 803         if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) {
 804                 mutex_exit(&connp->conn_lock);
 805                 ixa_refrele(ixa);
 806                 return (-TOUTSTATE);
 807         }
 808 
 809         if (icmp->icmp_state == TS_DATA_XFER) {
 810                 /* Already connected - clear out state */
 811                 if (connp->conn_mcbc_bind)
 812                         connp->conn_saddr_v6 = ipv6_all_zeros;
 813                 else
 814                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 815                 connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 816                 connp->conn_faddr_v6 = ipv6_all_zeros;
 817                 icmp->icmp_state = TS_IDLE;
 818         }
 819 
 820         /*
 821          * Use sin_port/sin6_port since applications like psh use SOCK_RAW
 822          * with IPPROTO_TCP.
 823          */
 824         connp->conn_fport = dstport;
 825         if (connp->conn_ipversion == IPV4_VERSION) {
 826                 /*
 827                  * Interpret a zero destination to mean loopback.
 828                  * Update the T_CONN_REQ (sin/sin6) since it is used to
 829                  * generate the T_CONN_CON.
 830                  */
 831                 if (v4dst == INADDR_ANY) {
 832                         v4dst = htonl(INADDR_LOOPBACK);
 833                         IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
 834                         ASSERT(connp->conn_family == AF_INET);
 835                         sin->sin_addr.s_addr = v4dst;
 836                 }
 837                 connp->conn_faddr_v6 = v6dst;
 838                 connp->conn_flowinfo = 0;
 839         } else {
 840                 ASSERT(connp->conn_ipversion == IPV6_VERSION);
 841                 /*
 842                  * Interpret a zero destination to mean loopback.
 843                  * Update the T_CONN_REQ (sin/sin6) since it is used to
 844                  * generate the T_CONN_CON.
 845                  */
 846                 if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
 847                         v6dst = ipv6_loopback;
 848                         sin6->sin6_addr = v6dst;
 849                 }
 850                 connp->conn_faddr_v6 = v6dst;
 851                 connp->conn_flowinfo = flowinfo;
 852         }
 853 
 854         /*
 855          * We update our cred/cpid based on the caller of connect
 856          */
 857         if (connp->conn_cred != cr) {
 858                 crhold(cr);
 859                 crfree(connp->conn_cred);
 860                 connp->conn_cred = cr;
 861         }
 862         connp->conn_cpid = pid;
 863         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
 864         ixa->ixa_cred = cr;
 865         ixa->ixa_cpid = pid;
 866         if (is_system_labeled()) {
 867                 /* We need to restart with a label based on the cred */
 868                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
 869         }
 870 
 871         if (scopeid != 0) {
 872                 ixa->ixa_flags |= IXAF_SCOPEID_SET;
 873                 ixa->ixa_scopeid = scopeid;
 874                 connp->conn_incoming_ifindex = scopeid;
 875         } else {
 876                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 877                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 878         }
 879 
 880         /*
 881          * conn_connect will drop conn_lock and reacquire it.
 882          * To prevent a send* from messing with this icmp_t while the lock
 883          * is dropped we set icmp_state and clear conn_v6lastdst.
 884          * That will make all send* fail with EISCONN.
 885          */
 886         connp->conn_v6lastdst = ipv6_all_zeros;
 887         icmp->icmp_state = TS_WCON_CREQ;
 888 
 889         error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC);
 890         mutex_exit(&connp->conn_lock);
 891         if (error != 0)
 892                 goto connect_failed;
 893 
 894         /*
 895          * The addresses have been verified. Time to insert in
 896          * the correct fanout list.
 897          */
 898         error = ipcl_conn_insert(connp);
 899         if (error != 0)
 900                 goto connect_failed;
 901 
 902         mutex_enter(&connp->conn_lock);
 903         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 904             &connp->conn_faddr_v6, connp->conn_flowinfo);
 905         if (error != 0) {
 906                 mutex_exit(&connp->conn_lock);
 907                 goto connect_failed;
 908         }
 909 
 910         icmp->icmp_state = TS_DATA_XFER;
 911         /* Record this as the "last" send even though we haven't sent any */
 912         connp->conn_v6lastdst = connp->conn_faddr_v6;
 913         connp->conn_lastipversion = connp->conn_ipversion;
 914         connp->conn_lastdstport = connp->conn_fport;
 915         connp->conn_lastflowinfo = connp->conn_flowinfo;
 916         connp->conn_lastscopeid = scopeid;
 917         connp->conn_lastsrcid = srcid;
 918         /* Also remember a source to use together with lastdst */
 919         connp->conn_v6lastsrc = v6src;
 920 
 921         oldixa = conn_replace_ixa(connp, ixa);
 922         mutex_exit(&connp->conn_lock);
 923         ixa_refrele(oldixa);
 924 
 925         ixa_refrele(ixa);
 926         return (0);
 927 
 928 connect_failed:
 929         if (ixa != NULL)
 930                 ixa_refrele(ixa);
 931         mutex_enter(&connp->conn_lock);
 932         icmp->icmp_state = TS_IDLE;
 933         /* In case the source address was set above */
 934         if (connp->conn_mcbc_bind)
 935                 connp->conn_saddr_v6 = ipv6_all_zeros;
 936         else
 937                 connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
 938         connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
 939         connp->conn_faddr_v6 = ipv6_all_zeros;
 940         connp->conn_v6lastdst = ipv6_all_zeros;
 941         connp->conn_flowinfo = 0;
 942 
 943         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
 944             &connp->conn_faddr_v6, connp->conn_flowinfo);
 945         mutex_exit(&connp->conn_lock);
 946         return (error);
 947 }
 948 
 949 static void
 950 rawip_do_close(conn_t *connp)
 951 {
 952         ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
 953 
 954         ip_quiesce_conn(connp);
 955 
 956         if (!IPCL_IS_NONSTR(connp)) {
 957                 qprocsoff(connp->conn_rq);
 958         }
 959 
 960         icmp_close_free(connp);
 961 
 962         /*
 963          * Now we are truly single threaded on this stream, and can
 964          * delete the things hanging off the connp, and finally the connp.
 965          * We removed this connp from the fanout list, it cannot be
 966          * accessed thru the fanouts, and we already waited for the
 967          * conn_ref to drop to 0. We are already in close, so
 968          * there cannot be any other thread from the top. qprocsoff
 969          * has completed, and service has completed or won't run in
 970          * future.
 971          */
 972         ASSERT(connp->conn_ref == 1);
 973 
 974         if (!IPCL_IS_NONSTR(connp)) {
 975                 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
 976         } else {
 977                 ip_free_helper_stream(connp);
 978         }
 979 
 980         connp->conn_ref--;
 981         ipcl_conn_destroy(connp);
 982 }
 983 
 984 static int
 985 icmp_close(queue_t *q, int flags)
 986 {
 987         conn_t  *connp;
 988 
 989         if (flags & SO_FALLBACK) {
 990                 /*
 991                  * stream is being closed while in fallback
 992                  * simply free the resources that were allocated
 993                  */
 994                 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
 995                 qprocsoff(q);
 996                 goto done;
 997         }
 998 
 999         connp = Q_TO_CONN(q);
1000         (void) rawip_do_close(connp);
1001 done:
1002         q->q_ptr = WR(q)->q_ptr = NULL;
1003         return (0);
1004 }
1005 
1006 static void
1007 icmp_close_free(conn_t *connp)
1008 {
1009         icmp_t *icmp = connp->conn_icmp;
1010 
1011         if (icmp->icmp_filter != NULL) {
1012                 kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
1013                 icmp->icmp_filter = NULL;
1014         }
1015 
1016         if (icmp->icmp_bpf_len != 0) {
1017                 kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
1018                 icmp->icmp_bpf_len = 0;
1019                 icmp->icmp_bpf_prog = NULL;
1020         }
1021 
1022         /*
1023          * Clear any fields which the kmem_cache constructor clears.
1024          * Only icmp_connp needs to be preserved.
1025          * TBD: We should make this more efficient to avoid clearing
1026          * everything.
1027          */
1028         ASSERT(icmp->icmp_connp == connp);
1029         bzero(icmp, sizeof (icmp_t));
1030         icmp->icmp_connp = connp;
1031 }
1032 
1033 /*
1034  * This routine handles each T_DISCON_REQ message passed to icmp
1035  * as an indicating that ICMP is no longer connected. This results
1036  * in telling IP to restore the binding to just the local address.
1037  */
1038 static int
1039 icmp_do_disconnect(conn_t *connp)
1040 {
1041         icmp_t  *icmp = connp->conn_icmp;
1042         int     error;
1043 
1044         mutex_enter(&connp->conn_lock);
1045         if (icmp->icmp_state != TS_DATA_XFER) {
1046                 mutex_exit(&connp->conn_lock);
1047                 return (-TOUTSTATE);
1048         }
1049         if (connp->conn_mcbc_bind)
1050                 connp->conn_saddr_v6 = ipv6_all_zeros;
1051         else
1052                 connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
1053         connp->conn_laddr_v6 = connp->conn_bound_addr_v6;
1054         connp->conn_faddr_v6 = ipv6_all_zeros;
1055         icmp->icmp_state = TS_IDLE;
1056 
1057         connp->conn_v6lastdst = ipv6_all_zeros;
1058         error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
1059             &connp->conn_faddr_v6, connp->conn_flowinfo);
1060         mutex_exit(&connp->conn_lock);
1061         if (error != 0)
1062                 return (error);
1063 
1064         /*
1065          * Tell IP to remove the full binding and revert
1066          * to the local address binding.
1067          */
1068         return (ip_laddr_fanout_insert(connp));
1069 }
1070 
1071 static void
1072 icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
1073 {
1074         conn_t  *connp = Q_TO_CONN(q);
1075         int     error;
1076 
1077         /*
1078          * Allocate the largest primitive we need to send back
1079          * T_error_ack is > than T_ok_ack
1080          */
1081         mp = reallocb(mp, sizeof (struct T_error_ack), 1);
1082         if (mp == NULL) {
1083                 /* Unable to reuse the T_DISCON_REQ for the ack. */
1084                 icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
1085                 return;
1086         }
1087 
1088         error = icmp_do_disconnect(connp);
1089 
1090         if (error != 0) {
1091                 if (error > 0) {
1092                         icmp_err_ack(q, mp, 0, error);
1093                 } else {
1094                         icmp_err_ack(q, mp, -error, 0);
1095                 }
1096         } else {
1097                 mp = mi_tpi_ok_ack_alloc(mp);
1098                 ASSERT(mp != NULL);
1099                 qreply(q, mp);
1100         }
1101 }
1102 
1103 static int
1104 icmp_disconnect(conn_t *connp)
1105 {
1106         int     error;
1107 
1108         connp->conn_dgram_errind = B_FALSE;
1109 
1110         error = icmp_do_disconnect(connp);
1111 
1112         if (error < 0)
1113                 error = proto_tlitosyserr(-error);
1114         return (error);
1115 }
1116 
1117 /* This routine creates a T_ERROR_ACK message and passes it upstream. */
1118 static void
1119 icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
1120 {
1121         if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
1122                 qreply(q, mp);
1123 }
1124 
1125 /* Shorthand to generate and send TPI error acks to our client */
1126 static void
1127 icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
1128     t_scalar_t t_error, int sys_error)
1129 {
1130         struct T_error_ack      *teackp;
1131 
1132         if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
1133             M_PCPROTO, T_ERROR_ACK)) != NULL) {
1134                 teackp = (struct T_error_ack *)mp->b_rptr;
1135                 teackp->ERROR_prim = primitive;
1136                 teackp->TLI_error = t_error;
1137                 teackp->UNIX_error = sys_error;
1138                 qreply(q, mp);
1139         }
1140 }
1141 
1142 /*
1143  * icmp_icmp_input is called as conn_recvicmp to process ICMP messages.
1144  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1145  * Assumes that IP has pulled up everything up to and including the ICMP header.
1146  */
1147 /* ARGSUSED2 */
1148 static void
1149 icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
1150 {
1151         conn_t          *connp = (conn_t *)arg1;
1152         icmp_t          *icmp = connp->conn_icmp;
1153         icmph_t         *icmph;
1154         ipha_t          *ipha;
1155         int             iph_hdr_length;
1156         sin_t           sin;
1157         mblk_t          *mp1;
1158         int             error = 0;
1159 
1160         ipha = (ipha_t *)mp->b_rptr;
1161 
1162         ASSERT(OK_32PTR(mp->b_rptr));
1163 
1164         if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
1165                 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
1166                 icmp_icmp_error_ipv6(connp, mp, ira);
1167                 return;
1168         }
1169         ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
1170 
1171         /* Skip past the outer IP and ICMP headers */
1172         ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length);
1173         iph_hdr_length = ira->ira_ip_hdr_length;
1174         icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
1175         ipha = (ipha_t *)&icmph[1]; /* Inner IP header */
1176 
1177         iph_hdr_length = IPH_HDR_LENGTH(ipha);
1178 
1179         switch (icmph->icmph_type) {
1180         case ICMP_DEST_UNREACHABLE:
1181                 switch (icmph->icmph_code) {
1182                 case ICMP_FRAGMENTATION_NEEDED: {
1183                         ipha_t          *ipha;
1184                         ip_xmit_attr_t  *ixa;
1185                         /*
1186                          * IP has already adjusted the path MTU.
1187                          * But we need to adjust DF for IPv4.
1188                          */
1189                         if (connp->conn_ipversion != IPV4_VERSION)
1190                                 break;
1191 
1192                         ixa = conn_get_ixa(connp, B_FALSE);
1193                         if (ixa == NULL || ixa->ixa_ire == NULL) {
1194                                 /*
1195                                  * Some other thread holds conn_ixa. We will
1196                                  * redo this on the next ICMP too big.
1197                                  */
1198                                 if (ixa != NULL)
1199                                         ixa_refrele(ixa);
1200                                 break;
1201                         }
1202                         (void) ip_get_pmtu(ixa);
1203 
1204                         mutex_enter(&connp->conn_lock);
1205                         ipha = (ipha_t *)connp->conn_ht_iphc;
1206                         if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
1207                                 ipha->ipha_fragment_offset_and_flags |=
1208                                     IPH_DF_HTONS;
1209                         } else {
1210                                 ipha->ipha_fragment_offset_and_flags &=
1211                                     ~IPH_DF_HTONS;
1212                         }
1213                         mutex_exit(&connp->conn_lock);
1214                         ixa_refrele(ixa);
1215                         break;
1216                 }
1217                 case ICMP_PORT_UNREACHABLE:
1218                 case ICMP_PROTOCOL_UNREACHABLE:
1219                         error = ECONNREFUSED;
1220                         break;
1221                 default:
1222                         /* Transient errors */
1223                         break;
1224                 }
1225                 break;
1226         default:
1227                 /* Transient errors */
1228                 break;
1229         }
1230         if (error == 0) {
1231                 freemsg(mp);
1232                 return;
1233         }
1234 
1235         /*
1236          * Deliver T_UDERROR_IND when the application has asked for it.
1237          * The socket layer enables this automatically when connected.
1238          */
1239         if (!connp->conn_dgram_errind) {
1240                 freemsg(mp);
1241                 return;
1242         }
1243 
1244         sin = sin_null;
1245         sin.sin_family = AF_INET;
1246         sin.sin_addr.s_addr = ipha->ipha_dst;
1247 
1248         if (IPCL_IS_NONSTR(connp)) {
1249                 mutex_enter(&connp->conn_lock);
1250                 if (icmp->icmp_state == TS_DATA_XFER) {
1251                         if (sin.sin_addr.s_addr == connp->conn_faddr_v4) {
1252                                 mutex_exit(&connp->conn_lock);
1253                                 (*connp->conn_upcalls->su_set_error)
1254                                     (connp->conn_upper_handle, error);
1255                                 goto done;
1256                         }
1257                 } else {
1258                         icmp->icmp_delayed_error = error;
1259                         *((sin_t *)&icmp->icmp_delayed_addr) = sin;
1260                 }
1261                 mutex_exit(&connp->conn_lock);
1262         } else {
1263                 mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
1264                     error);
1265                 if (mp1 != NULL)
1266                         putnext(connp->conn_rq, mp1);
1267         }
1268 done:
1269         freemsg(mp);
1270 }
1271 
1272 /*
1273  * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6.
1274  * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors.
1275  * Assumes that IP has pulled up all the extension headers as well as the
1276  * ICMPv6 header.
1277  */
1278 static void
1279 icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira)
1280 {
1281         icmp6_t         *icmp6;
1282         ip6_t           *ip6h, *outer_ip6h;
1283         uint16_t        iph_hdr_length;
1284         uint8_t         *nexthdrp;
1285         sin6_t          sin6;
1286         mblk_t          *mp1;
1287         int             error = 0;
1288         icmp_t          *icmp = connp->conn_icmp;
1289 
1290         outer_ip6h = (ip6_t *)mp->b_rptr;
1291 #ifdef DEBUG
1292         if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
1293                 iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h);
1294         else
1295                 iph_hdr_length = IPV6_HDR_LEN;
1296         ASSERT(iph_hdr_length == ira->ira_ip_hdr_length);
1297 #endif
1298         /* Skip past the outer IP and ICMP headers */
1299         iph_hdr_length = ira->ira_ip_hdr_length;
1300         icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
1301 
1302         ip6h = (ip6_t *)&icmp6[1];  /* Inner IP header */
1303         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) {
1304                 freemsg(mp);
1305                 return;
1306         }
1307 
1308         switch (icmp6->icmp6_type) {
1309         case ICMP6_DST_UNREACH:
1310                 switch (icmp6->icmp6_code) {
1311                 case ICMP6_DST_UNREACH_NOPORT:
1312                         error = ECONNREFUSED;
1313                         break;
1314                 case ICMP6_DST_UNREACH_ADMIN:
1315                 case ICMP6_DST_UNREACH_NOROUTE:
1316                 case ICMP6_DST_UNREACH_BEYONDSCOPE:
1317                 case ICMP6_DST_UNREACH_ADDR:
1318                         /* Transient errors */
1319                         break;
1320                 default:
1321                         break;
1322                 }
1323                 break;
1324         case ICMP6_PACKET_TOO_BIG: {
1325                 struct T_unitdata_ind   *tudi;
1326                 struct T_opthdr         *toh;
1327                 size_t                  udi_size;
1328                 mblk_t                  *newmp;
1329                 t_scalar_t              opt_length = sizeof (struct T_opthdr) +
1330                     sizeof (struct ip6_mtuinfo);
1331                 sin6_t                  *sin6;
1332                 struct ip6_mtuinfo      *mtuinfo;
1333 
1334                 /*
1335                  * If the application has requested to receive path mtu
1336                  * information, send up an empty message containing an
1337                  * IPV6_PATHMTU ancillary data item.
1338                  */
1339                 if (!connp->conn_ipv6_recvpathmtu)
1340                         break;
1341 
1342                 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) +
1343                     opt_length;
1344                 if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) {
1345                         BUMP_MIB(&icmp->icmp_is->is_rawip_mib, rawipInErrors);
1346                         break;
1347                 }
1348 
1349                 /*
1350                  * newmp->b_cont is left to NULL on purpose.  This is an
1351                  * empty message containing only ancillary data.
1352                  */
1353                 newmp->b_datap->db_type = M_PROTO;
1354                 tudi = (struct T_unitdata_ind *)newmp->b_rptr;
1355                 newmp->b_wptr = (uchar_t *)tudi + udi_size;
1356                 tudi->PRIM_type = T_UNITDATA_IND;
1357                 tudi->SRC_length = sizeof (sin6_t);
1358                 tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1359                 tudi->OPT_offset = tudi->SRC_offset + sizeof (sin6_t);
1360                 tudi->OPT_length = opt_length;
1361 
1362                 sin6 = (sin6_t *)&tudi[1];
1363                 bzero(sin6, sizeof (sin6_t));
1364                 sin6->sin6_family = AF_INET6;
1365                 sin6->sin6_addr = connp->conn_faddr_v6;
1366 
1367                 toh = (struct T_opthdr *)&sin6[1];
1368                 toh->level = IPPROTO_IPV6;
1369                 toh->name = IPV6_PATHMTU;
1370                 toh->len = opt_length;
1371                 toh->status = 0;
1372 
1373                 mtuinfo = (struct ip6_mtuinfo *)&toh[1];
1374                 bzero(mtuinfo, sizeof (struct ip6_mtuinfo));
1375                 mtuinfo->ip6m_addr.sin6_family = AF_INET6;
1376                 mtuinfo->ip6m_addr.sin6_addr = ip6h->ip6_dst;
1377                 mtuinfo->ip6m_mtu = icmp6->icmp6_mtu;
1378                 /*
1379                  * We've consumed everything we need from the original
1380                  * message.  Free it, then send our empty message.
1381                  */
1382                 freemsg(mp);
1383                 icmp_ulp_recv(connp, newmp, msgdsize(newmp));
1384                 return;
1385         }
1386         case ICMP6_TIME_EXCEEDED:
1387                 /* Transient errors */
1388                 break;
1389         case ICMP6_PARAM_PROB:
1390                 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
1391                 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
1392                     (uchar_t *)ip6h + icmp6->icmp6_pptr ==
1393                     (uchar_t *)nexthdrp) {
1394                         error = ECONNREFUSED;
1395                         break;
1396                 }
1397                 break;
1398         }
1399         if (error == 0) {
1400                 freemsg(mp);
1401                 return;
1402         }
1403 
1404         /*
1405          * Deliver T_UDERROR_IND when the application has asked for it.
1406          * The socket layer enables this automatically when connected.
1407          */
1408         if (!connp->conn_dgram_errind) {
1409                 freemsg(mp);
1410                 return;
1411         }
1412 
1413         sin6 = sin6_null;
1414         sin6.sin6_family = AF_INET6;
1415         sin6.sin6_addr = ip6h->ip6_dst;
1416         sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
1417         if (IPCL_IS_NONSTR(connp)) {
1418                 mutex_enter(&connp->conn_lock);
1419                 if (icmp->icmp_state == TS_DATA_XFER) {
1420                         if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
1421                             &connp->conn_faddr_v6)) {
1422                                 mutex_exit(&connp->conn_lock);
1423                                 (*connp->conn_upcalls->su_set_error)
1424                                     (connp->conn_upper_handle, error);
1425                                 goto done;
1426                         }
1427                 } else {
1428                         icmp->icmp_delayed_error = error;
1429                         *((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
1430                 }
1431                 mutex_exit(&connp->conn_lock);
1432         } else {
1433                 mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
1434                     NULL, 0, error);
1435                 if (mp1 != NULL)
1436                         putnext(connp->conn_rq, mp1);
1437         }
1438 done:
1439         freemsg(mp);
1440 }
1441 
1442 /*
1443  * This routine responds to T_ADDR_REQ messages.  It is called by icmp_wput.
1444  * The local address is filled in if endpoint is bound. The remote address
1445  * is filled in if remote address has been precified ("connected endpoint")
1446  * (The concept of connected CLTS sockets is alien to published TPI
1447  *  but we support it anyway).
1448  */
1449 static void
1450 icmp_addr_req(queue_t *q, mblk_t *mp)
1451 {
1452         struct sockaddr *sa;
1453         mblk_t  *ackmp;
1454         struct T_addr_ack *taa;
1455         icmp_t  *icmp = Q_TO_ICMP(q);
1456         conn_t  *connp = icmp->icmp_connp;
1457         uint_t  addrlen;
1458 
1459         /* Make it large enough for worst case */
1460         ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
1461             2 * sizeof (sin6_t), 1);
1462         if (ackmp == NULL) {
1463                 icmp_err_ack(q, mp, TSYSERR, ENOMEM);
1464                 return;
1465         }
1466         taa = (struct T_addr_ack *)ackmp->b_rptr;
1467 
1468         bzero(taa, sizeof (struct T_addr_ack));
1469         ackmp->b_wptr = (uchar_t *)&taa[1];
1470 
1471         taa->PRIM_type = T_ADDR_ACK;
1472         ackmp->b_datap->db_type = M_PCPROTO;
1473 
1474         if (connp->conn_family == AF_INET)
1475                 addrlen = sizeof (sin_t);
1476         else
1477                 addrlen = sizeof (sin6_t);
1478 
1479         mutex_enter(&connp->conn_lock);
1480         /*
1481          * Note: Following code assumes 32 bit alignment of basic
1482          * data structures like sin_t and struct T_addr_ack.
1483          */
1484         if (icmp->icmp_state != TS_UNBND) {
1485                 /*
1486                  * Fill in local address first
1487                  */
1488                 taa->LOCADDR_offset = sizeof (*taa);
1489                 taa->LOCADDR_length = addrlen;
1490                 sa = (struct sockaddr *)&taa[1];
1491                 (void) conn_getsockname(connp, sa, &addrlen);
1492                 ackmp->b_wptr += addrlen;
1493         }
1494         if (icmp->icmp_state == TS_DATA_XFER) {
1495                 /*
1496                  * connected, fill remote address too
1497                  */
1498                 taa->REMADDR_length = addrlen;
1499                 /* assumed 32-bit alignment */
1500                 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
1501                 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
1502                 (void) conn_getpeername(connp, sa, &addrlen);
1503                 ackmp->b_wptr += addrlen;
1504         }
1505         mutex_exit(&connp->conn_lock);
1506         ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
1507         qreply(q, ackmp);
1508 }
1509 
1510 static void
1511 icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
1512 {
1513         conn_t          *connp = icmp->icmp_connp;
1514 
1515         *tap = icmp_g_t_info_ack;
1516 
1517         if (connp->conn_family == AF_INET6)
1518                 tap->ADDR_size = sizeof (sin6_t);
1519         else
1520                 tap->ADDR_size = sizeof (sin_t);
1521         tap->CURRENT_state = icmp->icmp_state;
1522         tap->OPT_size = icmp_max_optsize;
1523 }
1524 
1525 static void
1526 icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
1527     t_uscalar_t cap_bits1)
1528 {
1529         tcap->CAP_bits1 = 0;
1530 
1531         if (cap_bits1 & TC1_INFO) {
1532                 icmp_copy_info(&tcap->INFO_ack, icmp);
1533                 tcap->CAP_bits1 |= TC1_INFO;
1534         }
1535 }
1536 
1537 /*
1538  * This routine responds to T_CAPABILITY_REQ messages.  It is called by
1539  * icmp_wput.  Much of the T_CAPABILITY_ACK information is copied from
1540  * icmp_g_t_info_ack.  The current state of the stream is copied from
1541  * icmp_state.
1542  */
1543 static void
1544 icmp_capability_req(queue_t *q, mblk_t *mp)
1545 {
1546         icmp_t                  *icmp = Q_TO_ICMP(q);
1547         t_uscalar_t             cap_bits1;
1548         struct T_capability_ack *tcap;
1549 
1550         cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
1551 
1552         mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
1553             mp->b_datap->db_type, T_CAPABILITY_ACK);
1554         if (!mp)
1555                 return;
1556 
1557         tcap = (struct T_capability_ack *)mp->b_rptr;
1558 
1559         icmp_do_capability_ack(icmp, tcap, cap_bits1);
1560 
1561         qreply(q, mp);
1562 }
1563 
1564 /*
1565  * This routine responds to T_INFO_REQ messages.  It is called by icmp_wput.
1566  * Most of the T_INFO_ACK information is copied from icmp_g_t_info_ack.
1567  * The current state of the stream is copied from icmp_state.
1568  */
1569 static void
1570 icmp_info_req(queue_t *q, mblk_t *mp)
1571 {
1572         icmp_t  *icmp = Q_TO_ICMP(q);
1573 
1574         /* Create a T_INFO_ACK message. */
1575         mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
1576             T_INFO_ACK);
1577         if (!mp)
1578                 return;
1579         icmp_copy_info((struct T_info_ack *)mp->b_rptr, icmp);
1580         qreply(q, mp);
1581 }
1582 
1583 static int
1584 icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
1585     int family)
1586 {
1587         conn_t *connp;
1588         dev_t   conn_dev;
1589         int     error;
1590 
1591         /* If the stream is already open, return immediately. */
1592         if (q->q_ptr != NULL)
1593                 return (0);
1594 
1595         if (sflag == MODOPEN)
1596                 return (EINVAL);
1597 
1598         /*
1599          * Since ICMP is not used so heavily, allocating from the small
1600          * arena should be sufficient.
1601          */
1602         if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
1603                 return (EBUSY);
1604         }
1605 
1606         if (flag & SO_FALLBACK) {
1607                 /*
1608                  * Non streams socket needs a stream to fallback to
1609                  */
1610                 RD(q)->q_ptr = (void *)conn_dev;
1611                 WR(q)->q_qinfo = &icmp_fallback_sock_winit;
1612                 WR(q)->q_ptr = (void *)ip_minor_arena_sa;
1613                 qprocson(q);
1614                 return (0);
1615         }
1616 
1617         connp = rawip_do_open(family, credp, &error, KM_SLEEP);
1618         if (connp == NULL) {
1619                 ASSERT(error != 0);
1620                 inet_minor_free(ip_minor_arena_sa, conn_dev);
1621                 return (error);
1622         }
1623 
1624         *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
1625         connp->conn_dev = conn_dev;
1626         connp->conn_minor_arena = ip_minor_arena_sa;
1627 
1628         /*
1629          * Initialize the icmp_t structure for this stream.
1630          */
1631         q->q_ptr = connp;
1632         WR(q)->q_ptr = connp;
1633         connp->conn_rq = q;
1634         connp->conn_wq = WR(q);
1635 
1636         WR(q)->q_hiwat = connp->conn_sndbuf;
1637         WR(q)->q_lowat = connp->conn_sndlowat;
1638 
1639         qprocson(q);
1640 
1641         /* Set the Stream head write offset. */
1642         (void) proto_set_tx_wroff(q, connp, connp->conn_wroff);
1643         (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf);
1644 
1645         mutex_enter(&connp->conn_lock);
1646         connp->conn_state_flags &= ~CONN_INCIPIENT;
1647         mutex_exit(&connp->conn_lock);
1648 
1649         icmp_bind_proto(connp->conn_icmp);
1650 
1651         return (0);
1652 }
1653 
1654 /* For /dev/icmp aka AF_INET open */
1655 static int
1656 icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1657 {
1658         return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
1659 }
1660 
1661 /* For /dev/icmp6 aka AF_INET6 open */
1662 static int
1663 icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
1664 {
1665         return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
1666 }
1667 
1668 /*
1669  * This is the open routine for icmp.  It allocates a icmp_t structure for
1670  * the stream and, on the first open of the module, creates an ND table.
1671  */
1672 static conn_t *
1673 rawip_do_open(int family, cred_t *credp, int *err, int flags)
1674 {
1675         icmp_t  *icmp;
1676         conn_t *connp;
1677         zoneid_t zoneid;
1678         netstack_t *ns;
1679         icmp_stack_t *is;
1680         int len;
1681         boolean_t isv6 = B_FALSE;
1682 
1683         *err = secpolicy_net_icmpaccess(credp);
1684         if (*err != 0)
1685                 return (NULL);
1686 
1687         if (family == AF_INET6)
1688                 isv6 = B_TRUE;
1689 
1690         ns = netstack_find_by_cred(credp);
1691         ASSERT(ns != NULL);
1692         is = ns->netstack_icmp;
1693         ASSERT(is != NULL);
1694 
1695         /*
1696          * For exclusive stacks we set the zoneid to zero
1697          * to make ICMP operate as if in the global zone.
1698          */
1699         if (ns->netstack_stackid != GLOBAL_NETSTACKID)
1700                 zoneid = GLOBAL_ZONEID;
1701         else
1702                 zoneid = crgetzoneid(credp);
1703 
1704         ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
1705 
1706         connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
1707         icmp = connp->conn_icmp;
1708 
1709         /*
1710          * ipcl_conn_create did a netstack_hold. Undo the hold that was
1711          * done by netstack_find_by_cred()
1712          */
1713         netstack_rele(ns);
1714 
1715         /*
1716          * Since this conn_t/icmp_t is not yet visible to anybody else we don't
1717          * need to lock anything.
1718          */
1719         ASSERT(connp->conn_proto == IPPROTO_ICMP);
1720         ASSERT(connp->conn_icmp == icmp);
1721         ASSERT(icmp->icmp_connp == connp);
1722 
1723         /* Set the initial state of the stream and the privilege status. */
1724         icmp->icmp_state = TS_UNBND;
1725         connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
1726         if (isv6) {
1727                 connp->conn_family = AF_INET6;
1728                 connp->conn_ipversion = IPV6_VERSION;
1729                 connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
1730                 connp->conn_proto = IPPROTO_ICMPV6;
1731                 /* May be changed by a SO_PROTOTYPE socket option. */
1732                 connp->conn_proto = IPPROTO_ICMPV6;
1733                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1734                 connp->conn_ixa->ixa_raw_cksum_offset = 2;
1735                 connp->conn_default_ttl = is->is_ipv6_hoplimit;
1736                 len = sizeof (ip6_t);
1737         } else {
1738                 connp->conn_family = AF_INET;
1739                 connp->conn_ipversion = IPV4_VERSION;
1740                 connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
1741                 /* May be changed by a SO_PROTOTYPE socket option. */
1742                 connp->conn_proto = IPPROTO_ICMP;
1743                 connp->conn_ixa->ixa_protocol = connp->conn_proto;
1744                 connp->conn_default_ttl = is->is_ipv4_ttl;
1745                 len = sizeof (ipha_t);
1746         }
1747         connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl;
1748 
1749         connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1750 
1751         /*
1752          * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set,
1753          * the checksum is provided in the pre-built packet. We clear
1754          * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a
1755          * complete IP header and not to compute the transport checksum.
1756          */
1757         connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
1758         /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
1759         connp->conn_ixa->ixa_zoneid = zoneid;
1760 
1761         connp->conn_zoneid = zoneid;
1762 
1763         /*
1764          * If the caller has the process-wide flag set, then default to MAC
1765          * exempt mode.  This allows read-down to unlabeled hosts.
1766          */
1767         if (getpflags(NET_MAC_AWARE, credp) != 0)
1768                 connp->conn_mac_mode = CONN_MAC_AWARE;
1769 
1770         connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
1771 
1772         icmp->icmp_is = is;
1773 
1774         connp->conn_rcvbuf = is->is_recv_hiwat;
1775         connp->conn_sndbuf = is->is_xmit_hiwat;
1776         connp->conn_sndlowat = is->is_xmit_lowat;
1777         connp->conn_rcvlowat = icmp_mod_info.mi_lowat;
1778 
1779         connp->conn_wroff = len + is->is_wroff_extra;
1780         connp->conn_so_type = SOCK_RAW;
1781 
1782         connp->conn_recv = icmp_input;
1783         connp->conn_recvicmp = icmp_icmp_input;
1784         crhold(credp);
1785         connp->conn_cred = credp;
1786         connp->conn_cpid = curproc->p_pid;
1787         connp->conn_open_time = ddi_get_lbolt64();
1788         /* Cache things in ixa without an extra refhold */
1789         ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
1790         connp->conn_ixa->ixa_cred = connp->conn_cred;
1791         connp->conn_ixa->ixa_cpid = connp->conn_cpid;
1792         if (is_system_labeled())
1793                 connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
1794 
1795         connp->conn_flow_cntrld = B_FALSE;
1796 
1797         if (is->is_pmtu_discovery)
1798                 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
1799 
1800         return (connp);
1801 }
1802 
1803 /*
1804  * Which ICMP options OK to set through T_UNITDATA_REQ...
1805  */
1806 /* ARGSUSED */
1807 static boolean_t
1808 icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
1809 {
1810         return (B_TRUE);
1811 }
1812 
1813 /*
1814  * This routine gets default values of certain options whose default
1815  * values are maintained by protcol specific code
1816  */
1817 int
1818 icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
1819 {
1820         icmp_t *icmp = Q_TO_ICMP(q);
1821         icmp_stack_t *is = icmp->icmp_is;
1822         int *i1 = (int *)ptr;
1823 
1824         switch (level) {
1825         case IPPROTO_IP:
1826                 switch (name) {
1827                 case IP_MULTICAST_TTL:
1828                         *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL;
1829                         return (sizeof (uchar_t));
1830                 case IP_MULTICAST_LOOP:
1831                         *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP;
1832                         return (sizeof (uchar_t));
1833                 }
1834                 break;
1835         case IPPROTO_IPV6:
1836                 switch (name) {
1837                 case IPV6_MULTICAST_HOPS:
1838                         *i1 = IP_DEFAULT_MULTICAST_TTL;
1839                         return (sizeof (int));
1840                 case IPV6_MULTICAST_LOOP:
1841                         *i1 = IP_DEFAULT_MULTICAST_LOOP;
1842                         return (sizeof (int));
1843                 case IPV6_UNICAST_HOPS:
1844                         *i1 = is->is_ipv6_hoplimit;
1845                         return (sizeof (int));
1846                 }
1847                 break;
1848         case IPPROTO_ICMPV6:
1849                 switch (name) {
1850                 case ICMP6_FILTER:
1851                         /* Make it look like "pass all" */
1852                         ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1853                         return (sizeof (icmp6_filter_t));
1854                 }
1855                 break;
1856         }
1857         return (-1);
1858 }
1859 
1860 /*
1861  * This routine retrieves the current status of socket options.
1862  * It returns the size of the option retrieved, or -1.
1863  */
1864 int
1865 icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
1866 {
1867         icmp_t          *icmp = connp->conn_icmp;
1868         int             *i1 = (int *)ptr;
1869         conn_opt_arg_t  coas;
1870         int             retval;
1871 
1872         coas.coa_connp = connp;
1873         coas.coa_ixa = connp->conn_ixa;
1874         coas.coa_ipp = &connp->conn_xmit_ipp;
1875         coas.coa_ancillary = B_FALSE;
1876         coas.coa_changed = 0;
1877 
1878         /*
1879          * We assume that the optcom framework has checked for the set
1880          * of levels and names that are supported, hence we don't worry
1881          * about rejecting based on that.
1882          * First check for ICMP specific handling, then pass to common routine.
1883          */
1884         switch (level) {
1885         case IPPROTO_IP:
1886                 /*
1887                  * Only allow IPv4 option processing on IPv4 sockets.
1888                  */
1889                 if (connp->conn_family != AF_INET)
1890                         return (-1);
1891 
1892                 switch (name) {
1893                 case IP_OPTIONS:
1894                 case T_IP_OPTIONS:
1895                         /* Options are passed up with each packet */
1896                         return (0);
1897                 case IP_HDRINCL:
1898                         mutex_enter(&connp->conn_lock);
1899                         *i1 = (int)icmp->icmp_hdrincl;
1900                         mutex_exit(&connp->conn_lock);
1901                         return (sizeof (int));
1902                 }
1903                 break;
1904 
1905         case IPPROTO_IPV6:
1906                 /*
1907                  * Only allow IPv6 option processing on native IPv6 sockets.
1908                  */
1909                 if (connp->conn_family != AF_INET6)
1910                         return (-1);
1911 
1912                 switch (name) {
1913                 case IPV6_CHECKSUM:
1914                         /*
1915                          * Return offset or -1 if no checksum offset.
1916                          * Does not apply to IPPROTO_ICMPV6
1917                          */
1918                         if (connp->conn_proto == IPPROTO_ICMPV6)
1919                                 return (-1);
1920 
1921                         mutex_enter(&connp->conn_lock);
1922                         if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM)
1923                                 *i1 = connp->conn_ixa->ixa_raw_cksum_offset;
1924                         else
1925                                 *i1 = -1;
1926                         mutex_exit(&connp->conn_lock);
1927                         return (sizeof (int));
1928                 }
1929                 break;
1930 
1931         case IPPROTO_ICMPV6:
1932                 /*
1933                  * Only allow IPv6 option processing on native IPv6 sockets.
1934                  */
1935                 if (connp->conn_family != AF_INET6)
1936                         return (-1);
1937 
1938                 if (connp->conn_proto != IPPROTO_ICMPV6)
1939                         return (-1);
1940 
1941                 switch (name) {
1942                 case ICMP6_FILTER:
1943                         mutex_enter(&connp->conn_lock);
1944                         if (icmp->icmp_filter == NULL) {
1945                                 /* Make it look like "pass all" */
1946                                 ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr);
1947                         } else {
1948                                 (void) bcopy(icmp->icmp_filter, ptr,
1949                                     sizeof (icmp6_filter_t));
1950                         }
1951                         mutex_exit(&connp->conn_lock);
1952                         return (sizeof (icmp6_filter_t));
1953                 }
1954         }
1955         mutex_enter(&connp->conn_lock);
1956         retval = conn_opt_get(&coas, level, name, ptr);
1957         mutex_exit(&connp->conn_lock);
1958         return (retval);
1959 }
1960 
1961 /*
1962  * This routine retrieves the current status of socket options.
1963  * It returns the size of the option retrieved, or -1.
1964  */
1965 int
1966 icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
1967 {
1968         conn_t          *connp = Q_TO_CONN(q);
1969         int             err;
1970 
1971         err = icmp_opt_get(connp, level, name, ptr);
1972         return (err);
1973 }
1974 
1975 static int
1976 icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
1977 {
1978         struct bpf_program prog;
1979         ip_bpf_insn_t *insns = NULL;
1980         unsigned int size;
1981 
1982 #ifdef _LP64
1983         if (get_udatamodel() != DATAMODEL_NATIVE) {
1984                 struct bpf_program32 *prog32;
1985 
1986                 if (inlen != sizeof (struct bpf_program32)) {
1987                         return (EINVAL);
1988                 }
1989                 prog32 = (struct bpf_program32 *)invalp;
1990                 prog.bf_len = prog32->bf_len;
1991                 prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
1992         } else
1993 #endif
1994         if (inlen == sizeof (struct bpf_program)) {
1995                 bcopy(invalp, &prog, sizeof (prog));
1996         } else {
1997                 return (EINVAL);
1998         }
1999 
2000         if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
2001                 return (EINVAL);
2002         }
2003         size = prog.bf_len * sizeof (struct bpf_insn);
2004         insns = kmem_alloc(size, KM_SLEEP);
2005         if (copyin(prog.bf_insns, insns, size) != 0) {
2006                 kmem_free(insns, size);
2007                 return (EFAULT);
2008         }
2009         if (!ip_bpf_validate(insns, prog.bf_len)) {
2010                 kmem_free(insns, size);
2011                 return (EINVAL);
2012         }
2013 
2014         rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
2015         if (icmp->icmp_bpf_len != 0) {
2016                 ASSERT(icmp->icmp_bpf_prog != NULL);
2017 
2018                 kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
2019         }
2020         icmp->icmp_bpf_len = size;
2021         icmp->icmp_bpf_prog = insns;
2022         rw_exit(&icmp->icmp_bpf_lock);
2023         return (0);
2024 }
2025 
2026 static int
2027 icmp_detach_filter(icmp_t *icmp)
2028 {
2029         int error;
2030 
2031         rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
2032         if (icmp->icmp_bpf_len == 0) {
2033                 ASSERT(icmp->icmp_bpf_prog == NULL);
2034                 error = ENOENT;
2035         } else {
2036                 kmem_free(icmp->icmp_bpf_prog,
2037                     icmp->icmp_bpf_len);
2038                 icmp->icmp_bpf_len = 0;
2039                 icmp->icmp_bpf_prog = NULL;
2040                 error = 0;
2041         }
2042         rw_exit(&icmp->icmp_bpf_lock);
2043         return (error);
2044 }
2045 
2046 static boolean_t
2047 icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
2048 {
2049         boolean_t res;
2050         uchar_t *buf = mp->b_rptr;
2051         uint_t wirelen, len = MBLKL(mp);
2052 
2053         rw_enter(&icmp->icmp_bpf_lock, RW_READER);
2054         if (icmp->icmp_bpf_len == 0) {
2055                 rw_exit(&icmp->icmp_bpf_lock);
2056                 return (B_FALSE);
2057         }
2058         if (ira->ira_flags & IRAF_IS_IPV4) {
2059                 ipha_t *ipha = (ipha_t *)buf;
2060 
2061                 wirelen = ntohs(ipha->ipha_length);
2062         } else {
2063                 ip6_t *ip6h = (ip6_t *)buf;
2064 
2065                 wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
2066         }
2067         res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
2068         rw_exit(&icmp->icmp_bpf_lock);
2069 
2070         return (res);
2071 }
2072 
2073 /*
2074  * This routine sets socket options.
2075  */
2076 int
2077 icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
2078     uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly)
2079 {
2080         conn_t          *connp = coa->coa_connp;
2081         ip_xmit_attr_t  *ixa = coa->coa_ixa;
2082         icmp_t          *icmp = connp->conn_icmp;
2083         icmp_stack_t    *is = icmp->icmp_is;
2084         int             *i1 = (int *)invalp;
2085         boolean_t       onoff = (*i1 == 0) ? 0 : 1;
2086         int             error;
2087 
2088         ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock));
2089 
2090         /*
2091          * For fixed length options, no sanity check
2092          * of passed in length is done. It is assumed *_optcom_req()
2093          * routines do the right thing.
2094          */
2095 
2096         switch (level) {
2097         case SOL_SOCKET:
2098                 switch (name) {
2099                 case SO_PROTOTYPE:
2100                         if ((*i1 & 0xFF) != IPPROTO_ICMP &&
2101                             (*i1 & 0xFF) != IPPROTO_ICMPV6 &&
2102                             secpolicy_net_rawaccess(cr) != 0) {
2103                                 return (EACCES);
2104                         }
2105                         if (checkonly)
2106                                 break;
2107 
2108                         mutex_enter(&connp->conn_lock);
2109                         connp->conn_proto = *i1 & 0xFF;
2110                         ixa->ixa_protocol = connp->conn_proto;
2111                         if ((connp->conn_proto == IPPROTO_RAW ||
2112                             connp->conn_proto == IPPROTO_IGMP) &&
2113                             connp->conn_family == AF_INET) {
2114                                 icmp->icmp_hdrincl = 1;
2115                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2116                         } else if (connp->conn_proto == IPPROTO_UDP ||
2117                             connp->conn_proto == IPPROTO_TCP ||
2118                             connp->conn_proto == IPPROTO_SCTP) {
2119                                 /* Used by test applications like psh */
2120                                 icmp->icmp_hdrincl = 0;
2121                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2122                         } else {
2123                                 icmp->icmp_hdrincl = 0;
2124                                 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2125                         }
2126 
2127                         if (connp->conn_family == AF_INET6 &&
2128                             connp->conn_proto == IPPROTO_ICMPV6) {
2129                                 /* Set offset for icmp6_cksum */
2130                                 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2131                                 ixa->ixa_raw_cksum_offset = 2;
2132                         }
2133                         if (icmp->icmp_filter != NULL &&
2134                             connp->conn_proto != IPPROTO_ICMPV6) {
2135                                 kmem_free(icmp->icmp_filter,
2136                                     sizeof (icmp6_filter_t));
2137                                 icmp->icmp_filter = NULL;
2138                         }
2139                         mutex_exit(&connp->conn_lock);
2140 
2141                         coa->coa_changed |= COA_HEADER_CHANGED;
2142                         /*
2143                          * For SCTP, we don't use icmp_bind_proto() for
2144                          * raw socket binding.
2145                          */
2146                         if (connp->conn_proto == IPPROTO_SCTP)
2147                                 return (0);
2148 
2149                         coa->coa_changed |= COA_ICMP_BIND_NEEDED;
2150                         return (0);
2151 
2152                 case SO_SNDBUF:
2153                         if (*i1 > is->is_max_buf) {
2154                                 return (ENOBUFS);
2155                         }
2156                         break;
2157                 case SO_RCVBUF:
2158                         if (*i1 > is->is_max_buf) {
2159                                 return (ENOBUFS);
2160                         }
2161                         break;
2162                 case SO_ATTACH_FILTER:
2163                         return (icmp_attach_filter(icmp, inlen, invalp));
2164                 case SO_DETACH_FILTER:
2165                         return (icmp_detach_filter(icmp));
2166                 }
2167                 break;
2168 
2169         case IPPROTO_IP:
2170                 /*
2171                  * Only allow IPv4 option processing on IPv4 sockets.
2172                  */
2173                 if (connp->conn_family != AF_INET)
2174                         return (EINVAL);
2175 
2176                 switch (name) {
2177                 case IP_HDRINCL:
2178                         if (!checkonly) {
2179                                 mutex_enter(&connp->conn_lock);
2180                                 icmp->icmp_hdrincl = onoff;
2181                                 if (onoff)
2182                                         ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2183                                 else
2184                                         ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2185                                 mutex_exit(&connp->conn_lock);
2186                         }
2187                         break;
2188                 }
2189                 break;
2190 
2191         case IPPROTO_IPV6:
2192                 if (connp->conn_family != AF_INET6)
2193                         return (EINVAL);
2194 
2195                 switch (name) {
2196                 case IPV6_CHECKSUM:
2197                         /*
2198                          * Integer offset into the user data of where the
2199                          * checksum is located.
2200                          * Offset of -1 disables option.
2201                          * Does not apply to IPPROTO_ICMPV6.
2202                          */
2203                         if (connp->conn_proto == IPPROTO_ICMPV6 ||
2204                             coa->coa_ancillary) {
2205                                 return (EINVAL);
2206                         }
2207                         if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) {
2208                                 /* Negative or not 16 bit aligned offset */
2209                                 return (EINVAL);
2210                         }
2211                         if (checkonly)
2212                                 break;
2213 
2214                         mutex_enter(&connp->conn_lock);
2215                         if (*i1 == -1) {
2216                                 ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM;
2217                                 ixa->ixa_raw_cksum_offset = 0;
2218                                 ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM;
2219                         } else {
2220                                 ixa->ixa_flags |= IXAF_SET_RAW_CKSUM;
2221                                 ixa->ixa_raw_cksum_offset = *i1;
2222                                 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM;
2223                         }
2224                         mutex_exit(&connp->conn_lock);
2225                         break;
2226                 }
2227                 break;
2228 
2229         case IPPROTO_ICMPV6:
2230                 /*
2231                  * Only allow IPv6 option processing on IPv6 sockets.
2232                  */
2233                 if (connp->conn_family != AF_INET6)
2234                         return (EINVAL);
2235                 if (connp->conn_proto != IPPROTO_ICMPV6)
2236                         return (EINVAL);
2237 
2238                 switch (name) {
2239                 case ICMP6_FILTER:
2240                         if (checkonly)
2241                                 break;
2242 
2243                         if ((inlen != 0) &&
2244                             (inlen != sizeof (icmp6_filter_t)))
2245                                 return (EINVAL);
2246 
2247                         mutex_enter(&connp->conn_lock);
2248                         if (inlen == 0) {
2249                                 if (icmp->icmp_filter != NULL) {
2250                                         kmem_free(icmp->icmp_filter,
2251                                             sizeof (icmp6_filter_t));
2252                                         icmp->icmp_filter = NULL;
2253                                 }
2254                         } else {
2255                                 if (icmp->icmp_filter == NULL) {
2256                                         icmp->icmp_filter = kmem_alloc(
2257                                             sizeof (icmp6_filter_t),
2258                                             KM_NOSLEEP);
2259                                         if (icmp->icmp_filter == NULL) {
2260                                                 mutex_exit(&connp->conn_lock);
2261                                                 return (ENOBUFS);
2262                                         }
2263                                 }
2264                                 (void) bcopy(invalp, icmp->icmp_filter, inlen);
2265                         }
2266                         mutex_exit(&connp->conn_lock);
2267                         break;
2268                 }
2269                 break;
2270         }
2271         error = conn_opt_set(coa, level, name, inlen, invalp,
2272             checkonly, cr);
2273         return (error);
2274 }
2275 
2276 /*
2277  * This routine sets socket options.
2278  */
2279 int
2280 icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
2281     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2282     void *thisdg_attrs, cred_t *cr)
2283 {
2284         icmp_t          *icmp = connp->conn_icmp;
2285         int             err;
2286         conn_opt_arg_t  coas, *coa;
2287         boolean_t       checkonly;
2288         icmp_stack_t    *is = icmp->icmp_is;
2289 
2290         switch (optset_context) {
2291         case SETFN_OPTCOM_CHECKONLY:
2292                 checkonly = B_TRUE;
2293                 /*
2294                  * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
2295                  * inlen != 0 implies value supplied and
2296                  *      we have to "pretend" to set it.
2297                  * inlen == 0 implies that there is no
2298                  *      value part in T_CHECK request and just validation
2299                  * done elsewhere should be enough, we just return here.
2300                  */
2301                 if (inlen == 0) {
2302                         *outlenp = 0;
2303                         return (0);
2304                 }
2305                 break;
2306         case SETFN_OPTCOM_NEGOTIATE:
2307                 checkonly = B_FALSE;
2308                 break;
2309         case SETFN_UD_NEGOTIATE:
2310         case SETFN_CONN_NEGOTIATE:
2311                 checkonly = B_FALSE;
2312                 /*
2313                  * Negotiating local and "association-related" options
2314                  * through T_UNITDATA_REQ.
2315                  *
2316                  * Following routine can filter out ones we do not
2317                  * want to be "set" this way.
2318                  */
2319                 if (!icmp_opt_allow_udr_set(level, name)) {
2320                         *outlenp = 0;
2321                         return (EINVAL);
2322                 }
2323                 break;
2324         default:
2325                 /*
2326                  * We should never get here
2327                  */
2328                 *outlenp = 0;
2329                 return (EINVAL);
2330         }
2331 
2332         ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
2333             (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
2334 
2335         if (thisdg_attrs != NULL) {
2336                 /* Options from T_UNITDATA_REQ */
2337                 coa = (conn_opt_arg_t *)thisdg_attrs;
2338                 ASSERT(coa->coa_connp == connp);
2339                 ASSERT(coa->coa_ixa != NULL);
2340                 ASSERT(coa->coa_ipp != NULL);
2341                 ASSERT(coa->coa_ancillary);
2342         } else {
2343                 coa = &coas;
2344                 coas.coa_connp = connp;
2345                 /* Get a reference on conn_ixa to prevent concurrent mods */
2346                 coas.coa_ixa = conn_get_ixa(connp, B_TRUE);
2347                 if (coas.coa_ixa == NULL) {
2348                         *outlenp = 0;
2349                         return (ENOMEM);
2350                 }
2351                 coas.coa_ipp = &connp->conn_xmit_ipp;
2352                 coas.coa_ancillary = B_FALSE;
2353                 coas.coa_changed = 0;
2354         }
2355 
2356         err = icmp_do_opt_set(coa, level, name, inlen, invalp,
2357             cr, checkonly);
2358         if (err != 0) {
2359 errout:
2360                 if (!coa->coa_ancillary)
2361                         ixa_refrele(coa->coa_ixa);
2362                 *outlenp = 0;
2363                 return (err);
2364         }
2365 
2366         /*
2367          * Common case of OK return with outval same as inval.
2368          */
2369         if (invalp != outvalp) {
2370                 /* don't trust bcopy for identical src/dst */
2371                 (void) bcopy(invalp, outvalp, inlen);
2372         }
2373         *outlenp = inlen;
2374 
2375         /*
2376          * If this was not ancillary data, then we rebuild the headers,
2377          * update the IRE/NCE, and IPsec as needed.
2378          * Since the label depends on the destination we go through
2379          * ip_set_destination first.
2380          */
2381         if (coa->coa_ancillary) {
2382                 return (0);
2383         }
2384 
2385         if (coa->coa_changed & COA_ROUTE_CHANGED) {
2386                 in6_addr_t saddr, faddr, nexthop;
2387                 in_port_t fport;
2388 
2389                 /*
2390                  * We clear lastdst to make sure we pick up the change
2391                  * next time sending.
2392                  * If we are connected we re-cache the information.
2393                  * We ignore errors to preserve BSD behavior.
2394                  * Note that we don't redo IPsec policy lookup here
2395                  * since the final destination (or source) didn't change.
2396                  */
2397                 mutex_enter(&connp->conn_lock);
2398                 connp->conn_v6lastdst = ipv6_all_zeros;
2399 
2400                 ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa,
2401                     &connp->conn_faddr_v6, &nexthop);
2402                 saddr = connp->conn_saddr_v6;
2403                 faddr = connp->conn_faddr_v6;
2404                 fport = connp->conn_fport;
2405                 mutex_exit(&connp->conn_lock);
2406 
2407                 if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) &&
2408                     !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) {
2409                         (void) ip_attr_connect(connp, coa->coa_ixa,
2410                             &saddr, &faddr, &nexthop, fport, NULL, NULL,
2411                             IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
2412                 }
2413         }
2414 
2415         ixa_refrele(coa->coa_ixa);
2416 
2417         if (coa->coa_changed & COA_HEADER_CHANGED) {
2418                 /*
2419                  * Rebuild the header template if we are connected.
2420                  * Otherwise clear conn_v6lastdst so we rebuild the header
2421                  * in the data path.
2422                  */
2423                 mutex_enter(&connp->conn_lock);
2424                 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
2425                     !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
2426                         err = icmp_build_hdr_template(connp,
2427                             &connp->conn_saddr_v6, &connp->conn_faddr_v6,
2428                             connp->conn_flowinfo);
2429                         if (err != 0) {
2430                                 mutex_exit(&connp->conn_lock);
2431                                 return (err);
2432                         }
2433                 } else {
2434                         connp->conn_v6lastdst = ipv6_all_zeros;
2435                 }
2436                 mutex_exit(&connp->conn_lock);
2437         }
2438         if (coa->coa_changed & COA_RCVBUF_CHANGED) {
2439                 (void) proto_set_rx_hiwat(connp->conn_rq, connp,
2440                     connp->conn_rcvbuf);
2441         }
2442         if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
2443                 connp->conn_wq->q_hiwat = connp->conn_sndbuf;
2444         }
2445         if (coa->coa_changed & COA_WROFF_CHANGED) {
2446                 /* Increase wroff if needed */
2447                 uint_t wroff;
2448 
2449                 mutex_enter(&connp->conn_lock);
2450                 wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra;
2451                 if (wroff > connp->conn_wroff) {
2452                         connp->conn_wroff = wroff;
2453                         mutex_exit(&connp->conn_lock);
2454                         (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff);
2455                 } else {
2456                         mutex_exit(&connp->conn_lock);
2457                 }
2458         }
2459         if (coa->coa_changed & COA_ICMP_BIND_NEEDED) {
2460                 icmp_bind_proto(icmp);
2461         }
2462         return (err);
2463 }
2464 
2465 /* This routine sets socket options. */
2466 int
2467 icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
2468     uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
2469     void *thisdg_attrs, cred_t *cr)
2470 {
2471         conn_t  *connp = Q_TO_CONN(q);
2472         int error;
2473 
2474         error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
2475             outlenp, outvalp, thisdg_attrs, cr);
2476         return (error);
2477 }
2478 
2479 /*
2480  * Setup IP headers.
2481  *
2482  * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto,
2483  * but icmp_output_hdrincl restores ipha_protocol once we return.
2484  */
2485 mblk_t *
2486 icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
2487     const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo,
2488     mblk_t *data_mp, int *errorp)
2489 {
2490         mblk_t          *mp;
2491         icmp_stack_t    *is = connp->conn_netstack->netstack_icmp;
2492         uint_t          data_len;
2493         uint32_t        cksum;
2494 
2495         data_len = msgdsize(data_mp);
2496         mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto,
2497             flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp);
2498         if (mp == NULL) {
2499                 ASSERT(*errorp != 0);
2500                 return (NULL);
2501         }
2502 
2503         ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
2504 
2505         /*
2506          * If there was a routing option/header then conn_prepend_hdr
2507          * has massaged it and placed the pseudo-header checksum difference
2508          * in the cksum argument.
2509          *
2510          * Prepare for ICMPv6 checksum done in IP.
2511          *
2512          * We make it easy for IP to include our pseudo header
2513          * by putting our length (and any routing header adjustment)
2514          * in the ICMPv6 checksum field.
2515          * The IP source, destination, and length have already been set by
2516          * conn_prepend_hdr.
2517          */
2518         cksum += data_len;
2519         cksum = (cksum >> 16) + (cksum & 0xFFFF);
2520         ASSERT(cksum < 0x10000);
2521 
2522         if (ixa->ixa_flags & IXAF_IS_IPV4) {
2523                 ipha_t  *ipha = (ipha_t *)mp->b_rptr;
2524 
2525                 ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen);
2526         } else {
2527                 ip6_t   *ip6h = (ip6_t *)mp->b_rptr;
2528                 uint_t  cksum_offset = 0;
2529 
2530                 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen);
2531 
2532                 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
2533                         if (connp->conn_proto == IPPROTO_ICMPV6) {
2534                                 cksum_offset = ixa->ixa_ip_hdr_length +
2535                                     offsetof(icmp6_t, icmp6_cksum);
2536                         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2537                                 cksum_offset = ixa->ixa_ip_hdr_length +
2538                                     ixa->ixa_raw_cksum_offset;
2539                         }
2540                 }
2541                 if (cksum_offset != 0) {
2542                         uint16_t *ptr;
2543 
2544                         /* Make sure the checksum fits in the first mblk */
2545                         if (cksum_offset + sizeof (short) > MBLKL(mp)) {
2546                                 mblk_t *mp1;
2547 
2548                                 mp1 = msgpullup(mp,
2549                                     cksum_offset + sizeof (short));
2550                                 freemsg(mp);
2551                                 if (mp1 == NULL) {
2552                                         *errorp = ENOMEM;
2553                                         return (NULL);
2554                                 }
2555                                 mp = mp1;
2556                                 ip6h = (ip6_t *)mp->b_rptr;
2557                         }
2558                         ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
2559                         *ptr = htons(cksum);
2560                 }
2561         }
2562 
2563         /* Note that we don't try to update wroff due to ancillary data */
2564         return (mp);
2565 }
2566 
2567 static int
2568 icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src,
2569     const in6_addr_t *v6dst, uint32_t flowinfo)
2570 {
2571         int             error;
2572 
2573         ASSERT(MUTEX_HELD(&connp->conn_lock));
2574         /*
2575          * We clear lastdst to make sure we don't use the lastdst path
2576          * next time sending since we might not have set v6dst yet.
2577          */
2578         connp->conn_v6lastdst = ipv6_all_zeros;
2579 
2580         error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo);
2581         if (error != 0)
2582                 return (error);
2583 
2584         /*
2585          * Any routing header/option has been massaged. The checksum difference
2586          * is stored in conn_sum.
2587          */
2588         return (0);
2589 }
2590 
2591 static mblk_t *
2592 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
2593 {
2594         ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
2595         if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
2596                 /*
2597                  * fallback has started but messages have not been moved yet
2598                  */
2599                 if (icmp->icmp_fallback_queue_head == NULL) {
2600                         ASSERT(icmp->icmp_fallback_queue_tail == NULL);
2601                         icmp->icmp_fallback_queue_head = mp;
2602                         icmp->icmp_fallback_queue_tail = mp;
2603                 } else {
2604                         ASSERT(icmp->icmp_fallback_queue_tail != NULL);
2605                         icmp->icmp_fallback_queue_tail->b_next = mp;
2606                         icmp->icmp_fallback_queue_tail = mp;
2607                 }
2608                 return (NULL);
2609         } else {
2610                 /*
2611                  * Fallback completed, let the caller putnext() the mblk.
2612                  */
2613                 return (mp);
2614         }
2615 }
2616 
2617 /*
2618  * Deliver data to ULP. In case we have a socket, and it's falling back to
2619  * TPI, then we'll queue the mp for later processing.
2620  */
2621 static void
2622 icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len)
2623 {
2624         if (IPCL_IS_NONSTR(connp)) {
2625                 icmp_t *icmp = connp->conn_icmp;
2626                 int error;
2627 
2628                 ASSERT(len == msgdsize(mp));
2629                 if ((*connp->conn_upcalls->su_recv)
2630                     (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) {
2631                         mutex_enter(&icmp->icmp_recv_lock);
2632                         if (error == ENOSPC) {
2633                                 /*
2634                                  * let's confirm while holding the lock
2635                                  */
2636                                 if ((*connp->conn_upcalls->su_recv)
2637                                     (connp->conn_upper_handle, NULL, 0, 0,
2638                                     &error, NULL) < 0) {
2639                                         ASSERT(error == ENOSPC);
2640                                         if (error == ENOSPC) {
2641                                                 connp->conn_flow_cntrld =
2642                                                     B_TRUE;
2643                                         }
2644                                 }
2645                                 mutex_exit(&icmp->icmp_recv_lock);
2646                         } else {
2647                                 ASSERT(error == EOPNOTSUPP);
2648                                 mp = icmp_queue_fallback(icmp, mp);
2649                                 mutex_exit(&icmp->icmp_recv_lock);
2650                                 if (mp != NULL)
2651                                         putnext(connp->conn_rq, mp);
2652                         }
2653                 }
2654                 ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
2655         } else {
2656                 putnext(connp->conn_rq, mp);
2657         }
2658 }
2659 
2660 /*
2661  * This is the inbound data path.
2662  * IP has already pulled up the IP headers and verified alignment
2663  * etc.
2664  */
2665 /* ARGSUSED2 */
2666 static void
2667 icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
2668 {
2669         conn_t                  *connp = (conn_t *)arg1;
2670         struct T_unitdata_ind   *tudi;
2671         uchar_t                 *rptr;          /* Pointer to IP header */
2672         int                     ip_hdr_length;
2673         int                     udi_size;       /* Size of T_unitdata_ind */
2674         int                     pkt_len;
2675         icmp_t                  *icmp;
2676         ip_pkt_t                ipps;
2677         ip6_t                   *ip6h;
2678         mblk_t                  *mp1;
2679         crb_t                   recv_ancillary;
2680         icmp_stack_t            *is;
2681         sin_t                   *sin;
2682         sin6_t                  *sin6;
2683         ipha_t                  *ipha;
2684 
2685         ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
2686 
2687         icmp = connp->conn_icmp;
2688         is = icmp->icmp_is;
2689         rptr = mp->b_rptr;
2690 
2691         ASSERT(DB_TYPE(mp) == M_DATA);
2692         ASSERT(OK_32PTR(rptr));
2693         ASSERT(ira->ira_pktlen == msgdsize(mp));
2694         pkt_len = ira->ira_pktlen;
2695 
2696         /*
2697          * Get a snapshot of these and allow other threads to change
2698          * them after that. We need the same recv_ancillary when determining
2699          * the size as when adding the ancillary data items.
2700          */
2701         mutex_enter(&connp->conn_lock);
2702         recv_ancillary = connp->conn_recv_ancillary;
2703         mutex_exit(&connp->conn_lock);
2704 
2705         ip_hdr_length = ira->ira_ip_hdr_length;
2706         ASSERT(MBLKL(mp) >= ip_hdr_length);  /* IP did a pullup */
2707 
2708         /* Initialize regardless of IP version */
2709         ipps.ipp_fields = 0;
2710 
2711         /* Apply socket filter, if needed */
2712         if (icmp->icmp_bpf_len != 0) {
2713                 if (icmp_eval_filter(icmp, mp, ira)) {
2714                         freemsg(mp);
2715                         return;
2716                 }
2717         }
2718 
2719         if (ira->ira_flags & IRAF_IS_IPV4) {
2720                 ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
2721                 ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2722                 ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr));
2723 
2724                 ipha = (ipha_t *)mp->b_rptr;
2725                 if (recv_ancillary.crb_all != 0)
2726                         (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE);
2727 
2728                 /*
2729                  * BSD for some reason adjusts ipha_length to exclude the
2730                  * IP header length. We do the same.
2731                  */
2732                 if (is->is_bsd_compat) {
2733                         ushort_t len;
2734 
2735                         len = ntohs(ipha->ipha_length);
2736                         if (mp->b_datap->db_ref > 1) {
2737                                 /*
2738                                  * Allocate a new IP header so that we can
2739                                  * modify ipha_length.
2740                                  */
2741                                 mblk_t  *mp1;
2742 
2743                                 mp1 = allocb(ip_hdr_length, BPRI_MED);
2744                                 if (mp1 == NULL) {
2745                                         freemsg(mp);
2746                                         BUMP_MIB(&is->is_rawip_mib,
2747                                             rawipInErrors);
2748                                         return;
2749                                 }
2750                                 bcopy(rptr, mp1->b_rptr, ip_hdr_length);
2751                                 mp->b_rptr = rptr + ip_hdr_length;
2752                                 rptr = mp1->b_rptr;
2753                                 ipha = (ipha_t *)rptr;
2754                                 mp1->b_cont = mp;
2755                                 mp1->b_wptr = rptr + ip_hdr_length;
2756                                 mp = mp1;
2757                         }
2758                         len -= ip_hdr_length;
2759                         ipha->ipha_length = htons(len);
2760                 }
2761 
2762                 /*
2763                  * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6
2764                  * sockets. This is ensured by icmp_bind and the IP fanout code.
2765                  */
2766                 ASSERT(connp->conn_family == AF_INET);
2767 
2768                 /*
2769                  * This is the inbound data path.  Packets are passed upstream
2770                  * as T_UNITDATA_IND messages with full IPv4 headers still
2771                  * attached.
2772                  */
2773 
2774                 /*
2775                  * Normally only send up the source address.
2776                  * If any ancillary data items are wanted we add those.
2777                  */
2778                 udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
2779                 if (recv_ancillary.crb_all != 0) {
2780                         udi_size += conn_recvancillary_size(connp,
2781                             recv_ancillary, ira, mp, &ipps);
2782                 }
2783 
2784                 /* Allocate a message block for the T_UNITDATA_IND structure. */
2785                 mp1 = allocb(udi_size, BPRI_MED);
2786                 if (mp1 == NULL) {
2787                         freemsg(mp);
2788                         BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2789                         return;
2790                 }
2791                 mp1->b_cont = mp;
2792                 tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2793                 mp1->b_datap->db_type = M_PROTO;
2794                 mp1->b_wptr = (uchar_t *)tudi + udi_size;
2795                 tudi->PRIM_type = T_UNITDATA_IND;
2796                 tudi->SRC_length = sizeof (sin_t);
2797                 tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2798                 sin = (sin_t *)&tudi[1];
2799                 *sin = sin_null;
2800                 sin->sin_family = AF_INET;
2801                 sin->sin_addr.s_addr = ipha->ipha_src;
2802                 *(uint32_t *)&sin->sin_zero[0] = 0;
2803                 *(uint32_t *)&sin->sin_zero[4] = 0;
2804                 tudi->OPT_offset =  sizeof (struct T_unitdata_ind) +
2805                     sizeof (sin_t);
2806                 udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t));
2807                 tudi->OPT_length = udi_size;
2808 
2809                 /*
2810                  * Add options if IP_RECVIF etc is set
2811                  */
2812                 if (udi_size != 0) {
2813                         conn_recvancillary_add(connp, recv_ancillary, ira,
2814                             &ipps, (uchar_t *)&sin[1], udi_size);
2815                 }
2816                 goto deliver;
2817         }
2818 
2819         ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION);
2820         /*
2821          * IPv6 packets can only be received by applications
2822          * that are prepared to receive IPv6 addresses.
2823          * The IP fanout must ensure this.
2824          */
2825         ASSERT(connp->conn_family == AF_INET6);
2826 
2827         /*
2828          * Handle IPv6 packets. We don't pass up the IP headers with the
2829          * payload for IPv6.
2830          */
2831 
2832         ip6h = (ip6_t *)rptr;
2833         if (recv_ancillary.crb_all != 0) {
2834                 /*
2835                  * Call on ip_find_hdr_v6 which gets individual lenghts of
2836                  * extension headers (and pointers to them).
2837                  */
2838                 uint8_t         nexthdr;
2839 
2840                 /* We don't care about the length or nextheader. */
2841                 (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr);
2842 
2843                 /*
2844                  * We do not pass up hop-by-hop options or any other
2845                  * extension header as part of the packet. Applications
2846                  * that want to see them have to specify IPV6_RECV* socket
2847                  * options. And conn_recvancillary_size/add explicitly
2848                  * drops the TX option from IPV6_HOPOPTS as it does for UDP.
2849                  *
2850                  * If we had multilevel ICMP sockets, then we'd want to
2851                  * modify conn_recvancillary_size/add to
2852                  * allow the user to see the label.
2853                  */
2854         }
2855 
2856         /*
2857          * Check a filter for ICMPv6 types if needed.
2858          * Verify raw checksums if needed.
2859          */
2860         mutex_enter(&connp->conn_lock);
2861         if (icmp->icmp_filter != NULL) {
2862                 int type;
2863 
2864                 /* Assumes that IP has done the pullupmsg */
2865                 type = mp->b_rptr[ip_hdr_length];
2866 
2867                 ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr);
2868                 if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) {
2869                         mutex_exit(&connp->conn_lock);
2870                         freemsg(mp);
2871                         return;
2872                 }
2873         }
2874         if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
2875                 /* Checksum */
2876                 uint16_t        *up;
2877                 uint32_t        sum;
2878                 int             remlen;
2879 
2880                 up = (uint16_t *)&ip6h->ip6_src;
2881 
2882                 remlen = msgdsize(mp) - ip_hdr_length;
2883                 sum = htons(connp->conn_proto + remlen)
2884                     + up[0] + up[1] + up[2] + up[3]
2885                     + up[4] + up[5] + up[6] + up[7]
2886                     + up[8] + up[9] + up[10] + up[11]
2887                     + up[12] + up[13] + up[14] + up[15];
2888                 sum = (sum & 0xffff) + (sum >> 16);
2889                 sum = IP_CSUM(mp, ip_hdr_length, sum);
2890                 if (sum != 0) {
2891                         /* IPv6 RAW checksum failed */
2892                         ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum));
2893                         mutex_exit(&connp->conn_lock);
2894                         freemsg(mp);
2895                         BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs);
2896                         return;
2897                 }
2898         }
2899         mutex_exit(&connp->conn_lock);
2900 
2901         udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2902 
2903         if (recv_ancillary.crb_all != 0) {
2904                 udi_size += conn_recvancillary_size(connp,
2905                     recv_ancillary, ira, mp, &ipps);
2906         }
2907 
2908         mp1 = allocb(udi_size, BPRI_MED);
2909         if (mp1 == NULL) {
2910                 freemsg(mp);
2911                 BUMP_MIB(&is->is_rawip_mib, rawipInErrors);
2912                 return;
2913         }
2914         mp1->b_cont = mp;
2915         mp1->b_datap->db_type = M_PROTO;
2916         tudi = (struct T_unitdata_ind *)mp1->b_rptr;
2917         mp1->b_wptr = (uchar_t *)tudi + udi_size;
2918         tudi->PRIM_type = T_UNITDATA_IND;
2919         tudi->SRC_length = sizeof (sin6_t);
2920         tudi->SRC_offset = sizeof (struct T_unitdata_ind);
2921         tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin6_t);
2922         udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t));
2923         tudi->OPT_length = udi_size;
2924         sin6 = (sin6_t *)&tudi[1];
2925         *sin6 = sin6_null;
2926         sin6->sin6_port = 0;
2927         sin6->sin6_family = AF_INET6;
2928 
2929         sin6->sin6_addr = ip6h->ip6_src;
2930         /* No sin6_flowinfo per API */
2931         sin6->sin6_flowinfo = 0;
2932         /* For link-scope pass up scope id */
2933         if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src))
2934                 sin6->sin6_scope_id = ira->ira_ruifindex;
2935         else
2936                 sin6->sin6_scope_id = 0;
2937         sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
2938             IPCL_ZONEID(connp), is->is_netstack);
2939 
2940         if (udi_size != 0) {
2941                 conn_recvancillary_add(connp, recv_ancillary, ira,
2942                     &ipps, (uchar_t *)&sin6[1], udi_size);
2943         }
2944 
2945         /* Skip all the IPv6 headers per API */
2946         mp->b_rptr += ip_hdr_length;
2947         pkt_len -= ip_hdr_length;
2948 
2949 deliver:
2950         BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
2951         icmp_ulp_recv(connp, mp1, pkt_len);
2952 }
2953 
2954 /*
2955  * return SNMP stuff in buffer in mpdata. We don't hold any lock and report
2956  * information that can be changing beneath us.
2957  */
2958 mblk_t *
2959 icmp_snmp_get(queue_t *q, mblk_t *mpctl)
2960 {
2961         mblk_t                  *mpdata;
2962         struct opthdr           *optp;
2963         conn_t                  *connp = Q_TO_CONN(q);
2964         icmp_stack_t            *is = connp->conn_netstack->netstack_icmp;
2965         mblk_t                  *mp2ctl;
2966 
2967         /*
2968          * make a copy of the original message
2969          */
2970         mp2ctl = copymsg(mpctl);
2971 
2972         if (mpctl == NULL ||
2973             (mpdata = mpctl->b_cont) == NULL) {
2974                 freemsg(mpctl);
2975                 freemsg(mp2ctl);
2976                 return (0);
2977         }
2978 
2979         /* fixed length structure for IPv4 and IPv6 counters */
2980         optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
2981         optp->level = EXPER_RAWIP;
2982         optp->name = 0;
2983         (void) snmp_append_data(mpdata, (char *)&is->is_rawip_mib,
2984             sizeof (is->is_rawip_mib));
2985         optp->len = msgdsize(mpdata);
2986         qreply(q, mpctl);
2987 
2988         return (mp2ctl);
2989 }
2990 
2991 /*
2992  * Return 0 if invalid set request, 1 otherwise, including non-rawip requests.
2993  * TODO:  If this ever actually tries to set anything, it needs to be
2994  * to do the appropriate locking.
2995  */
2996 /* ARGSUSED */
2997 int
2998 icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
2999     uchar_t *ptr, int len)
3000 {
3001         switch (level) {
3002         case EXPER_RAWIP:
3003                 return (0);
3004         default:
3005                 return (1);
3006         }
3007 }
3008 
3009 /*
3010  * This routine creates a T_UDERROR_IND message and passes it upstream.
3011  * The address and options are copied from the T_UNITDATA_REQ message
3012  * passed in mp.  This message is freed.
3013  */
3014 static void
3015 icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
3016 {
3017         struct T_unitdata_req *tudr;
3018         mblk_t  *mp1;
3019         uchar_t *destaddr;
3020         t_scalar_t destlen;
3021         uchar_t *optaddr;
3022         t_scalar_t optlen;
3023 
3024         if ((mp->b_wptr < mp->b_rptr) ||
3025             (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
3026                 goto done;
3027         }
3028         tudr = (struct T_unitdata_req *)mp->b_rptr;
3029         destaddr = mp->b_rptr + tudr->DEST_offset;
3030         if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
3031             destaddr + tudr->DEST_length < mp->b_rptr ||
3032             destaddr + tudr->DEST_length > mp->b_wptr) {
3033                 goto done;
3034         }
3035         optaddr = mp->b_rptr + tudr->OPT_offset;
3036         if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
3037             optaddr + tudr->OPT_length < mp->b_rptr ||
3038             optaddr + tudr->OPT_length > mp->b_wptr) {
3039                 goto done;
3040         }
3041         destlen = tudr->DEST_length;
3042         optlen = tudr->OPT_length;
3043 
3044         mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
3045             (char *)optaddr, optlen, err);
3046         if (mp1 != NULL)
3047                 qreply(q, mp1);
3048 
3049 done:
3050         freemsg(mp);
3051 }
3052 
3053 static int
3054 rawip_do_unbind(conn_t *connp)
3055 {
3056         icmp_t  *icmp = connp->conn_icmp;
3057 
3058         mutex_enter(&connp->conn_lock);
3059         /* If a bind has not been done, we can't unbind. */
3060         if (icmp->icmp_state == TS_UNBND) {
3061                 mutex_exit(&connp->conn_lock);
3062                 return (-TOUTSTATE);
3063         }
3064         connp->conn_saddr_v6 = ipv6_all_zeros;
3065         connp->conn_bound_addr_v6 = ipv6_all_zeros;
3066         connp->conn_laddr_v6 = ipv6_all_zeros;
3067         connp->conn_mcbc_bind = B_FALSE;
3068         connp->conn_lport = 0;
3069         connp->conn_fport = 0;
3070         /* In case we were also connected */
3071         connp->conn_faddr_v6 = ipv6_all_zeros;
3072         connp->conn_v6lastdst = ipv6_all_zeros;
3073 
3074         icmp->icmp_state = TS_UNBND;
3075 
3076         (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6,
3077             &connp->conn_faddr_v6, connp->conn_flowinfo);
3078         mutex_exit(&connp->conn_lock);
3079 
3080         ip_unbind(connp);
3081         return (0);
3082 }
3083 
3084 /*
3085  * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
3086  * After some error checking, the message is passed downstream to ip.
3087  */
3088 static void
3089 icmp_tpi_unbind(queue_t *q, mblk_t *mp)
3090 {
3091         conn_t  *connp = Q_TO_CONN(q);
3092         int     error;
3093 
3094         ASSERT(mp->b_cont == NULL);
3095         error = rawip_do_unbind(connp);
3096         if (error) {
3097                 if (error < 0) {
3098                         icmp_err_ack(q, mp, -error, 0);
3099                 } else {
3100                         icmp_err_ack(q, mp, 0, error);
3101                 }
3102                 return;
3103         }
3104 
3105         /*
3106          * Convert mp into a T_OK_ACK
3107          */
3108 
3109         mp = mi_tpi_ok_ack_alloc(mp);
3110 
3111         /*
3112          * should not happen in practice... T_OK_ACK is smaller than the
3113          * original message.
3114          */
3115         ASSERT(mp != NULL);
3116         ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
3117         qreply(q, mp);
3118 }
3119 
3120 /*
3121  * Process IPv4 packets that already include an IP header.
3122  * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
3123  * IPPROTO_IGMP).
3124  * In this case we ignore the address and any options in the T_UNITDATA_REQ.
3125  *
3126  * The packet is assumed to have a base (20 byte) IP header followed
3127  * by the upper-layer protocol. We include any IP_OPTIONS including a
3128  * CIPSO label but otherwise preserve the base IP header.
3129  */
3130 static int
3131 icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3132 {
3133         icmp_t          *icmp = connp->conn_icmp;
3134         icmp_stack_t    *is = icmp->icmp_is;
3135         ipha_t          iphas;
3136         ipha_t          *ipha;
3137         int             ip_hdr_length;
3138         int             tp_hdr_len;
3139         ip_xmit_attr_t  *ixa;
3140         ip_pkt_t        *ipp;
3141         in6_addr_t      v6src;
3142         in6_addr_t      v6dst;
3143         in6_addr_t      v6nexthop;
3144         int             error;
3145         boolean_t       do_ipsec;
3146 
3147         /*
3148          * We need an exclusive copy of conn_ixa since the included IP
3149          * header could have any destination.
3150          * That copy has no pointers hence we
3151          * need to set them up once we've parsed the ancillary data.
3152          */
3153         ixa = conn_get_ixa_exclusive(connp);
3154         if (ixa == NULL) {
3155                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3156                 freemsg(mp);
3157                 return (ENOMEM);
3158         }
3159         ASSERT(cr != NULL);
3160         /*
3161          * Caller has a reference on cr; from db_credp or because we
3162          * are running in process context.
3163          */
3164         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3165         ixa->ixa_cred = cr;
3166         ixa->ixa_cpid = pid;
3167         if (is_system_labeled()) {
3168                 /* We need to restart with a label based on the cred */
3169                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3170         }
3171 
3172         /* In case previous destination was multicast or multirt */
3173         ip_attr_newdst(ixa);
3174 
3175         /* Get a copy of conn_xmit_ipp since the TX label might change it */
3176         ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3177         if (ipp == NULL) {
3178                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3179                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3180                 ixa->ixa_cpid = connp->conn_cpid;
3181                 ixa_refrele(ixa);
3182                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3183                 freemsg(mp);
3184                 return (ENOMEM);
3185         }
3186         mutex_enter(&connp->conn_lock);
3187         error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3188         mutex_exit(&connp->conn_lock);
3189         if (error != 0) {
3190                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3191                 freemsg(mp);
3192                 goto done;
3193         }
3194 
3195         /* Sanity check length of packet */
3196         ipha = (ipha_t *)mp->b_rptr;
3197 
3198         ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
3199         if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) {
3200                 if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
3201                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3202                         freemsg(mp);
3203                         goto done;
3204                 }
3205                 ipha = (ipha_t *)mp->b_rptr;
3206         }
3207         ipha->ipha_version_and_hdr_length =
3208             (IP_VERSION<<4) | (ip_hdr_length>>2);
3209 
3210         /*
3211          * We set IXAF_DONTFRAG if the application set DF which makes
3212          * IP not fragment.
3213          */
3214         ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF);
3215         if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF))
3216                 ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3217         else
3218                 ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF);
3219 
3220         /* Even for multicast and broadcast we honor the apps ttl */
3221         ixa->ixa_flags |= IXAF_NO_TTL_CHANGE;
3222 
3223         /*
3224          * No source verification for non-local addresses
3225          */
3226         if (ipha->ipha_src != INADDR_ANY &&
3227             ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid,
3228             is->is_netstack->netstack_ip, B_FALSE)
3229             != IPVL_UNICAST_UP) {
3230                 ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3231         }
3232 
3233         if (ipha->ipha_dst == INADDR_ANY)
3234                 ipha->ipha_dst = htonl(INADDR_LOOPBACK);
3235 
3236         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src);
3237         IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
3238 
3239         /* Defer IPsec if it might need to look at ICMP type/code */
3240         do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP;
3241         ixa->ixa_flags |= IXAF_IS_IPV4;
3242 
3243         ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3244         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop,
3245             connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3246             (do_ipsec ? IPDF_IPSEC : 0));
3247         switch (error) {
3248         case 0:
3249                 break;
3250         case EADDRNOTAVAIL:
3251                 /*
3252                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3253                  * Don't have the application see that errno
3254                  */
3255                 error = ENETUNREACH;
3256                 goto failed;
3257         case ENETDOWN:
3258                 /*
3259                  * Have !ipif_addr_ready address; drop packet silently
3260                  * until we can get applications to not send until we
3261                  * are ready.
3262                  */
3263                 error = 0;
3264                 goto failed;
3265         case EHOSTUNREACH:
3266         case ENETUNREACH:
3267                 if (ixa->ixa_ire != NULL) {
3268                         /*
3269                          * Let conn_ip_output/ire_send_noroute return
3270                          * the error and send any local ICMP error.
3271                          */
3272                         error = 0;
3273                         break;
3274                 }
3275                 /* FALLTHRU */
3276         default:
3277         failed:
3278                 freemsg(mp);
3279                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3280                 goto done;
3281         }
3282         if (ipha->ipha_src == INADDR_ANY)
3283                 IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
3284 
3285         /*
3286          * We might be going to a different destination than last time,
3287          * thus check that TX allows the communication and compute any
3288          * needed label.
3289          *
3290          * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3291          * don't have to worry about concurrent threads.
3292          */
3293         if (is_system_labeled()) {
3294                 /*
3295                  * Check whether Trusted Solaris policy allows communication
3296                  * with this host, and pretend that the destination is
3297                  * unreachable if not.
3298                  * Compute any needed label and place it in ipp_label_v4/v6.
3299                  *
3300                  * Later conn_build_hdr_template/conn_prepend_hdr takes
3301                  * ipp_label_v4/v6 to form the packet.
3302                  *
3303                  * Tsol note: We have ipp structure local to this thread so
3304                  * no locking is needed.
3305                  */
3306                 error = conn_update_label(connp, ixa, &v6dst, ipp);
3307                 if (error != 0) {
3308                         freemsg(mp);
3309                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3310                         goto done;
3311                 }
3312         }
3313 
3314         /*
3315          * Save away a copy of the IPv4 header the application passed down
3316          * and then prepend an IPv4 header complete with any IP options
3317          * including label.
3318          * We need a struct copy since icmp_prepend_hdr will reuse the available
3319          * space in the mblk.
3320          */
3321         iphas = *ipha;
3322         mp->b_rptr += IP_SIMPLE_HDR_LENGTH;
3323 
3324         mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error);
3325         if (mp == NULL) {
3326                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3327                 ASSERT(error != 0);
3328                 goto done;
3329         }
3330         if (ixa->ixa_pktlen > IP_MAXPACKET) {
3331                 error = EMSGSIZE;
3332                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3333                 freemsg(mp);
3334                 goto done;
3335         }
3336         /* Restore key parts of the header that the application passed down */
3337         ipha = (ipha_t *)mp->b_rptr;
3338         ipha->ipha_type_of_service = iphas.ipha_type_of_service;
3339         ipha->ipha_ident = iphas.ipha_ident;
3340         ipha->ipha_fragment_offset_and_flags =
3341             iphas.ipha_fragment_offset_and_flags;
3342         ipha->ipha_ttl = iphas.ipha_ttl;
3343         ipha->ipha_protocol = iphas.ipha_protocol;
3344         ipha->ipha_src = iphas.ipha_src;
3345         ipha->ipha_dst = iphas.ipha_dst;
3346 
3347         ixa->ixa_protocol = ipha->ipha_protocol;
3348 
3349         /*
3350          * Make sure that the IP header plus any transport header that is
3351          * checksumed by ip_output is in the first mblk. (ip_output assumes
3352          * that at least the checksum field is in the first mblk.)
3353          */
3354         switch (ipha->ipha_protocol) {
3355         case IPPROTO_UDP:
3356                 tp_hdr_len = 8;
3357                 break;
3358         case IPPROTO_TCP:
3359                 tp_hdr_len = 20;
3360                 break;
3361         default:
3362                 tp_hdr_len = 0;
3363                 break;
3364         }
3365         ip_hdr_length = IPH_HDR_LENGTH(ipha);
3366         if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) {
3367                 if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) {
3368                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3369                         if (mp->b_cont == NULL)
3370                                 error = EINVAL;
3371                         else
3372                                 error = ENOMEM;
3373                         freemsg(mp);
3374                         goto done;
3375                 }
3376         }
3377 
3378         if (!do_ipsec) {
3379                 /* Policy might differ for different ICMP type/code */
3380                 if (ixa->ixa_ipsec_policy != NULL) {
3381                         IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3382                         ixa->ixa_ipsec_policy = NULL;
3383                         ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3384                 }
3385                 mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa);
3386                 if (mp == NULL) {
3387                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3388                         error = EHOSTUNREACH;   /* IPsec policy failure */
3389                         goto done;
3390                 }
3391         }
3392 
3393         /* We're done.  Pass the packet to ip. */
3394         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3395 
3396         error = conn_ip_output(mp, ixa);
3397         /* No rawipOutErrors if an error since IP increases its error counter */
3398         switch (error) {
3399         case 0:
3400                 break;
3401         case EWOULDBLOCK:
3402                 (void) ixa_check_drain_insert(connp, ixa);
3403                 error = 0;
3404                 break;
3405         case EADDRNOTAVAIL:
3406                 /*
3407                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3408                  * Don't have the application see that errno
3409                  */
3410                 error = ENETUNREACH;
3411                 break;
3412         }
3413 done:
3414         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3415         ixa->ixa_cred = connp->conn_cred; /* Restore */
3416         ixa->ixa_cpid = connp->conn_cpid;
3417         ixa_refrele(ixa);
3418         ip_pkt_free(ipp);
3419         kmem_free(ipp, sizeof (*ipp));
3420         return (error);
3421 }
3422 
3423 static mblk_t *
3424 icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa)
3425 {
3426         ipha_t  *ipha = NULL;
3427         ip6_t   *ip6h = NULL;
3428 
3429         if (ixa->ixa_flags & IXAF_IS_IPV4)
3430                 ipha = (ipha_t *)mp->b_rptr;
3431         else
3432                 ip6h = (ip6_t *)mp->b_rptr;
3433 
3434         if (ixa->ixa_ipsec_policy != NULL) {
3435                 IPPOL_REFRELE(ixa->ixa_ipsec_policy);
3436                 ixa->ixa_ipsec_policy = NULL;
3437                 ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;
3438         }
3439         return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa));
3440 }
3441 
3442 /*
3443  * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6
3444  * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from
3445  * the TPI options, otherwise we take them from msg_control.
3446  * If both sin and sin6 is set it is a connected socket and we use conn_faddr.
3447  * Always consumes mp; never consumes tudr_mp.
3448  */
3449 static int
3450 icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp,
3451     mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid)
3452 {
3453         icmp_t          *icmp = connp->conn_icmp;
3454         icmp_stack_t    *is = icmp->icmp_is;
3455         int             error;
3456         ip_xmit_attr_t  *ixa;
3457         ip_pkt_t        *ipp;
3458         in6_addr_t      v6src;
3459         in6_addr_t      v6dst;
3460         in6_addr_t      v6nexthop;
3461         in_port_t       dstport;
3462         uint32_t        flowinfo;
3463         int             is_absreq_failure = 0;
3464         conn_opt_arg_t  coas, *coa;
3465 
3466         ASSERT(tudr_mp != NULL || msg != NULL);
3467 
3468         /*
3469          * Get ixa before checking state to handle a disconnect race.
3470          *
3471          * We need an exclusive copy of conn_ixa since the ancillary data
3472          * options might modify it. That copy has no pointers hence we
3473          * need to set them up once we've parsed the ancillary data.
3474          */
3475         ixa = conn_get_ixa_exclusive(connp);
3476         if (ixa == NULL) {
3477                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3478                 freemsg(mp);
3479                 return (ENOMEM);
3480         }
3481         ASSERT(cr != NULL);
3482         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3483         ixa->ixa_cred = cr;
3484         ixa->ixa_cpid = pid;
3485         if (is_system_labeled()) {
3486                 /* We need to restart with a label based on the cred */
3487                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
3488         }
3489 
3490         /* In case previous destination was multicast or multirt */
3491         ip_attr_newdst(ixa);
3492 
3493         /* Get a copy of conn_xmit_ipp since the options might change it */
3494         ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP);
3495         if (ipp == NULL) {
3496                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3497                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3498                 ixa->ixa_cpid = connp->conn_cpid;
3499                 ixa_refrele(ixa);
3500                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3501                 freemsg(mp);
3502                 return (ENOMEM);
3503         }
3504         mutex_enter(&connp->conn_lock);
3505         error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP);
3506         mutex_exit(&connp->conn_lock);
3507         if (error != 0) {
3508                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3509                 freemsg(mp);
3510                 goto done;
3511         }
3512 
3513         /*
3514          * Parse the options and update ixa and ipp as a result.
3515          */
3516 
3517         coa = &coas;
3518         coa->coa_connp = connp;
3519         coa->coa_ixa = ixa;
3520         coa->coa_ipp = ipp;
3521         coa->coa_ancillary = B_TRUE;
3522         coa->coa_changed = 0;
3523 
3524         if (msg != NULL) {
3525                 error = process_auxiliary_options(connp, msg->msg_control,
3526                     msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr);
3527         } else {
3528                 struct T_unitdata_req *tudr;
3529 
3530                 tudr = (struct T_unitdata_req *)tudr_mp->b_rptr;
3531                 ASSERT(tudr->PRIM_type == T_UNITDATA_REQ);
3532                 error = tpi_optcom_buf(connp->conn_wq, tudr_mp,
3533                     &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj,
3534                     coa, &is_absreq_failure);
3535         }
3536         if (error != 0) {
3537                 /*
3538                  * Note: No special action needed in this
3539                  * module for "is_absreq_failure"
3540                  */
3541                 freemsg(mp);
3542                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3543                 goto done;
3544         }
3545         ASSERT(is_absreq_failure == 0);
3546 
3547         mutex_enter(&connp->conn_lock);
3548         /*
3549          * If laddr is unspecified then we look at sin6_src_id.
3550          * We will give precedence to a source address set with IPV6_PKTINFO
3551          * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
3552          * want ip_attr_connect to select a source (since it can fail) when
3553          * IPV6_PKTINFO is specified.
3554          * If this doesn't result in a source address then we get a source
3555          * from ip_attr_connect() below.
3556          */
3557         v6src = connp->conn_saddr_v6;
3558         if (sin != NULL) {
3559                 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
3560                 dstport = sin->sin_port;
3561                 flowinfo = 0;
3562                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3563                 ixa->ixa_flags |= IXAF_IS_IPV4;
3564         } else if (sin6 != NULL) {
3565                 boolean_t v4mapped;
3566                 uint_t srcid;
3567 
3568                 v6dst = sin6->sin6_addr;
3569                 dstport = sin6->sin6_port;
3570                 flowinfo = sin6->sin6_flowinfo;
3571                 srcid = sin6->__sin6_src_id;
3572                 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
3573                         ixa->ixa_scopeid = sin6->sin6_scope_id;
3574                         ixa->ixa_flags |= IXAF_SCOPEID_SET;
3575                 } else {
3576                         ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
3577                 }
3578                 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
3579                 if (v4mapped)
3580                         ixa->ixa_flags |= IXAF_IS_IPV4;
3581                 else
3582                         ixa->ixa_flags &= ~IXAF_IS_IPV4;
3583                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
3584                         if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
3585                             v4mapped, connp->conn_netstack)) {
3586                                 /* Mismatched v4mapped/v6 specified by srcid. */
3587                                 mutex_exit(&connp->conn_lock);
3588                                 error = EADDRNOTAVAIL;
3589                                 goto failed;    /* Does freemsg() and mib. */
3590                         }
3591                 }
3592         } else {
3593                 /* Connected case */
3594                 v6dst = connp->conn_faddr_v6;
3595                 flowinfo = connp->conn_flowinfo;
3596         }
3597         mutex_exit(&connp->conn_lock);
3598         /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
3599         if (ipp->ipp_fields & IPPF_ADDR) {
3600                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
3601                         if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3602                                 v6src = ipp->ipp_addr;
3603                 } else {
3604                         if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
3605                                 v6src = ipp->ipp_addr;
3606                 }
3607         }
3608         /*
3609          * Allow source not assigned to the system
3610          * only if it is not a local addresses
3611          */
3612         if (!V6_OR_V4_INADDR_ANY(v6src)) {
3613                 ip_laddr_t laddr_type;
3614 
3615                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
3616                         ipaddr_t v4src;
3617 
3618                         IN6_V4MAPPED_TO_IPADDR(&v6src, v4src);
3619                         laddr_type = ip_laddr_verify_v4(v4src, ixa->ixa_zoneid,
3620                             is->is_netstack->netstack_ip, B_FALSE);
3621                 } else {
3622                         laddr_type = ip_laddr_verify_v6(&v6src, ixa->ixa_zoneid,
3623                             is->is_netstack->netstack_ip, B_FALSE, B_FALSE);
3624                 }
3625                 if (laddr_type != IPVL_UNICAST_UP)
3626                         ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE;
3627         }
3628 
3629         ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop);
3630         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
3631             &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST);
3632 
3633         switch (error) {
3634         case 0:
3635                 break;
3636         case EADDRNOTAVAIL:
3637                 /*
3638                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3639                  * Don't have the application see that errno
3640                  */
3641                 error = ENETUNREACH;
3642                 goto failed;
3643         case ENETDOWN:
3644                 /*
3645                  * Have !ipif_addr_ready address; drop packet silently
3646                  * until we can get applications to not send until we
3647                  * are ready.
3648                  */
3649                 error = 0;
3650                 goto failed;
3651         case EHOSTUNREACH:
3652         case ENETUNREACH:
3653                 if (ixa->ixa_ire != NULL) {
3654                         /*
3655                          * Let conn_ip_output/ire_send_noroute return
3656                          * the error and send any local ICMP error.
3657                          */
3658                         error = 0;
3659                         break;
3660                 }
3661                 /* FALLTHRU */
3662         default:
3663         failed:
3664                 freemsg(mp);
3665                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3666                 goto done;
3667         }
3668 
3669         /*
3670          * We might be going to a different destination than last time,
3671          * thus check that TX allows the communication and compute any
3672          * needed label.
3673          *
3674          * TSOL Note: We have an exclusive ipp and ixa for this thread so we
3675          * don't have to worry about concurrent threads.
3676          */
3677         if (is_system_labeled()) {
3678                 /*
3679                  * Check whether Trusted Solaris policy allows communication
3680                  * with this host, and pretend that the destination is
3681                  * unreachable if not.
3682                  * Compute any needed label and place it in ipp_label_v4/v6.
3683                  *
3684                  * Later conn_build_hdr_template/conn_prepend_hdr takes
3685                  * ipp_label_v4/v6 to form the packet.
3686                  *
3687                  * Tsol note: We have ipp structure local to this thread so
3688                  * no locking is needed.
3689                  */
3690                 error = conn_update_label(connp, ixa, &v6dst, ipp);
3691                 if (error != 0) {
3692                         freemsg(mp);
3693                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3694                         goto done;
3695                 }
3696         }
3697         mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp,
3698             &error);
3699         if (mp == NULL) {
3700                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3701                 ASSERT(error != 0);
3702                 goto done;
3703         }
3704         if (ixa->ixa_pktlen > IP_MAXPACKET) {
3705                 error = EMSGSIZE;
3706                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3707                 freemsg(mp);
3708                 goto done;
3709         }
3710 
3711         /* Policy might differ for different ICMP type/code */
3712         mp = icmp_output_attach_policy(mp, connp, ixa);
3713         if (mp == NULL) {
3714                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3715                 error = EHOSTUNREACH;   /* IPsec policy failure */
3716                 goto done;
3717         }
3718 
3719         /* We're done.  Pass the packet to ip. */
3720         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3721 
3722         error = conn_ip_output(mp, ixa);
3723         if (!connp->conn_unspec_src)
3724                 ixa->ixa_flags |= IXAF_VERIFY_SOURCE;
3725         /* No rawipOutErrors if an error since IP increases its error counter */
3726         switch (error) {
3727         case 0:
3728                 break;
3729         case EWOULDBLOCK:
3730                 (void) ixa_check_drain_insert(connp, ixa);
3731                 error = 0;
3732                 break;
3733         case EADDRNOTAVAIL:
3734                 /*
3735                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3736                  * Don't have the application see that errno
3737                  */
3738                 error = ENETUNREACH;
3739                 /* FALLTHRU */
3740         default:
3741                 mutex_enter(&connp->conn_lock);
3742                 /*
3743                  * Clear the source and v6lastdst so we call ip_attr_connect
3744                  * for the next packet and try to pick a better source.
3745                  */
3746                 if (connp->conn_mcbc_bind)
3747                         connp->conn_saddr_v6 = ipv6_all_zeros;
3748                 else
3749                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
3750                 connp->conn_v6lastdst = ipv6_all_zeros;
3751                 mutex_exit(&connp->conn_lock);
3752                 break;
3753         }
3754 done:
3755         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3756         ixa->ixa_cred = connp->conn_cred; /* Restore */
3757         ixa->ixa_cpid = connp->conn_cpid;
3758         ixa_refrele(ixa);
3759         ip_pkt_free(ipp);
3760         kmem_free(ipp, sizeof (*ipp));
3761         return (error);
3762 }
3763 
3764 /*
3765  * Handle sending an M_DATA for a connected socket.
3766  * Handles both IPv4 and IPv6.
3767  */
3768 int
3769 icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid)
3770 {
3771         icmp_t          *icmp = connp->conn_icmp;
3772         icmp_stack_t    *is = icmp->icmp_is;
3773         int             error;
3774         ip_xmit_attr_t  *ixa;
3775         boolean_t       do_ipsec;
3776 
3777         /*
3778          * If no other thread is using conn_ixa this just gets a reference to
3779          * conn_ixa. Otherwise we get a safe copy of conn_ixa.
3780          */
3781         ixa = conn_get_ixa(connp, B_FALSE);
3782         if (ixa == NULL) {
3783                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3784                 freemsg(mp);
3785                 return (ENOMEM);
3786         }
3787 
3788         ASSERT(cr != NULL);
3789         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3790         ixa->ixa_cred = cr;
3791         ixa->ixa_cpid = pid;
3792 
3793         /* Defer IPsec if it might need to look at ICMP type/code */
3794         switch (ixa->ixa_protocol) {
3795         case IPPROTO_ICMP:
3796         case IPPROTO_ICMPV6:
3797                 do_ipsec = B_FALSE;
3798                 break;
3799         default:
3800                 do_ipsec = B_TRUE;
3801         }
3802 
3803         mutex_enter(&connp->conn_lock);
3804         mp = icmp_prepend_header_template(connp, ixa, mp,
3805             &connp->conn_saddr_v6, connp->conn_flowinfo, &error);
3806 
3807         if (mp == NULL) {
3808                 ASSERT(error != 0);
3809                 mutex_exit(&connp->conn_lock);
3810                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3811                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3812                 ixa->ixa_cpid = connp->conn_cpid;
3813                 ixa_refrele(ixa);
3814                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3815                 freemsg(mp);
3816                 return (error);
3817         }
3818 
3819         if (!do_ipsec) {
3820                 /* Policy might differ for different ICMP type/code */
3821                 mp = icmp_output_attach_policy(mp, connp, ixa);
3822                 if (mp == NULL) {
3823                         mutex_exit(&connp->conn_lock);
3824                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3825                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3826                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3827                         ixa->ixa_cpid = connp->conn_cpid;
3828                         ixa_refrele(ixa);
3829                         return (EHOSTUNREACH);  /* IPsec policy failure */
3830                 }
3831         }
3832 
3833         /*
3834          * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3835          * safe copy, then we need to fill in any pointers in it.
3836          */
3837         if (ixa->ixa_ire == NULL) {
3838                 in6_addr_t      faddr, saddr;
3839                 in6_addr_t      nexthop;
3840                 in_port_t       fport;
3841 
3842                 saddr = connp->conn_saddr_v6;
3843                 faddr = connp->conn_faddr_v6;
3844                 fport = connp->conn_fport;
3845                 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop);
3846                 mutex_exit(&connp->conn_lock);
3847 
3848                 error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop,
3849                     fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
3850                     (do_ipsec ? IPDF_IPSEC : 0));
3851                 switch (error) {
3852                 case 0:
3853                         break;
3854                 case EADDRNOTAVAIL:
3855                         /*
3856                          * IXAF_VERIFY_SOURCE tells us to pick a better source.
3857                          * Don't have the application see that errno
3858                          */
3859                         error = ENETUNREACH;
3860                         goto failed;
3861                 case ENETDOWN:
3862                         /*
3863                          * Have !ipif_addr_ready address; drop packet silently
3864                          * until we can get applications to not send until we
3865                          * are ready.
3866                          */
3867                         error = 0;
3868                         goto failed;
3869                 case EHOSTUNREACH:
3870                 case ENETUNREACH:
3871                         if (ixa->ixa_ire != NULL) {
3872                                 /*
3873                                  * Let conn_ip_output/ire_send_noroute return
3874                                  * the error and send any local ICMP error.
3875                                  */
3876                                 error = 0;
3877                                 break;
3878                         }
3879                         /* FALLTHRU */
3880                 default:
3881                 failed:
3882                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3883                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3884                         ixa->ixa_cpid = connp->conn_cpid;
3885                         ixa_refrele(ixa);
3886                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3887                         freemsg(mp);
3888                         return (error);
3889                 }
3890         } else {
3891                 /* Done with conn_t */
3892                 mutex_exit(&connp->conn_lock);
3893         }
3894 
3895         /* We're done.  Pass the packet to ip. */
3896         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
3897 
3898         error = conn_ip_output(mp, ixa);
3899         /* No rawipOutErrors if an error since IP increases its error counter */
3900         switch (error) {
3901         case 0:
3902                 break;
3903         case EWOULDBLOCK:
3904                 (void) ixa_check_drain_insert(connp, ixa);
3905                 error = 0;
3906                 break;
3907         case EADDRNOTAVAIL:
3908                 /*
3909                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
3910                  * Don't have the application see that errno
3911                  */
3912                 error = ENETUNREACH;
3913                 break;
3914         }
3915         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3916         ixa->ixa_cred = connp->conn_cred; /* Restore */
3917         ixa->ixa_cpid = connp->conn_cpid;
3918         ixa_refrele(ixa);
3919         return (error);
3920 }
3921 
3922 /*
3923  * Handle sending an M_DATA to the last destination.
3924  * Handles both IPv4 and IPv6.
3925  *
3926  * NOTE: The caller must hold conn_lock and we drop it here.
3927  */
3928 int
3929 icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid,
3930     ip_xmit_attr_t *ixa)
3931 {
3932         icmp_t          *icmp = connp->conn_icmp;
3933         icmp_stack_t    *is = icmp->icmp_is;
3934         int             error;
3935         boolean_t       do_ipsec;
3936 
3937         ASSERT(MUTEX_HELD(&connp->conn_lock));
3938         ASSERT(ixa != NULL);
3939 
3940         ASSERT(cr != NULL);
3941         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3942         ixa->ixa_cred = cr;
3943         ixa->ixa_cpid = pid;
3944 
3945         /* Defer IPsec if it might need to look at ICMP type/code */
3946         switch (ixa->ixa_protocol) {
3947         case IPPROTO_ICMP:
3948         case IPPROTO_ICMPV6:
3949                 do_ipsec = B_FALSE;
3950                 break;
3951         default:
3952                 do_ipsec = B_TRUE;
3953         }
3954 
3955 
3956         mp = icmp_prepend_header_template(connp, ixa, mp,
3957             &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error);
3958 
3959         if (mp == NULL) {
3960                 ASSERT(error != 0);
3961                 mutex_exit(&connp->conn_lock);
3962                 ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3963                 ixa->ixa_cred = connp->conn_cred; /* Restore */
3964                 ixa->ixa_cpid = connp->conn_cpid;
3965                 ixa_refrele(ixa);
3966                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3967                 freemsg(mp);
3968                 return (error);
3969         }
3970 
3971         if (!do_ipsec) {
3972                 /* Policy might differ for different ICMP type/code */
3973                 mp = icmp_output_attach_policy(mp, connp, ixa);
3974                 if (mp == NULL) {
3975                         mutex_exit(&connp->conn_lock);
3976                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
3977                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
3978                         ixa->ixa_cred = connp->conn_cred; /* Restore */
3979                         ixa->ixa_cpid = connp->conn_cpid;
3980                         ixa_refrele(ixa);
3981                         return (EHOSTUNREACH);  /* IPsec policy failure */
3982                 }
3983         }
3984 
3985         /*
3986          * In case we got a safe copy of conn_ixa, or if opt_set made us a new
3987          * safe copy, then we need to fill in any pointers in it.
3988          */
3989         if (ixa->ixa_ire == NULL) {
3990                 in6_addr_t      lastdst, lastsrc;
3991                 in6_addr_t      nexthop;
3992                 in_port_t       lastport;
3993 
3994                 lastsrc = connp->conn_v6lastsrc;
3995                 lastdst = connp->conn_v6lastdst;
3996                 lastport = connp->conn_lastdstport;
3997                 ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop);
3998                 mutex_exit(&connp->conn_lock);
3999 
4000                 error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst,
4001                     &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC |
4002                     IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0));
4003                 switch (error) {
4004                 case 0:
4005                         break;
4006                 case EADDRNOTAVAIL:
4007                         /*
4008                          * IXAF_VERIFY_SOURCE tells us to pick a better source.
4009                          * Don't have the application see that errno
4010                          */
4011                         error = ENETUNREACH;
4012                         goto failed;
4013                 case ENETDOWN:
4014                         /*
4015                          * Have !ipif_addr_ready address; drop packet silently
4016                          * until we can get applications to not send until we
4017                          * are ready.
4018                          */
4019                         error = 0;
4020                         goto failed;
4021                 case EHOSTUNREACH:
4022                 case ENETUNREACH:
4023                         if (ixa->ixa_ire != NULL) {
4024                                 /*
4025                                  * Let conn_ip_output/ire_send_noroute return
4026                                  * the error and send any local ICMP error.
4027                                  */
4028                                 error = 0;
4029                                 break;
4030                         }
4031                         /* FALLTHRU */
4032                 default:
4033                 failed:
4034                         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4035                         ixa->ixa_cred = connp->conn_cred; /* Restore */
4036                         ixa->ixa_cpid = connp->conn_cpid;
4037                         ixa_refrele(ixa);
4038                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4039                         freemsg(mp);
4040                         return (error);
4041                 }
4042         } else {
4043                 /* Done with conn_t */
4044                 mutex_exit(&connp->conn_lock);
4045         }
4046 
4047         /* We're done.  Pass the packet to ip. */
4048         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4049         error = conn_ip_output(mp, ixa);
4050         /* No rawipOutErrors if an error since IP increases its error counter */
4051         switch (error) {
4052         case 0:
4053                 break;
4054         case EWOULDBLOCK:
4055                 (void) ixa_check_drain_insert(connp, ixa);
4056                 error = 0;
4057                 break;
4058         case EADDRNOTAVAIL:
4059                 /*
4060                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
4061                  * Don't have the application see that errno
4062                  */
4063                 error = ENETUNREACH;
4064                 /* FALLTHRU */
4065         default:
4066                 mutex_enter(&connp->conn_lock);
4067                 /*
4068                  * Clear the source and v6lastdst so we call ip_attr_connect
4069                  * for the next packet and try to pick a better source.
4070                  */
4071                 if (connp->conn_mcbc_bind)
4072                         connp->conn_saddr_v6 = ipv6_all_zeros;
4073                 else
4074                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4075                 connp->conn_v6lastdst = ipv6_all_zeros;
4076                 mutex_exit(&connp->conn_lock);
4077                 break;
4078         }
4079         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4080         ixa->ixa_cred = connp->conn_cred; /* Restore */
4081         ixa->ixa_cpid = connp->conn_cpid;
4082         ixa_refrele(ixa);
4083         return (error);
4084 }
4085 
4086 
4087 /*
4088  * Prepend the header template and then fill in the source and
4089  * flowinfo. The caller needs to handle the destination address since
4090  * it's setting is different if rthdr or source route.
4091  *
4092  * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET.
4093  * When it returns NULL it sets errorp.
4094  */
4095 static mblk_t *
4096 icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
4097     const in6_addr_t *v6src, uint32_t flowinfo, int *errorp)
4098 {
4099         icmp_t          *icmp = connp->conn_icmp;
4100         icmp_stack_t    *is = icmp->icmp_is;
4101         uint_t          pktlen;
4102         uint_t          copylen;
4103         uint8_t         *iph;
4104         uint_t          ip_hdr_length;
4105         uint32_t        cksum;
4106         ip_pkt_t        *ipp;
4107 
4108         ASSERT(MUTEX_HELD(&connp->conn_lock));
4109 
4110         /*
4111          * Copy the header template.
4112          */
4113         copylen = connp->conn_ht_iphc_len;
4114         pktlen = copylen + msgdsize(mp);
4115         if (pktlen > IP_MAXPACKET) {
4116                 freemsg(mp);
4117                 *errorp = EMSGSIZE;
4118                 return (NULL);
4119         }
4120         ixa->ixa_pktlen = pktlen;
4121 
4122         /* check/fix buffer config, setup pointers into it */
4123         iph = mp->b_rptr - copylen;
4124         if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) {
4125                 mblk_t *mp1;
4126 
4127                 mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED);
4128                 if (mp1 == NULL) {
4129                         freemsg(mp);
4130                         *errorp = ENOMEM;
4131                         return (NULL);
4132                 }
4133                 mp1->b_wptr = DB_LIM(mp1);
4134                 mp1->b_cont = mp;
4135                 mp = mp1;
4136                 iph = (mp->b_wptr - copylen);
4137         }
4138         mp->b_rptr = iph;
4139         bcopy(connp->conn_ht_iphc, iph, copylen);
4140         ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc);
4141 
4142         ixa->ixa_ip_hdr_length = ip_hdr_length;
4143 
4144         /*
4145          * Prepare for ICMPv6 checksum done in IP.
4146          *
4147          * icmp_build_hdr_template has already massaged any routing header
4148          * and placed the result in conn_sum.
4149          *
4150          * We make it easy for IP to include our pseudo header
4151          * by putting our length (and any routing header adjustment)
4152          * in the ICMPv6 checksum field.
4153          */
4154         cksum = pktlen - ip_hdr_length;
4155 
4156         cksum += connp->conn_sum;
4157         cksum = (cksum >> 16) + (cksum & 0xFFFF);
4158         ASSERT(cksum < 0x10000);
4159 
4160         ipp = &connp->conn_xmit_ipp;
4161         if (ixa->ixa_flags & IXAF_IS_IPV4) {
4162                 ipha_t  *ipha = (ipha_t *)iph;
4163 
4164                 ipha->ipha_length = htons((uint16_t)pktlen);
4165 
4166                 /* if IP_PKTINFO specified an addres it wins over bind() */
4167                 if ((ipp->ipp_fields & IPPF_ADDR) &&
4168                     IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4169                         ASSERT(ipp->ipp_addr_v4 != INADDR_ANY);
4170                         ipha->ipha_src = ipp->ipp_addr_v4;
4171                 } else {
4172                         IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src);
4173                 }
4174         } else {
4175                 ip6_t *ip6h = (ip6_t *)iph;
4176                 uint_t  cksum_offset = 0;
4177 
4178                 ip6h->ip6_plen =  htons((uint16_t)(pktlen - IPV6_HDR_LEN));
4179 
4180                 /* if IP_PKTINFO specified an addres it wins over bind() */
4181                 if ((ipp->ipp_fields & IPPF_ADDR) &&
4182                     !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) {
4183                         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr));
4184                         ip6h->ip6_src = ipp->ipp_addr;
4185                 } else {
4186                         ip6h->ip6_src = *v6src;
4187                 }
4188                 ip6h->ip6_vcf =
4189                     (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4190                     (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4191                 if (ipp->ipp_fields & IPPF_TCLASS) {
4192                         /* Overrides the class part of flowinfo */
4193                         ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4194                             ipp->ipp_tclass);
4195                 }
4196 
4197                 if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) {
4198                         if (connp->conn_proto == IPPROTO_ICMPV6) {
4199                                 cksum_offset = ixa->ixa_ip_hdr_length +
4200                                     offsetof(icmp6_t, icmp6_cksum);
4201                         } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) {
4202                                 cksum_offset = ixa->ixa_ip_hdr_length +
4203                                     ixa->ixa_raw_cksum_offset;
4204                         }
4205                 }
4206                 if (cksum_offset != 0) {
4207                         uint16_t *ptr;
4208 
4209                         /* Make sure the checksum fits in the first mblk */
4210                         if (cksum_offset + sizeof (short) > MBLKL(mp)) {
4211                                 mblk_t *mp1;
4212 
4213                                 mp1 = msgpullup(mp,
4214                                     cksum_offset + sizeof (short));
4215                                 freemsg(mp);
4216                                 if (mp1 == NULL) {
4217                                         *errorp = ENOMEM;
4218                                         return (NULL);
4219                                 }
4220                                 mp = mp1;
4221                                 iph = mp->b_rptr;
4222                                 ip6h = (ip6_t *)iph;
4223                         }
4224                         ptr = (uint16_t *)(mp->b_rptr + cksum_offset);
4225                         *ptr = htons(cksum);
4226                 }
4227         }
4228 
4229         return (mp);
4230 }
4231 
4232 /*
4233  * This routine handles all messages passed downstream.  It either
4234  * consumes the message or passes it downstream; it never queues a
4235  * a message.
4236  */
4237 void
4238 icmp_wput(queue_t *q, mblk_t *mp)
4239 {
4240         sin6_t          *sin6;
4241         sin_t           *sin = NULL;
4242         uint_t          srcid;
4243         conn_t          *connp = Q_TO_CONN(q);
4244         icmp_t          *icmp = connp->conn_icmp;
4245         int             error = 0;
4246         struct sockaddr *addr = NULL;
4247         socklen_t       addrlen;
4248         icmp_stack_t    *is = icmp->icmp_is;
4249         struct T_unitdata_req *tudr;
4250         mblk_t          *data_mp;
4251         cred_t          *cr;
4252         pid_t           pid;
4253 
4254         /*
4255          * We directly handle several cases here: T_UNITDATA_REQ message
4256          * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected
4257          * socket.
4258          */
4259         switch (DB_TYPE(mp)) {
4260         case M_DATA:
4261                 /* sockfs never sends down M_DATA */
4262                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4263                 freemsg(mp);
4264                 return;
4265 
4266         case M_PROTO:
4267         case M_PCPROTO:
4268                 tudr = (struct T_unitdata_req *)mp->b_rptr;
4269                 if (MBLKL(mp) < sizeof (*tudr) ||
4270                     ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) {
4271                         icmp_wput_other(q, mp);
4272                         return;
4273                 }
4274                 break;
4275 
4276         default:
4277                 icmp_wput_other(q, mp);
4278                 return;
4279         }
4280 
4281         /* Handle valid T_UNITDATA_REQ here */
4282         data_mp = mp->b_cont;
4283         if (data_mp == NULL) {
4284                 error = EPROTO;
4285                 goto ud_error2;
4286         }
4287         mp->b_cont = NULL;
4288 
4289         if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) {
4290                 error = EADDRNOTAVAIL;
4291                 goto ud_error2;
4292         }
4293 
4294         /*
4295          * All Solaris components should pass a db_credp
4296          * for this message, hence we ASSERT.
4297          * On production kernels we return an error to be robust against
4298          * random streams modules sitting on top of us.
4299          */
4300         cr = msg_getcred(mp, &pid);
4301         ASSERT(cr != NULL);
4302         if (cr == NULL) {
4303                 error = EINVAL;
4304                 goto ud_error2;
4305         }
4306 
4307         /*
4308          * If a port has not been bound to the stream, fail.
4309          * This is not a problem when sockfs is directly
4310          * above us, because it will ensure that the socket
4311          * is first bound before allowing data to be sent.
4312          */
4313         if (icmp->icmp_state == TS_UNBND) {
4314                 error = EPROTO;
4315                 goto ud_error2;
4316         }
4317         addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset];
4318         addrlen = tudr->DEST_length;
4319 
4320         switch (connp->conn_family) {
4321         case AF_INET6:
4322                 sin6 = (sin6_t *)addr;
4323                 if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
4324                     (sin6->sin6_family != AF_INET6)) {
4325                         error = EADDRNOTAVAIL;
4326                         goto ud_error2;
4327                 }
4328 
4329                 /* No support for mapped addresses on raw sockets */
4330                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
4331                         error = EADDRNOTAVAIL;
4332                         goto ud_error2;
4333                 }
4334                 srcid = sin6->__sin6_src_id;
4335 
4336                 /*
4337                  * If the local address is a mapped address return
4338                  * an error.
4339                  * It would be possible to send an IPv6 packet but the
4340                  * response would never make it back to the application
4341                  * since it is bound to a mapped address.
4342                  */
4343                 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
4344                         error = EADDRNOTAVAIL;
4345                         goto ud_error2;
4346                 }
4347 
4348                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
4349                         sin6->sin6_addr = ipv6_loopback;
4350 
4351                 if (tudr->OPT_length != 0) {
4352                         /*
4353                          * If we are connected then the destination needs to be
4354                          * the same as the connected one.
4355                          */
4356                         if (icmp->icmp_state == TS_DATA_XFER &&
4357                             !conn_same_as_last_v6(connp, sin6)) {
4358                                 error = EISCONN;
4359                                 goto ud_error2;
4360                         }
4361                         error = icmp_output_ancillary(connp, NULL, sin6,
4362                             data_mp, mp, NULL, cr, pid);
4363                 } else {
4364                         ip_xmit_attr_t *ixa;
4365 
4366                         /*
4367                          * We have to allocate an ip_xmit_attr_t before we grab
4368                          * conn_lock and we need to hold conn_lock once we've
4369                          * checked conn_same_as_last_v6 to handle concurrent
4370                          * send* calls on a socket.
4371                          */
4372                         ixa = conn_get_ixa(connp, B_FALSE);
4373                         if (ixa == NULL) {
4374                                 error = ENOMEM;
4375                                 goto ud_error2;
4376                         }
4377                         mutex_enter(&connp->conn_lock);
4378 
4379                         if (conn_same_as_last_v6(connp, sin6) &&
4380                             connp->conn_lastsrcid == srcid &&
4381                             ipsec_outbound_policy_current(ixa)) {
4382                                 /* icmp_output_lastdst drops conn_lock */
4383                                 error = icmp_output_lastdst(connp, data_mp, cr,
4384                                     pid, ixa);
4385                         } else {
4386                                 /* icmp_output_newdst drops conn_lock */
4387                                 error = icmp_output_newdst(connp, data_mp, NULL,
4388                                     sin6, cr, pid, ixa);
4389                         }
4390                         ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4391                 }
4392                 if (error == 0) {
4393                         freeb(mp);
4394                         return;
4395                 }
4396                 break;
4397 
4398         case AF_INET:
4399                 sin = (sin_t *)addr;
4400                 if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
4401                     (sin->sin_family != AF_INET)) {
4402                         error = EADDRNOTAVAIL;
4403                         goto ud_error2;
4404                 }
4405                 if (sin->sin_addr.s_addr == INADDR_ANY)
4406                         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
4407 
4408                 /* Protocol 255 contains full IP headers */
4409                 /* Read without holding lock */
4410                 if (icmp->icmp_hdrincl) {
4411                         if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) {
4412                                 if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) {
4413                                         error = EINVAL;
4414                                         goto ud_error2;
4415                                 }
4416                         }
4417                         error = icmp_output_hdrincl(connp, data_mp, cr, pid);
4418                         if (error == 0) {
4419                                 freeb(mp);
4420                                 return;
4421                         }
4422                         /* data_mp consumed above */
4423                         data_mp = NULL;
4424                         goto ud_error2;
4425                 }
4426 
4427                 if (tudr->OPT_length != 0) {
4428                         /*
4429                          * If we are connected then the destination needs to be
4430                          * the same as the connected one.
4431                          */
4432                         if (icmp->icmp_state == TS_DATA_XFER &&
4433                             !conn_same_as_last_v4(connp, sin)) {
4434                                 error = EISCONN;
4435                                 goto ud_error2;
4436                         }
4437                         error = icmp_output_ancillary(connp, sin, NULL,
4438                             data_mp, mp, NULL, cr, pid);
4439                 } else {
4440                         ip_xmit_attr_t *ixa;
4441 
4442                         /*
4443                          * We have to allocate an ip_xmit_attr_t before we grab
4444                          * conn_lock and we need to hold conn_lock once we've
4445                          * checked conn_same_as_last_v4 to handle concurrent
4446                          * send* calls on a socket.
4447                          */
4448                         ixa = conn_get_ixa(connp, B_FALSE);
4449                         if (ixa == NULL) {
4450                                 error = ENOMEM;
4451                                 goto ud_error2;
4452                         }
4453                         mutex_enter(&connp->conn_lock);
4454 
4455                         if (conn_same_as_last_v4(connp, sin) &&
4456                             ipsec_outbound_policy_current(ixa)) {
4457                                 /* icmp_output_lastdst drops conn_lock */
4458                                 error = icmp_output_lastdst(connp, data_mp, cr,
4459                                     pid, ixa);
4460                         } else {
4461                                 /* icmp_output_newdst drops conn_lock */
4462                                 error = icmp_output_newdst(connp, data_mp, sin,
4463                                     NULL, cr, pid, ixa);
4464                         }
4465                         ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
4466                 }
4467                 if (error == 0) {
4468                         freeb(mp);
4469                         return;
4470                 }
4471                 break;
4472         }
4473         ASSERT(mp != NULL);
4474         /* mp is freed by the following routine */
4475         icmp_ud_err(q, mp, (t_scalar_t)error);
4476         return;
4477 
4478 ud_error2:
4479         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4480         freemsg(data_mp);
4481         ASSERT(mp != NULL);
4482         /* mp is freed by the following routine */
4483         icmp_ud_err(q, mp, (t_scalar_t)error);
4484 }
4485 
4486 /*
4487  * Handle the case of the IP address or flow label being different
4488  * for both IPv4 and IPv6.
4489  *
4490  * NOTE: The caller must hold conn_lock and we drop it here.
4491  */
4492 static int
4493 icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6,
4494     cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa)
4495 {
4496         icmp_t          *icmp = connp->conn_icmp;
4497         icmp_stack_t    *is = icmp->icmp_is;
4498         int             error;
4499         ip_xmit_attr_t  *oldixa;
4500         boolean_t       do_ipsec;
4501         uint_t          srcid;
4502         uint32_t        flowinfo;
4503         in6_addr_t      v6src;
4504         in6_addr_t      v6dst;
4505         in6_addr_t      v6nexthop;
4506         in_port_t       dstport;
4507 
4508         ASSERT(MUTEX_HELD(&connp->conn_lock));
4509         ASSERT(ixa != NULL);
4510 
4511         /*
4512          * We hold conn_lock across all the use and modifications of
4513          * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they
4514          * stay consistent.
4515          */
4516 
4517         ASSERT(cr != NULL);
4518         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4519         ixa->ixa_cred = cr;
4520         ixa->ixa_cpid = pid;
4521         if (is_system_labeled()) {
4522                 /* We need to restart with a label based on the cred */
4523                 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
4524         }
4525         /*
4526          * If we are connected then the destination needs to be the
4527          * same as the connected one, which is not the case here since we
4528          * checked for that above.
4529          */
4530         if (icmp->icmp_state == TS_DATA_XFER) {
4531                 mutex_exit(&connp->conn_lock);
4532                 error = EISCONN;
4533                 goto ud_error;
4534         }
4535 
4536         /* In case previous destination was multicast or multirt */
4537         ip_attr_newdst(ixa);
4538 
4539         /*
4540          * If laddr is unspecified then we look at sin6_src_id.
4541          * We will give precedence to a source address set with IPV6_PKTINFO
4542          * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't
4543          * want ip_attr_connect to select a source (since it can fail) when
4544          * IPV6_PKTINFO is specified.
4545          * If this doesn't result in a source address then we get a source
4546          * from ip_attr_connect() below.
4547          */
4548         v6src = connp->conn_saddr_v6;
4549         if (sin != NULL) {
4550                 IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst);
4551                 dstport = sin->sin_port;
4552                 flowinfo = 0;
4553                 /* Don't bother with ip_srcid_find_id(), but indicate anyway. */
4554                 srcid = 0;
4555                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4556                 ixa->ixa_flags |= IXAF_IS_IPV4;
4557         } else {
4558                 boolean_t v4mapped;
4559 
4560                 v6dst = sin6->sin6_addr;
4561                 dstport = sin6->sin6_port;
4562                 flowinfo = sin6->sin6_flowinfo;
4563                 srcid = sin6->__sin6_src_id;
4564                 if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) {
4565                         ixa->ixa_scopeid = sin6->sin6_scope_id;
4566                         ixa->ixa_flags |= IXAF_SCOPEID_SET;
4567                 } else {
4568                         ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
4569                 }
4570                 v4mapped = IN6_IS_ADDR_V4MAPPED(&v6dst);
4571                 if (v4mapped)
4572                         ixa->ixa_flags |= IXAF_IS_IPV4;
4573                 else
4574                         ixa->ixa_flags &= ~IXAF_IS_IPV4;
4575                 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) {
4576                         if (!ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp),
4577                             v4mapped, connp->conn_netstack)) {
4578                                 /* Mismatched v4mapped/v6 specified by srcid. */
4579                                 mutex_exit(&connp->conn_lock);
4580                                 error = EADDRNOTAVAIL;
4581                                 goto ud_error;
4582                         }
4583                 }
4584         }
4585         /* Handle IP_PKTINFO/IPV6_PKTINFO setting source address. */
4586         if (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR) {
4587                 ip_pkt_t *ipp = &connp->conn_xmit_ipp;
4588 
4589                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
4590                         if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4591                                 v6src = ipp->ipp_addr;
4592                 } else {
4593                         if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4594                                 v6src = ipp->ipp_addr;
4595                 }
4596         }
4597 
4598         /* Defer IPsec if it might need to look at ICMP type/code */
4599         switch (ixa->ixa_protocol) {
4600         case IPPROTO_ICMP:
4601         case IPPROTO_ICMPV6:
4602                 do_ipsec = B_FALSE;
4603                 break;
4604         default:
4605                 do_ipsec = B_TRUE;
4606         }
4607 
4608         ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop);
4609         mutex_exit(&connp->conn_lock);
4610 
4611         error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport,
4612             &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST |
4613             (do_ipsec ? IPDF_IPSEC : 0));
4614         switch (error) {
4615         case 0:
4616                 break;
4617         case EADDRNOTAVAIL:
4618                 /*
4619                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
4620                  * Don't have the application see that errno
4621                  */
4622                 error = ENETUNREACH;
4623                 goto failed;
4624         case ENETDOWN:
4625                 /*
4626                  * Have !ipif_addr_ready address; drop packet silently
4627                  * until we can get applications to not send until we
4628                  * are ready.
4629                  */
4630                 error = 0;
4631                 goto failed;
4632         case EHOSTUNREACH:
4633         case ENETUNREACH:
4634                 if (ixa->ixa_ire != NULL) {
4635                         /*
4636                          * Let conn_ip_output/ire_send_noroute return
4637                          * the error and send any local ICMP error.
4638                          */
4639                         error = 0;
4640                         break;
4641                 }
4642                 /* FALLTHRU */
4643         default:
4644         failed:
4645                 goto ud_error;
4646         }
4647 
4648         mutex_enter(&connp->conn_lock);
4649         /*
4650          * While we dropped the lock some other thread might have connected
4651          * this socket. If so we bail out with EISCONN to ensure that the
4652          * connecting thread is the one that updates conn_ixa, conn_ht_*
4653          * and conn_*last*.
4654          */
4655         if (icmp->icmp_state == TS_DATA_XFER) {
4656                 mutex_exit(&connp->conn_lock);
4657                 error = EISCONN;
4658                 goto ud_error;
4659         }
4660 
4661         /*
4662          * We need to rebuild the headers if
4663          *  - we are labeling packets (could be different for different
4664          *    destinations)
4665          *  - we have a source route (or routing header) since we need to
4666          *    massage that to get the pseudo-header checksum
4667          *  - a socket option with COA_HEADER_CHANGED has been set which
4668          *    set conn_v6lastdst to zero.
4669          *
4670          * Otherwise the prepend function will just update the src, dst,
4671          * and flow label.
4672          */
4673         if (is_system_labeled()) {
4674                 /* TX MLP requires SCM_UCRED and don't have that here */
4675                 if (connp->conn_mlp_type != mlptSingle) {
4676                         mutex_exit(&connp->conn_lock);
4677                         error = ECONNREFUSED;
4678                         goto ud_error;
4679                 }
4680                 /*
4681                  * Check whether Trusted Solaris policy allows communication
4682                  * with this host, and pretend that the destination is
4683                  * unreachable if not.
4684                  * Compute any needed label and place it in ipp_label_v4/v6.
4685                  *
4686                  * Later conn_build_hdr_template/conn_prepend_hdr takes
4687                  * ipp_label_v4/v6 to form the packet.
4688                  *
4689                  * Tsol note: Since we hold conn_lock we know no other
4690                  * thread manipulates conn_xmit_ipp.
4691                  */
4692                 error = conn_update_label(connp, ixa, &v6dst,
4693                     &connp->conn_xmit_ipp);
4694                 if (error != 0) {
4695                         mutex_exit(&connp->conn_lock);
4696                         goto ud_error;
4697                 }
4698                 /* Rebuild the header template */
4699                 error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4700                     flowinfo);
4701                 if (error != 0) {
4702                         mutex_exit(&connp->conn_lock);
4703                         goto ud_error;
4704                 }
4705         } else if (connp->conn_xmit_ipp.ipp_fields &
4706             (IPPF_IPV4_OPTIONS|IPPF_RTHDR) ||
4707             IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) {
4708                 /* Rebuild the header template */
4709                 error = icmp_build_hdr_template(connp, &v6src, &v6dst,
4710                     flowinfo);
4711                 if (error != 0) {
4712                         mutex_exit(&connp->conn_lock);
4713                         goto ud_error;
4714                 }
4715         } else {
4716                 /* Simply update the destination address if no source route */
4717                 if (ixa->ixa_flags & IXAF_IS_IPV4) {
4718                         ipha_t  *ipha = (ipha_t *)connp->conn_ht_iphc;
4719 
4720                         IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst);
4721                         if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) {
4722                                 ipha->ipha_fragment_offset_and_flags |=
4723                                     IPH_DF_HTONS;
4724                         } else {
4725                                 ipha->ipha_fragment_offset_and_flags &=
4726                                     ~IPH_DF_HTONS;
4727                         }
4728                 } else {
4729                         ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc;
4730                         ip6h->ip6_dst = v6dst;
4731                 }
4732         }
4733 
4734         /*
4735          * Remember the dst etc which corresponds to the built header
4736          * template and conn_ixa.
4737          */
4738         oldixa = conn_replace_ixa(connp, ixa);
4739         connp->conn_v6lastdst = v6dst;
4740         connp->conn_lastflowinfo = flowinfo;
4741         connp->conn_lastscopeid = ixa->ixa_scopeid;
4742         connp->conn_lastsrcid = srcid;
4743         /* Also remember a source to use together with lastdst */
4744         connp->conn_v6lastsrc = v6src;
4745 
4746         data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src,
4747             flowinfo, &error);
4748 
4749         /* Done with conn_t */
4750         mutex_exit(&connp->conn_lock);
4751         ixa_refrele(oldixa);
4752 
4753         if (data_mp == NULL) {
4754                 ASSERT(error != 0);
4755                 goto ud_error;
4756         }
4757 
4758         if (!do_ipsec) {
4759                 /* Policy might differ for different ICMP type/code */
4760                 data_mp = icmp_output_attach_policy(data_mp, connp, ixa);
4761                 if (data_mp == NULL) {
4762                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4763                         error = EHOSTUNREACH;   /* IPsec policy failure */
4764                         goto done;
4765                 }
4766         }
4767 
4768         /* We're done.  Pass the packet to ip. */
4769         BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
4770 
4771         error = conn_ip_output(data_mp, ixa);
4772         /* No rawipOutErrors if an error since IP increases its error counter */
4773         switch (error) {
4774         case 0:
4775                 break;
4776         case EWOULDBLOCK:
4777                 (void) ixa_check_drain_insert(connp, ixa);
4778                 error = 0;
4779                 break;
4780         case EADDRNOTAVAIL:
4781                 /*
4782                  * IXAF_VERIFY_SOURCE tells us to pick a better source.
4783                  * Don't have the application see that errno
4784                  */
4785                 error = ENETUNREACH;
4786                 /* FALLTHRU */
4787         default:
4788                 mutex_enter(&connp->conn_lock);
4789                 /*
4790                  * Clear the source and v6lastdst so we call ip_attr_connect
4791                  * for the next packet and try to pick a better source.
4792                  */
4793                 if (connp->conn_mcbc_bind)
4794                         connp->conn_saddr_v6 = ipv6_all_zeros;
4795                 else
4796                         connp->conn_saddr_v6 = connp->conn_bound_addr_v6;
4797                 connp->conn_v6lastdst = ipv6_all_zeros;
4798                 mutex_exit(&connp->conn_lock);
4799                 break;
4800         }
4801 done:
4802         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4803         ixa->ixa_cred = connp->conn_cred; /* Restore */
4804         ixa->ixa_cpid = connp->conn_cpid;
4805         ixa_refrele(ixa);
4806         return (error);
4807 
4808 ud_error:
4809         ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
4810         ixa->ixa_cred = connp->conn_cred; /* Restore */
4811         ixa->ixa_cpid = connp->conn_cpid;
4812         ixa_refrele(ixa);
4813 
4814         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
4815         freemsg(data_mp);
4816         return (error);
4817 }
4818 
4819 /* ARGSUSED */
4820 static void
4821 icmp_wput_fallback(queue_t *q, mblk_t *mp)
4822 {
4823 #ifdef DEBUG
4824         cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
4825 #endif
4826         freemsg(mp);
4827 }
4828 
4829 static void
4830 icmp_wput_other(queue_t *q, mblk_t *mp)
4831 {
4832         uchar_t *rptr = mp->b_rptr;
4833         struct iocblk *iocp;
4834         conn_t  *connp = Q_TO_CONN(q);
4835         icmp_t  *icmp = connp->conn_icmp;
4836         cred_t *cr;
4837 
4838         switch (mp->b_datap->db_type) {
4839         case M_PROTO:
4840         case M_PCPROTO:
4841                 if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
4842                         /*
4843                          * If the message does not contain a PRIM_type,
4844                          * throw it away.
4845                          */
4846                         freemsg(mp);
4847                         return;
4848                 }
4849                 switch (((t_primp_t)rptr)->type) {
4850                 case T_ADDR_REQ:
4851                         icmp_addr_req(q, mp);
4852                         return;
4853                 case O_T_BIND_REQ:
4854                 case T_BIND_REQ:
4855                         icmp_tpi_bind(q, mp);
4856                         return;
4857                 case T_CONN_REQ:
4858                         icmp_tpi_connect(q, mp);
4859                         return;
4860                 case T_CAPABILITY_REQ:
4861                         icmp_capability_req(q, mp);
4862                         return;
4863                 case T_INFO_REQ:
4864                         icmp_info_req(q, mp);
4865                         return;
4866                 case T_UNITDATA_REQ:
4867                         /*
4868                          * If a T_UNITDATA_REQ gets here, the address must
4869                          * be bad.  Valid T_UNITDATA_REQs are handled
4870                          * in icmp_wput.
4871                          */
4872                         icmp_ud_err(q, mp, EADDRNOTAVAIL);
4873                         return;
4874                 case T_UNBIND_REQ:
4875                         icmp_tpi_unbind(q, mp);
4876                         return;
4877                 case T_SVR4_OPTMGMT_REQ:
4878                         /*
4879                          * All Solaris components should pass a db_credp
4880                          * for this TPI message, hence we ASSERT.
4881                          * But in case there is some other M_PROTO that looks
4882                          * like a TPI message sent by some other kernel
4883                          * component, we check and return an error.
4884                          */
4885                         cr = msg_getcred(mp, NULL);
4886                         ASSERT(cr != NULL);
4887                         if (cr == NULL) {
4888                                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
4889                                 return;
4890                         }
4891 
4892                         if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get,
4893                             cr)) {
4894                                 svr4_optcom_req(q, mp, cr, &icmp_opt_obj);
4895                         }
4896                         return;
4897 
4898                 case T_OPTMGMT_REQ:
4899                         /*
4900                          * All Solaris components should pass a db_credp
4901                          * for this TPI message, hence we ASSERT.
4902                          * But in case there is some other M_PROTO that looks
4903                          * like a TPI message sent by some other kernel
4904                          * component, we check and return an error.
4905                          */
4906                         cr = msg_getcred(mp, NULL);
4907                         ASSERT(cr != NULL);
4908                         if (cr == NULL) {
4909                                 icmp_err_ack(q, mp, TSYSERR, EINVAL);
4910                                 return;
4911                         }
4912                         tpi_optcom_req(q, mp, cr, &icmp_opt_obj);
4913                         return;
4914 
4915                 case T_DISCON_REQ:
4916                         icmp_tpi_disconnect(q, mp);
4917                         return;
4918 
4919                 /* The following TPI message is not supported by icmp. */
4920                 case O_T_CONN_RES:
4921                 case T_CONN_RES:
4922                         icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4923                         return;
4924 
4925                 /* The following 3 TPI requests are illegal for icmp. */
4926                 case T_DATA_REQ:
4927                 case T_EXDATA_REQ:
4928                 case T_ORDREL_REQ:
4929                         icmp_err_ack(q, mp, TNOTSUPPORT, 0);
4930                         return;
4931                 default:
4932                         break;
4933                 }
4934                 break;
4935         case M_FLUSH:
4936                 if (*rptr & FLUSHW)
4937                         flushq(q, FLUSHDATA);
4938                 break;
4939         case M_IOCTL:
4940                 iocp = (struct iocblk *)mp->b_rptr;
4941                 switch (iocp->ioc_cmd) {
4942                 case TI_GETPEERNAME:
4943                         if (icmp->icmp_state != TS_DATA_XFER) {
4944                                 /*
4945                                  * If a default destination address has not
4946                                  * been associated with the stream, then we
4947                                  * don't know the peer's name.
4948                                  */
4949                                 iocp->ioc_error = ENOTCONN;
4950                                 iocp->ioc_count = 0;
4951                                 mp->b_datap->db_type = M_IOCACK;
4952                                 qreply(q, mp);
4953                                 return;
4954                         }
4955                         /* FALLTHRU */
4956                 case TI_GETMYNAME:
4957                         /*
4958                          * For TI_GETPEERNAME and TI_GETMYNAME, we first
4959                          * need to copyin the user's strbuf structure.
4960                          * Processing will continue in the M_IOCDATA case
4961                          * below.
4962                          */
4963                         mi_copyin(q, mp, NULL,
4964                             SIZEOF_STRUCT(strbuf, iocp->ioc_flag));
4965                         return;
4966                 default:
4967                         break;
4968                 }
4969                 break;
4970         case M_IOCDATA:
4971                 icmp_wput_iocdata(q, mp);
4972                 return;
4973         default:
4974                 /* Unrecognized messages are passed through without change. */
4975                 break;
4976         }
4977         ip_wput_nondata(q, mp);
4978 }
4979 
4980 /*
4981  * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA
4982  * messages.
4983  */
4984 static void
4985 icmp_wput_iocdata(queue_t *q, mblk_t *mp)
4986 {
4987         mblk_t          *mp1;
4988         STRUCT_HANDLE(strbuf, sb);
4989         uint_t          addrlen;
4990         conn_t          *connp = Q_TO_CONN(q);
4991         icmp_t          *icmp = connp->conn_icmp;
4992 
4993         /* Make sure it is one of ours. */
4994         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
4995         case TI_GETMYNAME:
4996         case TI_GETPEERNAME:
4997                 break;
4998         default:
4999                 ip_wput_nondata(q, mp);
5000                 return;
5001         }
5002 
5003         switch (mi_copy_state(q, mp, &mp1)) {
5004         case -1:
5005                 return;
5006         case MI_COPY_CASE(MI_COPY_IN, 1):
5007                 break;
5008         case MI_COPY_CASE(MI_COPY_OUT, 1):
5009                 /*
5010                  * The address has been copied out, so now
5011                  * copyout the strbuf.
5012                  */
5013                 mi_copyout(q, mp);
5014                 return;
5015         case MI_COPY_CASE(MI_COPY_OUT, 2):
5016                 /*
5017                  * The address and strbuf have been copied out.
5018                  * We're done, so just acknowledge the original
5019                  * M_IOCTL.
5020                  */
5021                 mi_copy_done(q, mp, 0);
5022                 return;
5023         default:
5024                 /*
5025                  * Something strange has happened, so acknowledge
5026                  * the original M_IOCTL with an EPROTO error.
5027                  */
5028                 mi_copy_done(q, mp, EPROTO);
5029                 return;
5030         }
5031 
5032         /*
5033          * Now we have the strbuf structure for TI_GETMYNAME
5034          * and TI_GETPEERNAME.  Next we copyout the requested
5035          * address and then we'll copyout the strbuf.
5036          */
5037         STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
5038             (void *)mp1->b_rptr);
5039 
5040         if (connp->conn_family == AF_INET)
5041                 addrlen = sizeof (sin_t);
5042         else
5043                 addrlen = sizeof (sin6_t);
5044 
5045         if (STRUCT_FGET(sb, maxlen) < addrlen) {
5046                 mi_copy_done(q, mp, EINVAL);
5047                 return;
5048         }
5049         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5050         case TI_GETMYNAME:
5051                 break;
5052         case TI_GETPEERNAME:
5053                 if (icmp->icmp_state != TS_DATA_XFER) {
5054                         mi_copy_done(q, mp, ENOTCONN);
5055                         return;
5056                 }
5057                 break;
5058         default:
5059                 mi_copy_done(q, mp, EPROTO);
5060                 return;
5061         }
5062         mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
5063         if (!mp1)
5064                 return;
5065 
5066         STRUCT_FSET(sb, len, addrlen);
5067         switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
5068         case TI_GETMYNAME:
5069                 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr,
5070                     &addrlen);
5071                 break;
5072         case TI_GETPEERNAME:
5073                 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr,
5074                     &addrlen);
5075                 break;
5076         }
5077         mp1->b_wptr += addrlen;
5078         /* Copy out the address */
5079         mi_copyout(q, mp);
5080 }
5081 
5082 void
5083 icmp_ddi_g_init(void)
5084 {
5085         icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
5086             icmp_opt_obj.odb_opt_arr_cnt);
5087 
5088         /*
5089          * We want to be informed each time a stack is created or
5090          * destroyed in the kernel, so we can maintain the
5091          * set of icmp_stack_t's.
5092          */
5093         netstack_register(NS_ICMP, rawip_stack_init, NULL, rawip_stack_fini);
5094 }
5095 
5096 void
5097 icmp_ddi_g_destroy(void)
5098 {
5099         netstack_unregister(NS_ICMP);
5100 }
5101 
5102 #define INET_NAME       "ip"
5103 
5104 /*
5105  * Initialize the ICMP stack instance.
5106  */
5107 static void *
5108 rawip_stack_init(netstackid_t stackid, netstack_t *ns)
5109 {
5110         icmp_stack_t    *is;
5111         int             error = 0;
5112         size_t          arrsz;
5113         major_t         major;
5114 
5115         is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
5116         is->is_netstack = ns;
5117 
5118         arrsz = sizeof (icmp_propinfo_tbl);
5119         is->is_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
5120         bcopy(icmp_propinfo_tbl, is->is_propinfo_tbl, arrsz);
5121 
5122         is->is_ksp = rawip_kstat_init(stackid);
5123 
5124         major = mod_name_to_major(INET_NAME);
5125         error = ldi_ident_from_major(major, &is->is_ldi_ident);
5126         ASSERT(error == 0);
5127         return (is);
5128 }
5129 
5130 /*
5131  * Free the ICMP stack instance.
5132  */
5133 static void
5134 rawip_stack_fini(netstackid_t stackid, void *arg)
5135 {
5136         icmp_stack_t *is = (icmp_stack_t *)arg;
5137 
5138         kmem_free(is->is_propinfo_tbl, sizeof (icmp_propinfo_tbl));
5139         is->is_propinfo_tbl = NULL;
5140 
5141         rawip_kstat_fini(stackid, is->is_ksp);
5142         is->is_ksp = NULL;
5143         ldi_ident_release(is->is_ldi_ident);
5144         kmem_free(is, sizeof (*is));
5145 }
5146 
5147 static void *
5148 rawip_kstat_init(netstackid_t stackid)
5149 {
5150         kstat_t *ksp;
5151 
5152         rawip_named_kstat_t template = {
5153                 { "inDatagrams",        KSTAT_DATA_UINT32, 0 },
5154                 { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
5155                 { "inErrors",           KSTAT_DATA_UINT32, 0 },
5156                 { "outDatagrams",       KSTAT_DATA_UINT32, 0 },
5157                 { "outErrors",          KSTAT_DATA_UINT32, 0 },
5158         };
5159 
5160         ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
5161             KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid);
5162         if (ksp == NULL || ksp->ks_data == NULL)
5163                 return (NULL);
5164 
5165         bcopy(&template, ksp->ks_data, sizeof (template));
5166         ksp->ks_update = rawip_kstat_update;
5167         ksp->ks_private = (void *)(uintptr_t)stackid;
5168 
5169         kstat_install(ksp);
5170         return (ksp);
5171 }
5172 
5173 static void
5174 rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
5175 {
5176         if (ksp != NULL) {
5177                 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
5178                 kstat_delete_netstack(ksp, stackid);
5179         }
5180 }
5181 
5182 static int
5183 rawip_kstat_update(kstat_t *ksp, int rw)
5184 {
5185         rawip_named_kstat_t *rawipkp;
5186         netstackid_t    stackid = (netstackid_t)(uintptr_t)ksp->ks_private;
5187         netstack_t      *ns;
5188         icmp_stack_t    *is;
5189 
5190         if ((ksp == NULL) || (ksp->ks_data == NULL))
5191                 return (EIO);
5192 
5193         if (rw == KSTAT_WRITE)
5194                 return (EACCES);
5195 
5196         rawipkp = (rawip_named_kstat_t *)ksp->ks_data;
5197 
5198         ns = netstack_find_by_stackid(stackid);
5199         if (ns == NULL)
5200                 return (-1);
5201         is = ns->netstack_icmp;
5202         if (is == NULL) {
5203                 netstack_rele(ns);
5204                 return (-1);
5205         }
5206         rawipkp->inDatagrams.value.ui32 =  is->is_rawip_mib.rawipInDatagrams;
5207         rawipkp->inCksumErrs.value.ui32 =  is->is_rawip_mib.rawipInCksumErrs;
5208         rawipkp->inErrors.value.ui32 =          is->is_rawip_mib.rawipInErrors;
5209         rawipkp->outDatagrams.value.ui32 = is->is_rawip_mib.rawipOutDatagrams;
5210         rawipkp->outErrors.value.ui32 =         is->is_rawip_mib.rawipOutErrors;
5211         netstack_rele(ns);
5212         return (0);
5213 }
5214 
5215 /* ARGSUSED */
5216 int
5217 rawip_accept(sock_lower_handle_t lproto_handle,
5218     sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
5219     cred_t *cr)
5220 {
5221         return (EOPNOTSUPP);
5222 }
5223 
5224 /* ARGSUSED */
5225 int
5226 rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5227     socklen_t len, cred_t *cr)
5228 {
5229         conn_t  *connp = (conn_t *)proto_handle;
5230         int     error;
5231 
5232         /* All Solaris components should pass a cred for this operation. */
5233         ASSERT(cr != NULL);
5234 
5235         /* Binding to a NULL address really means unbind */
5236         if (sa == NULL)
5237                 error = rawip_do_unbind(connp);
5238         else
5239                 error = rawip_do_bind(connp, sa, len);
5240 
5241         if (error < 0) {
5242                 if (error == -TOUTSTATE)
5243                         error = EINVAL;
5244                 else
5245                         error = proto_tlitosyserr(-error);
5246         }
5247         return (error);
5248 }
5249 
5250 static int
5251 rawip_implicit_bind(conn_t *connp)
5252 {
5253         sin6_t sin6addr;
5254         sin_t *sin;
5255         sin6_t *sin6;
5256         socklen_t len;
5257         int error;
5258 
5259         if (connp->conn_family == AF_INET) {
5260                 len = sizeof (struct sockaddr_in);
5261                 sin = (sin_t *)&sin6addr;
5262                 *sin = sin_null;
5263                 sin->sin_family = AF_INET;
5264                 sin->sin_addr.s_addr = INADDR_ANY;
5265         } else {
5266                 ASSERT(connp->conn_family == AF_INET6);
5267                 len = sizeof (sin6_t);
5268                 sin6 = (sin6_t *)&sin6addr;
5269                 *sin6 = sin6_null;
5270                 sin6->sin6_family = AF_INET6;
5271                 V6_SET_ZERO(sin6->sin6_addr);
5272         }
5273 
5274         error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
5275 
5276         return ((error < 0) ? proto_tlitosyserr(-error) : error);
5277 }
5278 
5279 static int
5280 rawip_unbind(conn_t *connp)
5281 {
5282         int error;
5283 
5284         error = rawip_do_unbind(connp);
5285         if (error < 0) {
5286                 error = proto_tlitosyserr(-error);
5287         }
5288         return (error);
5289 }
5290 
5291 /* ARGSUSED */
5292 int
5293 rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
5294 {
5295         return (EOPNOTSUPP);
5296 }
5297 
5298 int
5299 rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
5300     socklen_t len, sock_connid_t *id, cred_t *cr)
5301 {
5302         conn_t  *connp = (conn_t *)proto_handle;
5303         icmp_t *icmp = connp->conn_icmp;
5304         int     error;
5305         boolean_t did_bind = B_FALSE;
5306         pid_t   pid = curproc->p_pid;
5307 
5308         /* All Solaris components should pass a cred for this operation. */
5309         ASSERT(cr != NULL);
5310 
5311         if (sa == NULL) {
5312                 /*
5313                  * Disconnect
5314                  * Make sure we are connected
5315                  */
5316                 if (icmp->icmp_state != TS_DATA_XFER)
5317                         return (EINVAL);
5318 
5319                 error = icmp_disconnect(connp);
5320                 return (error);
5321         }
5322 
5323         error = proto_verify_ip_addr(connp->conn_family, sa, len);
5324         if (error != 0)
5325                 return (error);
5326 
5327         /* do an implicit bind if necessary */
5328         if (icmp->icmp_state == TS_UNBND) {
5329                 error = rawip_implicit_bind(connp);
5330                 /*
5331                  * We could be racing with an actual bind, in which case
5332                  * we would see EPROTO. We cross our fingers and try
5333                  * to connect.
5334                  */
5335                 if (!(error == 0 || error == EPROTO))
5336                         return (error);
5337                 did_bind = B_TRUE;
5338         }
5339 
5340         /*
5341          * set SO_DGRAM_ERRIND
5342          */
5343         connp->conn_dgram_errind = B_TRUE;
5344 
5345         error = rawip_do_connect(connp, sa, len, cr, pid);
5346         if (error != 0 && did_bind) {
5347                 int unbind_err;
5348 
5349                 unbind_err = rawip_unbind(connp);
5350                 ASSERT(unbind_err == 0);
5351         }
5352 
5353         if (error == 0) {
5354                 *id = 0;
5355                 (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle,
5356                     0, NULL, -1);
5357         } else if (error < 0) {
5358                 error = proto_tlitosyserr(-error);
5359         }
5360         return (error);
5361 }
5362 
5363 /* ARGSUSED2 */
5364 int
5365 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
5366     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb,
5367     sock_quiesce_arg_t *arg)
5368 {
5369         conn_t  *connp = (conn_t *)proto_handle;
5370         icmp_t  *icmp;
5371         struct T_capability_ack tca;
5372         struct sockaddr_in6 laddr, faddr;
5373         socklen_t laddrlen, faddrlen;
5374         short opts;
5375         struct stroptions *stropt;
5376         mblk_t *mp, *stropt_mp;
5377         int error;
5378 
5379         icmp = connp->conn_icmp;
5380 
5381         stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
5382 
5383         /*
5384          * setup the fallback stream that was allocated
5385          */
5386         connp->conn_dev = (dev_t)RD(q)->q_ptr;
5387         connp->conn_minor_arena = WR(q)->q_ptr;
5388 
5389         RD(q)->q_ptr = WR(q)->q_ptr = connp;
5390 
5391         WR(q)->q_qinfo = &icmpwinit;
5392 
5393         connp->conn_rq = RD(q);
5394         connp->conn_wq = WR(q);
5395 
5396         /* Notify stream head about options before sending up data */
5397         stropt_mp->b_datap->db_type = M_SETOPTS;
5398         stropt_mp->b_wptr += sizeof (*stropt);
5399         stropt = (struct stroptions *)stropt_mp->b_rptr;
5400         stropt->so_flags = SO_WROFF | SO_HIWAT;
5401         stropt->so_wroff = connp->conn_wroff;
5402         stropt->so_hiwat = connp->conn_rcvbuf;
5403         putnext(RD(q), stropt_mp);
5404 
5405         /*
5406          * free helper stream
5407          */
5408         ip_free_helper_stream(connp);
5409 
5410         /*
5411          * Collect the information needed to sync with the sonode
5412          */
5413         icmp_do_capability_ack(icmp, &tca, TC1_INFO);
5414 
5415         laddrlen = faddrlen = sizeof (sin6_t);
5416         (void) rawip_getsockname((sock_lower_handle_t)connp,
5417             (struct sockaddr *)&laddr, &laddrlen, CRED());
5418         error = rawip_getpeername((sock_lower_handle_t)connp,
5419             (struct sockaddr *)&faddr, &faddrlen, CRED());
5420         if (error != 0)
5421                 faddrlen = 0;
5422         opts = 0;
5423         if (connp->conn_dgram_errind)
5424                 opts |= SO_DGRAM_ERRIND;
5425         if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
5426                 opts |= SO_DONTROUTE;
5427 
5428         mp = (*quiesced_cb)(connp->conn_upper_handle, arg, &tca,
5429             (struct sockaddr *)&laddr, laddrlen,
5430             (struct sockaddr *)&faddr, faddrlen, opts);
5431 
5432         /*
5433          * Attempts to send data up during fallback will result in it being
5434          * queued in icmp_t. Now we push up any queued packets.
5435          */
5436         mutex_enter(&icmp->icmp_recv_lock);
5437         if (mp != NULL) {
5438                 mp->b_next = icmp->icmp_fallback_queue_head;
5439                 icmp->icmp_fallback_queue_head = mp;
5440         }
5441         while (icmp->icmp_fallback_queue_head != NULL) {
5442                 mp = icmp->icmp_fallback_queue_head;
5443                 icmp->icmp_fallback_queue_head = mp->b_next;
5444                 mp->b_next = NULL;
5445                 mutex_exit(&icmp->icmp_recv_lock);
5446                 putnext(RD(q), mp);
5447                 mutex_enter(&icmp->icmp_recv_lock);
5448         }
5449         icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
5450 
5451         /*
5452          * No longer a streams less socket
5453          */
5454         mutex_enter(&connp->conn_lock);
5455         connp->conn_flags &= ~IPCL_NONSTR;
5456         mutex_exit(&connp->conn_lock);
5457 
5458         mutex_exit(&icmp->icmp_recv_lock);
5459 
5460         ASSERT(icmp->icmp_fallback_queue_head == NULL &&
5461             icmp->icmp_fallback_queue_tail == NULL);
5462 
5463         ASSERT(connp->conn_ref >= 1);
5464 
5465         return (0);
5466 }
5467 
5468 /* ARGSUSED2 */
5469 sock_lower_handle_t
5470 rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
5471     uint_t *smodep, int *errorp, int flags, cred_t *credp)
5472 {
5473         conn_t *connp;
5474 
5475         if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
5476                 *errorp = EPROTONOSUPPORT;
5477                 return (NULL);
5478         }
5479 
5480         connp = rawip_do_open(family, credp, errorp, flags);
5481         if (connp != NULL) {
5482                 connp->conn_flags |= IPCL_NONSTR;
5483 
5484                 mutex_enter(&connp->conn_lock);
5485                 connp->conn_state_flags &= ~CONN_INCIPIENT;
5486                 mutex_exit(&connp->conn_lock);
5487                 *sock_downcalls = &sock_rawip_downcalls;
5488                 *smodep = SM_ATOMIC;
5489         } else {
5490                 ASSERT(*errorp != 0);
5491         }
5492 
5493         return ((sock_lower_handle_t)connp);
5494 }
5495 
5496 /* ARGSUSED3 */
5497 void
5498 rawip_activate(sock_lower_handle_t proto_handle,
5499     sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
5500     cred_t *cr)
5501 {
5502         conn_t                  *connp = (conn_t *)proto_handle;
5503         struct sock_proto_props sopp;
5504 
5505         /* All Solaris components should pass a cred for this operation. */
5506         ASSERT(cr != NULL);
5507 
5508         connp->conn_upcalls = sock_upcalls;
5509         connp->conn_upper_handle = sock_handle;
5510 
5511         sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
5512             SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
5513         sopp.sopp_wroff = connp->conn_wroff;
5514         sopp.sopp_rxhiwat = connp->conn_rcvbuf;
5515         sopp.sopp_rxlowat = connp->conn_rcvlowat;
5516         sopp.sopp_maxblk = INFPSZ;
5517         sopp.sopp_maxpsz = IP_MAXPACKET;
5518         sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
5519             icmp_mod_info.mi_minpsz;
5520 
5521         (*connp->conn_upcalls->su_set_proto_props)
5522             (connp->conn_upper_handle, &sopp);
5523 
5524         icmp_bind_proto(connp->conn_icmp);
5525 }
5526 
5527 /* ARGSUSED3 */
5528 int
5529 rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5530     socklen_t *salenp, cred_t *cr)
5531 {
5532         conn_t  *connp = (conn_t *)proto_handle;
5533         icmp_t  *icmp = connp->conn_icmp;
5534         int     error;
5535 
5536         /* All Solaris components should pass a cred for this operation. */
5537         ASSERT(cr != NULL);
5538 
5539         mutex_enter(&connp->conn_lock);
5540         if (icmp->icmp_state != TS_DATA_XFER)
5541                 error = ENOTCONN;
5542         else
5543                 error = conn_getpeername(connp, sa, salenp);
5544         mutex_exit(&connp->conn_lock);
5545         return (error);
5546 }
5547 
5548 /* ARGSUSED3 */
5549 int
5550 rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
5551     socklen_t *salenp, cred_t *cr)
5552 {
5553         conn_t  *connp = (conn_t *)proto_handle;
5554         int     error;
5555 
5556         /* All Solaris components should pass a cred for this operation. */
5557         ASSERT(cr != NULL);
5558 
5559         mutex_enter(&connp->conn_lock);
5560         error = conn_getsockname(connp, sa, salenp);
5561         mutex_exit(&connp->conn_lock);
5562         return (error);
5563 }
5564 
5565 int
5566 rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5567     const void *optvalp, socklen_t optlen, cred_t *cr)
5568 {
5569         conn_t  *connp = (conn_t *)proto_handle;
5570         int error;
5571 
5572         /* All Solaris components should pass a cred for this operation. */
5573         ASSERT(cr != NULL);
5574 
5575         error = proto_opt_check(level, option_name, optlen, NULL,
5576             icmp_opt_obj.odb_opt_des_arr,
5577             icmp_opt_obj.odb_opt_arr_cnt,
5578             B_TRUE, B_FALSE, cr);
5579 
5580         if (error != 0) {
5581                 /*
5582                  * option not recognized
5583                  */
5584                 if (error < 0) {
5585                         error = proto_tlitosyserr(-error);
5586                 }
5587                 return (error);
5588         }
5589 
5590         error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
5591             option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
5592             (uchar_t *)optvalp, NULL, cr);
5593 
5594         ASSERT(error >= 0);
5595 
5596         return (error);
5597 }
5598 
5599 int
5600 rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
5601     void *optvalp, socklen_t *optlen, cred_t *cr)
5602 {
5603         int             error;
5604         conn_t          *connp = (conn_t *)proto_handle;
5605         t_uscalar_t     max_optbuf_len;
5606         void            *optvalp_buf;
5607         int             len;
5608 
5609         /* All Solaris components should pass a cred for this operation. */
5610         ASSERT(cr != NULL);
5611 
5612         error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
5613             icmp_opt_obj.odb_opt_des_arr,
5614             icmp_opt_obj.odb_opt_arr_cnt,
5615             B_FALSE, B_TRUE, cr);
5616 
5617         if (error != 0) {
5618                 if (error < 0) {
5619                         error = proto_tlitosyserr(-error);
5620                 }
5621                 return (error);
5622         }
5623 
5624         optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
5625         len = icmp_opt_get(connp, level, option_name, optvalp_buf);
5626         if (len == -1) {
5627                 kmem_free(optvalp_buf, max_optbuf_len);
5628                 return (EINVAL);
5629         }
5630 
5631         /*
5632          * update optlen and copy option value
5633          */
5634         t_uscalar_t size = MIN(len, *optlen);
5635 
5636         bcopy(optvalp_buf, optvalp, size);
5637         bcopy(&size, optlen, sizeof (size));
5638 
5639         kmem_free(optvalp_buf, max_optbuf_len);
5640         return (0);
5641 }
5642 
5643 /* ARGSUSED1 */
5644 int
5645 rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
5646 {
5647         conn_t  *connp = (conn_t *)proto_handle;
5648 
5649         /* All Solaris components should pass a cred for this operation. */
5650         ASSERT(cr != NULL);
5651 
5652         (void) rawip_do_close(connp);
5653         return (0);
5654 }
5655 
5656 /* ARGSUSED2 */
5657 int
5658 rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
5659 {
5660         conn_t  *connp = (conn_t *)proto_handle;
5661 
5662         /* All Solaris components should pass a cred for this operation. */
5663         ASSERT(cr != NULL);
5664 
5665         /* shut down the send side */
5666         if (how != SHUT_RD)
5667                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5668                     SOCK_OPCTL_SHUT_SEND, 0);
5669         /* shut down the recv side */
5670         if (how != SHUT_WR)
5671                 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
5672                     SOCK_OPCTL_SHUT_RECV, 0);
5673         return (0);
5674 }
5675 
5676 void
5677 rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
5678 {
5679         conn_t  *connp = (conn_t *)proto_handle;
5680         icmp_t  *icmp = connp->conn_icmp;
5681 
5682         mutex_enter(&icmp->icmp_recv_lock);
5683         connp->conn_flow_cntrld = B_FALSE;
5684         mutex_exit(&icmp->icmp_recv_lock);
5685 }
5686 
5687 int
5688 rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
5689     int mode, int32_t *rvalp, cred_t *cr)
5690 {
5691         conn_t          *connp = (conn_t *)proto_handle;
5692         int             error;
5693 
5694         /* All Solaris components should pass a cred for this operation. */
5695         ASSERT(cr != NULL);
5696 
5697         /*
5698          * If we don't have a helper stream then create one.
5699          * ip_create_helper_stream takes care of locking the conn_t,
5700          * so this check for NULL is just a performance optimization.
5701          */
5702         if (connp->conn_helper_info == NULL) {
5703                 icmp_stack_t *is = connp->conn_icmp->icmp_is;
5704 
5705                 ASSERT(is->is_ldi_ident != NULL);
5706 
5707                 /*
5708                  * Create a helper stream for non-STREAMS socket.
5709                  */
5710                 error = ip_create_helper_stream(connp, is->is_ldi_ident);
5711                 if (error != 0) {
5712                         ip0dbg(("rawip_ioctl: create of IP helper stream "
5713                             "failed %d\n", error));
5714                         return (error);
5715                 }
5716         }
5717 
5718         switch (cmd) {
5719         case _SIOCSOCKFALLBACK:
5720         case TI_GETPEERNAME:
5721         case TI_GETMYNAME:
5722 #ifdef DEBUG
5723                 cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
5724                     " socket", cmd);
5725 #endif
5726                 error = EINVAL;
5727                 break;
5728         default:
5729                 /*
5730                  * Pass on to IP using helper stream
5731                  */
5732                 error = ldi_ioctl(connp->conn_helper_info->iphs_handle,
5733                     cmd, arg, mode, cr, rvalp);
5734                 break;
5735         }
5736         return (error);
5737 }
5738 
5739 int
5740 rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
5741     cred_t *cr)
5742 {
5743         sin6_t          *sin6;
5744         sin_t           *sin = NULL;
5745         uint_t          srcid;
5746         conn_t          *connp = (conn_t *)proto_handle;
5747         icmp_t          *icmp = connp->conn_icmp;
5748         int             error = 0;
5749         icmp_stack_t    *is = icmp->icmp_is;
5750         pid_t           pid = curproc->p_pid;
5751         ip_xmit_attr_t  *ixa;
5752 
5753         ASSERT(DB_TYPE(mp) == M_DATA);
5754 
5755         /* All Solaris components should pass a cred for this operation. */
5756         ASSERT(cr != NULL);
5757 
5758         /* do an implicit bind if necessary */
5759         if (icmp->icmp_state == TS_UNBND) {
5760                 error = rawip_implicit_bind(connp);
5761                 /*
5762                  * We could be racing with an actual bind, in which case
5763                  * we would see EPROTO. We cross our fingers and try
5764                  * to connect.
5765                  */
5766                 if (!(error == 0 || error == EPROTO)) {
5767                         freemsg(mp);
5768                         return (error);
5769                 }
5770         }
5771 
5772         /* Protocol 255 contains full IP headers */
5773         /* Read without holding lock */
5774         if (icmp->icmp_hdrincl) {
5775                 ASSERT(connp->conn_ipversion == IPV4_VERSION);
5776                 if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) {
5777                         if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) {
5778                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5779                                 freemsg(mp);
5780                                 return (EINVAL);
5781                         }
5782                 }
5783                 error = icmp_output_hdrincl(connp, mp, cr, pid);
5784                 if (is->is_sendto_ignerr)
5785                         return (0);
5786                 else
5787                         return (error);
5788         }
5789 
5790         /* Connected? */
5791         if (msg->msg_name == NULL) {
5792                 if (icmp->icmp_state != TS_DATA_XFER) {
5793                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5794                         return (EDESTADDRREQ);
5795                 }
5796                 if (msg->msg_controllen != 0) {
5797                         error = icmp_output_ancillary(connp, NULL, NULL, mp,
5798                             NULL, msg, cr, pid);
5799                 } else {
5800                         error = icmp_output_connected(connp, mp, cr, pid);
5801                 }
5802                 if (is->is_sendto_ignerr)
5803                         return (0);
5804                 else
5805                         return (error);
5806         }
5807         if (icmp->icmp_state == TS_DATA_XFER) {
5808                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5809                 return (EISCONN);
5810         }
5811         error = proto_verify_ip_addr(connp->conn_family,
5812             (struct sockaddr *)msg->msg_name, msg->msg_namelen);
5813         if (error != 0) {
5814                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5815                 return (error);
5816         }
5817         switch (connp->conn_family) {
5818         case AF_INET6:
5819                 sin6 = (sin6_t *)msg->msg_name;
5820 
5821                 /* No support for mapped addresses on raw sockets */
5822                 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
5823                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5824                         return (EADDRNOTAVAIL);
5825                 }
5826                 srcid = sin6->__sin6_src_id;
5827 
5828                 /*
5829                  * If the local address is a mapped address return
5830                  * an error.
5831                  * It would be possible to send an IPv6 packet but the
5832                  * response would never make it back to the application
5833                  * since it is bound to a mapped address.
5834                  */
5835                 if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) {
5836                         BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5837                         return (EADDRNOTAVAIL);
5838                 }
5839 
5840                 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
5841                         sin6->sin6_addr = ipv6_loopback;
5842 
5843                 /*
5844                  * We have to allocate an ip_xmit_attr_t before we grab
5845                  * conn_lock and we need to hold conn_lock once we've check
5846                  * conn_same_as_last_v6 to handle concurrent send* calls on a
5847                  * socket.
5848                  */
5849                 if (msg->msg_controllen == 0) {
5850                         ixa = conn_get_ixa(connp, B_FALSE);
5851                         if (ixa == NULL) {
5852                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5853                                 return (ENOMEM);
5854                         }
5855                 } else {
5856                         ixa = NULL;
5857                 }
5858                 mutex_enter(&connp->conn_lock);
5859                 if (icmp->icmp_delayed_error != 0) {
5860                         sin6_t  *sin2 = (sin6_t *)&icmp->icmp_delayed_addr;
5861 
5862                         error = icmp->icmp_delayed_error;
5863                         icmp->icmp_delayed_error = 0;
5864 
5865                         /* Compare IP address and family */
5866 
5867                         if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr,
5868                             &sin2->sin6_addr) &&
5869                             sin6->sin6_family == sin2->sin6_family) {
5870                                 mutex_exit(&connp->conn_lock);
5871                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5872                                 if (ixa != NULL)
5873                                         ixa_refrele(ixa);
5874                                 return (error);
5875                         }
5876                 }
5877                 if (msg->msg_controllen != 0) {
5878                         mutex_exit(&connp->conn_lock);
5879                         ASSERT(ixa == NULL);
5880                         error = icmp_output_ancillary(connp, NULL, sin6, mp,
5881                             NULL, msg, cr, pid);
5882                 } else if (conn_same_as_last_v6(connp, sin6) &&
5883                     connp->conn_lastsrcid == srcid &&
5884                     ipsec_outbound_policy_current(ixa)) {
5885                         /* icmp_output_lastdst drops conn_lock */
5886                         error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5887                 } else {
5888                         /* icmp_output_newdst drops conn_lock */
5889                         error = icmp_output_newdst(connp, mp, NULL, sin6, cr,
5890                             pid, ixa);
5891                 }
5892                 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5893                 if (is->is_sendto_ignerr)
5894                         return (0);
5895                 else
5896                         return (error);
5897         case AF_INET:
5898                 sin = (sin_t *)msg->msg_name;
5899 
5900                 if (sin->sin_addr.s_addr == INADDR_ANY)
5901                         sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
5902 
5903                 /*
5904                  * We have to allocate an ip_xmit_attr_t before we grab
5905                  * conn_lock and we need to hold conn_lock once we've check
5906                  * conn_same_as_last_v6 to handle concurrent send* on a socket.
5907                  */
5908                 if (msg->msg_controllen == 0) {
5909                         ixa = conn_get_ixa(connp, B_FALSE);
5910                         if (ixa == NULL) {
5911                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5912                                 return (ENOMEM);
5913                         }
5914                 } else {
5915                         ixa = NULL;
5916                 }
5917                 mutex_enter(&connp->conn_lock);
5918                 if (icmp->icmp_delayed_error != 0) {
5919                         sin_t  *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
5920 
5921                         error = icmp->icmp_delayed_error;
5922                         icmp->icmp_delayed_error = 0;
5923 
5924                         /* Compare IP address */
5925 
5926                         if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) {
5927                                 mutex_exit(&connp->conn_lock);
5928                                 BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
5929                                 if (ixa != NULL)
5930                                         ixa_refrele(ixa);
5931                                 return (error);
5932                         }
5933                 }
5934 
5935                 if (msg->msg_controllen != 0) {
5936                         mutex_exit(&connp->conn_lock);
5937                         ASSERT(ixa == NULL);
5938                         error = icmp_output_ancillary(connp, sin, NULL, mp,
5939                             NULL, msg, cr, pid);
5940                 } else if (conn_same_as_last_v4(connp, sin) &&
5941                     ipsec_outbound_policy_current(ixa)) {
5942                         /* icmp_output_lastdst drops conn_lock */
5943                         error = icmp_output_lastdst(connp, mp, cr, pid, ixa);
5944                 } else {
5945                         /* icmp_output_newdst drops conn_lock */
5946                         error = icmp_output_newdst(connp, mp, sin, NULL, cr,
5947                             pid, ixa);
5948                 }
5949                 ASSERT(MUTEX_NOT_HELD(&connp->conn_lock));
5950                 if (is->is_sendto_ignerr)
5951                         return (0);
5952                 else
5953                         return (error);
5954         default:
5955                 return (EINVAL);
5956         }
5957 }
5958 
5959 sock_downcalls_t sock_rawip_downcalls = {
5960         rawip_activate,
5961         rawip_accept,
5962         rawip_bind,
5963         rawip_listen,
5964         rawip_connect,
5965         rawip_getpeername,
5966         rawip_getsockname,
5967         rawip_getsockopt,
5968         rawip_setsockopt,
5969         rawip_send,
5970         NULL,
5971         NULL,
5972         NULL,
5973         rawip_shutdown,
5974         rawip_clr_flowctrl,
5975         rawip_ioctl,
5976         rawip_close
5977 };