1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 1990 Mentat Inc.
  24  * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/dlpi.h>
  30 #include <sys/stropts.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/strsun.h>
  33 #include <sys/strlog.h>
  34 #include <sys/strsubr.h>
  35 #define _SUN_TPI_VERSION        2
  36 #include <sys/tihdr.h>
  37 #include <sys/ddi.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/debug.h>
  41 #include <sys/sdt.h>
  42 #include <sys/kobj.h>
  43 #include <sys/zone.h>
  44 #include <sys/neti.h>
  45 #include <sys/hook.h>
  46 
  47 #include <sys/kmem.h>
  48 #include <sys/systm.h>
  49 #include <sys/param.h>
  50 #include <sys/socket.h>
  51 #include <sys/vtrace.h>
  52 #include <sys/isa_defs.h>
  53 #include <sys/atomic.h>
  54 #include <sys/policy.h>
  55 #include <sys/mac.h>
  56 #include <net/if.h>
  57 #include <net/if_types.h>
  58 #include <net/route.h>
  59 #include <net/if_dl.h>
  60 #include <sys/sockio.h>
  61 #include <netinet/in.h>
  62 #include <netinet/ip6.h>
  63 #include <netinet/icmp6.h>
  64 #include <netinet/sctp.h>
  65 
  66 #include <inet/common.h>
  67 #include <inet/mi.h>
  68 #include <inet/optcom.h>
  69 #include <inet/mib2.h>
  70 #include <inet/nd.h>
  71 #include <inet/arp.h>
  72 
  73 #include <inet/ip.h>
  74 #include <inet/ip_impl.h>
  75 #include <inet/ip6.h>
  76 #include <inet/ip6_asp.h>
  77 #include <inet/tcp.h>
  78 #include <inet/tcp_impl.h>
  79 #include <inet/udp_impl.h>
  80 #include <inet/ipp_common.h>
  81 
  82 #include <inet/ip_multi.h>
  83 #include <inet/ip_if.h>
  84 #include <inet/ip_ire.h>
  85 #include <inet/ip_rts.h>
  86 #include <inet/ip_ndp.h>
  87 #include <net/pfkeyv2.h>
  88 #include <inet/sadb.h>
  89 #include <inet/ipsec_impl.h>
  90 #include <inet/iptun/iptun_impl.h>
  91 #include <inet/sctp_ip.h>
  92 #include <sys/pattr.h>
  93 #include <inet/ipclassifier.h>
  94 #include <inet/ipsecah.h>
  95 #include <inet/rawip_impl.h>
  96 #include <inet/rts_impl.h>
  97 #include <sys/squeue_impl.h>
  98 #include <sys/squeue.h>
  99 
 100 #include <sys/tsol/label.h>
 101 #include <sys/tsol/tnet.h>
 102 
 103 /* Temporary; for CR 6451644 work-around */
 104 #include <sys/ethernet.h>
 105 
 106 /*
 107  * Naming conventions:
 108  *      These rules should be judiciously applied
 109  *      if there is a need to identify something as IPv6 versus IPv4
 110  *      IPv6 funcions will end with _v6 in the ip module.
 111  *      IPv6 funcions will end with _ipv6 in the transport modules.
 112  *      IPv6 macros:
 113  *              Some macros end with _V6; e.g. ILL_FRAG_HASH_V6
 114  *              Some macros start with V6_; e.g. V6_OR_V4_INADDR_ANY
 115  *              And then there are ..V4_PART_OF_V6.
 116  *              The intent is that macros in the ip module end with _V6.
 117  *      IPv6 global variables will start with ipv6_
 118  *      IPv6 structures will start with ipv6
 119  *      IPv6 defined constants should start with IPV6_
 120  *              (but then there are NDP_DEFAULT_VERS_PRI_AND_FLOW, etc)
 121  */
 122 
 123 /*
 124  * ip6opt_ls is used to enable IPv6 (via /etc/system on TX systems).
 125  * We need to do this because we didn't obtain the IP6OPT_LS (0x0a)
 126  * from IANA. This mechanism will remain in effect until an official
 127  * number is obtained.
 128  */
 129 uchar_t ip6opt_ls;
 130 
 131 const in6_addr_t ipv6_all_ones =
 132         { 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU };
 133 const in6_addr_t ipv6_all_zeros = { 0, 0, 0, 0 };
 134 
 135 #ifdef  _BIG_ENDIAN
 136 const in6_addr_t ipv6_unspecified_group = { 0xff000000U, 0, 0, 0 };
 137 #else   /* _BIG_ENDIAN */
 138 const in6_addr_t ipv6_unspecified_group = { 0x000000ffU, 0, 0, 0 };
 139 #endif  /* _BIG_ENDIAN */
 140 
 141 #ifdef  _BIG_ENDIAN
 142 const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x00000001U };
 143 #else  /* _BIG_ENDIAN */
 144 const in6_addr_t ipv6_loopback = { 0, 0, 0, 0x01000000U };
 145 #endif /* _BIG_ENDIAN */
 146 
 147 #ifdef _BIG_ENDIAN
 148 const in6_addr_t ipv6_all_hosts_mcast = { 0xff020000U, 0, 0, 0x00000001U };
 149 #else  /* _BIG_ENDIAN */
 150 const in6_addr_t ipv6_all_hosts_mcast = { 0x000002ffU, 0, 0, 0x01000000U };
 151 #endif /* _BIG_ENDIAN */
 152 
 153 #ifdef _BIG_ENDIAN
 154 const in6_addr_t ipv6_all_rtrs_mcast = { 0xff020000U, 0, 0, 0x00000002U };
 155 #else  /* _BIG_ENDIAN */
 156 const in6_addr_t ipv6_all_rtrs_mcast = { 0x000002ffU, 0, 0, 0x02000000U };
 157 #endif /* _BIG_ENDIAN */
 158 
 159 #ifdef _BIG_ENDIAN
 160 const in6_addr_t ipv6_all_v2rtrs_mcast = { 0xff020000U, 0, 0, 0x00000016U };
 161 #else  /* _BIG_ENDIAN */
 162 const in6_addr_t ipv6_all_v2rtrs_mcast = { 0x000002ffU, 0, 0, 0x16000000U };
 163 #endif /* _BIG_ENDIAN */
 164 
 165 #ifdef _BIG_ENDIAN
 166 const in6_addr_t ipv6_solicited_node_mcast =
 167                         { 0xff020000U, 0, 0x00000001U, 0xff000000U };
 168 #else  /* _BIG_ENDIAN */
 169 const in6_addr_t ipv6_solicited_node_mcast =
 170                         { 0x000002ffU, 0, 0x01000000U, 0x000000ffU };
 171 #endif /* _BIG_ENDIAN */
 172 
 173 static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *);
 174 static void     icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *);
 175 static void     icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *,
 176     ip_recv_attr_t *);
 177 static void     icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *,
 178     ip_recv_attr_t *);
 179 static void     icmp_send_redirect_v6(mblk_t *, in6_addr_t *,
 180     in6_addr_t *, ip_recv_attr_t *);
 181 static void     icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *,
 182     ip_recv_attr_t *);
 183 static boolean_t        ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *);
 184 
 185 /*
 186  * icmp_inbound_v6 deals with ICMP messages that are handled by IP.
 187  * If the ICMP message is consumed by IP, i.e., it should not be delivered
 188  * to any IPPROTO_ICMP raw sockets, then it returns NULL.
 189  * Likewise, if the ICMP error is misformed (too short, etc), then it
 190  * returns NULL. The caller uses this to determine whether or not to send
 191  * to raw sockets.
 192  *
 193  * All error messages are passed to the matching transport stream.
 194  *
 195  * See comment for icmp_inbound_v4() on how IPsec is handled.
 196  */
 197 mblk_t *
 198 icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira)
 199 {
 200         icmp6_t         *icmp6;
 201         ip6_t           *ip6h;          /* Outer header */
 202         int             ip_hdr_length;  /* Outer header length */
 203         boolean_t       interested;
 204         ill_t           *ill = ira->ira_ill;
 205         ip_stack_t      *ipst = ill->ill_ipst;
 206         mblk_t          *mp_ret = NULL;
 207 
 208         ip6h = (ip6_t *)mp->b_rptr;
 209 
 210         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs);
 211 
 212         /* Check for Martian packets  */
 213         if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) {
 214                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 215                 ip_drop_input("ipIfStatsInAddrErrors: mcast src", mp, ill);
 216                 freemsg(mp);
 217                 return (NULL);
 218         }
 219 
 220         /* Make sure ira_l2src is set for ndp_input */
 221         if (!(ira->ira_flags & IRAF_L2SRC_SET))
 222                 ip_setl2src(mp, ira, ira->ira_rill);
 223 
 224         ip_hdr_length = ira->ira_ip_hdr_length;
 225         if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) {
 226                 if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) {
 227                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
 228                         ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
 229                         freemsg(mp);
 230                         return (NULL);
 231                 }
 232                 ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira);
 233                 if (ip6h == NULL) {
 234                         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
 235                         freemsg(mp);
 236                         return (NULL);
 237                 }
 238         }
 239 
 240         icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
 241         DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6);
 242         ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type,
 243             icmp6->icmp6_code));
 244 
 245         /*
 246          * We will set "interested" to "true" if we should pass a copy to
 247          * the transport i.e., if it is an error message.
 248          */
 249         interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK);
 250 
 251         switch (icmp6->icmp6_type) {
 252         case ICMP6_DST_UNREACH:
 253                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInDestUnreachs);
 254                 if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
 255                         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInAdminProhibs);
 256                 break;
 257 
 258         case ICMP6_TIME_EXCEEDED:
 259                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInTimeExcds);
 260                 break;
 261 
 262         case ICMP6_PARAM_PROB:
 263                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInParmProblems);
 264                 break;
 265 
 266         case ICMP6_PACKET_TOO_BIG:
 267                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs);
 268                 break;
 269 
 270         case ICMP6_ECHO_REQUEST:
 271                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos);
 272                 if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) &&
 273                     !ipst->ips_ipv6_resp_echo_mcast)
 274                         break;
 275 
 276                 /*
 277                  * We must have exclusive use of the mblk to convert it to
 278                  * a response.
 279                  * If not, we copy it.
 280                  */
 281                 if (mp->b_datap->db_ref > 1) {
 282                         mblk_t  *mp1;
 283 
 284                         mp1 = copymsg(mp);
 285                         if (mp1 == NULL) {
 286                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 287                                 ip_drop_input("ipIfStatsInDiscards - copymsg",
 288                                     mp, ill);
 289                                 freemsg(mp);
 290                                 return (NULL);
 291                         }
 292                         freemsg(mp);
 293                         mp = mp1;
 294                         ip6h = (ip6_t *)mp->b_rptr;
 295                         icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
 296                 }
 297 
 298                 icmp6->icmp6_type = ICMP6_ECHO_REPLY;
 299                 icmp_send_reply_v6(mp, ip6h, icmp6, ira);
 300                 return (NULL);
 301 
 302         case ICMP6_ECHO_REPLY:
 303                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies);
 304                 break;
 305 
 306         case ND_ROUTER_SOLICIT:
 307                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterSolicits);
 308                 break;
 309 
 310         case ND_ROUTER_ADVERT:
 311                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRouterAdvertisements);
 312                 break;
 313 
 314         case ND_NEIGHBOR_SOLICIT:
 315                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits);
 316                 ndp_input(mp, ira);
 317                 return (NULL);
 318 
 319         case ND_NEIGHBOR_ADVERT:
 320                 BUMP_MIB(ill->ill_icmp6_mib,
 321                     ipv6IfIcmpInNeighborAdvertisements);
 322                 ndp_input(mp, ira);
 323                 return (NULL);
 324 
 325         case ND_REDIRECT:
 326                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects);
 327 
 328                 if (ipst->ips_ipv6_ignore_redirect)
 329                         break;
 330 
 331                 /* We now allow a RAW socket to receive this. */
 332                 interested = B_TRUE;
 333                 break;
 334 
 335         /*
 336          * The next three icmp messages will be handled by MLD.
 337          * Pass all valid MLD packets up to any process(es)
 338          * listening on a raw ICMP socket.
 339          */
 340         case MLD_LISTENER_QUERY:
 341         case MLD_LISTENER_REPORT:
 342         case MLD_LISTENER_REDUCTION:
 343                 mp = mld_input(mp, ira);
 344                 return (mp);
 345         default:
 346                 break;
 347         }
 348         /*
 349          * See if there is an ICMP client to avoid an extra copymsg/freemsg
 350          * if there isn't one.
 351          */
 352         if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) {
 353                 /* If there is an ICMP client and we want one too, copy it. */
 354 
 355                 if (!interested) {
 356                         /* Caller will deliver to RAW sockets */
 357                         return (mp);
 358                 }
 359                 mp_ret = copymsg(mp);
 360                 if (mp_ret == NULL) {
 361                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 362                         ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
 363                 }
 364         } else if (!interested) {
 365                 /* Neither we nor raw sockets are interested. Drop packet now */
 366                 freemsg(mp);
 367                 return (NULL);
 368         }
 369 
 370         /*
 371          * ICMP error or redirect packet. Make sure we have enough of
 372          * the header and that db_ref == 1 since we might end up modifying
 373          * the packet.
 374          */
 375         if (mp->b_cont != NULL) {
 376                 if (ip_pullup(mp, -1, ira) == NULL) {
 377                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 378                         ip_drop_input("ipIfStatsInDiscards - ip_pullup",
 379                             mp, ill);
 380                         freemsg(mp);
 381                         return (mp_ret);
 382                 }
 383         }
 384 
 385         if (mp->b_datap->db_ref > 1) {
 386                 mblk_t  *mp1;
 387 
 388                 mp1 = copymsg(mp);
 389                 if (mp1 == NULL) {
 390                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 391                         ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
 392                         freemsg(mp);
 393                         return (mp_ret);
 394                 }
 395                 freemsg(mp);
 396                 mp = mp1;
 397         }
 398 
 399         /*
 400          * In case mp has changed, verify the message before any further
 401          * processes.
 402          */
 403         ip6h = (ip6_t *)mp->b_rptr;
 404         icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]);
 405         if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
 406                 freemsg(mp);
 407                 return (mp_ret);
 408         }
 409 
 410         switch (icmp6->icmp6_type) {
 411         case ND_REDIRECT:
 412                 icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira);
 413                 break;
 414         case ICMP6_PACKET_TOO_BIG:
 415                 /* Update DCE and adjust MTU is icmp header if needed */
 416                 icmp_inbound_too_big_v6(icmp6, ira);
 417                 /* FALLTHRU */
 418         default:
 419                 icmp_inbound_error_fanout_v6(mp, icmp6, ira);
 420                 break;
 421         }
 422 
 423         return (mp_ret);
 424 }
 425 
 426 /*
 427  * Send an ICMP echo reply.
 428  * The caller has already updated the payload part of the packet.
 429  * We handle the ICMP checksum, IP source address selection and feed
 430  * the packet into ip_output_simple.
 431  */
 432 static void
 433 icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6,
 434     ip_recv_attr_t *ira)
 435 {
 436         uint_t          ip_hdr_length = ira->ira_ip_hdr_length;
 437         ill_t           *ill = ira->ira_ill;
 438         ip_stack_t      *ipst = ill->ill_ipst;
 439         ip_xmit_attr_t  ixas;
 440         in6_addr_t      origsrc;
 441 
 442         /*
 443          * Remove any extension headers (do not reverse a source route)
 444          * and clear the flow id (keep traffic class for now).
 445          */
 446         if (ip_hdr_length != IPV6_HDR_LEN) {
 447                 int     i;
 448 
 449                 for (i = 0; i < IPV6_HDR_LEN; i++) {
 450                         mp->b_rptr[ip_hdr_length - i - 1] =
 451                             mp->b_rptr[IPV6_HDR_LEN - i - 1];
 452                 }
 453                 mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN);
 454                 ip6h = (ip6_t *)mp->b_rptr;
 455                 ip6h->ip6_nxt = IPPROTO_ICMPV6;
 456                 i = ntohs(ip6h->ip6_plen);
 457                 i -= (ip_hdr_length - IPV6_HDR_LEN);
 458                 ip6h->ip6_plen = htons(i);
 459                 ip_hdr_length = IPV6_HDR_LEN;
 460                 ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp));
 461         }
 462         ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
 463 
 464         /* Reverse the source and destination addresses. */
 465         origsrc = ip6h->ip6_src;
 466         ip6h->ip6_src = ip6h->ip6_dst;
 467         ip6h->ip6_dst = origsrc;
 468 
 469         /* set the hop limit */
 470         ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
 471 
 472         /*
 473          * Prepare for checksum by putting icmp length in the icmp
 474          * checksum field. The checksum is calculated in ip_output
 475          */
 476         icmp6->icmp6_cksum = ip6h->ip6_plen;
 477 
 478         bzero(&ixas, sizeof (ixas));
 479         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
 480         ixas.ixa_zoneid = ira->ira_zoneid;
 481         ixas.ixa_cred = kcred;
 482         ixas.ixa_cpid = NOPID;
 483         ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
 484         ixas.ixa_ifindex = 0;
 485         ixas.ixa_ipst = ipst;
 486         ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 487 
 488         if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
 489                 /*
 490                  * This packet should go out the same way as it
 491                  * came in i.e in clear, independent of the IPsec
 492                  * policy for transmitting packets.
 493                  */
 494                 ixas.ixa_flags |= IXAF_NO_IPSEC;
 495         } else {
 496                 if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
 497                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 498                         /* Note: mp already consumed and ip_drop_packet done */
 499                         return;
 500                 }
 501         }
 502 
 503         /* Was the destination (now source) link-local? Send out same group */
 504         if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
 505                 ixas.ixa_flags |= IXAF_SCOPEID_SET;
 506                 if (IS_UNDER_IPMP(ill))
 507                         ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
 508                 else
 509                         ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
 510         }
 511 
 512         if (ira->ira_flags & IRAF_MULTIBROADCAST) {
 513                 /*
 514                  * Not one or our addresses (IRE_LOCALs), thus we let
 515                  * ip_output_simple pick the source.
 516                  */
 517                 ip6h->ip6_src = ipv6_all_zeros;
 518                 ixas.ixa_flags |= IXAF_SET_SOURCE;
 519         }
 520 
 521         /* Should we send using dce_pmtu? */
 522         if (ipst->ips_ipv6_icmp_return_pmtu)
 523                 ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
 524 
 525         (void) ip_output_simple(mp, &ixas);
 526         ixa_cleanup(&ixas);
 527 
 528 }
 529 
 530 /*
 531  * Verify the ICMP messages for either for ICMP error or redirect packet.
 532  * The caller should have fully pulled up the message. If it's a redirect
 533  * packet, only basic checks on IP header will be done; otherwise, verify
 534  * the packet by looking at the included ULP header.
 535  *
 536  * Called before icmp_inbound_error_fanout_v6 is called.
 537  */
 538 static boolean_t
 539 icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
 540 {
 541         ill_t           *ill = ira->ira_ill;
 542         uint16_t        hdr_length;
 543         uint8_t         *nexthdrp;
 544         uint8_t         nexthdr;
 545         ip_stack_t      *ipst = ill->ill_ipst;
 546         conn_t          *connp;
 547         ip6_t           *ip6h;  /* Inner header */
 548 
 549         ip6h = (ip6_t *)&icmp6[1];
 550         if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr)
 551                 goto truncated;
 552 
 553         if (icmp6->icmp6_type == ND_REDIRECT) {
 554                 hdr_length = sizeof (nd_redirect_t);
 555         } else {
 556                 if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION))
 557                         goto discard_pkt;
 558                 hdr_length = IPV6_HDR_LEN;
 559         }
 560 
 561         if ((uchar_t *)ip6h + hdr_length > mp->b_wptr)
 562                 goto truncated;
 563 
 564         /*
 565          * Stop here for ICMP_REDIRECT.
 566          */
 567         if (icmp6->icmp6_type == ND_REDIRECT)
 568                 return (B_TRUE);
 569 
 570         /*
 571          * ICMP errors only.
 572          */
 573         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
 574                 goto discard_pkt;
 575         nexthdr = *nexthdrp;
 576 
 577         /* Try to pass the ICMP message to clients who need it */
 578         switch (nexthdr) {
 579         case IPPROTO_UDP:
 580                 /*
 581                  * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
 582                  * transport header.
 583                  */
 584                 if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
 585                     mp->b_wptr)
 586                         goto truncated;
 587                 break;
 588         case IPPROTO_TCP: {
 589                 tcpha_t         *tcpha;
 590 
 591                 /*
 592                  * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
 593                  * transport header.
 594                  */
 595                 if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
 596                     mp->b_wptr)
 597                         goto truncated;
 598 
 599                 tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
 600                 /*
 601                  * With IPMP we need to match across group, which we do
 602                  * since we have the upper ill from ira_ill.
 603                  */
 604                 connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN,
 605                     ill->ill_phyint->phyint_ifindex, ipst);
 606                 if (connp == NULL)
 607                         goto discard_pkt;
 608 
 609                 if ((connp->conn_verifyicmp != NULL) &&
 610                     !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) {
 611                         CONN_DEC_REF(connp);
 612                         goto discard_pkt;
 613                 }
 614                 CONN_DEC_REF(connp);
 615                 break;
 616         }
 617         case IPPROTO_SCTP:
 618                 /*
 619                  * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
 620                  * transport header.
 621                  */
 622                 if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN >
 623                     mp->b_wptr)
 624                         goto truncated;
 625                 break;
 626         case IPPROTO_ESP:
 627         case IPPROTO_AH:
 628                 break;
 629         case IPPROTO_ENCAP:
 630         case IPPROTO_IPV6: {
 631                 /* Look for self-encapsulated packets that caused an error */
 632                 ip6_t *in_ip6h;
 633 
 634                 in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
 635                 if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ?
 636                     sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr)
 637                         goto truncated;
 638                 break;
 639         }
 640         default:
 641                 break;
 642         }
 643 
 644         return (B_TRUE);
 645 
 646 discard_pkt:
 647         /* Bogus ICMP error. */
 648         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 649         return (B_FALSE);
 650 
 651 truncated:
 652         /* We pulled up everthing already. Must be truncated */
 653         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
 654         return (B_FALSE);
 655 }
 656 
 657 /*
 658  * Process received IPv6 ICMP Packet too big.
 659  * The caller is responsible for validating the packet before passing it in
 660  * and also to fanout the ICMP error to any matching transport conns. Assumes
 661  * the message has been fully pulled up.
 662  *
 663  * Before getting here, the caller has called icmp_inbound_verify_v6()
 664  * that should have verified with ULP to prevent undoing the changes we're
 665  * going to make to DCE. For example, TCP might have verified that the packet
 666  * which generated error is in the send window.
 667  *
 668  * In some cases modified this MTU in the ICMP header packet; the caller
 669  * should pass to the matching ULP after this returns.
 670  */
 671 static void
 672 icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira)
 673 {
 674         uint32_t        mtu;
 675         dce_t           *dce;
 676         ill_t           *ill = ira->ira_ill; /* Upper ill if IPMP */
 677         ip_stack_t      *ipst = ill->ill_ipst;
 678         int             old_max_frag;
 679         in6_addr_t      final_dst;
 680         ip6_t           *ip6h;  /* Inner IP header */
 681 
 682         /* Caller has already pulled up everything. */
 683         ip6h = (ip6_t *)&icmp6[1];
 684         final_dst = ip_get_dst_v6(ip6h, NULL, NULL);
 685 
 686         /*
 687          * For link local destinations matching simply on address is not
 688          * sufficient. Same link local addresses for different ILL's is
 689          * possible.
 690          */
 691         if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) {
 692                 dce = dce_lookup_and_add_v6(&final_dst,
 693                     ill->ill_phyint->phyint_ifindex, ipst);
 694         } else {
 695                 dce = dce_lookup_and_add_v6(&final_dst, 0, ipst);
 696         }
 697         if (dce == NULL) {
 698                 /* Couldn't add a unique one - ENOMEM */
 699                 if (ip_debug > 2) {
 700                         /* ip1dbg */
 701                         pr_addr_dbg("icmp_inbound_too_big_v6:"
 702                             "no dce for dst %s\n", AF_INET6,
 703                             &final_dst);
 704                 }
 705                 return;
 706         }
 707 
 708         mtu = ntohl(icmp6->icmp6_mtu);
 709 
 710         mutex_enter(&dce->dce_lock);
 711         if (dce->dce_flags & DCEF_PMTU)
 712                 old_max_frag = dce->dce_pmtu;
 713         else if (IN6_IS_ADDR_MULTICAST(&final_dst))
 714                 old_max_frag = ill->ill_mc_mtu;
 715         else
 716                 old_max_frag = ill->ill_mtu;
 717 
 718         if (mtu >= IPV6_MIN_MTU) {
 719                 ip1dbg(("Received mtu from router: %d\n", mtu));
 720                 DTRACE_PROBE1(icmp6__received__mtu, uint32_t, mtu);
 721                 dce->dce_pmtu = MIN(old_max_frag, mtu);
 722                 icmp6->icmp6_mtu = htonl(dce->dce_pmtu);
 723 
 724                 /* We now have a PMTU for sure */
 725                 dce->dce_flags |= DCEF_PMTU;
 726                 dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
 727         } else {
 728                 /*
 729                  * RFC 8021 suggests to ignore messages where mtu is
 730                  * less than the IPv6 minimum.
 731                  */
 732                 ip1dbg(("Received mtu less than IPv6 "
 733                     "min mtu %d: %d\n", IPV6_MIN_MTU, mtu));
 734                 DTRACE_PROBE1(icmp6__too__small__mtu, uint32_t, mtu);
 735         }
 736 
 737         mutex_exit(&dce->dce_lock);
 738         /*
 739          * After dropping the lock the new value is visible to everyone.
 740          * Then we bump the generation number so any cached values reinspect
 741          * the dce_t.
 742          */
 743         dce_increment_generation(dce);
 744         dce_refrele(dce);
 745 }
 746 
 747 /*
 748  * Fanout received ICMPv6 error packets to the transports.
 749  * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else.
 750  *
 751  * The caller must have called icmp_inbound_verify_v6.
 752  */
 753 void
 754 icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira)
 755 {
 756         uint16_t        *up;    /* Pointer to ports in ULP header */
 757         uint32_t        ports;  /* reversed ports for fanout */
 758         ip6_t           rip6h;  /* With reversed addresses */
 759         ip6_t           *ip6h;  /* Inner IP header */
 760         uint16_t        hdr_length; /* Inner IP header length */
 761         uint8_t         *nexthdrp;
 762         uint8_t         nexthdr;
 763         tcpha_t         *tcpha;
 764         conn_t          *connp;
 765         ill_t           *ill = ira->ira_ill; /* Upper in the case of IPMP */
 766         ip_stack_t      *ipst = ill->ill_ipst;
 767         ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
 768 
 769         /* Caller has already pulled up everything. */
 770         ip6h = (ip6_t *)&icmp6[1];
 771         ASSERT(mp->b_cont == NULL);
 772         ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
 773 
 774         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp))
 775                 goto drop_pkt;
 776         nexthdr = *nexthdrp;
 777         ira->ira_protocol = nexthdr;
 778 
 779         /*
 780          * We need a separate IP header with the source and destination
 781          * addresses reversed to do fanout/classification because the ip6h in
 782          * the ICMPv6 error is in the form we sent it out.
 783          */
 784         rip6h.ip6_src = ip6h->ip6_dst;
 785         rip6h.ip6_dst = ip6h->ip6_src;
 786         rip6h.ip6_nxt = nexthdr;
 787 
 788         /* Try to pass the ICMP message to clients who need it */
 789         switch (nexthdr) {
 790         case IPPROTO_UDP: {
 791                 /* Attempt to find a client stream based on port. */
 792                 up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
 793 
 794                 /* Note that we send error to all matches. */
 795                 ira->ira_flags |= IRAF_ICMP_ERROR;
 796                 ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira);
 797                 ira->ira_flags &= ~IRAF_ICMP_ERROR;
 798                 return;
 799         }
 800         case IPPROTO_TCP: {
 801                 /*
 802                  * Attempt to find a client stream based on port.
 803                  * Note that we do a reverse lookup since the header is
 804                  * in the form we sent it out.
 805                  */
 806                 tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length);
 807                 /*
 808                  * With IPMP we need to match across group, which we do
 809                  * since we have the upper ill from ira_ill.
 810                  */
 811                 connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha,
 812                     TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst);
 813                 if (connp == NULL) {
 814                         goto drop_pkt;
 815                 }
 816 
 817                 if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) ||
 818                     (ira->ira_flags & IRAF_IPSEC_SECURE)) {
 819                         mp = ipsec_check_inbound_policy(mp, connp,
 820                             NULL, ip6h, ira);
 821                         if (mp == NULL) {
 822                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 823                                 /* Note that mp is NULL */
 824                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
 825                                 CONN_DEC_REF(connp);
 826                                 return;
 827                         }
 828                 }
 829 
 830                 ira->ira_flags |= IRAF_ICMP_ERROR;
 831                 if (IPCL_IS_TCP(connp)) {
 832                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
 833                             connp->conn_recvicmp, connp, ira, SQ_FILL,
 834                             SQTAG_TCP6_INPUT_ICMP_ERR);
 835                 } else {
 836                         /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
 837                         ill_t *rill = ira->ira_rill;
 838 
 839                         ira->ira_ill = ira->ira_rill = NULL;
 840                         (connp->conn_recv)(connp, mp, NULL, ira);
 841                         CONN_DEC_REF(connp);
 842                         ira->ira_ill = ill;
 843                         ira->ira_rill = rill;
 844                 }
 845                 ira->ira_flags &= ~IRAF_ICMP_ERROR;
 846                 return;
 847 
 848         }
 849         case IPPROTO_SCTP:
 850                 up = (uint16_t *)((uchar_t *)ip6h + hdr_length);
 851                 /* Find a SCTP client stream for this packet. */
 852                 ((uint16_t *)&ports)[0] = up[1];
 853                 ((uint16_t *)&ports)[1] = up[0];
 854 
 855                 ira->ira_flags |= IRAF_ICMP_ERROR;
 856                 ip_fanout_sctp(mp, NULL, &rip6h, ports, ira);
 857                 ira->ira_flags &= ~IRAF_ICMP_ERROR;
 858                 return;
 859 
 860         case IPPROTO_ESP:
 861         case IPPROTO_AH:
 862                 if (!ipsec_loaded(ipss)) {
 863                         ip_proto_not_sup(mp, ira);
 864                         return;
 865                 }
 866 
 867                 if (nexthdr == IPPROTO_ESP)
 868                         mp = ipsecesp_icmp_error(mp, ira);
 869                 else
 870                         mp = ipsecah_icmp_error(mp, ira);
 871                 if (mp == NULL)
 872                         return;
 873 
 874                 /* Just in case ipsec didn't preserve the NULL b_cont */
 875                 if (mp->b_cont != NULL) {
 876                         if (!pullupmsg(mp, -1))
 877                                 goto drop_pkt;
 878                 }
 879 
 880                 /*
 881                  * If succesful, the mp has been modified to not include
 882                  * the ESP/AH header so we can fanout to the ULP's icmp
 883                  * error handler.
 884                  */
 885                 if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN)
 886                         goto drop_pkt;
 887 
 888                 ip6h = (ip6_t *)mp->b_rptr;
 889                 /* Don't call hdr_length_v6() unless you have to. */
 890                 if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
 891                         hdr_length = ip_hdr_length_v6(mp, ip6h);
 892                 else
 893                         hdr_length = IPV6_HDR_LEN;
 894 
 895                 /* Verify the modified message before any further processes. */
 896                 icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
 897                 if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
 898                         freemsg(mp);
 899                         return;
 900                 }
 901 
 902                 icmp_inbound_error_fanout_v6(mp, icmp6, ira);
 903                 return;
 904 
 905         case IPPROTO_IPV6: {
 906                 /* Look for self-encapsulated packets that caused an error */
 907                 ip6_t *in_ip6h;
 908 
 909                 in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length);
 910 
 911                 if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) &&
 912                     IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) {
 913                         /*
 914                          * Self-encapsulated case. As in the ipv4 case,
 915                          * we need to strip the 2nd IP header. Since mp
 916                          * is already pulled-up, we can simply bcopy
 917                          * the 3rd header + data over the 2nd header.
 918                          */
 919                         uint16_t unused_len;
 920 
 921                         /*
 922                          * Make sure we don't do recursion more than once.
 923                          */
 924                         if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h,
 925                             &unused_len, &nexthdrp) ||
 926                             *nexthdrp == IPPROTO_IPV6) {
 927                                 goto drop_pkt;
 928                         }
 929 
 930                         /*
 931                          * Copy the 3rd header + remaining data on top
 932                          * of the 2nd header.
 933                          */
 934                         bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h);
 935 
 936                         /*
 937                          * Subtract length of the 2nd header.
 938                          */
 939                         mp->b_wptr -= hdr_length;
 940 
 941                         ip6h = (ip6_t *)mp->b_rptr;
 942                         /* Don't call hdr_length_v6() unless you have to. */
 943                         if (ip6h->ip6_nxt != IPPROTO_ICMPV6)
 944                                 hdr_length = ip_hdr_length_v6(mp, ip6h);
 945                         else
 946                                 hdr_length = IPV6_HDR_LEN;
 947 
 948                         /*
 949                          * Verify the modified message before any further
 950                          * processes.
 951                          */
 952                         icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]);
 953                         if (!icmp_inbound_verify_v6(mp, icmp6, ira)) {
 954                                 freemsg(mp);
 955                                 return;
 956                         }
 957 
 958                         /*
 959                          * Now recurse, and see what I _really_ should be
 960                          * doing here.
 961                          */
 962                         icmp_inbound_error_fanout_v6(mp, icmp6, ira);
 963                         return;
 964                 }
 965                 /* FALLTHRU */
 966         }
 967         case IPPROTO_ENCAP:
 968                 if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src,
 969                     &rip6h.ip6_dst, ipst)) != NULL) {
 970                         ira->ira_flags |= IRAF_ICMP_ERROR;
 971                         connp->conn_recvicmp(connp, mp, NULL, ira);
 972                         CONN_DEC_REF(connp);
 973                         ira->ira_flags &= ~IRAF_ICMP_ERROR;
 974                         return;
 975                 }
 976                 /*
 977                  * No IP tunnel is interested, fallthrough and see
 978                  * if a raw socket will want it.
 979                  */
 980                 /* FALLTHRU */
 981         default:
 982                 ira->ira_flags |= IRAF_ICMP_ERROR;
 983                 ASSERT(ira->ira_protocol == nexthdr);
 984                 ip_fanout_proto_v6(mp, &rip6h, ira);
 985                 ira->ira_flags &= ~IRAF_ICMP_ERROR;
 986                 return;
 987         }
 988         /* NOTREACHED */
 989 drop_pkt:
 990         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
 991         ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n"));
 992         freemsg(mp);
 993 }
 994 
 995 /*
 996  * Process received IPv6 ICMP Redirect messages.
 997  * Assumes the caller has verified that the headers are in the pulled up mblk.
 998  * Consumes mp.
 999  */
1000 /* ARGSUSED */
1001 static void
1002 icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd,
1003     ip_recv_attr_t *ira)
1004 {
1005         ire_t           *ire, *nire;
1006         ire_t           *prev_ire = NULL;
1007         ire_t           *redir_ire;
1008         in6_addr_t      *src, *dst, *gateway;
1009         nd_opt_hdr_t    *opt;
1010         nce_t           *nce;
1011         int             ncec_flags = 0;
1012         int             err = 0;
1013         boolean_t       redirect_to_router = B_FALSE;
1014         int             len;
1015         int             optlen;
1016         ill_t           *ill = ira->ira_rill;
1017         ill_t           *rill = ira->ira_rill;
1018         ip_stack_t      *ipst = ill->ill_ipst;
1019 
1020         /*
1021          * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill
1022          * and make it be the IPMP upper so avoid being confused by a packet
1023          * addressed to a unicast address on a different ill.
1024          */
1025         if (IS_UNDER_IPMP(rill)) {
1026                 rill = ipmp_ill_hold_ipmp_ill(rill);
1027                 if (rill == NULL) {
1028                         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1029                         ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill",
1030                             mp, ill);
1031                         freemsg(mp);
1032                         return;
1033                 }
1034                 ASSERT(rill != ira->ira_rill);
1035         }
1036 
1037         len = mp->b_wptr - (uchar_t *)rd;
1038         src = &ip6h->ip6_src;
1039         dst = &rd->nd_rd_dst;
1040         gateway = &rd->nd_rd_target;
1041 
1042         /* Verify if it is a valid redirect */
1043         if (!IN6_IS_ADDR_LINKLOCAL(src) ||
1044             (ip6h->ip6_hops != IPV6_MAX_HOPS) ||
1045             (rd->nd_rd_code != 0) ||
1046             (len < sizeof (nd_redirect_t)) ||
1047             (IN6_IS_ADDR_V4MAPPED(dst)) ||
1048             (IN6_IS_ADDR_MULTICAST(dst))) {
1049                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1050                 ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill);
1051                 goto fail_redirect;
1052         }
1053 
1054         if (!(IN6_IS_ADDR_LINKLOCAL(gateway) ||
1055             IN6_ARE_ADDR_EQUAL(gateway, dst))) {
1056                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1057                 ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway",
1058                     mp, ill);
1059                 goto fail_redirect;
1060         }
1061 
1062         optlen = len - sizeof (nd_redirect_t);
1063         if (optlen != 0) {
1064                 if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) {
1065                         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1066                         ip_drop_input("ipv6IfIcmpInBadRedirects - options",
1067                             mp, ill);
1068                         goto fail_redirect;
1069                 }
1070         }
1071 
1072         if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) {
1073                 redirect_to_router = B_TRUE;
1074                 ncec_flags |= NCE_F_ISROUTER;
1075         } else {
1076                 gateway = dst;  /* Add nce for dst */
1077         }
1078 
1079 
1080         /*
1081          * Verify that the IP source address of the redirect is
1082          * the same as the current first-hop router for the specified
1083          * ICMP destination address.
1084          * Also, Make sure we had a route for the dest in question and
1085          * that route was pointing to the old gateway (the source of the
1086          * redirect packet.)
1087          * We do longest match and then compare ire_gateway_addr_v6 below.
1088          */
1089         prev_ire = ire_ftable_lookup_v6(dst, 0, 0, 0, rill,
1090             ALL_ZONES, NULL, MATCH_IRE_ILL, 0, ipst, NULL);
1091 
1092         /*
1093          * Check that
1094          *      the redirect was not from ourselves
1095          *      old gateway is still directly reachable
1096          */
1097         if (prev_ire == NULL ||
1098             (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
1099             (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
1100             !IN6_ARE_ADDR_EQUAL(src, &prev_ire->ire_gateway_addr_v6)) {
1101                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1102                 ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill);
1103                 goto fail_redirect;
1104         }
1105 
1106         ASSERT(prev_ire->ire_ill != NULL);
1107         if (prev_ire->ire_ill->ill_flags & ILLF_NONUD)
1108                 ncec_flags |= NCE_F_NONUD;
1109 
1110         opt = (nd_opt_hdr_t *)&rd[1];
1111         opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR);
1112         if (opt != NULL) {
1113                 err = nce_lookup_then_add_v6(rill,
1114                     (uchar_t *)&opt[1],             /* Link layer address */
1115                     rill->ill_phys_addr_length,
1116                     gateway, ncec_flags, ND_STALE, &nce);
1117                 switch (err) {
1118                 case 0:
1119                         nce_refrele(nce);
1120                         break;
1121                 case EEXIST:
1122                         /*
1123                          * Check to see if link layer address has changed and
1124                          * process the ncec_state accordingly.
1125                          */
1126                         nce_process(nce->nce_common,
1127                             (uchar_t *)&opt[1], 0, B_FALSE);
1128                         nce_refrele(nce);
1129                         break;
1130                 default:
1131                         ip1dbg(("icmp_redirect_v6: NCE create failed %d\n",
1132                             err));
1133                         goto fail_redirect;
1134                 }
1135         }
1136         if (redirect_to_router) {
1137                 ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway));
1138 
1139                 /*
1140                  * Create a Route Association.  This will allow us to remember
1141                  * a router told us to use the particular gateway.
1142                  */
1143                 ire = ire_create_v6(
1144                     dst,
1145                     &ipv6_all_ones,         /* mask */
1146                     gateway,                    /* gateway addr */
1147                     IRE_HOST,
1148                     prev_ire->ire_ill,
1149                     ALL_ZONES,
1150                     (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
1151                     NULL,
1152                     ipst);
1153         } else {
1154                 ipif_t *ipif;
1155                 in6_addr_t gw;
1156 
1157                 /*
1158                  * Just create an on link entry, i.e. interface route.
1159                  * The gateway field is our link-local on the ill.
1160                  */
1161                 mutex_enter(&rill->ill_lock);
1162                 for (ipif = rill->ill_ipif; ipif != NULL;
1163                     ipif = ipif->ipif_next) {
1164                         if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
1165                             IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr))
1166                                 break;
1167                 }
1168                 if (ipif == NULL) {
1169                         /* We have no link-local address! */
1170                         mutex_exit(&rill->ill_lock);
1171                         goto fail_redirect;
1172                 }
1173                 gw = ipif->ipif_v6lcl_addr;
1174                 mutex_exit(&rill->ill_lock);
1175 
1176                 ire = ire_create_v6(
1177                     dst,                                /* gateway == dst */
1178                     &ipv6_all_ones,                 /* mask */
1179                     &gw,                            /* gateway addr */
1180                     rill->ill_net_type,                      /* IF_[NO]RESOLVER */
1181                     prev_ire->ire_ill,
1182                     ALL_ZONES,
1183                     (RTF_DYNAMIC | RTF_HOST),
1184                     NULL,
1185                     ipst);
1186         }
1187 
1188         if (ire == NULL)
1189                 goto fail_redirect;
1190 
1191         nire = ire_add(ire);
1192         /* Check if it was a duplicate entry */
1193         if (nire != NULL && nire != ire) {
1194                 ASSERT(nire->ire_identical_ref > 1);
1195                 ire_delete(nire);
1196                 ire_refrele(nire);
1197                 nire = NULL;
1198         }
1199         ire = nire;
1200         if (ire != NULL) {
1201                 ire_refrele(ire);               /* Held in ire_add */
1202 
1203                 /* tell routing sockets that we received a redirect */
1204                 ip_rts_change_v6(RTM_REDIRECT,
1205                     &rd->nd_rd_dst,
1206                     &rd->nd_rd_target,
1207                     &ipv6_all_ones, 0, src,
1208                     (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
1209                     (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
1210 
1211                 /*
1212                  * Delete any existing IRE_HOST type ires for this destination.
1213                  * This together with the added IRE has the effect of
1214                  * modifying an existing redirect.
1215                  */
1216                 redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST,
1217                     prev_ire->ire_ill, ALL_ZONES, NULL,
1218                     (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst,
1219                     NULL);
1220 
1221                 if (redir_ire != NULL) {
1222                         if (redir_ire->ire_flags & RTF_DYNAMIC)
1223                                 ire_delete(redir_ire);
1224                         ire_refrele(redir_ire);
1225                 }
1226         }
1227 
1228         ire_refrele(prev_ire);
1229         prev_ire = NULL;
1230 
1231 fail_redirect:
1232         if (prev_ire != NULL)
1233                 ire_refrele(prev_ire);
1234         freemsg(mp);
1235         if (rill != ira->ira_rill)
1236                 ill_refrele(rill);
1237 }
1238 
1239 /*
1240  * Build and ship an IPv6 ICMP message using the packet data in mp,
1241  * and the ICMP header pointed to by "stuff".  (May be called as
1242  * writer.)
1243  * Note: assumes that icmp_pkt_err_ok_v6 has been called to
1244  * verify that an icmp error packet can be sent.
1245  *
1246  * If v6src_ptr is set use it as a source. Otherwise select a reasonable
1247  * source address (see above function).
1248  */
1249 static void
1250 icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len,
1251     const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira)
1252 {
1253         ip6_t           *ip6h;
1254         in6_addr_t      v6dst;
1255         size_t          len_needed;
1256         size_t          msg_len;
1257         mblk_t          *mp1;
1258         icmp6_t         *icmp6;
1259         in6_addr_t      v6src;
1260         ill_t           *ill = ira->ira_ill;
1261         ip_stack_t      *ipst = ill->ill_ipst;
1262         ip_xmit_attr_t  ixas;
1263 
1264         ip6h = (ip6_t *)mp->b_rptr;
1265 
1266         bzero(&ixas, sizeof (ixas));
1267         ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
1268         ixas.ixa_zoneid = ira->ira_zoneid;
1269         ixas.ixa_ifindex = 0;
1270         ixas.ixa_ipst = ipst;
1271         ixas.ixa_cred = kcred;
1272         ixas.ixa_cpid = NOPID;
1273         ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */
1274         ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1275 
1276         /*
1277          * If the source of the original packet was link-local, then
1278          * make sure we send on the same ill (group) as we received it on.
1279          */
1280         if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
1281                 ixas.ixa_flags |= IXAF_SCOPEID_SET;
1282                 if (IS_UNDER_IPMP(ill))
1283                         ixas.ixa_scopeid = ill_get_upper_ifindex(ill);
1284                 else
1285                         ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex;
1286         }
1287 
1288         if (ira->ira_flags & IRAF_IPSEC_SECURE) {
1289                 /*
1290                  * Apply IPsec based on how IPsec was applied to
1291                  * the packet that had the error.
1292                  *
1293                  * If it was an outbound packet that caused the ICMP
1294                  * error, then the caller will have setup the IRA
1295                  * appropriately.
1296                  */
1297                 if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) {
1298                         BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
1299                         /* Note: mp already consumed and ip_drop_packet done */
1300                         return;
1301                 }
1302         } else {
1303                 /*
1304                  * This is in clear. The icmp message we are building
1305                  * here should go out in clear, independent of our policy.
1306                  */
1307                 ixas.ixa_flags |= IXAF_NO_IPSEC;
1308         }
1309 
1310         /*
1311          * If the caller specified the source we use that.
1312          * Otherwise, if the packet was for one of our unicast addresses, make
1313          * sure we respond with that as the source. Otherwise
1314          * have ip_output_simple pick the source address.
1315          */
1316         if (v6src_ptr != NULL) {
1317                 v6src = *v6src_ptr;
1318         } else {
1319                 ire_t *ire;
1320                 uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY;
1321 
1322                 if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
1323                     IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst))
1324                         match_flags |= MATCH_IRE_ILL;
1325 
1326                 ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0,
1327                     (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL,
1328                     match_flags, 0, ipst, NULL);
1329                 if (ire != NULL) {
1330                         v6src = ip6h->ip6_dst;
1331                         ire_refrele(ire);
1332                 } else {
1333                         v6src = ipv6_all_zeros;
1334                         ixas.ixa_flags |= IXAF_SET_SOURCE;
1335                 }
1336         }
1337         v6dst = ip6h->ip6_src;
1338         len_needed = ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len;
1339         msg_len = msgdsize(mp);
1340         if (msg_len > len_needed) {
1341                 if (!adjmsg(mp, len_needed - msg_len)) {
1342                         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1343                         freemsg(mp);
1344                         return;
1345                 }
1346                 msg_len = len_needed;
1347         }
1348         mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED);
1349         if (mp1 == NULL) {
1350                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1351                 freemsg(mp);
1352                 return;
1353         }
1354         mp1->b_cont = mp;
1355         mp = mp1;
1356 
1357         /*
1358          * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
1359          * node generates be accepted in peace by all on-host destinations.
1360          * If we do NOT assume that all on-host destinations trust
1361          * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
1362          * (Look for IXAF_TRUSTED_ICMP).
1363          */
1364         ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
1365 
1366         ip6h = (ip6_t *)mp->b_rptr;
1367         mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len);
1368 
1369         ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
1370         ip6h->ip6_nxt = IPPROTO_ICMPV6;
1371         ip6h->ip6_hops = ipst->ips_ipv6_def_hops;
1372         ip6h->ip6_dst = v6dst;
1373         ip6h->ip6_src = v6src;
1374         msg_len += IPV6_HDR_LEN + len;
1375         if (msg_len > IP_MAXPACKET + IPV6_HDR_LEN) {
1376                 (void) adjmsg(mp, IP_MAXPACKET + IPV6_HDR_LEN - msg_len);
1377                 msg_len = IP_MAXPACKET + IPV6_HDR_LEN;
1378         }
1379         ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN));
1380         icmp6 = (icmp6_t *)&ip6h[1];
1381         bcopy(stuff, (char *)icmp6, len);
1382         /*
1383          * Prepare for checksum by putting icmp length in the icmp
1384          * checksum field. The checksum is calculated in ip_output_wire_v6.
1385          */
1386         icmp6->icmp6_cksum = ip6h->ip6_plen;
1387         if (icmp6->icmp6_type == ND_REDIRECT) {
1388                 ip6h->ip6_hops = IPV6_MAX_HOPS;
1389         }
1390 
1391         (void) ip_output_simple(mp, &ixas);
1392         ixa_cleanup(&ixas);
1393 }
1394 
1395 /*
1396  * Update the output mib when ICMPv6 packets are sent.
1397  */
1398 void
1399 icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6)
1400 {
1401         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs);
1402 
1403         switch (icmp6->icmp6_type) {
1404         case ICMP6_DST_UNREACH:
1405                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutDestUnreachs);
1406                 if (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN)
1407                         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutAdminProhibs);
1408                 break;
1409 
1410         case ICMP6_TIME_EXCEEDED:
1411                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutTimeExcds);
1412                 break;
1413 
1414         case ICMP6_PARAM_PROB:
1415                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutParmProblems);
1416                 break;
1417 
1418         case ICMP6_PACKET_TOO_BIG:
1419                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutPktTooBigs);
1420                 break;
1421 
1422         case ICMP6_ECHO_REQUEST:
1423                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchos);
1424                 break;
1425 
1426         case ICMP6_ECHO_REPLY:
1427                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutEchoReplies);
1428                 break;
1429 
1430         case ND_ROUTER_SOLICIT:
1431                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterSolicits);
1432                 break;
1433 
1434         case ND_ROUTER_ADVERT:
1435                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRouterAdvertisements);
1436                 break;
1437 
1438         case ND_NEIGHBOR_SOLICIT:
1439                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutNeighborSolicits);
1440                 break;
1441 
1442         case ND_NEIGHBOR_ADVERT:
1443                 BUMP_MIB(ill->ill_icmp6_mib,
1444                     ipv6IfIcmpOutNeighborAdvertisements);
1445                 break;
1446 
1447         case ND_REDIRECT:
1448                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutRedirects);
1449                 break;
1450 
1451         case MLD_LISTENER_QUERY:
1452                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembQueries);
1453                 break;
1454 
1455         case MLD_LISTENER_REPORT:
1456         case MLD_V2_LISTENER_REPORT:
1457                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembResponses);
1458                 break;
1459 
1460         case MLD_LISTENER_REDUCTION:
1461                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutGroupMembReductions);
1462                 break;
1463         }
1464 }
1465 
1466 /*
1467  * Check if it is ok to send an ICMPv6 error packet in
1468  * response to the IP packet in mp.
1469  * Free the message and return null if no
1470  * ICMP error packet should be sent.
1471  */
1472 static mblk_t *
1473 icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira)
1474 {
1475         ill_t           *ill = ira->ira_ill;
1476         ip_stack_t      *ipst = ill->ill_ipst;
1477         boolean_t       llbcast;
1478         ip6_t           *ip6h;
1479 
1480         if (!mp)
1481                 return (NULL);
1482 
1483         /* We view multicast and broadcast as the same.. */
1484         llbcast = (ira->ira_flags &
1485             (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0;
1486         ip6h = (ip6_t *)mp->b_rptr;
1487 
1488         /* Check if source address uniquely identifies the host */
1489 
1490         if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src) ||
1491             IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src) ||
1492             IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
1493                 freemsg(mp);
1494                 return (NULL);
1495         }
1496 
1497         if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
1498                 size_t  len_needed = IPV6_HDR_LEN + ICMP6_MINLEN;
1499                 icmp6_t         *icmp6;
1500 
1501                 if (mp->b_wptr - mp->b_rptr < len_needed) {
1502                         if (!pullupmsg(mp, len_needed)) {
1503                                 BUMP_MIB(ill->ill_icmp6_mib,
1504                                     ipv6IfIcmpInErrors);
1505                                 freemsg(mp);
1506                                 return (NULL);
1507                         }
1508                         ip6h = (ip6_t *)mp->b_rptr;
1509                 }
1510                 icmp6 = (icmp6_t *)&ip6h[1];
1511                 /* Explicitly do not generate errors in response to redirects */
1512                 if (ICMP6_IS_ERROR(icmp6->icmp6_type) ||
1513                     icmp6->icmp6_type == ND_REDIRECT) {
1514                         freemsg(mp);
1515                         return (NULL);
1516                 }
1517         }
1518         /*
1519          * Check that the destination is not multicast and that the packet
1520          * was not sent on link layer broadcast or multicast.  (Exception
1521          * is Packet too big message as per the draft - when mcast_ok is set.)
1522          */
1523         if (!mcast_ok &&
1524             (llbcast || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) {
1525                 freemsg(mp);
1526                 return (NULL);
1527         }
1528         /*
1529          * If this is a labeled system, then check to see if we're allowed to
1530          * send a response to this particular sender.  If not, then just drop.
1531          */
1532         if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
1533                 BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors);
1534                 freemsg(mp);
1535                 return (NULL);
1536         }
1537 
1538         if (icmp_err_rate_limit(ipst)) {
1539                 /*
1540                  * Only send ICMP error packets every so often.
1541                  * This should be done on a per port/source basis,
1542                  * but for now this will suffice.
1543                  */
1544                 freemsg(mp);
1545                 return (NULL);
1546         }
1547         return (mp);
1548 }
1549 
1550 /*
1551  * Called when a packet was sent out the same link that it arrived on.
1552  * Check if it is ok to send a redirect and then send it.
1553  */
1554 void
1555 ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire,
1556     ip_recv_attr_t *ira)
1557 {
1558         ill_t           *ill = ira->ira_ill;
1559         ip_stack_t      *ipst = ill->ill_ipst;
1560         in6_addr_t      *v6targ;
1561         ire_t           *src_ire_v6 = NULL;
1562         mblk_t          *mp1;
1563         ire_t           *nhop_ire = NULL;
1564 
1565         /*
1566          * Don't send a redirect when forwarding a source
1567          * routed packet.
1568          */
1569         if (ip_source_routed_v6(ip6h, mp, ipst))
1570                 return;
1571 
1572         if (ire->ire_type & IRE_ONLINK) {
1573                 /* Target is directly connected */
1574                 v6targ = &ip6h->ip6_dst;
1575         } else {
1576                 /* Determine the most specific IRE used to send the packets */
1577                 nhop_ire = ire_nexthop(ire);
1578                 if (nhop_ire == NULL)
1579                         return;
1580 
1581                 /*
1582                  * We won't send redirects to a router
1583                  * that doesn't have a link local
1584                  * address, but will forward.
1585                  */
1586                 if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) {
1587                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
1588                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1589                         ire_refrele(nhop_ire);
1590                         return;
1591                 }
1592                 v6targ = &nhop_ire->ire_addr_v6;
1593         }
1594         src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src,
1595             NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
1596             MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL);
1597 
1598         if (src_ire_v6 == NULL) {
1599                 if (nhop_ire != NULL)
1600                         ire_refrele(nhop_ire);
1601                 return;
1602         }
1603 
1604         /*
1605          * The source is directly connected.
1606          */
1607         mp1 = copymsg(mp);
1608         if (mp1 != NULL)
1609                 icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira);
1610 
1611         if (nhop_ire != NULL)
1612                 ire_refrele(nhop_ire);
1613         ire_refrele(src_ire_v6);
1614 }
1615 
1616 /*
1617  * Generate an ICMPv6 redirect message.
1618  * Include target link layer address option if it exits.
1619  * Always include redirect header.
1620  */
1621 static void
1622 icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest,
1623     ip_recv_attr_t *ira)
1624 {
1625         nd_redirect_t   *rd;
1626         nd_opt_rd_hdr_t *rdh;
1627         uchar_t         *buf;
1628         ncec_t          *ncec = NULL;
1629         nd_opt_hdr_t    *opt;
1630         int             len;
1631         int             ll_opt_len = 0;
1632         int             max_redir_hdr_data_len;
1633         int             pkt_len;
1634         in6_addr_t      *srcp;
1635         ill_t           *ill;
1636         boolean_t       need_refrele;
1637         ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
1638 
1639         mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira);
1640         if (mp == NULL)
1641                 return;
1642 
1643         if (IS_UNDER_IPMP(ira->ira_ill)) {
1644                 ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill);
1645                 if (ill == NULL) {
1646                         ill = ira->ira_ill;
1647                         BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects);
1648                         ip_drop_output("no IPMP ill for sending redirect",
1649                             mp, ill);
1650                         freemsg(mp);
1651                         return;
1652                 }
1653                 need_refrele = B_TRUE;
1654         } else {
1655                 ill = ira->ira_ill;
1656                 need_refrele = B_FALSE;
1657         }
1658 
1659         ncec = ncec_lookup_illgrp_v6(ill, targetp);
1660         if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE &&
1661             ncec->ncec_lladdr != NULL) {
1662                 ll_opt_len = (sizeof (nd_opt_hdr_t) +
1663                     ill->ill_phys_addr_length + 7)/8 * 8;
1664         }
1665         len = sizeof (nd_redirect_t) + sizeof (nd_opt_rd_hdr_t) + ll_opt_len;
1666         ASSERT(len % 4 == 0);
1667         buf = kmem_alloc(len, KM_NOSLEEP);
1668         if (buf == NULL) {
1669                 if (ncec != NULL)
1670                         ncec_refrele(ncec);
1671                 if (need_refrele)
1672                         ill_refrele(ill);
1673                 freemsg(mp);
1674                 return;
1675         }
1676 
1677         rd = (nd_redirect_t *)buf;
1678         rd->nd_rd_type = (uint8_t)ND_REDIRECT;
1679         rd->nd_rd_code = 0;
1680         rd->nd_rd_reserved = 0;
1681         rd->nd_rd_target = *targetp;
1682         rd->nd_rd_dst = *dest;
1683 
1684         opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t));
1685         if (ncec != NULL && ll_opt_len != 0) {
1686                 opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
1687                 opt->nd_opt_len = ll_opt_len/8;
1688                 bcopy((char *)ncec->ncec_lladdr, &opt[1],
1689                     ill->ill_phys_addr_length);
1690         }
1691         if (ncec != NULL)
1692                 ncec_refrele(ncec);
1693         rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len);
1694         rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER;
1695         /* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */
1696         max_redir_hdr_data_len =
1697             (ipst->ips_ipv6_icmp_return - IPV6_HDR_LEN - len)/8*8;
1698         pkt_len = msgdsize(mp);
1699         /* Make sure mp is 8 byte aligned */
1700         if (pkt_len > max_redir_hdr_data_len) {
1701                 rdh->nd_opt_rh_len = (max_redir_hdr_data_len +
1702                     sizeof (nd_opt_rd_hdr_t))/8;
1703                 (void) adjmsg(mp, max_redir_hdr_data_len - pkt_len);
1704         } else {
1705                 rdh->nd_opt_rh_len = (pkt_len + sizeof (nd_opt_rd_hdr_t))/8;
1706                 (void) adjmsg(mp, -(pkt_len % 8));
1707         }
1708         rdh->nd_opt_rh_reserved1 = 0;
1709         rdh->nd_opt_rh_reserved2 = 0;
1710         /* ipif_v6lcl_addr contains the link-local source address */
1711         srcp = &ill->ill_ipif->ipif_v6lcl_addr;
1712 
1713         /* Redirects sent by router, and router is global zone */
1714         ASSERT(ira->ira_zoneid == ALL_ZONES);
1715         ira->ira_zoneid = GLOBAL_ZONEID;
1716         icmp_pkt_v6(mp, buf, len, srcp, ira);
1717         kmem_free(buf, len);
1718         if (need_refrele)
1719                 ill_refrele(ill);
1720 }
1721 
1722 
1723 /* Generate an ICMP time exceeded message.  (May be called as writer.) */
1724 void
1725 icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1726     ip_recv_attr_t *ira)
1727 {
1728         icmp6_t icmp6;
1729 
1730         mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1731         if (mp == NULL)
1732                 return;
1733 
1734         bzero(&icmp6, sizeof (icmp6_t));
1735         icmp6.icmp6_type = ICMP6_TIME_EXCEEDED;
1736         icmp6.icmp6_code = code;
1737         icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1738 }
1739 
1740 /*
1741  * Generate an ICMP unreachable message.
1742  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1743  * constructed by the caller.
1744  */
1745 void
1746 icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok,
1747     ip_recv_attr_t *ira)
1748 {
1749         icmp6_t icmp6;
1750 
1751         mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1752         if (mp == NULL)
1753                 return;
1754 
1755         bzero(&icmp6, sizeof (icmp6_t));
1756         icmp6.icmp6_type = ICMP6_DST_UNREACH;
1757         icmp6.icmp6_code = code;
1758         icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1759 }
1760 
1761 /*
1762  * Generate an ICMP pkt too big message.
1763  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1764  * constructed by the caller.
1765  */
1766 void
1767 icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok,
1768     ip_recv_attr_t *ira)
1769 {
1770         icmp6_t icmp6;
1771 
1772         mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1773         if (mp == NULL)
1774                 return;
1775 
1776         bzero(&icmp6, sizeof (icmp6_t));
1777         icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG;
1778         icmp6.icmp6_code = 0;
1779         icmp6.icmp6_mtu = htonl(mtu);
1780 
1781         icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1782 }
1783 
1784 /*
1785  * Generate an ICMP parameter problem message. (May be called as writer.)
1786  * 'offset' is the offset from the beginning of the packet in error.
1787  * When called from ip_output side a minimal ip_recv_attr_t needs to be
1788  * constructed by the caller.
1789  */
1790 static void
1791 icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset,
1792     boolean_t mcast_ok, ip_recv_attr_t *ira)
1793 {
1794         icmp6_t icmp6;
1795 
1796         mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira);
1797         if (mp == NULL)
1798                 return;
1799 
1800         bzero((char *)&icmp6, sizeof (icmp6_t));
1801         icmp6.icmp6_type = ICMP6_PARAM_PROB;
1802         icmp6.icmp6_code = code;
1803         icmp6.icmp6_pptr = htonl(offset);
1804         icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira);
1805 }
1806 
1807 void
1808 icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok,
1809     ip_recv_attr_t *ira)
1810 {
1811         ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
1812         uint16_t        hdr_length;
1813         uint8_t         *nexthdrp;
1814         uint32_t        offset;
1815         ill_t           *ill = ira->ira_ill;
1816 
1817         /* Determine the offset of the bad nexthdr value */
1818         if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) {
1819                 /* Malformed packet */
1820                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1821                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1822                 freemsg(mp);
1823                 return;
1824         }
1825 
1826         offset = nexthdrp - mp->b_rptr;
1827         icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset,
1828             mcast_ok, ira);
1829 }
1830 
1831 /*
1832  * Verify whether or not the IP address is a valid local address.
1833  * Could be a unicast, including one for a down interface.
1834  * If allow_mcbc then a multicast or broadcast address is also
1835  * acceptable.
1836  *
1837  * In the case of a multicast address, however, the
1838  * upper protocol is expected to reset the src address
1839  * to zero when we return IPVL_MCAST so that
1840  * no packets are emitted with multicast address as
1841  * source address.
1842  * The addresses valid for bind are:
1843  *      (1) - in6addr_any
1844  *      (2) - IP address of an UP interface
1845  *      (3) - IP address of a DOWN interface
1846  *      (4) - a multicast address. In this case
1847  *      the conn will only receive packets destined to
1848  *      the specified multicast address. Note: the
1849  *      application still has to issue an
1850  *      IPV6_JOIN_GROUP socket option.
1851  *
1852  * In all the above cases, the bound address must be valid in the current zone.
1853  * When the address is loopback or multicast, there might be many matching IREs
1854  * so bind has to look up based on the zone.
1855  */
1856 ip_laddr_t
1857 ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid,
1858     ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid)
1859 {
1860         ire_t           *src_ire;
1861         uint_t          match_flags;
1862         ill_t           *ill = NULL;
1863 
1864         ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src));
1865         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src));
1866 
1867         match_flags = MATCH_IRE_ZONEONLY;
1868         if (scopeid != 0) {
1869                 ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst);
1870                 if (ill == NULL)
1871                         return (IPVL_BAD);
1872                 match_flags |= MATCH_IRE_ILL;
1873         }
1874 
1875         src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0,
1876             ill, zoneid, NULL, match_flags, 0, ipst, NULL);
1877         if (ill != NULL)
1878                 ill_refrele(ill);
1879 
1880         /*
1881          * If an address other than in6addr_any is requested,
1882          * we verify that it is a valid address for bind
1883          * Note: Following code is in if-else-if form for
1884          * readability compared to a condition check.
1885          */
1886         if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
1887                 /*
1888                  * (2) Bind to address of local UP interface
1889                  */
1890                 ire_refrele(src_ire);
1891                 return (IPVL_UNICAST_UP);
1892         } else if (IN6_IS_ADDR_MULTICAST(v6src)) {
1893                 /* (4) bind to multicast address. */
1894                 if (src_ire != NULL)
1895                         ire_refrele(src_ire);
1896 
1897                 /*
1898                  * Note: caller should take IPV6_MULTICAST_IF
1899                  * into account when selecting a real source address.
1900                  */
1901                 if (allow_mcbc)
1902                         return (IPVL_MCAST);
1903                 else
1904                         return (IPVL_BAD);
1905         } else {
1906                 ipif_t *ipif;
1907 
1908                 /*
1909                  * (3) Bind to address of local DOWN interface?
1910                  * (ipif_lookup_addr() looks up all interfaces
1911                  * but we do not get here for UP interfaces
1912                  * - case (2) above)
1913                  */
1914                 if (src_ire != NULL)
1915                         ire_refrele(src_ire);
1916 
1917                 ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst);
1918                 if (ipif == NULL)
1919                         return (IPVL_BAD);
1920 
1921                 /* Not a useful source? */
1922                 if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
1923                         ipif_refrele(ipif);
1924                         return (IPVL_BAD);
1925                 }
1926                 ipif_refrele(ipif);
1927                 return (IPVL_UNICAST_DOWN);
1928         }
1929 }
1930 
1931 /*
1932  * Verify that both the source and destination addresses are valid.  If
1933  * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
1934  * i.e. have no route to it.  Protocols like TCP want to verify destination
1935  * reachability, while tunnels do not.
1936  *
1937  * Determine the route, the interface, and (optionally) the source address
1938  * to use to reach a given destination.
1939  * Note that we allow connect to broadcast and multicast addresses when
1940  * IPDF_ALLOW_MCBC is set.
1941  * first_hop and dst_addr are normally the same, but if source routing
1942  * they will differ; in that case the first_hop is what we'll use for the
1943  * routing lookup but the dce and label checks will be done on dst_addr,
1944  *
1945  * If uinfo is set, then we fill in the best available information
1946  * we have for the destination. This is based on (in priority order) any
1947  * metrics and path MTU stored in a dce_t, route metrics, and finally the
1948  * ill_mtu/ill_mc_mtu.
1949  *
1950  * Tsol note: If we have a source route then dst_addr != firsthop. But we
1951  * always do the label check on dst_addr.
1952  *
1953  * Assumes that the caller has set ixa_scopeid for link-local communication.
1954  */
1955 int
1956 ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr,
1957     const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo,
1958     uint32_t flags, uint_t mac_mode)
1959 {
1960         ire_t           *ire;
1961         int             error = 0;
1962         in6_addr_t      setsrc;                         /* RTF_SETSRC */
1963         zoneid_t        zoneid = ixa->ixa_zoneid;    /* Honors SO_ALLZONES */
1964         ip_stack_t      *ipst = ixa->ixa_ipst;
1965         dce_t           *dce;
1966         uint_t          pmtu;
1967         uint_t          ifindex;
1968         uint_t          generation;
1969         nce_t           *nce;
1970         ill_t           *ill = NULL;
1971         boolean_t       multirt = B_FALSE;
1972 
1973         ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr));
1974 
1975         ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
1976 
1977         /*
1978          * We never send to zero; the ULPs map it to the loopback address.
1979          * We can't allow it since we use zero to mean unitialized in some
1980          * places.
1981          */
1982         ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr));
1983 
1984         if (is_system_labeled()) {
1985                 ts_label_t *tsl = NULL;
1986 
1987                 error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION,
1988                     mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
1989                 if (error != 0)
1990                         return (error);
1991                 if (tsl != NULL) {
1992                         /* Update the label */
1993                         ip_xmit_attr_replace_tsl(ixa, tsl);
1994                 }
1995         }
1996 
1997         setsrc = ipv6_all_zeros;
1998         /*
1999          * Select a route; For IPMP interfaces, we would only select
2000          * a "hidden" route (i.e., going through a specific under_ill)
2001          * if ixa_ifindex has been specified.
2002          */
2003         ire = ip_select_route_v6(firsthop, *src_addrp, ixa, &generation,
2004             &setsrc, &error, &multirt);
2005         ASSERT(ire != NULL);    /* IRE_NOROUTE if none found */
2006         if (error != 0)
2007                 goto bad_addr;
2008 
2009         /*
2010          * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
2011          * If IPDF_VERIFY_DST is set, the destination must be reachable.
2012          * Otherwise the destination needn't be reachable.
2013          *
2014          * If we match on a reject or black hole, then we've got a
2015          * local failure.  May as well fail out the connect() attempt,
2016          * since it's never going to succeed.
2017          */
2018         if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2019                 /*
2020                  * If we're verifying destination reachability, we always want
2021                  * to complain here.
2022                  *
2023                  * If we're not verifying destination reachability but the
2024                  * destination has a route, we still want to fail on the
2025                  * temporary address and broadcast address tests.
2026                  *
2027                  * In both cases do we let the code continue so some reasonable
2028                  * information is returned to the caller. That enables the
2029                  * caller to use (and even cache) the IRE. conn_ip_ouput will
2030                  * use the generation mismatch path to check for the unreachable
2031                  * case thereby avoiding any specific check in the main path.
2032                  */
2033                 ASSERT(generation == IRE_GENERATION_VERIFY);
2034                 if (flags & IPDF_VERIFY_DST) {
2035                         /*
2036                          * Set errno but continue to set up ixa_ire to be
2037                          * the RTF_REJECT|RTF_BLACKHOLE IRE.
2038                          * That allows callers to use ip_output to get an
2039                          * ICMP error back.
2040                          */
2041                         if (!(ire->ire_type & IRE_HOST))
2042                                 error = ENETUNREACH;
2043                         else
2044                                 error = EHOSTUNREACH;
2045                 }
2046         }
2047 
2048         if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
2049             !(flags & IPDF_ALLOW_MCBC)) {
2050                 ire_refrele(ire);
2051                 ire = ire_reject(ipst, B_FALSE);
2052                 generation = IRE_GENERATION_VERIFY;
2053                 error = ENETUNREACH;
2054         }
2055 
2056         /* Cache things */
2057         if (ixa->ixa_ire != NULL)
2058                 ire_refrele_notr(ixa->ixa_ire);
2059 #ifdef DEBUG
2060         ire_refhold_notr(ire);
2061         ire_refrele(ire);
2062 #endif
2063         ixa->ixa_ire = ire;
2064         ixa->ixa_ire_generation = generation;
2065 
2066         /*
2067          * Ensure that ixa_dce is always set any time that ixa_ire is set,
2068          * since some callers will send a packet to conn_ip_output() even if
2069          * there's an error.
2070          */
2071         ifindex = 0;
2072         if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) {
2073                 /* If we are creating a DCE we'd better have an ifindex */
2074                 if (ill != NULL)
2075                         ifindex = ill->ill_phyint->phyint_ifindex;
2076                 else
2077                         flags &= ~IPDF_UNIQUE_DCE;
2078         }
2079 
2080         if (flags & IPDF_UNIQUE_DCE) {
2081                 /* Fallback to the default dce if allocation fails */
2082                 dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst);
2083                 if (dce != NULL) {
2084                         generation = dce->dce_generation;
2085                 } else {
2086                         dce = dce_lookup_v6(dst_addr, ifindex, ipst,
2087                             &generation);
2088                 }
2089         } else {
2090                 dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation);
2091         }
2092         ASSERT(dce != NULL);
2093         if (ixa->ixa_dce != NULL)
2094                 dce_refrele_notr(ixa->ixa_dce);
2095 #ifdef DEBUG
2096         dce_refhold_notr(dce);
2097         dce_refrele(dce);
2098 #endif
2099         ixa->ixa_dce = dce;
2100         ixa->ixa_dce_generation = generation;
2101 
2102 
2103         /*
2104          * For multicast with multirt we have a flag passed back from
2105          * ire_lookup_multi_ill_v6 since we don't have an IRE for each
2106          * possible multicast address.
2107          * We also need a flag for multicast since we can't check
2108          * whether RTF_MULTIRT is set in ixa_ire for multicast.
2109          */
2110         if (multirt) {
2111                 ixa->ixa_postfragfn = ip_postfrag_multirt_v6;
2112                 ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
2113         } else {
2114                 ixa->ixa_postfragfn = ire->ire_postfragfn;
2115                 ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
2116         }
2117         if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2118                 /* Get an nce to cache. */
2119                 nce = ire_to_nce(ire, NULL, firsthop);
2120                 if (nce == NULL) {
2121                         /* Allocation failure? */
2122                         ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2123                 } else {
2124                         if (ixa->ixa_nce != NULL)
2125                                 nce_refrele(ixa->ixa_nce);
2126                         ixa->ixa_nce = nce;
2127                 }
2128         }
2129 
2130         /*
2131          * If the source address is a loopback address, the
2132          * destination had best be local or multicast.
2133          * If we are sending to an IRE_LOCAL using a loopback source then
2134          * it had better be the same zoneid.
2135          */
2136         if (IN6_IS_ADDR_LOOPBACK(src_addrp)) {
2137                 if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
2138                         ire = NULL;     /* Stored in ixa_ire */
2139                         error = EADDRNOTAVAIL;
2140                         goto bad_addr;
2141                 }
2142                 if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
2143                         ire = NULL;     /* Stored in ixa_ire */
2144                         error = EADDRNOTAVAIL;
2145                         goto bad_addr;
2146                 }
2147         }
2148 
2149         /*
2150          * Does the caller want us to pick a source address?
2151          */
2152         if (flags & IPDF_SELECT_SRC) {
2153                 in6_addr_t      src_addr;
2154 
2155                 /*
2156                  * We use use ire_nexthop_ill to avoid the under ipmp
2157                  * interface for source address selection. Note that for ipmp
2158                  * probe packets, ixa_ifindex would have been specified, and
2159                  * the ip_select_route() invocation would have picked an ire
2160                  * will ire_ill pointing at an under interface.
2161                  */
2162                 ill = ire_nexthop_ill(ire);
2163 
2164                 /* If unreachable we have no ill but need some source */
2165                 if (ill == NULL) {
2166                         src_addr = ipv6_loopback;
2167                         /* Make sure we look for a better source address */
2168                         generation = SRC_GENERATION_VERIFY;
2169                 } else {
2170                         error = ip_select_source_v6(ill, &setsrc, dst_addr,
2171                             zoneid, ipst, B_FALSE, ixa->ixa_src_preferences,
2172                             &src_addr, &generation, NULL);
2173                         if (error != 0) {
2174                                 ire = NULL;     /* Stored in ixa_ire */
2175                                 goto bad_addr;
2176                         }
2177                 }
2178 
2179                 /*
2180                  * We allow the source address to to down.
2181                  * However, we check that we don't use the loopback address
2182                  * as a source when sending out on the wire.
2183                  */
2184                 if (IN6_IS_ADDR_LOOPBACK(&src_addr) &&
2185                     !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
2186                     !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
2187                         ire = NULL;     /* Stored in ixa_ire */
2188                         error = EADDRNOTAVAIL;
2189                         goto bad_addr;
2190                 }
2191 
2192                 *src_addrp = src_addr;
2193                 ixa->ixa_src_generation = generation;
2194         }
2195 
2196         /*
2197          * Make sure we don't leave an unreachable ixa_nce in place
2198          * since ip_select_route is used when we unplumb i.e., remove
2199          * references on ixa_ire, ixa_nce, and ixa_dce.
2200          */
2201         nce = ixa->ixa_nce;
2202         if (nce != NULL && nce->nce_is_condemned) {
2203                 nce_refrele(nce);
2204                 ixa->ixa_nce = NULL;
2205                 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2206         }
2207 
2208         /*
2209          * Note that IPv6 multicast supports PMTU discovery unlike IPv4
2210          * multicast. But pmtu discovery is only enabled for connected
2211          * sockets in general.
2212          */
2213 
2214         /*
2215          * Set initial value for fragmentation limit.  Either conn_ip_output
2216          * or ULP might updates it when there are routing changes.
2217          * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
2218          */
2219         pmtu = ip_get_pmtu(ixa);
2220         ixa->ixa_fragsize = pmtu;
2221         /* Make sure ixa_fragsize and ixa_pmtu remain identical */
2222         if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
2223                 ixa->ixa_pmtu = pmtu;
2224 
2225         /*
2226          * Extract information useful for some transports.
2227          * First we look for DCE metrics. Then we take what we have in
2228          * the metrics in the route, where the offlink is used if we have
2229          * one.
2230          */
2231         if (uinfo != NULL) {
2232                 bzero(uinfo, sizeof (*uinfo));
2233 
2234                 if (dce->dce_flags & DCEF_UINFO)
2235                         *uinfo = dce->dce_uinfo;
2236 
2237                 rts_merge_metrics(uinfo, &ire->ire_metrics);
2238 
2239                 /* Allow ire_metrics to decrease the path MTU from above */
2240                 if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
2241                         uinfo->iulp_mtu = pmtu;
2242 
2243                 uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
2244                 uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
2245                 uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
2246         }
2247 
2248         if (ill != NULL)
2249                 ill_refrele(ill);
2250 
2251         return (error);
2252 
2253 bad_addr:
2254         if (ire != NULL)
2255                 ire_refrele(ire);
2256 
2257         if (ill != NULL)
2258                 ill_refrele(ill);
2259 
2260         /*
2261          * Make sure we don't leave an unreachable ixa_nce in place
2262          * since ip_select_route is used when we unplumb i.e., remove
2263          * references on ixa_ire, ixa_nce, and ixa_dce.
2264          */
2265         nce = ixa->ixa_nce;
2266         if (nce != NULL && nce->nce_is_condemned) {
2267                 nce_refrele(nce);
2268                 ixa->ixa_nce = NULL;
2269                 ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
2270         }
2271 
2272         return (error);
2273 }
2274 
2275 /*
2276  * Handle protocols with which IP is less intimate.  There
2277  * can be more than one stream bound to a particular
2278  * protocol.  When this is the case, normally each one gets a copy
2279  * of any incoming packets.
2280  *
2281  * Zones notes:
2282  * Packets will be distributed to conns in all zones. This is really only
2283  * useful for ICMPv6 as only applications in the global zone can create raw
2284  * sockets for other protocols.
2285  */
2286 void
2287 ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira)
2288 {
2289         mblk_t          *mp1;
2290         in6_addr_t      laddr = ip6h->ip6_dst;
2291         conn_t          *connp, *first_connp, *next_connp;
2292         connf_t         *connfp;
2293         ill_t           *ill = ira->ira_ill;
2294         ip_stack_t      *ipst = ill->ill_ipst;
2295 
2296         connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol];
2297         mutex_enter(&connfp->connf_lock);
2298         connp = connfp->connf_head;
2299         for (connp = connfp->connf_head; connp != NULL;
2300             connp = connp->conn_next) {
2301                 /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2302                 if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2303                     (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2304                     tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2305                         break;
2306         }
2307 
2308         if (connp == NULL) {
2309                 /*
2310                  * No one bound to this port.  Is
2311                  * there a client that wants all
2312                  * unclaimed datagrams?
2313                  */
2314                 mutex_exit(&connfp->connf_lock);
2315                 ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
2316                     ICMP6_PARAMPROB_NEXTHEADER, ira);
2317                 return;
2318         }
2319 
2320         ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
2321 
2322         CONN_INC_REF(connp);
2323         first_connp = connp;
2324 
2325         /*
2326          * XXX: Fix the multiple protocol listeners case. We should not
2327          * be walking the conn->conn_next list here.
2328          */
2329         connp = connp->conn_next;
2330         for (;;) {
2331                 while (connp != NULL) {
2332                         /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */
2333                         if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) &&
2334                             (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2335                             tsol_receive_local(mp, &laddr, IPV6_VERSION,
2336                             ira, connp)))
2337                                 break;
2338                         connp = connp->conn_next;
2339                 }
2340 
2341                 if (connp == NULL) {
2342                         /* No more interested clients */
2343                         connp = first_connp;
2344                         break;
2345                 }
2346                 if (((mp1 = dupmsg(mp)) == NULL) &&
2347                     ((mp1 = copymsg(mp)) == NULL)) {
2348                         /* Memory allocation failed */
2349                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2350                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2351                         connp = first_connp;
2352                         break;
2353                 }
2354 
2355                 CONN_INC_REF(connp);
2356                 mutex_exit(&connfp->connf_lock);
2357 
2358                 ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr,
2359                     ira);
2360 
2361                 mutex_enter(&connfp->connf_lock);
2362                 /* Follow the next pointer before releasing the conn. */
2363                 next_connp = connp->conn_next;
2364                 CONN_DEC_REF(connp);
2365                 connp = next_connp;
2366         }
2367 
2368         /* Last one.  Send it upstream. */
2369         mutex_exit(&connfp->connf_lock);
2370 
2371         ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira);
2372 
2373         CONN_DEC_REF(connp);
2374 }
2375 
2376 /*
2377  * Called when it is conceptually a ULP that would sent the packet
2378  * e.g., port unreachable and nexthdr unknown. Check that the packet
2379  * would have passed the IPsec global policy before sending the error.
2380  *
2381  * Send an ICMP error after patching up the packet appropriately.
2382  * Uses ip_drop_input and bumps the appropriate MIB.
2383  * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use.
2384  */
2385 void
2386 ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code,
2387     ip_recv_attr_t *ira)
2388 {
2389         ip6_t           *ip6h;
2390         boolean_t       secure;
2391         ill_t           *ill = ira->ira_ill;
2392         ip_stack_t      *ipst = ill->ill_ipst;
2393         netstack_t      *ns = ipst->ips_netstack;
2394         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2395 
2396         secure = ira->ira_flags & IRAF_IPSEC_SECURE;
2397 
2398         /*
2399          * We are generating an icmp error for some inbound packet.
2400          * Called from all ip_fanout_(udp, tcp, proto) functions.
2401          * Before we generate an error, check with global policy
2402          * to see whether this is allowed to enter the system. As
2403          * there is no "conn", we are checking with global policy.
2404          */
2405         ip6h = (ip6_t *)mp->b_rptr;
2406         if (secure || ipss->ipsec_inbound_v6_policy_present) {
2407                 mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns);
2408                 if (mp == NULL)
2409                         return;
2410         }
2411 
2412         /* We never send errors for protocols that we do implement */
2413         if (ira->ira_protocol == IPPROTO_ICMPV6) {
2414                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2415                 ip_drop_input("ip_fanout_send_icmp_v6", mp, ill);
2416                 freemsg(mp);
2417                 return;
2418         }
2419 
2420         switch (icmp_type) {
2421         case ICMP6_DST_UNREACH:
2422                 ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT);
2423 
2424                 BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
2425                 ip_drop_input("ipIfStatsNoPorts", mp, ill);
2426 
2427                 icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira);
2428                 break;
2429         case ICMP6_PARAM_PROB:
2430                 ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER);
2431 
2432                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
2433                 ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
2434 
2435                 /* Let the system determine the offset for this one */
2436                 icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira);
2437                 break;
2438         default:
2439 #ifdef DEBUG
2440                 panic("ip_fanout_send_icmp_v6: wrong type");
2441                 /*NOTREACHED*/
2442 #else
2443                 freemsg(mp);
2444                 break;
2445 #endif
2446         }
2447 }
2448 
2449 /*
2450  * Fanout for UDP packets that are multicast or ICMP errors.
2451  * (Unicast fanout is handled in ip_input_v6.)
2452  *
2453  * If SO_REUSEADDR is set all multicast packets
2454  * will be delivered to all conns bound to the same port.
2455  *
2456  * Fanout for UDP packets.
2457  * The caller puts <fport, lport> in the ports parameter.
2458  * ire_type must be IRE_BROADCAST for multicast and broadcast packets.
2459  *
2460  * If SO_REUSEADDR is set all multicast and broadcast packets
2461  * will be delivered to all conns bound to the same port.
2462  *
2463  * Zones notes:
2464  * Earlier in ip_input on a system with multiple shared-IP zones we
2465  * duplicate the multicast and broadcast packets and send them up
2466  * with each explicit zoneid that exists on that ill.
2467  * This means that here we can match the zoneid with SO_ALLZONES being special.
2468  */
2469 void
2470 ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport,
2471     ip_recv_attr_t *ira)
2472 {
2473         in6_addr_t      laddr;
2474         conn_t          *connp;
2475         connf_t         *connfp;
2476         in6_addr_t      faddr;
2477         ill_t           *ill = ira->ira_ill;
2478         ip_stack_t      *ipst = ill->ill_ipst;
2479 
2480         ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
2481 
2482         laddr = ip6h->ip6_dst;
2483         faddr = ip6h->ip6_src;
2484 
2485         /* Attempt to find a client stream based on destination port. */
2486         connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
2487         mutex_enter(&connfp->connf_lock);
2488         connp = connfp->connf_head;
2489         while (connp != NULL) {
2490                 if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) &&
2491                     conn_wantpacket_v6(connp, ira, ip6h) &&
2492                     (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2493                     tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp)))
2494                         break;
2495                 connp = connp->conn_next;
2496         }
2497 
2498         if (connp == NULL)
2499                 goto notfound;
2500 
2501         CONN_INC_REF(connp);
2502 
2503         if (connp->conn_reuseaddr) {
2504                 conn_t          *first_connp = connp;
2505                 conn_t          *next_connp;
2506                 mblk_t          *mp1;
2507 
2508                 connp = connp->conn_next;
2509                 for (;;) {
2510                         while (connp != NULL) {
2511                                 if (IPCL_UDP_MATCH_V6(connp, lport, laddr,
2512                                     fport, faddr) &&
2513                                     conn_wantpacket_v6(connp, ira, ip6h) &&
2514                                     (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
2515                                     tsol_receive_local(mp, &laddr, IPV6_VERSION,
2516                                     ira, connp)))
2517                                         break;
2518                                 connp = connp->conn_next;
2519                         }
2520                         if (connp == NULL) {
2521                                 /* No more interested clients */
2522                                 connp = first_connp;
2523                                 break;
2524                         }
2525                         if (((mp1 = dupmsg(mp)) == NULL) &&
2526                             ((mp1 = copymsg(mp)) == NULL)) {
2527                                 /* Memory allocation failed */
2528                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2529                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2530                                 connp = first_connp;
2531                                 break;
2532                         }
2533 
2534                         CONN_INC_REF(connp);
2535                         mutex_exit(&connfp->connf_lock);
2536 
2537                         IP6_STAT(ipst, ip6_udp_fanmb);
2538                         ip_fanout_udp_conn(connp, mp1, NULL,
2539                             (ip6_t *)mp1->b_rptr, ira);
2540 
2541                         mutex_enter(&connfp->connf_lock);
2542                         /* Follow the next pointer before releasing the conn. */
2543                         next_connp = connp->conn_next;
2544                         IP6_STAT(ipst, ip6_udp_fanmb);
2545                         CONN_DEC_REF(connp);
2546                         connp = next_connp;
2547                 }
2548         }
2549 
2550         /* Last one.  Send it upstream. */
2551         mutex_exit(&connfp->connf_lock);
2552 
2553         IP6_STAT(ipst, ip6_udp_fanmb);
2554         ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira);
2555         CONN_DEC_REF(connp);
2556         return;
2557 
2558 notfound:
2559         mutex_exit(&connfp->connf_lock);
2560         /*
2561          * No one bound to this port.  Is
2562          * there a client that wants all
2563          * unclaimed datagrams?
2564          */
2565         if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) {
2566                 ASSERT(ira->ira_protocol == IPPROTO_UDP);
2567                 ip_fanout_proto_v6(mp, ip6h, ira);
2568         } else {
2569                 ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH,
2570                     ICMP6_DST_UNREACH_NOPORT, ira);
2571         }
2572 }
2573 
2574 /*
2575  * int ip_find_hdr_v6()
2576  *
2577  * This routine is used by the upper layer protocols, iptun, and IPsec:
2578  * - Set extension header pointers to appropriate locations
2579  * - Determine IPv6 header length and return it
2580  * - Return a pointer to the last nexthdr value
2581  *
2582  * The caller must initialize ipp_fields.
2583  * The upper layer protocols normally set label_separate which makes the
2584  * routine put the TX label in ipp_label_v6. If this is not set then
2585  * the hop-by-hop options including the label are placed in ipp_hopopts.
2586  *
2587  * NOTE: If multiple extension headers of the same type are present,
2588  * ip_find_hdr_v6() will set the respective extension header pointers
2589  * to the first one that it encounters in the IPv6 header.  It also
2590  * skips fragment headers.  This routine deals with malformed packets
2591  * of various sorts in which case the returned length is up to the
2592  * malformed part.
2593  */
2594 int
2595 ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp,
2596     uint8_t *nexthdrp)
2597 {
2598         uint_t  length, ehdrlen;
2599         uint8_t nexthdr;
2600         uint8_t *whereptr, *endptr;
2601         ip6_dest_t *tmpdstopts;
2602         ip6_rthdr_t *tmprthdr;
2603         ip6_hbh_t *tmphopopts;
2604         ip6_frag_t *tmpfraghdr;
2605 
2606         ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
2607         ipp->ipp_hoplimit = ip6h->ip6_hops;
2608         ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow);
2609         ipp->ipp_addr = ip6h->ip6_dst;
2610 
2611         length = IPV6_HDR_LEN;
2612         whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
2613         endptr = mp->b_wptr;
2614 
2615         nexthdr = ip6h->ip6_nxt;
2616         while (whereptr < endptr) {
2617                 /* Is there enough left for len + nexthdr? */
2618                 if (whereptr + MIN_EHDR_LEN > endptr)
2619                         goto done;
2620 
2621                 switch (nexthdr) {
2622                 case IPPROTO_HOPOPTS: {
2623                         /* We check for any CIPSO */
2624                         uchar_t *secopt;
2625                         boolean_t hbh_needed;
2626                         uchar_t *after_secopt;
2627 
2628                         tmphopopts = (ip6_hbh_t *)whereptr;
2629                         ehdrlen = 8 * (tmphopopts->ip6h_len + 1);
2630                         if ((uchar_t *)tmphopopts +  ehdrlen > endptr)
2631                                 goto done;
2632                         nexthdr = tmphopopts->ip6h_nxt;
2633 
2634                         if (!label_separate) {
2635                                 secopt = NULL;
2636                                 after_secopt = whereptr;
2637                         } else {
2638                                 /*
2639                                  * We have dropped packets with bad options in
2640                                  * ip6_input. No need to check return value
2641                                  * here.
2642                                  */
2643                                 (void) tsol_find_secopt_v6(whereptr, ehdrlen,
2644                                     &secopt, &after_secopt, &hbh_needed);
2645                         }
2646                         if (secopt != NULL && after_secopt - whereptr > 0) {
2647                                 ipp->ipp_fields |= IPPF_LABEL_V6;
2648                                 ipp->ipp_label_v6 = secopt;
2649                                 ipp->ipp_label_len_v6 = after_secopt - whereptr;
2650                         } else {
2651                                 ipp->ipp_label_len_v6 = 0;
2652                                 after_secopt = whereptr;
2653                                 hbh_needed = B_TRUE;
2654                         }
2655                         /* return only 1st hbh */
2656                         if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) {
2657                                 ipp->ipp_fields |= IPPF_HOPOPTS;
2658                                 ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt;
2659                                 ipp->ipp_hopoptslen = ehdrlen -
2660                                     ipp->ipp_label_len_v6;
2661                         }
2662                         break;
2663                 }
2664                 case IPPROTO_DSTOPTS:
2665                         tmpdstopts = (ip6_dest_t *)whereptr;
2666                         ehdrlen = 8 * (tmpdstopts->ip6d_len + 1);
2667                         if ((uchar_t *)tmpdstopts +  ehdrlen > endptr)
2668                                 goto done;
2669                         nexthdr = tmpdstopts->ip6d_nxt;
2670                         /*
2671                          * ipp_dstopts is set to the destination header after a
2672                          * routing header.
2673                          * Assume it is a post-rthdr destination header
2674                          * and adjust when we find an rthdr.
2675                          */
2676                         if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
2677                                 ipp->ipp_fields |= IPPF_DSTOPTS;
2678                                 ipp->ipp_dstopts = tmpdstopts;
2679                                 ipp->ipp_dstoptslen = ehdrlen;
2680                         }
2681                         break;
2682                 case IPPROTO_ROUTING:
2683                         tmprthdr = (ip6_rthdr_t *)whereptr;
2684                         ehdrlen = 8 * (tmprthdr->ip6r_len + 1);
2685                         if ((uchar_t *)tmprthdr +  ehdrlen > endptr)
2686                                 goto done;
2687                         nexthdr = tmprthdr->ip6r_nxt;
2688                         /* return only 1st rthdr */
2689                         if (!(ipp->ipp_fields & IPPF_RTHDR)) {
2690                                 ipp->ipp_fields |= IPPF_RTHDR;
2691                                 ipp->ipp_rthdr = tmprthdr;
2692                                 ipp->ipp_rthdrlen = ehdrlen;
2693                         }
2694                         /*
2695                          * Make any destination header we've seen be a
2696                          * pre-rthdr destination header.
2697                          */
2698                         if (ipp->ipp_fields & IPPF_DSTOPTS) {
2699                                 ipp->ipp_fields &= ~IPPF_DSTOPTS;
2700                                 ipp->ipp_fields |= IPPF_RTHDRDSTOPTS;
2701                                 ipp->ipp_rthdrdstopts = ipp->ipp_dstopts;
2702                                 ipp->ipp_dstopts = NULL;
2703                                 ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen;
2704                                 ipp->ipp_dstoptslen = 0;
2705                         }
2706                         break;
2707                 case IPPROTO_FRAGMENT:
2708                         tmpfraghdr = (ip6_frag_t *)whereptr;
2709                         ehdrlen = sizeof (ip6_frag_t);
2710                         if ((uchar_t *)tmpfraghdr + ehdrlen > endptr)
2711                                 goto done;
2712                         nexthdr = tmpfraghdr->ip6f_nxt;
2713                         if (!(ipp->ipp_fields & IPPF_FRAGHDR)) {
2714                                 ipp->ipp_fields |= IPPF_FRAGHDR;
2715                                 ipp->ipp_fraghdr = tmpfraghdr;
2716                                 ipp->ipp_fraghdrlen = ehdrlen;
2717                         }
2718                         break;
2719                 case IPPROTO_NONE:
2720                 default:
2721                         goto done;
2722                 }
2723                 length += ehdrlen;
2724                 whereptr += ehdrlen;
2725         }
2726 done:
2727         if (nexthdrp != NULL)
2728                 *nexthdrp = nexthdr;
2729         return (length);
2730 }
2731 
2732 /*
2733  * Try to determine where and what are the IPv6 header length and
2734  * pointer to nexthdr value for the upper layer protocol (or an
2735  * unknown next hdr).
2736  *
2737  * Parameters returns a pointer to the nexthdr value;
2738  * Must handle malformed packets of various sorts.
2739  * Function returns failure for malformed cases.
2740  */
2741 boolean_t
2742 ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr,
2743     uint8_t **nexthdrpp)
2744 {
2745         uint16_t length;
2746         uint_t  ehdrlen;
2747         uint8_t *nexthdrp;
2748         uint8_t *whereptr;
2749         uint8_t *endptr;
2750         ip6_dest_t *desthdr;
2751         ip6_rthdr_t *rthdr;
2752         ip6_frag_t *fraghdr;
2753 
2754         ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION);
2755         length = IPV6_HDR_LEN;
2756         whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
2757         endptr = mp->b_wptr;
2758 
2759         nexthdrp = &ip6h->ip6_nxt;
2760         while (whereptr < endptr) {
2761                 /* Is there enough left for len + nexthdr? */
2762                 if (whereptr + MIN_EHDR_LEN > endptr)
2763                         break;
2764 
2765                 switch (*nexthdrp) {
2766                 case IPPROTO_HOPOPTS:
2767                 case IPPROTO_DSTOPTS:
2768                         /* Assumes the headers are identical for hbh and dst */
2769                         desthdr = (ip6_dest_t *)whereptr;
2770                         ehdrlen = 8 * (desthdr->ip6d_len + 1);
2771                         if ((uchar_t *)desthdr +  ehdrlen > endptr)
2772                                 return (B_FALSE);
2773                         nexthdrp = &desthdr->ip6d_nxt;
2774                         break;
2775                 case IPPROTO_ROUTING:
2776                         rthdr = (ip6_rthdr_t *)whereptr;
2777                         ehdrlen =  8 * (rthdr->ip6r_len + 1);
2778                         if ((uchar_t *)rthdr +  ehdrlen > endptr)
2779                                 return (B_FALSE);
2780                         nexthdrp = &rthdr->ip6r_nxt;
2781                         break;
2782                 case IPPROTO_FRAGMENT:
2783                         fraghdr = (ip6_frag_t *)whereptr;
2784                         ehdrlen = sizeof (ip6_frag_t);
2785                         if ((uchar_t *)&fraghdr[1] > endptr)
2786                                 return (B_FALSE);
2787                         nexthdrp = &fraghdr->ip6f_nxt;
2788                         break;
2789                 case IPPROTO_NONE:
2790                         /* No next header means we're finished */
2791                 default:
2792                         *hdr_length_ptr = length;
2793                         *nexthdrpp = nexthdrp;
2794                         return (B_TRUE);
2795                 }
2796                 length += ehdrlen;
2797                 whereptr += ehdrlen;
2798                 *hdr_length_ptr = length;
2799                 *nexthdrpp = nexthdrp;
2800         }
2801         switch (*nexthdrp) {
2802         case IPPROTO_HOPOPTS:
2803         case IPPROTO_DSTOPTS:
2804         case IPPROTO_ROUTING:
2805         case IPPROTO_FRAGMENT:
2806                 /*
2807                  * If any know extension headers are still to be processed,
2808                  * the packet's malformed (or at least all the IP header(s) are
2809                  * not in the same mblk - and that should never happen.
2810                  */
2811                 return (B_FALSE);
2812 
2813         default:
2814                 /*
2815                  * If we get here, we know that all of the IP headers were in
2816                  * the same mblk, even if the ULP header is in the next mblk.
2817                  */
2818                 *hdr_length_ptr = length;
2819                 *nexthdrpp = nexthdrp;
2820                 return (B_TRUE);
2821         }
2822 }
2823 
2824 /*
2825  * Return the length of the IPv6 related headers (including extension headers)
2826  * Returns a length even if the packet is malformed.
2827  */
2828 int
2829 ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h)
2830 {
2831         uint16_t hdr_len;
2832         uint8_t *nexthdrp;
2833 
2834         (void) ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_len, &nexthdrp);
2835         return (hdr_len);
2836 }
2837 
2838 /*
2839  * Parse and process any hop-by-hop or destination options.
2840  *
2841  * Assumes that q is an ill read queue so that ICMP errors for link-local
2842  * destinations are sent out the correct interface.
2843  *
2844  * Returns -1 if there was an error and mp has been consumed.
2845  * Returns 0 if no special action is needed.
2846  * Returns 1 if the packet contained a router alert option for this node
2847  * which is verified to be "interesting/known" for our implementation.
2848  *
2849  * XXX Note: In future as more hbh or dest options are defined,
2850  * it may be better to have different routines for hbh and dest
2851  * options as opt_type fields other than IP6OPT_PAD1 and IP6OPT_PADN
2852  * may have same value in different namespaces. Or is it same namespace ??
2853  * Current code checks for each opt_type (other than pads) if it is in
2854  * the expected  nexthdr (hbh or dest)
2855  */
2856 int
2857 ip_process_options_v6(mblk_t *mp, ip6_t *ip6h,
2858     uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira)
2859 {
2860         uint8_t opt_type;
2861         uint_t optused;
2862         int ret = 0;
2863         const char *errtype;
2864         ill_t           *ill = ira->ira_ill;
2865         ip_stack_t      *ipst = ill->ill_ipst;
2866 
2867         while (optlen != 0) {
2868                 opt_type = *optptr;
2869                 if (opt_type == IP6OPT_PAD1) {
2870                         optused = 1;
2871                 } else {
2872                         if (optlen < 2)
2873                                 goto bad_opt;
2874                         errtype = "malformed";
2875                         if (opt_type == ip6opt_ls) {
2876                                 optused = 2 + optptr[1];
2877                                 if (optused > optlen)
2878                                         goto bad_opt;
2879                         } else switch (opt_type) {
2880                         case IP6OPT_PADN:
2881                                 /*
2882                                  * Note:We don't verify that (N-2) pad octets
2883                                  * are zero as required by spec. Adhere to
2884                                  * "be liberal in what you accept..." part of
2885                                  * implementation philosophy (RFC791,RFC1122)
2886                                  */
2887                                 optused = 2 + optptr[1];
2888                                 if (optused > optlen)
2889                                         goto bad_opt;
2890                                 break;
2891 
2892                         case IP6OPT_JUMBO:
2893                                 if (hdr_type != IPPROTO_HOPOPTS)
2894                                         goto opt_error;
2895                                 goto opt_error; /* XXX Not implemented! */
2896 
2897                         case IP6OPT_ROUTER_ALERT: {
2898                                 struct ip6_opt_router *or;
2899 
2900                                 if (hdr_type != IPPROTO_HOPOPTS)
2901                                         goto opt_error;
2902                                 optused = 2 + optptr[1];
2903                                 if (optused > optlen)
2904                                         goto bad_opt;
2905                                 or = (struct ip6_opt_router *)optptr;
2906                                 /* Check total length and alignment */
2907                                 if (optused != sizeof (*or) ||
2908                                     ((uintptr_t)or->ip6or_value & 0x1) != 0)
2909                                         goto opt_error;
2910                                 /* Check value */
2911                                 switch (*((uint16_t *)or->ip6or_value)) {
2912                                 case IP6_ALERT_MLD:
2913                                 case IP6_ALERT_RSVP:
2914                                         ret = 1;
2915                                 }
2916                                 break;
2917                         }
2918                         case IP6OPT_HOME_ADDRESS: {
2919                                 /*
2920                                  * Minimal support for the home address option
2921                                  * (which is required by all IPv6 nodes).
2922                                  * Implement by just swapping the home address
2923                                  * and source address.
2924                                  * XXX Note: this has IPsec implications since
2925                                  * AH needs to take this into account.
2926                                  * Also, when IPsec is used we need to ensure
2927                                  * that this is only processed once
2928                                  * in the received packet (to avoid swapping
2929                                  * back and forth).
2930                                  * NOTE:This option processing is considered
2931                                  * to be unsafe and prone to a denial of
2932                                  * service attack.
2933                                  * The current processing is not safe even with
2934                                  * IPsec secured IP packets. Since the home
2935                                  * address option processing requirement still
2936                                  * is in the IETF draft and in the process of
2937                                  * being redefined for its usage, it has been
2938                                  * decided to turn off the option by default.
2939                                  * If this section of code needs to be executed,
2940                                  * ndd variable ip6_ignore_home_address_opt
2941                                  * should be set to 0 at the user's own risk.
2942                                  */
2943                                 struct ip6_opt_home_address *oh;
2944                                 in6_addr_t tmp;
2945 
2946                                 if (ipst->ips_ipv6_ignore_home_address_opt)
2947                                         goto opt_error;
2948 
2949                                 if (hdr_type != IPPROTO_DSTOPTS)
2950                                         goto opt_error;
2951                                 optused = 2 + optptr[1];
2952                                 if (optused > optlen)
2953                                         goto bad_opt;
2954 
2955                                 /*
2956                                  * We did this dest. opt the first time
2957                                  * around (i.e. before AH processing).
2958                                  * If we've done AH... stop now.
2959                                  */
2960                                 if ((ira->ira_flags & IRAF_IPSEC_SECURE) &&
2961                                     ira->ira_ipsec_ah_sa != NULL)
2962                                         break;
2963 
2964                                 oh = (struct ip6_opt_home_address *)optptr;
2965                                 /* Check total length and alignment */
2966                                 if (optused < sizeof (*oh) ||
2967                                     ((uintptr_t)oh->ip6oh_addr & 0x7) != 0)
2968                                         goto opt_error;
2969                                 /* Swap ip6_src and the home address */
2970                                 tmp = ip6h->ip6_src;
2971                                 /* XXX Note: only 8 byte alignment option */
2972                                 ip6h->ip6_src = *(in6_addr_t *)oh->ip6oh_addr;
2973                                 *(in6_addr_t *)oh->ip6oh_addr = tmp;
2974                                 break;
2975                         }
2976 
2977                         case IP6OPT_TUNNEL_LIMIT:
2978                                 if (hdr_type != IPPROTO_DSTOPTS) {
2979                                         goto opt_error;
2980                                 }
2981                                 optused = 2 + optptr[1];
2982                                 if (optused > optlen) {
2983                                         goto bad_opt;
2984                                 }
2985                                 if (optused != 3) {
2986                                         goto opt_error;
2987                                 }
2988                                 break;
2989 
2990                         default:
2991                                 errtype = "unknown";
2992                                 /* FALLTHROUGH */
2993                         opt_error:
2994                                 /* Determine which zone should send error */
2995                                 switch (IP6OPT_TYPE(opt_type)) {
2996                                 case IP6OPT_TYPE_SKIP:
2997                                         optused = 2 + optptr[1];
2998                                         if (optused > optlen)
2999                                                 goto bad_opt;
3000                                         ip1dbg(("ip_process_options_v6: %s "
3001                                             "opt 0x%x skipped\n",
3002                                             errtype, opt_type));
3003                                         break;
3004                                 case IP6OPT_TYPE_DISCARD:
3005                                         ip1dbg(("ip_process_options_v6: %s "
3006                                             "opt 0x%x; packet dropped\n",
3007                                             errtype, opt_type));
3008                                         BUMP_MIB(ill->ill_ip_mib,
3009                                             ipIfStatsInHdrErrors);
3010                                         ip_drop_input("ipIfStatsInHdrErrors",
3011                                             mp, ill);
3012                                         freemsg(mp);
3013                                         return (-1);
3014                                 case IP6OPT_TYPE_ICMP:
3015                                         BUMP_MIB(ill->ill_ip_mib,
3016                                             ipIfStatsInHdrErrors);
3017                                         ip_drop_input("ipIfStatsInHdrErrors",
3018                                             mp, ill);
3019                                         icmp_param_problem_v6(mp,
3020                                             ICMP6_PARAMPROB_OPTION,
3021                                             (uint32_t)(optptr -
3022                                             (uint8_t *)ip6h),
3023                                             B_FALSE, ira);
3024                                         return (-1);
3025                                 case IP6OPT_TYPE_FORCEICMP:
3026                                         BUMP_MIB(ill->ill_ip_mib,
3027                                             ipIfStatsInHdrErrors);
3028                                         ip_drop_input("ipIfStatsInHdrErrors",
3029                                             mp, ill);
3030                                         icmp_param_problem_v6(mp,
3031                                             ICMP6_PARAMPROB_OPTION,
3032                                             (uint32_t)(optptr -
3033                                             (uint8_t *)ip6h),
3034                                             B_TRUE, ira);
3035                                         return (-1);
3036                                 default:
3037                                         ASSERT(0);
3038                                 }
3039                         }
3040                 }
3041                 optlen -= optused;
3042                 optptr += optused;
3043         }
3044         return (ret);
3045 
3046 bad_opt:
3047         /* Determine which zone should send error */
3048         ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3049         icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION,
3050             (uint32_t)(optptr - (uint8_t *)ip6h),
3051             B_FALSE, ira);
3052         return (-1);
3053 }
3054 
3055 /*
3056  * Process a routing header that is not yet empty.
3057  * Because of RFC 5095, we now reject all route headers.
3058  */
3059 void
3060 ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth,
3061     ip_recv_attr_t *ira)
3062 {
3063         ill_t           *ill = ira->ira_ill;
3064         ip_stack_t      *ipst = ill->ill_ipst;
3065 
3066         ASSERT(rth->ip6r_segleft != 0);
3067 
3068         if (!ipst->ips_ipv6_forward_src_routed) {
3069                 /* XXX Check for source routed out same interface? */
3070                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
3071                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
3072                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
3073                 freemsg(mp);
3074                 return;
3075         }
3076 
3077         ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3078         icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3079             (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h),
3080             B_FALSE, ira);
3081 }
3082 
3083 /*
3084  * Read side put procedure for IPv6 module.
3085  */
3086 void
3087 ip_rput_v6(queue_t *q, mblk_t *mp)
3088 {
3089         ill_t           *ill;
3090 
3091         ill = (ill_t *)q->q_ptr;
3092         if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
3093                 union DL_primitives *dl;
3094 
3095                 dl = (union DL_primitives *)mp->b_rptr;
3096                 /*
3097                  * Things are opening or closing - only accept DLPI
3098                  * ack messages. If the stream is closing and ip_wsrv
3099                  * has completed, ip_close is out of the qwait, but has
3100                  * not yet completed qprocsoff. Don't proceed any further
3101                  * because the ill has been cleaned up and things hanging
3102                  * off the ill have been freed.
3103                  */
3104                 if ((mp->b_datap->db_type != M_PCPROTO) ||
3105                     (dl->dl_primitive == DL_UNITDATA_IND)) {
3106                         inet_freemsg(mp);
3107                         return;
3108                 }
3109         }
3110         if (DB_TYPE(mp) == M_DATA) {
3111                 struct mac_header_info_s mhi;
3112 
3113                 ip_mdata_to_mhi(ill, mp, &mhi);
3114                 ip_input_v6(ill, NULL, mp, &mhi);
3115         } else {
3116                 ip_rput_notdata(ill, mp);
3117         }
3118 }
3119 
3120 /*
3121  * Walk through the IPv6 packet in mp and see if there's an AH header
3122  * in it.  See if the AH header needs to get done before other headers in
3123  * the packet.  (Worker function for ipsec_early_ah_v6().)
3124  */
3125 #define IPSEC_HDR_DONT_PROCESS  0
3126 #define IPSEC_HDR_PROCESS       1
3127 #define IPSEC_MEMORY_ERROR      2 /* or malformed packet */
3128 static int
3129 ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr)
3130 {
3131         uint_t  length;
3132         uint_t  ehdrlen;
3133         uint8_t *whereptr;
3134         uint8_t *endptr;
3135         uint8_t *nexthdrp;
3136         ip6_dest_t *desthdr;
3137         ip6_rthdr_t *rthdr;
3138         ip6_t   *ip6h;
3139 
3140         /*
3141          * For now just pullup everything.  In general, the less pullups,
3142          * the better, but there's so much squirrelling through anyway,
3143          * it's just easier this way.
3144          */
3145         if (!pullupmsg(mp, -1)) {
3146                 return (IPSEC_MEMORY_ERROR);
3147         }
3148 
3149         ip6h = (ip6_t *)mp->b_rptr;
3150         length = IPV6_HDR_LEN;
3151         whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */
3152         endptr = mp->b_wptr;
3153 
3154         /*
3155          * We can't just use the argument nexthdr in the place
3156          * of nexthdrp becaue we don't dereference nexthdrp
3157          * till we confirm whether it is a valid address.
3158          */
3159         nexthdrp = &ip6h->ip6_nxt;
3160         while (whereptr < endptr) {
3161                 /* Is there enough left for len + nexthdr? */
3162                 if (whereptr + MIN_EHDR_LEN > endptr)
3163                         return (IPSEC_MEMORY_ERROR);
3164 
3165                 switch (*nexthdrp) {
3166                 case IPPROTO_HOPOPTS:
3167                 case IPPROTO_DSTOPTS:
3168                         /* Assumes the headers are identical for hbh and dst */
3169                         desthdr = (ip6_dest_t *)whereptr;
3170                         ehdrlen = 8 * (desthdr->ip6d_len + 1);
3171                         if ((uchar_t *)desthdr +  ehdrlen > endptr)
3172                                 return (IPSEC_MEMORY_ERROR);
3173                         /*
3174                          * Return DONT_PROCESS because the destination
3175                          * options header may be for each hop in a
3176                          * routing-header, and we only want AH if we're
3177                          * finished with routing headers.
3178                          */
3179                         if (*nexthdrp == IPPROTO_DSTOPTS)
3180                                 return (IPSEC_HDR_DONT_PROCESS);
3181                         nexthdrp = &desthdr->ip6d_nxt;
3182                         break;
3183                 case IPPROTO_ROUTING:
3184                         rthdr = (ip6_rthdr_t *)whereptr;
3185 
3186                         /*
3187                          * If there's more hops left on the routing header,
3188                          * return now with DON'T PROCESS.
3189                          */
3190                         if (rthdr->ip6r_segleft > 0)
3191                                 return (IPSEC_HDR_DONT_PROCESS);
3192 
3193                         ehdrlen =  8 * (rthdr->ip6r_len + 1);
3194                         if ((uchar_t *)rthdr +  ehdrlen > endptr)
3195                                 return (IPSEC_MEMORY_ERROR);
3196                         nexthdrp = &rthdr->ip6r_nxt;
3197                         break;
3198                 case IPPROTO_FRAGMENT:
3199                         /* Wait for reassembly */
3200                         return (IPSEC_HDR_DONT_PROCESS);
3201                 case IPPROTO_AH:
3202                         *nexthdr = IPPROTO_AH;
3203                         return (IPSEC_HDR_PROCESS);
3204                 case IPPROTO_NONE:
3205                         /* No next header means we're finished */
3206                 default:
3207                         return (IPSEC_HDR_DONT_PROCESS);
3208                 }
3209                 length += ehdrlen;
3210                 whereptr += ehdrlen;
3211         }
3212         /*
3213          * Malformed/truncated packet.
3214          */
3215         return (IPSEC_MEMORY_ERROR);
3216 }
3217 
3218 /*
3219  * Path for AH if options are present.
3220  * Returns NULL if the mblk was consumed.
3221  *
3222  * Sometimes AH needs to be done before other IPv6 headers for security
3223  * reasons.  This function (and its ipsec_needs_processing_v6() above)
3224  * indicates if that is so, and fans out to the appropriate IPsec protocol
3225  * for the datagram passed in.
3226  */
3227 mblk_t *
3228 ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira)
3229 {
3230         uint8_t nexthdr;
3231         ah_t *ah;
3232         ill_t           *ill = ira->ira_ill;
3233         ip_stack_t      *ipst = ill->ill_ipst;
3234         ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
3235 
3236         switch (ipsec_needs_processing_v6(mp, &nexthdr)) {
3237         case IPSEC_MEMORY_ERROR:
3238                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3239                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
3240                 freemsg(mp);
3241                 return (NULL);
3242         case IPSEC_HDR_DONT_PROCESS:
3243                 return (mp);
3244         }
3245 
3246         /* Default means send it to AH! */
3247         ASSERT(nexthdr == IPPROTO_AH);
3248 
3249         if (!ipsec_loaded(ipss)) {
3250                 ip_proto_not_sup(mp, ira);
3251                 return (NULL);
3252         }
3253 
3254         mp = ipsec_inbound_ah_sa(mp, ira, &ah);
3255         if (mp == NULL)
3256                 return (NULL);
3257         ASSERT(ah != NULL);
3258         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3259         ASSERT(ira->ira_ipsec_ah_sa != NULL);
3260         ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
3261         mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira);
3262 
3263         if (mp == NULL) {
3264                 /*
3265                  * Either it failed or is pending. In the former case
3266                  * ipIfStatsInDiscards was increased.
3267                  */
3268                 return (NULL);
3269         }
3270 
3271         /* we're done with IPsec processing, send it up */
3272         ip_input_post_ipsec(mp, ira);
3273         return (NULL);
3274 }
3275 
3276 /*
3277  * Reassemble fragment.
3278  * When it returns a completed message the first mblk will only contain
3279  * the headers prior to the fragment header, with the nexthdr value updated
3280  * to be the header after the fragment header.
3281  */
3282 mblk_t *
3283 ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h,
3284     ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira)
3285 {
3286         uint32_t        ident = ntohl(fraghdr->ip6f_ident);
3287         uint16_t        offset;
3288         boolean_t       more_frags;
3289         uint8_t         nexthdr = fraghdr->ip6f_nxt;
3290         in6_addr_t      *v6dst_ptr;
3291         in6_addr_t      *v6src_ptr;
3292         uint_t          end;
3293         uint_t          hdr_length;
3294         size_t          count;
3295         ipf_t           *ipf;
3296         ipf_t           **ipfp;
3297         ipfb_t          *ipfb;
3298         mblk_t          *mp1;
3299         uint8_t         ecn_info = 0;
3300         size_t          msg_len;
3301         mblk_t          *tail_mp;
3302         mblk_t          *t_mp;
3303         boolean_t       pruned = B_FALSE;
3304         uint32_t        sum_val;
3305         uint16_t        sum_flags;
3306         ill_t           *ill = ira->ira_ill;
3307         ip_stack_t      *ipst = ill->ill_ipst;
3308         uint_t          prev_nexthdr_offset;
3309         uint8_t         prev_nexthdr;
3310         uint8_t         *ptr;
3311         uint32_t        packet_size;
3312 
3313         /*
3314          * We utilize hardware computed checksum info only for UDP since
3315          * IP fragmentation is a normal occurence for the protocol.  In
3316          * addition, checksum offload support for IP fragments carrying
3317          * UDP payload is commonly implemented across network adapters.
3318          */
3319         ASSERT(ira->ira_rill != NULL);
3320         if (nexthdr == IPPROTO_UDP && dohwcksum &&
3321             ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
3322             (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
3323                 mblk_t *mp1 = mp->b_cont;
3324                 int32_t len;
3325 
3326                 /* Record checksum information from the packet */
3327                 sum_val = (uint32_t)DB_CKSUM16(mp);
3328                 sum_flags = DB_CKSUMFLAGS(mp);
3329 
3330                 /* fragmented payload offset from beginning of mblk */
3331                 offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr);
3332 
3333                 if ((sum_flags & HCK_PARTIALCKSUM) &&
3334                     (mp1 == NULL || mp1->b_cont == NULL) &&
3335                     offset >= DB_CKSUMSTART(mp) &&
3336                     ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
3337                         uint32_t adj;
3338                         /*
3339                          * Partial checksum has been calculated by hardware
3340                          * and attached to the packet; in addition, any
3341                          * prepended extraneous data is even byte aligned.
3342                          * If any such data exists, we adjust the checksum;
3343                          * this would also handle any postpended data.
3344                          */
3345                         IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
3346                             mp, mp1, len, adj);
3347 
3348                         /* One's complement subtract extraneous checksum */
3349                         if (adj >= sum_val)
3350                                 sum_val = ~(adj - sum_val) & 0xFFFF;
3351                         else
3352                                 sum_val -= adj;
3353                 }
3354         } else {
3355                 sum_val = 0;
3356                 sum_flags = 0;
3357         }
3358 
3359         /* Clear hardware checksumming flag */
3360         DB_CKSUMFLAGS(mp) = 0;
3361 
3362         /*
3363          * Determine the offset (from the begining of the IP header)
3364          * of the nexthdr value which has IPPROTO_FRAGMENT. We use
3365          * this when removing the fragment header from the packet.
3366          * This packet consists of the IPv6 header, a potential
3367          * hop-by-hop options header, a potential pre-routing-header
3368          * destination options header, and a potential routing header.
3369          */
3370         prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
3371         prev_nexthdr = ip6h->ip6_nxt;
3372         ptr = (uint8_t *)&ip6h[1];
3373 
3374         if (prev_nexthdr == IPPROTO_HOPOPTS) {
3375                 ip6_hbh_t       *hbh_hdr;
3376                 uint_t          hdr_len;
3377 
3378                 hbh_hdr = (ip6_hbh_t *)ptr;
3379                 hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
3380                 prev_nexthdr = hbh_hdr->ip6h_nxt;
3381                 prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
3382                     - (uint8_t *)ip6h;
3383                 ptr += hdr_len;
3384         }
3385         if (prev_nexthdr == IPPROTO_DSTOPTS) {
3386                 ip6_dest_t      *dest_hdr;
3387                 uint_t          hdr_len;
3388 
3389                 dest_hdr = (ip6_dest_t *)ptr;
3390                 hdr_len = 8 * (dest_hdr->ip6d_len + 1);
3391                 prev_nexthdr = dest_hdr->ip6d_nxt;
3392                 prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
3393                     - (uint8_t *)ip6h;
3394                 ptr += hdr_len;
3395         }
3396         if (prev_nexthdr == IPPROTO_ROUTING) {
3397                 ip6_rthdr_t     *rthdr;
3398                 uint_t          hdr_len;
3399 
3400                 rthdr = (ip6_rthdr_t *)ptr;
3401                 prev_nexthdr = rthdr->ip6r_nxt;
3402                 prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
3403                     - (uint8_t *)ip6h;
3404                 hdr_len = 8 * (rthdr->ip6r_len + 1);
3405                 ptr += hdr_len;
3406         }
3407         if (prev_nexthdr != IPPROTO_FRAGMENT) {
3408                 /* Can't handle other headers before the fragment header */
3409                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3410                 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3411                 freemsg(mp);
3412                 return (NULL);
3413         }
3414 
3415         /*
3416          * Note: Fragment offset in header is in 8-octet units.
3417          * Clearing least significant 3 bits not only extracts
3418          * it but also gets it in units of octets.
3419          */
3420         offset = ntohs(fraghdr->ip6f_offlg) & ~7;
3421         more_frags = (fraghdr->ip6f_offlg & IP6F_MORE_FRAG);
3422 
3423         /*
3424          * Is the more frags flag on and the payload length not a multiple
3425          * of eight?
3426          */
3427         if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) {
3428                 ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill);
3429                 icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3430                     (uint32_t)((char *)&ip6h->ip6_plen -
3431                     (char *)ip6h), B_FALSE, ira);
3432                 return (NULL);
3433         }
3434 
3435         v6src_ptr = &ip6h->ip6_src;
3436         v6dst_ptr = &ip6h->ip6_dst;
3437         end = remlen;
3438 
3439         hdr_length = (uint_t)((char *)&fraghdr[1] - (char *)ip6h);
3440         end += offset;
3441 
3442         /*
3443          * Would fragment cause reassembled packet to have a payload length
3444          * greater than IP_MAXPACKET - the max payload size?
3445          */
3446         if (end > IP_MAXPACKET) {
3447                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3448                 ip_drop_input("Reassembled packet too large", mp, ill);
3449                 icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER,
3450                     (uint32_t)((char *)&fraghdr->ip6f_offlg -
3451                     (char *)ip6h), B_FALSE, ira);
3452                 return (NULL);
3453         }
3454 
3455         /*
3456          * This packet just has one fragment. Reassembly not
3457          * needed.
3458          */
3459         if (!more_frags && offset == 0) {
3460                 goto reass_done;
3461         }
3462 
3463         /*
3464          * Drop the fragmented as early as possible, if
3465          * we don't have resource(s) to re-assemble.
3466          */
3467         if (ipst->ips_ip_reass_queue_bytes == 0) {
3468                 freemsg(mp);
3469                 return (NULL);
3470         }
3471 
3472         /* Record the ECN field info. */
3473         ecn_info = (uint8_t)(ntohl(ip6h->ip6_vcf & htonl(~0xFFCFFFFF)) >> 20);
3474         /*
3475          * If this is not the first fragment, dump the unfragmentable
3476          * portion of the packet.
3477          */
3478         if (offset)
3479                 mp->b_rptr = (uchar_t *)&fraghdr[1];
3480 
3481         /*
3482          * Fragmentation reassembly.  Each ILL has a hash table for
3483          * queueing packets undergoing reassembly for all IPIFs
3484          * associated with the ILL.  The hash is based on the packet
3485          * IP ident field.  The ILL frag hash table was allocated
3486          * as a timer block at the time the ILL was created.  Whenever
3487          * there is anything on the reassembly queue, the timer will
3488          * be running.
3489          */
3490         /* Handle vnic loopback of fragments */
3491         if (mp->b_datap->db_ref > 2)
3492                 msg_len = 0;
3493         else
3494                 msg_len = MBLKSIZE(mp);
3495 
3496         tail_mp = mp;
3497         while (tail_mp->b_cont != NULL) {
3498                 tail_mp = tail_mp->b_cont;
3499                 if (tail_mp->b_datap->db_ref <= 2)
3500                         msg_len += MBLKSIZE(tail_mp);
3501         }
3502         /*
3503          * If the reassembly list for this ILL will get too big
3504          * prune it.
3505          */
3506 
3507         if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
3508             ipst->ips_ip_reass_queue_bytes) {
3509                 DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
3510                     uint_t, ill->ill_frag_count,
3511                     uint_t, ipst->ips_ip_reass_queue_bytes);
3512                 ill_frag_prune(ill,
3513                     (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
3514                     (ipst->ips_ip_reass_queue_bytes - msg_len));
3515                 pruned = B_TRUE;
3516         }
3517 
3518         ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH_V6(*v6src_ptr, ident)];
3519         mutex_enter(&ipfb->ipfb_lock);
3520 
3521         ipfp = &ipfb->ipfb_ipf;
3522         /* Try to find an existing fragment queue for this packet. */
3523         for (;;) {
3524                 ipf = ipfp[0];
3525                 if (ipf) {
3526                         /*
3527                          * It has to match on ident, source address, and
3528                          * dest address.
3529                          */
3530                         if (ipf->ipf_ident == ident &&
3531                             IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6src, v6src_ptr) &&
3532                             IN6_ARE_ADDR_EQUAL(&ipf->ipf_v6dst, v6dst_ptr)) {
3533 
3534                                 /*
3535                                  * If we have received too many
3536                                  * duplicate fragments for this packet
3537                                  * free it.
3538                                  */
3539                                 if (ipf->ipf_num_dups > ip_max_frag_dups) {
3540                                         ill_frag_free_pkts(ill, ipfb, ipf, 1);
3541                                         freemsg(mp);
3542                                         mutex_exit(&ipfb->ipfb_lock);
3543                                         return (NULL);
3544                                 }
3545 
3546                                 break;
3547                         }
3548                         ipfp = &ipf->ipf_hash_next;
3549                         continue;
3550                 }
3551 
3552 
3553                 /*
3554                  * If we pruned the list, do we want to store this new
3555                  * fragment?. We apply an optimization here based on the
3556                  * fact that most fragments will be received in order.
3557                  * So if the offset of this incoming fragment is zero,
3558                  * it is the first fragment of a new packet. We will
3559                  * keep it.  Otherwise drop the fragment, as we have
3560                  * probably pruned the packet already (since the
3561                  * packet cannot be found).
3562                  */
3563 
3564                 if (pruned && offset != 0) {
3565                         mutex_exit(&ipfb->ipfb_lock);
3566                         freemsg(mp);
3567                         return (NULL);
3568                 }
3569 
3570                 /* New guy.  Allocate a frag message. */
3571                 mp1 = allocb(sizeof (*ipf), BPRI_MED);
3572                 if (!mp1) {
3573                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3574                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3575                         freemsg(mp);
3576         partial_reass_done:
3577                         mutex_exit(&ipfb->ipfb_lock);
3578                         return (NULL);
3579                 }
3580 
3581                 if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
3582                         /*
3583                          * Too many fragmented packets in this hash bucket.
3584                          * Free the oldest.
3585                          */
3586                         ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
3587                 }
3588 
3589                 mp1->b_cont = mp;
3590 
3591                 /* Initialize the fragment header. */
3592                 ipf = (ipf_t *)mp1->b_rptr;
3593                 ipf->ipf_mp = mp1;
3594                 ipf->ipf_ptphn = ipfp;
3595                 ipfp[0] = ipf;
3596                 ipf->ipf_hash_next = NULL;
3597                 ipf->ipf_ident = ident;
3598                 ipf->ipf_v6src = *v6src_ptr;
3599                 ipf->ipf_v6dst = *v6dst_ptr;
3600                 /* Record reassembly start time. */
3601                 ipf->ipf_timestamp = gethrestime_sec();
3602                 /* Record ipf generation and account for frag header */
3603                 ipf->ipf_gen = ill->ill_ipf_gen++;
3604                 ipf->ipf_count = MBLKSIZE(mp1);
3605                 ipf->ipf_protocol = nexthdr;
3606                 ipf->ipf_nf_hdr_len = 0;
3607                 ipf->ipf_prev_nexthdr_offset = 0;
3608                 ipf->ipf_last_frag_seen = B_FALSE;
3609                 ipf->ipf_ecn = ecn_info;
3610                 ipf->ipf_num_dups = 0;
3611                 ipfb->ipfb_frag_pkts++;
3612                 ipf->ipf_checksum = 0;
3613                 ipf->ipf_checksum_flags = 0;
3614 
3615                 /* Store checksum value in fragment header */
3616                 if (sum_flags != 0) {
3617                         sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3618                         sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3619                         ipf->ipf_checksum = sum_val;
3620                         ipf->ipf_checksum_flags = sum_flags;
3621                 }
3622 
3623                 /*
3624                  * We handle reassembly two ways.  In the easy case,
3625                  * where all the fragments show up in order, we do
3626                  * minimal bookkeeping, and just clip new pieces on
3627                  * the end.  If we ever see a hole, then we go off
3628                  * to ip_reassemble which has to mark the pieces and
3629                  * keep track of the number of holes, etc.  Obviously,
3630                  * the point of having both mechanisms is so we can
3631                  * handle the easy case as efficiently as possible.
3632                  */
3633                 if (offset == 0) {
3634                         /* Easy case, in-order reassembly so far. */
3635                         /* Update the byte count */
3636                         ipf->ipf_count += msg_len;
3637                         ipf->ipf_tail_mp = tail_mp;
3638                         /*
3639                          * Keep track of next expected offset in
3640                          * ipf_end.
3641                          */
3642                         ipf->ipf_end = end;
3643                         ipf->ipf_nf_hdr_len = hdr_length;
3644                         ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset;
3645                 } else {
3646                         /* Hard case, hole at the beginning. */
3647                         ipf->ipf_tail_mp = NULL;
3648                         /*
3649                          * ipf_end == 0 means that we have given up
3650                          * on easy reassembly.
3651                          */
3652                         ipf->ipf_end = 0;
3653 
3654                         /* Forget checksum offload from now on */
3655                         ipf->ipf_checksum_flags = 0;
3656 
3657                         /*
3658                          * ipf_hole_cnt is set by ip_reassemble.
3659                          * ipf_count is updated by ip_reassemble.
3660                          * No need to check for return value here
3661                          * as we don't expect reassembly to complete or
3662                          * fail for the first fragment itself.
3663                          */
3664                         (void) ip_reassemble(mp, ipf, offset, more_frags, ill,
3665                             msg_len);
3666                 }
3667                 /* Update per ipfb and ill byte counts */
3668                 ipfb->ipfb_count += ipf->ipf_count;
3669                 ASSERT(ipfb->ipfb_count > 0);     /* Wraparound */
3670                 atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
3671                 /* If the frag timer wasn't already going, start it. */
3672                 mutex_enter(&ill->ill_lock);
3673                 ill_frag_timer_start(ill);
3674                 mutex_exit(&ill->ill_lock);
3675                 goto partial_reass_done;
3676         }
3677 
3678         /*
3679          * If the packet's flag has changed (it could be coming up
3680          * from an interface different than the previous, therefore
3681          * possibly different checksum capability), then forget about
3682          * any stored checksum states.  Otherwise add the value to
3683          * the existing one stored in the fragment header.
3684          */
3685         if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
3686                 sum_val += ipf->ipf_checksum;
3687                 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3688                 sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
3689                 ipf->ipf_checksum = sum_val;
3690         } else if (ipf->ipf_checksum_flags != 0) {
3691                 /* Forget checksum offload from now on */
3692                 ipf->ipf_checksum_flags = 0;
3693         }
3694 
3695         /*
3696          * We have a new piece of a datagram which is already being
3697          * reassembled.  Update the ECN info if all IP fragments
3698          * are ECN capable.  If there is one which is not, clear
3699          * all the info.  If there is at least one which has CE
3700          * code point, IP needs to report that up to transport.
3701          */
3702         if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
3703                 if (ecn_info == IPH_ECN_CE)
3704                         ipf->ipf_ecn = IPH_ECN_CE;
3705         } else {
3706                 ipf->ipf_ecn = IPH_ECN_NECT;
3707         }
3708 
3709         if (offset && ipf->ipf_end == offset) {
3710                 /* The new fragment fits at the end */
3711                 ipf->ipf_tail_mp->b_cont = mp;
3712                 /* Update the byte count */
3713                 ipf->ipf_count += msg_len;
3714                 /* Update per ipfb and ill byte counts */
3715                 ipfb->ipfb_count += msg_len;
3716                 ASSERT(ipfb->ipfb_count > 0);     /* Wraparound */
3717                 atomic_add_32(&ill->ill_frag_count, msg_len);
3718                 if (more_frags) {
3719                         /* More to come. */
3720                         ipf->ipf_end = end;
3721                         ipf->ipf_tail_mp = tail_mp;
3722                         goto partial_reass_done;
3723                 }
3724         } else {
3725                 /*
3726                  * Go do the hard cases.
3727                  * Call ip_reassemble().
3728                  */
3729                 int ret;
3730 
3731                 if (offset == 0) {
3732                         if (ipf->ipf_prev_nexthdr_offset == 0) {
3733                                 ipf->ipf_nf_hdr_len = hdr_length;
3734                                 ipf->ipf_prev_nexthdr_offset =
3735                                     prev_nexthdr_offset;
3736                         }
3737                 }
3738                 /* Save current byte count */
3739                 count = ipf->ipf_count;
3740                 ret = ip_reassemble(mp, ipf, offset, more_frags, ill, msg_len);
3741 
3742                 /* Count of bytes added and subtracted (freeb()ed) */
3743                 count = ipf->ipf_count - count;
3744                 if (count) {
3745                         /* Update per ipfb and ill byte counts */
3746                         ipfb->ipfb_count += count;
3747                         ASSERT(ipfb->ipfb_count > 0);     /* Wraparound */
3748                         atomic_add_32(&ill->ill_frag_count, count);
3749                 }
3750                 if (ret == IP_REASS_PARTIAL) {
3751                         goto partial_reass_done;
3752                 } else if (ret == IP_REASS_FAILED) {
3753                         /* Reassembly failed. Free up all resources */
3754                         ill_frag_free_pkts(ill, ipfb, ipf, 1);
3755                         for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
3756                                 IP_REASS_SET_START(t_mp, 0);
3757                                 IP_REASS_SET_END(t_mp, 0);
3758                         }
3759                         freemsg(mp);
3760                         goto partial_reass_done;
3761                 }
3762 
3763                 /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
3764         }
3765         /*
3766          * We have completed reassembly.  Unhook the frag header from
3767          * the reassembly list.
3768          *
3769          * Grab the unfragmentable header length next header value out
3770          * of the first fragment
3771          */
3772         ASSERT(ipf->ipf_nf_hdr_len != 0);
3773         hdr_length = ipf->ipf_nf_hdr_len;
3774 
3775         /*
3776          * Before we free the frag header, record the ECN info
3777          * to report back to the transport.
3778          */
3779         ecn_info = ipf->ipf_ecn;
3780 
3781         /*
3782          * Store the nextheader field in the header preceding the fragment
3783          * header
3784          */
3785         nexthdr = ipf->ipf_protocol;
3786         prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
3787         ipfp = ipf->ipf_ptphn;
3788 
3789         /* We need to supply these to caller */
3790         if ((sum_flags = ipf->ipf_checksum_flags) != 0)
3791                 sum_val = ipf->ipf_checksum;
3792         else
3793                 sum_val = 0;
3794 
3795         mp1 = ipf->ipf_mp;
3796         count = ipf->ipf_count;
3797         ipf = ipf->ipf_hash_next;
3798         if (ipf)
3799                 ipf->ipf_ptphn = ipfp;
3800         ipfp[0] = ipf;
3801         atomic_add_32(&ill->ill_frag_count, -count);
3802         ASSERT(ipfb->ipfb_count >= count);
3803         ipfb->ipfb_count -= count;
3804         ipfb->ipfb_frag_pkts--;
3805         mutex_exit(&ipfb->ipfb_lock);
3806         /* Ditch the frag header. */
3807         mp = mp1->b_cont;
3808         freeb(mp1);
3809 
3810         /*
3811          * Make sure the packet is good by doing some sanity
3812          * check. If bad we can silentely drop the packet.
3813          */
3814 reass_done:
3815         if (hdr_length < sizeof (ip6_frag_t)) {
3816                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
3817                 ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
3818                 ip1dbg(("ip_input_fragment_v6: bad packet\n"));
3819                 freemsg(mp);
3820                 return (NULL);
3821         }
3822 
3823         /*
3824          * Remove the fragment header from the initial header by
3825          * splitting the mblk into the non-fragmentable header and
3826          * everthing after the fragment extension header.  This has the
3827          * side effect of putting all the headers that need destination
3828          * processing into the b_cont block-- on return this fact is
3829          * used in order to avoid having to look at the extensions
3830          * already processed.
3831          *
3832          * Note that this code assumes that the unfragmentable portion
3833          * of the header is in the first mblk and increments
3834          * the read pointer past it.  If this assumption is broken
3835          * this code fails badly.
3836          */
3837         if (mp->b_rptr + hdr_length != mp->b_wptr) {
3838                 mblk_t *nmp;
3839 
3840                 if (!(nmp = dupb(mp))) {
3841                         ip1dbg(("ip_input_fragment_v6: dupb failed\n"));
3842                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3843                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3844                         freemsg(mp);
3845                         return (NULL);
3846                 }
3847                 nmp->b_cont = mp->b_cont;
3848                 mp->b_cont = nmp;
3849                 nmp->b_rptr += hdr_length;
3850         }
3851         mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t);
3852 
3853         ip6h = (ip6_t *)mp->b_rptr;
3854         ((char *)ip6h)[prev_nexthdr_offset] = nexthdr;
3855 
3856         /* Restore original IP length in header. */
3857         packet_size = msgdsize(mp);
3858         ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN));
3859         /* Record the ECN info. */
3860         ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
3861         ip6h->ip6_vcf |= htonl(ecn_info << 20);
3862 
3863         /* Update the receive attributes */
3864         ira->ira_pktlen = packet_size;
3865         ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t);
3866         ira->ira_protocol = nexthdr;
3867 
3868         /* Reassembly is successful; set checksum information in packet */
3869         DB_CKSUM16(mp) = (uint16_t)sum_val;
3870         DB_CKSUMFLAGS(mp) = sum_flags;
3871         DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
3872 
3873         return (mp);
3874 }
3875 
3876 /*
3877  * Given an mblk and a ptr, find the destination address in an IPv6 routing
3878  * header.
3879  */
3880 static in6_addr_t
3881 pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv)
3882 {
3883         ip6_rthdr0_t *rt0;
3884         int segleft, numaddr;
3885         in6_addr_t *ap, rv = oldrv;
3886 
3887         rt0 = (ip6_rthdr0_t *)whereptr;
3888         if (rt0->ip6r0_type != 0 && rt0->ip6r0_type != 2) {
3889                 DTRACE_PROBE2(pluck_out_dst_unknown_type, mblk_t *, mp,
3890                     uint8_t *, whereptr);
3891                 return (rv);
3892         }
3893         segleft = rt0->ip6r0_segleft;
3894         numaddr = rt0->ip6r0_len / 2;
3895 
3896         if ((rt0->ip6r0_len & 0x1) ||
3897             (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) ||
3898             (segleft > rt0->ip6r0_len / 2)) {
3899                 /*
3900                  * Corrupt packet.  Either the routing header length is odd
3901                  * (can't happen) or mismatched compared to the packet, or the
3902                  * number of addresses is.  Return what we can.  This will
3903                  * only be a problem on forwarded packets that get squeezed
3904                  * through an outbound tunnel enforcing IPsec Tunnel Mode.
3905                  */
3906                 DTRACE_PROBE2(pluck_out_dst_badpkt, mblk_t *, mp, uint8_t *,
3907                     whereptr);
3908                 return (rv);
3909         }
3910 
3911         if (segleft != 0) {
3912                 ap = (in6_addr_t *)((char *)rt0 + sizeof (*rt0));
3913                 rv = ap[numaddr - 1];
3914         }
3915 
3916         return (rv);
3917 }
3918 
3919 /*
3920  * Walk through the options to see if there is a routing header.
3921  * If present get the destination which is the last address of
3922  * the option.
3923  * mp needs to be provided in cases when the extension headers might span
3924  * b_cont; mp is never modified by this function.
3925  */
3926 in6_addr_t
3927 ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment)
3928 {
3929         const mblk_t *current_mp = mp;
3930         uint8_t nexthdr;
3931         uint8_t *whereptr;
3932         int ehdrlen;
3933         in6_addr_t rv;
3934 
3935         whereptr = (uint8_t *)ip6h;
3936         ehdrlen = sizeof (ip6_t);
3937 
3938         /* We assume at least the IPv6 base header is within one mblk. */
3939         ASSERT(mp == NULL ||
3940             (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen));
3941 
3942         rv = ip6h->ip6_dst;
3943         nexthdr = ip6h->ip6_nxt;
3944         if (is_fragment != NULL)
3945                 *is_fragment = B_FALSE;
3946 
3947         /*
3948          * We also assume (thanks to ipsec_tun_outbound()'s pullup) that
3949          * no extension headers will be split across mblks.
3950          */
3951 
3952         while (nexthdr == IPPROTO_HOPOPTS || nexthdr == IPPROTO_DSTOPTS ||
3953             nexthdr == IPPROTO_ROUTING) {
3954                 if (nexthdr == IPPROTO_ROUTING)
3955                         rv = pluck_out_dst(current_mp, whereptr, rv);
3956 
3957                 /*
3958                  * All IPv6 extension headers have the next-header in byte
3959                  * 0, and the (length - 8) in 8-byte-words.
3960                  */
3961                 while (current_mp != NULL &&
3962                     whereptr + ehdrlen >= current_mp->b_wptr) {
3963                         ehdrlen -= (current_mp->b_wptr - whereptr);
3964                         current_mp = current_mp->b_cont;
3965                         if (current_mp == NULL) {
3966                                 /* Bad packet.  Return what we can. */
3967                                 DTRACE_PROBE3(ip_get_dst_v6_badpkt, mblk_t *,
3968                                     mp, mblk_t *, current_mp, ip6_t *, ip6h);
3969                                 goto done;
3970                         }
3971                         whereptr = current_mp->b_rptr;
3972                 }
3973                 whereptr += ehdrlen;
3974 
3975                 nexthdr = *whereptr;
3976                 ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr);
3977                 ehdrlen = (*(whereptr + 1) + 1) * 8;
3978         }
3979 
3980 done:
3981         if (nexthdr == IPPROTO_FRAGMENT && is_fragment != NULL)
3982                 *is_fragment = B_TRUE;
3983         return (rv);
3984 }
3985 
3986 /*
3987  * ip_source_routed_v6:
3988  * This function is called by redirect code (called from ip_input_v6) to
3989  * know whether this packet is source routed through this node i.e
3990  * whether this node (router) is part of the journey. This
3991  * function is called under two cases :
3992  *
3993  * case 1 : Routing header was processed by this node and
3994  *          ip_process_rthdr replaced ip6_dst with the next hop
3995  *          and we are forwarding the packet to the next hop.
3996  *
3997  * case 2 : Routing header was not processed by this node and we
3998  *          are just forwarding the packet.
3999  *
4000  * For case (1) we don't want to send redirects. For case(2) we
4001  * want to send redirects.
4002  */
4003 static boolean_t
4004 ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst)
4005 {
4006         uint8_t         nexthdr;
4007         in6_addr_t      *addrptr;
4008         ip6_rthdr0_t    *rthdr;
4009         uint8_t         numaddr;
4010         ip6_hbh_t       *hbhhdr;
4011         uint_t          ehdrlen;
4012         uint8_t         *byteptr;
4013 
4014         ip2dbg(("ip_source_routed_v6\n"));
4015         nexthdr = ip6h->ip6_nxt;
4016         ehdrlen = IPV6_HDR_LEN;
4017 
4018         /* if a routing hdr is preceeded by HOPOPT or DSTOPT */
4019         while (nexthdr == IPPROTO_HOPOPTS ||
4020             nexthdr == IPPROTO_DSTOPTS) {
4021                 byteptr = (uint8_t *)ip6h + ehdrlen;
4022                 /*
4023                  * Check if we have already processed
4024                  * packets or we are just a forwarding
4025                  * router which only pulled up msgs up
4026                  * to IPV6HDR and  one HBH ext header
4027                  */
4028                 if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
4029                         ip2dbg(("ip_source_routed_v6: Extension"
4030                             " headers not processed\n"));
4031                         return (B_FALSE);
4032                 }
4033                 hbhhdr = (ip6_hbh_t *)byteptr;
4034                 nexthdr = hbhhdr->ip6h_nxt;
4035                 ehdrlen = ehdrlen + 8 * (hbhhdr->ip6h_len + 1);
4036         }
4037         switch (nexthdr) {
4038         case IPPROTO_ROUTING:
4039                 byteptr = (uint8_t *)ip6h + ehdrlen;
4040                 /*
4041                  * If for some reason, we haven't pulled up
4042                  * the routing hdr data mblk, then we must
4043                  * not have processed it at all. So for sure
4044                  * we are not part of the source routed journey.
4045                  */
4046                 if (byteptr + MIN_EHDR_LEN > mp->b_wptr) {
4047                         ip2dbg(("ip_source_routed_v6: Routing"
4048                             " header not processed\n"));
4049                         return (B_FALSE);
4050                 }
4051                 rthdr = (ip6_rthdr0_t *)byteptr;
4052                 /*
4053                  * Either we are an intermediate router or the
4054                  * last hop before destination and we have
4055                  * already processed the routing header.
4056                  * If segment_left is greater than or equal to zero,
4057                  * then we must be the (numaddr - segleft) entry
4058                  * of the routing header. Although ip6r0_segleft
4059                  * is a unit8_t variable, we still check for zero
4060                  * or greater value, if in case the data type
4061                  * is changed someday in future.
4062                  */
4063                 if (rthdr->ip6r0_segleft > 0 ||
4064                     rthdr->ip6r0_segleft == 0) {
4065                         numaddr = rthdr->ip6r0_len / 2;
4066                         addrptr = (in6_addr_t *)((char *)rthdr +
4067                             sizeof (*rthdr));
4068                         addrptr += (numaddr - (rthdr->ip6r0_segleft + 1));
4069                         if (addrptr != NULL) {
4070                                 if (ip_type_v6(addrptr, ipst) == IRE_LOCAL)
4071                                         return (B_TRUE);
4072                                 ip1dbg(("ip_source_routed_v6: Not local\n"));
4073                         }
4074                 }
4075         /* FALLTHRU */
4076         default:
4077                 ip2dbg(("ip_source_routed_v6: Not source routed here\n"));
4078                 return (B_FALSE);
4079         }
4080 }
4081 
4082 /*
4083  * IPv6 fragmentation.  Essentially the same as IPv4 fragmentation.
4084  * We have not optimized this in terms of number of mblks
4085  * allocated. For instance, for each fragment sent we always allocate a
4086  * mblk to hold the IPv6 header and fragment header.
4087  *
4088  * Assumes that all the extension headers are contained in the first mblk
4089  * and that the fragment header has has already been added by calling
4090  * ip_fraghdr_add_v6.
4091  */
4092 int
4093 ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
4094     uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid,
4095     pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
4096 {
4097         ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
4098         ip6_t           *fip6h;
4099         mblk_t          *hmp;
4100         mblk_t          *hmp0;
4101         mblk_t          *dmp;
4102         ip6_frag_t      *fraghdr;
4103         size_t          unfragmentable_len;
4104         size_t          mlen;
4105         size_t          max_chunk;
4106         uint16_t        off_flags;
4107         uint16_t        offset = 0;
4108         ill_t           *ill = nce->nce_ill;
4109         uint8_t         nexthdr;
4110         uint8_t         *ptr;
4111         ip_stack_t      *ipst = ill->ill_ipst;
4112         uint_t          priority = mp->b_band;
4113         int             error = 0;
4114 
4115         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
4116         if (max_frag == 0) {
4117                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4118                 ip_drop_output("FragFails: zero max_frag", mp, ill);
4119                 freemsg(mp);
4120                 return (EINVAL);
4121         }
4122 
4123         /*
4124          * Caller should have added fraghdr_t to pkt_len, and also
4125          * updated ip6_plen.
4126          */
4127         ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len);
4128         ASSERT(msgdsize(mp) == pkt_len);
4129 
4130         /*
4131          * Determine the length of the unfragmentable portion of this
4132          * datagram.  This consists of the IPv6 header, a potential
4133          * hop-by-hop options header, a potential pre-routing-header
4134          * destination options header, and a potential routing header.
4135          */
4136         nexthdr = ip6h->ip6_nxt;
4137         ptr = (uint8_t *)&ip6h[1];
4138 
4139         if (nexthdr == IPPROTO_HOPOPTS) {
4140                 ip6_hbh_t       *hbh_hdr;
4141                 uint_t          hdr_len;
4142 
4143                 hbh_hdr = (ip6_hbh_t *)ptr;
4144                 hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4145                 nexthdr = hbh_hdr->ip6h_nxt;
4146                 ptr += hdr_len;
4147         }
4148         if (nexthdr == IPPROTO_DSTOPTS) {
4149                 ip6_dest_t      *dest_hdr;
4150                 uint_t          hdr_len;
4151 
4152                 dest_hdr = (ip6_dest_t *)ptr;
4153                 if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4154                         hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4155                         nexthdr = dest_hdr->ip6d_nxt;
4156                         ptr += hdr_len;
4157                 }
4158         }
4159         if (nexthdr == IPPROTO_ROUTING) {
4160                 ip6_rthdr_t     *rthdr;
4161                 uint_t          hdr_len;
4162 
4163                 rthdr = (ip6_rthdr_t *)ptr;
4164                 nexthdr = rthdr->ip6r_nxt;
4165                 hdr_len = 8 * (rthdr->ip6r_len + 1);
4166                 ptr += hdr_len;
4167         }
4168         if (nexthdr != IPPROTO_FRAGMENT) {
4169                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4170                 ip_drop_output("FragFails: bad nexthdr", mp, ill);
4171                 freemsg(mp);
4172                 return (EINVAL);
4173         }
4174         unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4175         unfragmentable_len += sizeof (ip6_frag_t);
4176 
4177         max_chunk = (max_frag - unfragmentable_len) & ~7;
4178 
4179         /*
4180          * Allocate an mblk with enough room for the link-layer
4181          * header and the unfragmentable part of the datagram, which includes
4182          * the fragment header.  This (or a copy) will be used as the
4183          * first mblk for each fragment we send.
4184          */
4185         hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp);
4186         if (hmp == NULL) {
4187                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4188                 ip_drop_output("FragFails: no hmp", mp, ill);
4189                 freemsg(mp);
4190                 return (ENOBUFS);
4191         }
4192         hmp->b_rptr += ipst->ips_ip_wroff_extra;
4193         hmp->b_wptr = hmp->b_rptr + unfragmentable_len;
4194 
4195         fip6h = (ip6_t *)hmp->b_rptr;
4196         bcopy(ip6h, fip6h, unfragmentable_len);
4197 
4198         /*
4199          * pkt_len is set to the total length of the fragmentable data in this
4200          * datagram.  For each fragment sent, we will decrement pkt_len
4201          * by the amount of fragmentable data sent in that fragment
4202          * until len reaches zero.
4203          */
4204         pkt_len -= unfragmentable_len;
4205 
4206         /*
4207          * Move read ptr past unfragmentable portion, we don't want this part
4208          * of the data in our fragments.
4209          */
4210         mp->b_rptr += unfragmentable_len;
4211         if (mp->b_rptr == mp->b_wptr) {
4212                 mblk_t *mp1 = mp->b_cont;
4213                 freeb(mp);
4214                 mp = mp1;
4215         }
4216 
4217         while (pkt_len != 0) {
4218                 mlen = MIN(pkt_len, max_chunk);
4219                 pkt_len -= mlen;
4220                 if (pkt_len != 0) {
4221                         /* Not last */
4222                         hmp0 = copyb(hmp);
4223                         if (hmp0 == NULL) {
4224                                 BUMP_MIB(ill->ill_ip_mib,
4225                                     ipIfStatsOutFragFails);
4226                                 ip_drop_output("FragFails: copyb failed",
4227                                     mp, ill);
4228                                 freeb(hmp);
4229                                 freemsg(mp);
4230                                 ip1dbg(("ip_fragment_v6: copyb failed\n"));
4231                                 return (ENOBUFS);
4232                         }
4233                         off_flags = IP6F_MORE_FRAG;
4234                 } else {
4235                         /* Last fragment */
4236                         hmp0 = hmp;
4237                         hmp = NULL;
4238                         off_flags = 0;
4239                 }
4240                 fip6h = (ip6_t *)(hmp0->b_rptr);
4241                 fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len -
4242                     sizeof (ip6_frag_t));
4243 
4244                 fip6h->ip6_plen = htons((uint16_t)(mlen +
4245                     unfragmentable_len - IPV6_HDR_LEN));
4246                 /*
4247                  * Note: Optimization alert.
4248                  * In IPv6 (and IPv4) protocol header, Fragment Offset
4249                  * ("offset") is 13 bits wide and in 8-octet units.
4250                  * In IPv6 protocol header (unlike IPv4) in a 16 bit field,
4251                  * it occupies the most significant 13 bits.
4252                  * (least significant 13 bits in IPv4).
4253                  * We do not do any shifts here. Not shifting is same effect
4254                  * as taking offset value in octet units, dividing by 8 and
4255                  * then shifting 3 bits left to line it up in place in proper
4256                  * place protocol header.
4257                  */
4258                 fraghdr->ip6f_offlg = htons(offset) | off_flags;
4259 
4260                 if (!(dmp = ip_carve_mp(&mp, mlen))) {
4261                         /* mp has already been freed by ip_carve_mp() */
4262                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4263                         ip_drop_output("FragFails: could not carve mp",
4264                             hmp0, ill);
4265                         if (hmp != NULL)
4266                                 freeb(hmp);
4267                         freeb(hmp0);
4268                         ip1dbg(("ip_carve_mp: failed\n"));
4269                         return (ENOBUFS);
4270                 }
4271                 hmp0->b_cont = dmp;
4272                 /* Get the priority marking, if any */
4273                 hmp0->b_band = priority;
4274 
4275                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
4276 
4277                 error = postfragfn(hmp0, nce, ixaflags,
4278                     mlen + unfragmentable_len, xmit_hint, szone, nolzid,
4279                     ixa_cookie);
4280                 if (error != 0 && error != EWOULDBLOCK && hmp != NULL) {
4281                         /* No point in sending the other fragments */
4282                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
4283                         ip_drop_output("FragFails: postfragfn failed",
4284                             hmp, ill);
4285                         freeb(hmp);
4286                         freemsg(mp);
4287                         return (error);
4288                 }
4289                 /* No need to redo state machine in loop */
4290                 ixaflags &= ~IXAF_REACH_CONF;
4291 
4292                 offset += mlen;
4293         }
4294         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
4295         return (error);
4296 }
4297 
4298 /*
4299  * Add a fragment header to an IPv6 packet.
4300  * Assumes that all the extension headers are contained in the first mblk.
4301  *
4302  * The fragment header is inserted after an hop-by-hop options header
4303  * and after [an optional destinations header followed by] a routing header.
4304  */
4305 mblk_t *
4306 ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa)
4307 {
4308         ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
4309         ip6_t           *fip6h;
4310         mblk_t          *hmp;
4311         ip6_frag_t      *fraghdr;
4312         size_t          unfragmentable_len;
4313         uint8_t         nexthdr;
4314         uint_t          prev_nexthdr_offset;
4315         uint8_t         *ptr;
4316         uint_t          priority = mp->b_band;
4317         ip_stack_t      *ipst = ixa->ixa_ipst;
4318 
4319         /*
4320          * Determine the length of the unfragmentable portion of this
4321          * datagram.  This consists of the IPv6 header, a potential
4322          * hop-by-hop options header, a potential pre-routing-header
4323          * destination options header, and a potential routing header.
4324          */
4325         nexthdr = ip6h->ip6_nxt;
4326         prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h;
4327         ptr = (uint8_t *)&ip6h[1];
4328 
4329         if (nexthdr == IPPROTO_HOPOPTS) {
4330                 ip6_hbh_t       *hbh_hdr;
4331                 uint_t          hdr_len;
4332 
4333                 hbh_hdr = (ip6_hbh_t *)ptr;
4334                 hdr_len = 8 * (hbh_hdr->ip6h_len + 1);
4335                 nexthdr = hbh_hdr->ip6h_nxt;
4336                 prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt
4337                     - (uint8_t *)ip6h;
4338                 ptr += hdr_len;
4339         }
4340         if (nexthdr == IPPROTO_DSTOPTS) {
4341                 ip6_dest_t      *dest_hdr;
4342                 uint_t          hdr_len;
4343 
4344                 dest_hdr = (ip6_dest_t *)ptr;
4345                 if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) {
4346                         hdr_len = 8 * (dest_hdr->ip6d_len + 1);
4347                         nexthdr = dest_hdr->ip6d_nxt;
4348                         prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt
4349                             - (uint8_t *)ip6h;
4350                         ptr += hdr_len;
4351                 }
4352         }
4353         if (nexthdr == IPPROTO_ROUTING) {
4354                 ip6_rthdr_t     *rthdr;
4355                 uint_t          hdr_len;
4356 
4357                 rthdr = (ip6_rthdr_t *)ptr;
4358                 nexthdr = rthdr->ip6r_nxt;
4359                 prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt
4360                     - (uint8_t *)ip6h;
4361                 hdr_len = 8 * (rthdr->ip6r_len + 1);
4362                 ptr += hdr_len;
4363         }
4364         unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
4365 
4366         /*
4367          * Allocate an mblk with enough room for the link-layer
4368          * header, the unfragmentable part of the datagram, and the
4369          * fragment header.
4370          */
4371         hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) +
4372             ipst->ips_ip_wroff_extra, mp);
4373         if (hmp == NULL) {
4374                 ill_t *ill = ixa->ixa_nce->nce_ill;
4375 
4376                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
4377                 ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill);
4378                 freemsg(mp);
4379                 return (NULL);
4380         }
4381         hmp->b_rptr += ipst->ips_ip_wroff_extra;
4382         hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t);
4383 
4384         fip6h = (ip6_t *)hmp->b_rptr;
4385         fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len);
4386 
4387         bcopy(ip6h, fip6h, unfragmentable_len);
4388         fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t));
4389         hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
4390 
4391         fraghdr->ip6f_nxt = nexthdr;
4392         fraghdr->ip6f_reserved = 0;
4393         fraghdr->ip6f_offlg = 0;
4394         fraghdr->ip6f_ident = htonl(ident);
4395 
4396         /* Get the priority marking, if any */
4397         hmp->b_band = priority;
4398 
4399         /*
4400          * Move read ptr past unfragmentable portion, we don't want this part
4401          * of the data in our fragments.
4402          */
4403         mp->b_rptr += unfragmentable_len;
4404         hmp->b_cont = mp;
4405         return (hmp);
4406 }
4407 
4408 /*
4409  * Determine if the ill and multicast aspects of that packets
4410  * "matches" the conn.
4411  */
4412 boolean_t
4413 conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h)
4414 {
4415         ill_t           *ill = ira->ira_rill;
4416         zoneid_t        zoneid = ira->ira_zoneid;
4417         uint_t          in_ifindex;
4418         in6_addr_t      *v6dst_ptr = &ip6h->ip6_dst;
4419         in6_addr_t      *v6src_ptr = &ip6h->ip6_src;
4420 
4421         /*
4422          * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local
4423          * scopeid. This is used to limit
4424          * unicast and multicast reception to conn_incoming_ifindex.
4425          * conn_wantpacket_v6 is called both for unicast and
4426          * multicast packets.
4427          */
4428         in_ifindex = connp->conn_incoming_ifindex;
4429 
4430         /* mpathd can bind to the under IPMP interface, which we allow */
4431         if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
4432                 if (!IS_UNDER_IPMP(ill))
4433                         return (B_FALSE);
4434 
4435                 if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
4436                         return (B_FALSE);
4437         }
4438 
4439         if (!IPCL_ZONE_MATCH(connp, zoneid))
4440                 return (B_FALSE);
4441 
4442         if (!(ira->ira_flags & IRAF_MULTICAST))
4443                 return (B_TRUE);
4444 
4445         if (connp->conn_multi_router)
4446                 return (B_TRUE);
4447 
4448         if (ira->ira_protocol == IPPROTO_RSVP)
4449                 return (B_TRUE);
4450 
4451         return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr,
4452             ira->ira_ill));
4453 }
4454 
4455 /*
4456  * pr_addr_dbg function provides the needed buffer space to call
4457  * inet_ntop() function's 3rd argument. This function should be
4458  * used by any kernel routine which wants to save INET6_ADDRSTRLEN
4459  * stack buffer space in it's own stack frame. This function uses
4460  * a buffer from it's own stack and prints the information.
4461  * Example: pr_addr_dbg("func: no route for %s\n ", AF_INET, addr)
4462  *
4463  * Note:    This function can call inet_ntop() once.
4464  */
4465 void
4466 pr_addr_dbg(char *fmt1, int af, const void *addr)
4467 {
4468         char    buf[INET6_ADDRSTRLEN];
4469 
4470         if (fmt1 == NULL) {
4471                 ip0dbg(("pr_addr_dbg: Wrong arguments\n"));
4472                 return;
4473         }
4474 
4475         /*
4476          * This does not compare debug level and just prints
4477          * out. Thus it is the responsibility of the caller
4478          * to check the appropriate debug-level before calling
4479          * this function.
4480          */
4481         if (ip_debug > 0) {
4482                 printf(fmt1, inet_ntop(af, addr, buf, sizeof (buf)));
4483         }
4484 
4485 
4486 }
4487 
4488 
4489 /*
4490  * Return the length in bytes of the IPv6 headers (base header
4491  * extension headers) that will be needed based on the
4492  * ip_pkt_t structure passed by the caller.
4493  *
4494  * The returned length does not include the length of the upper level
4495  * protocol (ULP) header.
4496  */
4497 int
4498 ip_total_hdrs_len_v6(const ip_pkt_t *ipp)
4499 {
4500         int len;
4501 
4502         len = IPV6_HDR_LEN;
4503 
4504         /*
4505          * If there's a security label here, then we ignore any hop-by-hop
4506          * options the user may try to set.
4507          */
4508         if (ipp->ipp_fields & IPPF_LABEL_V6) {
4509                 uint_t hopoptslen;
4510                 /*
4511                  * Note that ipp_label_len_v6 is just the option - not
4512                  * the hopopts extension header. It also needs to be padded
4513                  * to a multiple of 8 bytes.
4514                  */
4515                 ASSERT(ipp->ipp_label_len_v6 != 0);
4516                 hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4517                 hopoptslen = (hopoptslen + 7)/8 * 8;
4518                 len += hopoptslen;
4519         } else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4520                 ASSERT(ipp->ipp_hopoptslen != 0);
4521                 len += ipp->ipp_hopoptslen;
4522         }
4523 
4524         /*
4525          * En-route destination options
4526          * Only do them if there's a routing header as well
4527          */
4528         if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4529             (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4530                 ASSERT(ipp->ipp_rthdrdstoptslen != 0);
4531                 len += ipp->ipp_rthdrdstoptslen;
4532         }
4533         if (ipp->ipp_fields & IPPF_RTHDR) {
4534                 ASSERT(ipp->ipp_rthdrlen != 0);
4535                 len += ipp->ipp_rthdrlen;
4536         }
4537         if (ipp->ipp_fields & IPPF_DSTOPTS) {
4538                 ASSERT(ipp->ipp_dstoptslen != 0);
4539                 len += ipp->ipp_dstoptslen;
4540         }
4541         return (len);
4542 }
4543 
4544 /*
4545  * All-purpose routine to build a header chain of an IPv6 header
4546  * followed by any required extension headers and a proto header.
4547  *
4548  * The caller has to set the source and destination address as well as
4549  * ip6_plen. The caller has to massage any routing header and compensate
4550  * for the ULP pseudo-header checksum due to the source route.
4551  *
4552  * The extension headers will all be fully filled in.
4553  */
4554 void
4555 ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
4556     uint8_t protocol, uint32_t flowinfo)
4557 {
4558         uint8_t *nxthdr_ptr;
4559         uint8_t *cp;
4560         ip6_t   *ip6h = (ip6_t *)buf;
4561 
4562         /* Initialize IPv6 header */
4563         ip6h->ip6_vcf =
4564             (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
4565             (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
4566 
4567         if (ipp->ipp_fields & IPPF_TCLASS) {
4568                 /* Overrides the class part of flowinfo */
4569                 ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf,
4570                     ipp->ipp_tclass);
4571         }
4572 
4573         if (ipp->ipp_fields & IPPF_HOPLIMIT)
4574                 ip6h->ip6_hops = ipp->ipp_hoplimit;
4575         else
4576                 ip6h->ip6_hops = ipp->ipp_unicast_hops;
4577 
4578         if ((ipp->ipp_fields & IPPF_ADDR) &&
4579             !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
4580                 ip6h->ip6_src = ipp->ipp_addr;
4581 
4582         nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt;
4583         cp = (uint8_t *)&ip6h[1];
4584         /*
4585          * Here's where we have to start stringing together
4586          * any extension headers in the right order:
4587          * Hop-by-hop, destination, routing, and final destination opts.
4588          */
4589         /*
4590          * If there's a security label here, then we ignore any hop-by-hop
4591          * options the user may try to set.
4592          */
4593         if (ipp->ipp_fields & IPPF_LABEL_V6) {
4594                 /*
4595                  * Hop-by-hop options with the label.
4596                  * Note that ipp_label_v6 is just the option - not
4597                  * the hopopts extension header. It also needs to be padded
4598                  * to a multiple of 8 bytes.
4599                  */
4600                 ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4601                 uint_t hopoptslen;
4602                 uint_t padlen;
4603 
4604                 padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t);
4605                 hopoptslen = (padlen + 7)/8 * 8;
4606                 padlen = hopoptslen - padlen;
4607 
4608                 *nxthdr_ptr = IPPROTO_HOPOPTS;
4609                 nxthdr_ptr = &hbh->ip6h_nxt;
4610                 hbh->ip6h_len = hopoptslen/8 - 1;
4611                 cp += sizeof (ip6_hbh_t);
4612                 bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6);
4613                 cp += ipp->ipp_label_len_v6;
4614 
4615                 ASSERT(padlen <= 7);
4616                 switch (padlen) {
4617                 case 0:
4618                         break;
4619                 case 1:
4620                         cp[0] = IP6OPT_PAD1;
4621                         break;
4622                 default:
4623                         cp[0] = IP6OPT_PADN;
4624                         cp[1] = padlen - 2;
4625                         bzero(&cp[2], padlen - 2);
4626                         break;
4627                 }
4628                 cp += padlen;
4629         } else if (ipp->ipp_fields & IPPF_HOPOPTS) {
4630                 /* Hop-by-hop options */
4631                 ip6_hbh_t *hbh = (ip6_hbh_t *)cp;
4632 
4633                 *nxthdr_ptr = IPPROTO_HOPOPTS;
4634                 nxthdr_ptr = &hbh->ip6h_nxt;
4635 
4636                 bcopy(ipp->ipp_hopopts, cp, ipp->ipp_hopoptslen);
4637                 cp += ipp->ipp_hopoptslen;
4638         }
4639         /*
4640          * En-route destination options
4641          * Only do them if there's a routing header as well
4642          */
4643         if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) ==
4644             (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) {
4645                 ip6_dest_t *dst = (ip6_dest_t *)cp;
4646 
4647                 *nxthdr_ptr = IPPROTO_DSTOPTS;
4648                 nxthdr_ptr = &dst->ip6d_nxt;
4649 
4650                 bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen);
4651                 cp += ipp->ipp_rthdrdstoptslen;
4652         }
4653         /*
4654          * Routing header next
4655          */
4656         if (ipp->ipp_fields & IPPF_RTHDR) {
4657                 ip6_rthdr_t *rt = (ip6_rthdr_t *)cp;
4658 
4659                 *nxthdr_ptr = IPPROTO_ROUTING;
4660                 nxthdr_ptr = &rt->ip6r_nxt;
4661 
4662                 bcopy(ipp->ipp_rthdr, cp, ipp->ipp_rthdrlen);
4663                 cp += ipp->ipp_rthdrlen;
4664         }
4665         /*
4666          * Do ultimate destination options
4667          */
4668         if (ipp->ipp_fields & IPPF_DSTOPTS) {
4669                 ip6_dest_t *dest = (ip6_dest_t *)cp;
4670 
4671                 *nxthdr_ptr = IPPROTO_DSTOPTS;
4672                 nxthdr_ptr = &dest->ip6d_nxt;
4673 
4674                 bcopy(ipp->ipp_dstopts, cp, ipp->ipp_dstoptslen);
4675                 cp += ipp->ipp_dstoptslen;
4676         }
4677         /*
4678          * Now set the last header pointer to the proto passed in
4679          */
4680         *nxthdr_ptr = protocol;
4681         ASSERT((int)(cp - buf) == buf_len);
4682 }
4683 
4684 /*
4685  * Return a pointer to the routing header extension header
4686  * in the IPv6 header(s) chain passed in.
4687  * If none found, return NULL
4688  * Assumes that all extension headers are in same mblk as the v6 header
4689  */
4690 ip6_rthdr_t *
4691 ip_find_rthdr_v6(ip6_t *ip6h, uint8_t *endptr)
4692 {
4693         ip6_dest_t      *desthdr;
4694         ip6_frag_t      *fraghdr;
4695         uint_t          hdrlen;
4696         uint8_t         nexthdr;
4697         uint8_t         *ptr = (uint8_t *)&ip6h[1];
4698 
4699         if (ip6h->ip6_nxt == IPPROTO_ROUTING)
4700                 return ((ip6_rthdr_t *)ptr);
4701 
4702         /*
4703          * The routing header will precede all extension headers
4704          * other than the hop-by-hop and destination options
4705          * extension headers, so if we see anything other than those,
4706          * we're done and didn't find it.
4707          * We could see a destination options header alone but no
4708          * routing header, in which case we'll return NULL as soon as
4709          * we see anything after that.
4710          * Hop-by-hop and destination option headers are identical,
4711          * so we can use either one we want as a template.
4712          */
4713         nexthdr = ip6h->ip6_nxt;
4714         while (ptr < endptr) {
4715                 /* Is there enough left for len + nexthdr? */
4716                 if (ptr + MIN_EHDR_LEN > endptr)
4717                         return (NULL);
4718 
4719                 switch (nexthdr) {
4720                 case IPPROTO_HOPOPTS:
4721                 case IPPROTO_DSTOPTS:
4722                         /* Assumes the headers are identical for hbh and dst */
4723                         desthdr = (ip6_dest_t *)ptr;
4724                         hdrlen = 8 * (desthdr->ip6d_len + 1);
4725                         nexthdr = desthdr->ip6d_nxt;
4726                         break;
4727 
4728                 case IPPROTO_ROUTING:
4729                         return ((ip6_rthdr_t *)ptr);
4730 
4731                 case IPPROTO_FRAGMENT:
4732                         fraghdr = (ip6_frag_t *)ptr;
4733                         hdrlen = sizeof (ip6_frag_t);
4734                         nexthdr = fraghdr->ip6f_nxt;
4735                         break;
4736 
4737                 default:
4738                         return (NULL);
4739                 }
4740                 ptr += hdrlen;
4741         }
4742         return (NULL);
4743 }
4744 
4745 /*
4746  * Called for source-routed packets originating on this node.
4747  * Manipulates the original routing header by moving every entry up
4748  * one slot, placing the first entry in the v6 header's v6_dst field,
4749  * and placing the ultimate destination in the routing header's last
4750  * slot.
4751  *
4752  * Returns the checksum diference between the ultimate destination
4753  * (last hop in the routing header when the packet is sent) and
4754  * the first hop (ip6_dst when the packet is sent)
4755  */
4756 /* ARGSUSED2 */
4757 uint32_t
4758 ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns)
4759 {
4760         uint_t          numaddr;
4761         uint_t          i;
4762         in6_addr_t      *addrptr;
4763         in6_addr_t      tmp;
4764         ip6_rthdr0_t    *rthdr = (ip6_rthdr0_t *)rth;
4765         uint32_t        cksm;
4766         uint32_t        addrsum = 0;
4767         uint16_t        *ptr;
4768 
4769         /*
4770          * Perform any processing needed for source routing.
4771          * We know that all extension headers will be in the same mblk
4772          * as the IPv6 header.
4773          */
4774 
4775         /*
4776          * If no segments left in header, or the header length field is zero,
4777          * don't move hop addresses around;
4778          * Checksum difference is zero.
4779          */
4780         if ((rthdr->ip6r0_segleft == 0) || (rthdr->ip6r0_len == 0))
4781                 return (0);
4782 
4783         ptr = (uint16_t *)&ip6h->ip6_dst;
4784         cksm = 0;
4785         for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4786                 cksm += ptr[i];
4787         }
4788         cksm = (cksm & 0xFFFF) + (cksm >> 16);
4789 
4790         /*
4791          * Here's where the fun begins - we have to
4792          * move all addresses up one spot, take the
4793          * first hop and make it our first ip6_dst,
4794          * and place the ultimate destination in the
4795          * newly-opened last slot.
4796          */
4797         addrptr = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
4798         numaddr = rthdr->ip6r0_len / 2;
4799         tmp = *addrptr;
4800         for (i = 0; i < (numaddr - 1); addrptr++, i++) {
4801                 *addrptr = addrptr[1];
4802         }
4803         *addrptr = ip6h->ip6_dst;
4804         ip6h->ip6_dst = tmp;
4805 
4806         /*
4807          * From the checksummed ultimate destination subtract the checksummed
4808          * current ip6_dst (the first hop address). Return that number.
4809          * (In the v4 case, the second part of this is done in each routine
4810          *  that calls ip_massage_options(). We do it all in this one place
4811          *  for v6).
4812          */
4813         ptr = (uint16_t *)&ip6h->ip6_dst;
4814         for (i = 0; i < (sizeof (in6_addr_t) / sizeof (uint16_t)); i++) {
4815                 addrsum += ptr[i];
4816         }
4817         cksm -= ((addrsum >> 16) + (addrsum & 0xFFFF));
4818         if ((int)cksm < 0)
4819                 cksm--;
4820         cksm = (cksm & 0xFFFF) + (cksm >> 16);
4821 
4822         return (cksm);
4823 }
4824 
4825 void
4826 *ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp)
4827 {
4828         kstat_t *ksp;
4829 
4830         ip6_stat_t template = {
4831                 { "ip6_udp_fannorm",    KSTAT_DATA_UINT64 },
4832                 { "ip6_udp_fanmb",      KSTAT_DATA_UINT64 },
4833                 { "ip6_recv_pullup",            KSTAT_DATA_UINT64 },
4834                 { "ip6_db_ref",                 KSTAT_DATA_UINT64 },
4835                 { "ip6_notaligned",             KSTAT_DATA_UINT64 },
4836                 { "ip6_multimblk",              KSTAT_DATA_UINT64 },
4837                 { "ipsec_proto_ahesp",          KSTAT_DATA_UINT64 },
4838                 { "ip6_out_sw_cksum",                   KSTAT_DATA_UINT64 },
4839                 { "ip6_out_sw_cksum_bytes",             KSTAT_DATA_UINT64 },
4840                 { "ip6_in_sw_cksum",                    KSTAT_DATA_UINT64 },
4841                 { "ip6_tcp_in_full_hw_cksum_err",       KSTAT_DATA_UINT64 },
4842                 { "ip6_tcp_in_part_hw_cksum_err",       KSTAT_DATA_UINT64 },
4843                 { "ip6_tcp_in_sw_cksum_err",            KSTAT_DATA_UINT64 },
4844                 { "ip6_udp_in_full_hw_cksum_err",       KSTAT_DATA_UINT64 },
4845                 { "ip6_udp_in_part_hw_cksum_err",       KSTAT_DATA_UINT64 },
4846                 { "ip6_udp_in_sw_cksum_err",            KSTAT_DATA_UINT64 },
4847         };
4848         ksp = kstat_create_netstack("ip", 0, "ip6stat", "net",
4849             KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
4850             KSTAT_FLAG_VIRTUAL, stackid);
4851 
4852         if (ksp == NULL)
4853                 return (NULL);
4854 
4855         bcopy(&template, ip6_statisticsp, sizeof (template));
4856         ksp->ks_data = (void *)ip6_statisticsp;
4857         ksp->ks_private = (void *)(uintptr_t)stackid;
4858 
4859         kstat_install(ksp);
4860         return (ksp);
4861 }
4862 
4863 void
4864 ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp)
4865 {
4866         if (ksp != NULL) {
4867                 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
4868                 kstat_delete_netstack(ksp, stackid);
4869         }
4870 }
4871 
4872 /*
4873  * The following two functions set and get the value for the
4874  * IPV6_SRC_PREFERENCES socket option.
4875  */
4876 int
4877 ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs)
4878 {
4879         /*
4880          * We only support preferences that are covered by
4881          * IPV6_PREFER_SRC_MASK.
4882          */
4883         if (prefs & ~IPV6_PREFER_SRC_MASK)
4884                 return (EINVAL);
4885 
4886         /*
4887          * Look for conflicting preferences or default preferences.  If
4888          * both bits of a related pair are clear, the application wants the
4889          * system's default value for that pair.  Both bits in a pair can't
4890          * be set.
4891          */
4892         if ((prefs & IPV6_PREFER_SRC_MIPMASK) == 0) {
4893                 prefs |= IPV6_PREFER_SRC_MIPDEFAULT;
4894         } else if ((prefs & IPV6_PREFER_SRC_MIPMASK) ==
4895             IPV6_PREFER_SRC_MIPMASK) {
4896                 return (EINVAL);
4897         }
4898         if ((prefs & IPV6_PREFER_SRC_TMPMASK) == 0) {
4899                 prefs |= IPV6_PREFER_SRC_TMPDEFAULT;
4900         } else if ((prefs & IPV6_PREFER_SRC_TMPMASK) ==
4901             IPV6_PREFER_SRC_TMPMASK) {
4902                 return (EINVAL);
4903         }
4904         if ((prefs & IPV6_PREFER_SRC_CGAMASK) == 0) {
4905                 prefs |= IPV6_PREFER_SRC_CGADEFAULT;
4906         } else if ((prefs & IPV6_PREFER_SRC_CGAMASK) ==
4907             IPV6_PREFER_SRC_CGAMASK) {
4908                 return (EINVAL);
4909         }
4910 
4911         ixa->ixa_src_preferences = prefs;
4912         return (0);
4913 }
4914 
4915 size_t
4916 ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val)
4917 {
4918         *val = ixa->ixa_src_preferences;
4919         return (sizeof (ixa->ixa_src_preferences));
4920 }
4921 
4922 /*
4923  * Get the size of the IP options (including the IP headers size)
4924  * without including the AH header's size. If till_ah is B_FALSE,
4925  * and if AH header is present, dest options beyond AH header will
4926  * also be included in the returned size.
4927  */
4928 int
4929 ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah)
4930 {
4931         ip6_t *ip6h;
4932         uint8_t nexthdr;
4933         uint8_t *whereptr;
4934         ip6_hbh_t *hbhhdr;
4935         ip6_dest_t *dsthdr;
4936         ip6_rthdr_t *rthdr;
4937         int ehdrlen;
4938         int size;
4939         ah_t *ah;
4940 
4941         ip6h = (ip6_t *)mp->b_rptr;
4942         size = IPV6_HDR_LEN;
4943         nexthdr = ip6h->ip6_nxt;
4944         whereptr = (uint8_t *)&ip6h[1];
4945         for (;;) {
4946                 /* Assume IP has already stripped it */
4947                 ASSERT(nexthdr != IPPROTO_FRAGMENT);
4948                 switch (nexthdr) {
4949                 case IPPROTO_HOPOPTS:
4950                         hbhhdr = (ip6_hbh_t *)whereptr;
4951                         nexthdr = hbhhdr->ip6h_nxt;
4952                         ehdrlen = 8 * (hbhhdr->ip6h_len + 1);
4953                         break;
4954                 case IPPROTO_DSTOPTS:
4955                         dsthdr = (ip6_dest_t *)whereptr;
4956                         nexthdr = dsthdr->ip6d_nxt;
4957                         ehdrlen = 8 * (dsthdr->ip6d_len + 1);
4958                         break;
4959                 case IPPROTO_ROUTING:
4960                         rthdr = (ip6_rthdr_t *)whereptr;
4961                         nexthdr = rthdr->ip6r_nxt;
4962                         ehdrlen = 8 * (rthdr->ip6r_len + 1);
4963                         break;
4964                 default :
4965                         if (till_ah) {
4966                                 ASSERT(nexthdr == IPPROTO_AH);
4967                                 return (size);
4968                         }
4969                         /*
4970                          * If we don't have a AH header to traverse,
4971                          * return now. This happens normally for
4972                          * outbound datagrams where we have not inserted
4973                          * the AH header.
4974                          */
4975                         if (nexthdr != IPPROTO_AH) {
4976                                 return (size);
4977                         }
4978 
4979                         /*
4980                          * We don't include the AH header's size
4981                          * to be symmetrical with other cases where
4982                          * we either don't have a AH header (outbound)
4983                          * or peek into the AH header yet (inbound and
4984                          * not pulled up yet).
4985                          */
4986                         ah = (ah_t *)whereptr;
4987                         nexthdr = ah->ah_nexthdr;
4988                         ehdrlen = (ah->ah_length << 2) + 8;
4989 
4990                         if (nexthdr == IPPROTO_DSTOPTS) {
4991                                 if (whereptr + ehdrlen >= mp->b_wptr) {
4992                                         /*
4993                                          * The destination options header
4994                                          * is not part of the first mblk.
4995                                          */
4996                                         whereptr = mp->b_cont->b_rptr;
4997                                 } else {
4998                                         whereptr += ehdrlen;
4999                                 }
5000 
5001                                 dsthdr = (ip6_dest_t *)whereptr;
5002                                 ehdrlen = 8 * (dsthdr->ip6d_len + 1);
5003                                 size += ehdrlen;
5004                         }
5005                         return (size);
5006                 }
5007                 whereptr += ehdrlen;
5008                 size += ehdrlen;
5009         }
5010 }
5011 
5012 /*
5013  * Utility routine that checks if `v6srcp' is a valid address on underlying
5014  * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
5015  * associated with `v6srcp' on success.  NOTE: if this is not called from
5016  * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
5017  * group during or after this lookup.
5018  */
5019 boolean_t
5020 ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp)
5021 {
5022         ipif_t *ipif;
5023 
5024 
5025         ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst);
5026         if (ipif != NULL) {
5027                 if (ipifp != NULL)
5028                         *ipifp = ipif;
5029                 else
5030                         ipif_refrele(ipif);
5031                 return (B_TRUE);
5032         }
5033 
5034         if (ip_debug > 2) {
5035                 pr_addr_dbg("ipif_lookup_testaddr_v6: cannot find ipif for "
5036                     "src %s\n", AF_INET6, v6srcp);
5037         }
5038         return (B_FALSE);
5039 }