1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 /* Copyright (c) 1990 Mentat Inc. */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/dlpi.h>
  30 #include <sys/stropts.h>
  31 #include <sys/sysmacros.h>
  32 #include <sys/strsubr.h>
  33 #include <sys/strlog.h>
  34 #include <sys/strsun.h>
  35 #include <sys/zone.h>
  36 #define _SUN_TPI_VERSION 2
  37 #include <sys/tihdr.h>
  38 #include <sys/xti_inet.h>
  39 #include <sys/ddi.h>
  40 #include <sys/sunddi.h>
  41 #include <sys/cmn_err.h>
  42 #include <sys/debug.h>
  43 #include <sys/kobj.h>
  44 #include <sys/modctl.h>
  45 #include <sys/atomic.h>
  46 #include <sys/policy.h>
  47 #include <sys/priv.h>
  48 
  49 #include <sys/systm.h>
  50 #include <sys/param.h>
  51 #include <sys/kmem.h>
  52 #include <sys/sdt.h>
  53 #include <sys/socket.h>
  54 #include <sys/vtrace.h>
  55 #include <sys/isa_defs.h>
  56 #include <sys/mac.h>
  57 #include <net/if.h>
  58 #include <net/if_arp.h>
  59 #include <net/route.h>
  60 #include <sys/sockio.h>
  61 #include <netinet/in.h>
  62 #include <net/if_dl.h>
  63 
  64 #include <inet/common.h>
  65 #include <inet/mi.h>
  66 #include <inet/mib2.h>
  67 #include <inet/nd.h>
  68 #include <inet/arp.h>
  69 #include <inet/snmpcom.h>
  70 #include <inet/kstatcom.h>
  71 
  72 #include <netinet/igmp_var.h>
  73 #include <netinet/ip6.h>
  74 #include <netinet/icmp6.h>
  75 #include <netinet/sctp.h>
  76 
  77 #include <inet/ip.h>
  78 #include <inet/ip_impl.h>
  79 #include <inet/ip6.h>
  80 #include <inet/ip6_asp.h>
  81 #include <inet/optcom.h>
  82 #include <inet/tcp.h>
  83 #include <inet/tcp_impl.h>
  84 #include <inet/ip_multi.h>
  85 #include <inet/ip_if.h>
  86 #include <inet/ip_ire.h>
  87 #include <inet/ip_ftable.h>
  88 #include <inet/ip_rts.h>
  89 #include <inet/ip_ndp.h>
  90 #include <inet/ip_listutils.h>
  91 #include <netinet/igmp.h>
  92 #include <netinet/ip_mroute.h>
  93 #include <inet/ipp_common.h>
  94 
  95 #include <net/pfkeyv2.h>
  96 #include <inet/sadb.h>
  97 #include <inet/ipsec_impl.h>
  98 #include <inet/ipdrop.h>
  99 #include <inet/ip_netinfo.h>
 100 #include <inet/ilb_ip.h>
 101 #include <sys/squeue_impl.h>
 102 #include <sys/squeue.h>
 103 
 104 #include <sys/ethernet.h>
 105 #include <net/if_types.h>
 106 #include <sys/cpuvar.h>
 107 
 108 #include <ipp/ipp.h>
 109 #include <ipp/ipp_impl.h>
 110 #include <ipp/ipgpc/ipgpc.h>
 111 
 112 #include <sys/pattr.h>
 113 #include <inet/ipclassifier.h>
 114 #include <inet/sctp_ip.h>
 115 #include <inet/sctp/sctp_impl.h>
 116 #include <inet/udp_impl.h>
 117 #include <sys/sunddi.h>
 118 
 119 #include <sys/tsol/label.h>
 120 #include <sys/tsol/tnet.h>
 121 
 122 #include <sys/clock_impl.h>       /* For LBOLT_FASTPATH{,64} */
 123 
 124 #ifdef  DEBUG
 125 extern boolean_t skip_sctp_cksum;
 126 #endif
 127 
 128 static void     ip_input_local_v4(ire_t *, mblk_t *, ipha_t *,
 129     ip_recv_attr_t *);
 130 
 131 static void     ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *,
 132     ip_recv_attr_t *);
 133 static void     ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *,
 134     ip_recv_attr_t *);
 135 
 136 #pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4)
 137 
 138 /*
 139  * Direct read side procedure capable of dealing with chains. GLDv3 based
 140  * drivers call this function directly with mblk chains while STREAMS
 141  * read side procedure ip_rput() calls this for single packet with ip_ring
 142  * set to NULL to process one packet at a time.
 143  *
 144  * The ill will always be valid if this function is called directly from
 145  * the driver.
 146  *
 147  * If ip_input() is called from GLDv3:
 148  *
 149  *   - This must be a non-VLAN IP stream.
 150  *   - 'mp' is either an untagged or a special priority-tagged packet.
 151  *   - Any VLAN tag that was in the MAC header has been stripped.
 152  *
 153  * If the IP header in packet is not 32-bit aligned, every message in the
 154  * chain will be aligned before further operations. This is required on SPARC
 155  * platform.
 156  */
 157 void
 158 ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 159     struct mac_header_info_s *mhip)
 160 {
 161         (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL,
 162             NULL);
 163 }
 164 
 165 /*
 166  * ip_accept_tcp() - This function is called by the squeue when it retrieves
 167  * a chain of packets in the poll mode. The packets have gone through the
 168  * data link processing but not IP processing. For performance and latency
 169  * reasons, the squeue wants to process the chain in line instead of feeding
 170  * it back via ip_input path.
 171  *
 172  * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4
 173  * will pass back any TCP packets matching the target sqp to
 174  * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by
 175  * ip_input_v4 and ip_fanout_v4 as normal.
 176  * The TCP packets that match the target squeue are returned to the caller
 177  * as a b_next chain after each packet has been prepend with an mblk
 178  * from ip_recv_attr_to_mblk.
 179  */
 180 mblk_t *
 181 ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp,
 182     mblk_t *mp_chain, mblk_t **last, uint_t *cnt)
 183 {
 184         return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp,
 185             last, cnt));
 186 }
 187 
 188 /*
 189  * Used by ip_input and ip_accept_tcp
 190  * The last three arguments are only used by ip_accept_tcp, and mhip is
 191  * only used by ip_input.
 192  */
 193 mblk_t *
 194 ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain,
 195     struct mac_header_info_s *mhip, squeue_t *target_sqp,
 196     mblk_t **last, uint_t *cnt)
 197 {
 198         mblk_t          *mp;
 199         ipha_t          *ipha;
 200         ip_recv_attr_t  iras;   /* Receive attributes */
 201         rtc_t           rtc;
 202         iaflags_t       chain_flags = 0;        /* Fixed for chain */
 203         mblk_t          *ahead = NULL;  /* Accepted head */
 204         mblk_t          *atail = NULL;  /* Accepted tail */
 205         uint_t          acnt = 0;       /* Accepted count */
 206 
 207         ASSERT(mp_chain != NULL);
 208         ASSERT(ill != NULL);
 209 
 210         /* These ones do not change as we loop over packets */
 211         iras.ira_ill = iras.ira_rill = ill;
 212         iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
 213         iras.ira_rifindex = iras.ira_ruifindex;
 214         iras.ira_sqp = NULL;
 215         iras.ira_ring = ip_ring;
 216         /* For ECMP and outbound transmit ring selection */
 217         iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring);
 218 
 219         iras.ira_target_sqp = target_sqp;
 220         iras.ira_target_sqp_mp = NULL;
 221         if (target_sqp != NULL)
 222                 chain_flags |= IRAF_TARGET_SQP;
 223 
 224         /*
 225          * We try to have a mhip pointer when possible, but
 226          * it might be NULL in some cases. In those cases we
 227          * have to assume unicast.
 228          */
 229         iras.ira_mhip = mhip;
 230         iras.ira_flags = 0;
 231         if (mhip != NULL) {
 232                 switch (mhip->mhi_dsttype) {
 233                 case MAC_ADDRTYPE_MULTICAST :
 234                         chain_flags |= IRAF_L2DST_MULTICAST;
 235                         break;
 236                 case MAC_ADDRTYPE_BROADCAST :
 237                         chain_flags |= IRAF_L2DST_BROADCAST;
 238                         break;
 239                 }
 240         }
 241 
 242         /*
 243          * Initialize the one-element route cache.
 244          *
 245          * We do ire caching from one iteration to
 246          * another. In the event the packet chain contains
 247          * all packets from the same dst, this caching saves
 248          * an ire_route_recursive for each of the succeeding
 249          * packets in a packet chain.
 250          */
 251         rtc.rtc_ire = NULL;
 252         rtc.rtc_ipaddr = INADDR_ANY;
 253 
 254         /* Loop over b_next */
 255         for (mp = mp_chain; mp != NULL; mp = mp_chain) {
 256                 mp_chain = mp->b_next;
 257                 mp->b_next = NULL;
 258 
 259                 ASSERT(DB_TYPE(mp) == M_DATA);
 260 
 261 
 262                 /*
 263                  * if db_ref > 1 then copymsg and free original. Packet
 264                  * may be changed and we do not want the other entity
 265                  * who has a reference to this message to trip over the
 266                  * changes. This is a blind change because trying to
 267                  * catch all places that might change the packet is too
 268                  * difficult.
 269                  *
 270                  * This corresponds to the fast path case, where we have
 271                  * a chain of M_DATA mblks.  We check the db_ref count
 272                  * of only the 1st data block in the mblk chain. There
 273                  * doesn't seem to be a reason why a device driver would
 274                  * send up data with varying db_ref counts in the mblk
 275                  * chain. In any case the Fast path is a private
 276                  * interface, and our drivers don't do such a thing.
 277                  * Given the above assumption, there is no need to walk
 278                  * down the entire mblk chain (which could have a
 279                  * potential performance problem)
 280                  *
 281                  * The "(DB_REF(mp) > 1)" check was moved from ip_rput()
 282                  * to here because of exclusive ip stacks and vnics.
 283                  * Packets transmitted from exclusive stack over vnic
 284                  * can have db_ref > 1 and when it gets looped back to
 285                  * another vnic in a different zone, you have ip_input()
 286                  * getting dblks with db_ref > 1. So if someone
 287                  * complains of TCP performance under this scenario,
 288                  * take a serious look here on the impact of copymsg().
 289                  */
 290                 if (DB_REF(mp) > 1) {
 291                         if ((mp = ip_fix_dbref(mp, &iras)) == NULL) {
 292                                 /* mhip might point into 1st packet in chain */
 293                                 iras.ira_mhip = NULL;
 294                                 continue;
 295                         }
 296                 }
 297 
 298                 /*
 299                  * IP header ptr not aligned?
 300                  * OR IP header not complete in first mblk
 301                  */
 302                 ipha = (ipha_t *)mp->b_rptr;
 303                 if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) {
 304                         mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH,
 305                             &iras);
 306                         if (mp == NULL) {
 307                                 /* mhip might point into 1st packet in chain */
 308                                 iras.ira_mhip = NULL;
 309                                 continue;
 310                         }
 311                         ipha = (ipha_t *)mp->b_rptr;
 312                 }
 313 
 314                 /* Protect against a mix of Ethertypes and IP versions */
 315                 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
 316                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
 317                         ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
 318                         freemsg(mp);
 319                         /* mhip might point into 1st packet in the chain. */
 320                         iras.ira_mhip = NULL;
 321                         continue;
 322                 }
 323 
 324                 /*
 325                  * Check for Martian addrs; we have to explicitly
 326                  * test for for zero dst since this is also used as
 327                  * an indication that the rtc is not used.
 328                  */
 329                 if (ipha->ipha_dst == INADDR_ANY) {
 330                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 331                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 332                         freemsg(mp);
 333                         /* mhip might point into 1st packet in the chain. */
 334                         iras.ira_mhip = NULL;
 335                         continue;
 336                 }
 337 
 338                 /*
 339                  * Keep L2SRC from a previous packet in chain since mhip
 340                  * might point into an earlier packet in the chain.
 341                  * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast
 342                  * source check in forwarding path.
 343                  */
 344                 chain_flags |= (iras.ira_flags &
 345                     (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC));
 346 
 347                 iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM |
 348                     IRAF_VERIFY_ULP_CKSUM | chain_flags;
 349                 iras.ira_free_flags = 0;
 350                 iras.ira_cred = NULL;
 351                 iras.ira_cpid = NOPID;
 352                 iras.ira_tsl = NULL;
 353                 iras.ira_zoneid = ALL_ZONES;    /* Default for forwarding */
 354 
 355                 /*
 356                  * We must count all incoming packets, even if they end
 357                  * up being dropped later on. Defer counting bytes until
 358                  * we have the whole IP header in first mblk.
 359                  */
 360                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives);
 361 
 362                 iras.ira_pktlen = ntohs(ipha->ipha_length);
 363                 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets,
 364                     iras.ira_pktlen);
 365 
 366                 /*
 367                  * Call one of:
 368                  *      ill_input_full_v4
 369                  *      ill_input_short_v4
 370                  * The former is used in unusual cases. See ill_set_inputfn().
 371                  */
 372                 (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc);
 373 
 374                 /* Any references to clean up? No hold on ira_ill */
 375                 if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED))
 376                         ira_cleanup(&iras, B_FALSE);
 377 
 378                 if (iras.ira_target_sqp_mp != NULL) {
 379                         /* Better be called from ip_accept_tcp */
 380                         ASSERT(target_sqp != NULL);
 381 
 382                         /* Found one packet to accept */
 383                         mp = iras.ira_target_sqp_mp;
 384                         iras.ira_target_sqp_mp = NULL;
 385                         ASSERT(ip_recv_attr_is_mblk(mp));
 386 
 387                         if (atail != NULL)
 388                                 atail->b_next = mp;
 389                         else
 390                                 ahead = mp;
 391                         atail = mp;
 392                         acnt++;
 393                         mp = NULL;
 394                 }
 395                 /* mhip might point into 1st packet in the chain. */
 396                 iras.ira_mhip = NULL;
 397         }
 398         /* Any remaining references to the route cache? */
 399         if (rtc.rtc_ire != NULL) {
 400                 ASSERT(rtc.rtc_ipaddr != INADDR_ANY);
 401                 ire_refrele(rtc.rtc_ire);
 402         }
 403 
 404         if (ahead != NULL) {
 405                 /* Better be called from ip_accept_tcp */
 406                 ASSERT(target_sqp != NULL);
 407                 *last = atail;
 408                 *cnt = acnt;
 409                 return (ahead);
 410         }
 411 
 412         return (NULL);
 413 }
 414 
 415 /*
 416  * This input function is used when
 417  *  - is_system_labeled()
 418  *  - CGTP filtering
 419  *  - DHCP unicast before we have an IP address configured
 420  *  - there is an listener for IPPROTO_RSVP
 421  */
 422 void
 423 ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 424     ip_recv_attr_t *ira, rtc_t *rtc)
 425 {
 426         ipha_t          *ipha = (ipha_t *)iph_arg;
 427         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 428         ill_t           *ill = ira->ira_ill;
 429         ip_stack_t      *ipst = ill->ill_ipst;
 430         int             cgtp_flt_pkt;
 431 
 432         ASSERT(ira->ira_tsl == NULL);
 433 
 434         /*
 435          * Attach any necessary label information to
 436          * this packet
 437          */
 438         if (is_system_labeled()) {
 439                 ira->ira_flags |= IRAF_SYSTEM_LABELED;
 440 
 441                 /*
 442                  * This updates ira_cred, ira_tsl and ira_free_flags based
 443                  * on the label.
 444                  */
 445                 if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) {
 446                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 447                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
 448                         freemsg(mp);
 449                         return;
 450                 }
 451                 /* Note that ira_tsl can be NULL here. */
 452 
 453                 /* tsol_get_pkt_label sometimes does pullupmsg */
 454                 ipha = (ipha_t *)mp->b_rptr;
 455         }
 456 
 457         /*
 458          * Invoke the CGTP (multirouting) filtering module to process
 459          * the incoming packet. Packets identified as duplicates
 460          * must be discarded. Filtering is active only if the
 461          * the ip_cgtp_filter ndd variable is non-zero.
 462          */
 463         cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP;
 464         if (ipst->ips_ip_cgtp_filter &&
 465             ipst->ips_ip_cgtp_filter_ops != NULL) {
 466                 netstackid_t stackid;
 467 
 468                 stackid = ipst->ips_netstack->netstack_stackid;
 469                 /*
 470                  * CGTP and IPMP are mutually exclusive so
 471                  * phyint_ifindex is fine here.
 472                  */
 473                 cgtp_flt_pkt =
 474                     ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid,
 475                     ill->ill_phyint->phyint_ifindex, mp);
 476                 if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) {
 477                         ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill);
 478                         freemsg(mp);
 479                         return;
 480                 }
 481         }
 482 
 483         /*
 484          * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP
 485          * server to unicast DHCP packets to a DHCP client using the
 486          * IP address it is offering to the client.  This can be
 487          * disabled through the "broadcast bit", but not all DHCP
 488          * servers honor that bit.  Therefore, to interoperate with as
 489          * many DHCP servers as possible, the DHCP client allows the
 490          * server to unicast, but we treat those packets as broadcast
 491          * here.  Note that we don't rewrite the packet itself since
 492          * (a) that would mess up the checksums and (b) the DHCP
 493          * client conn is bound to INADDR_ANY so ip_fanout_udp() will
 494          * hand it the packet regardless.
 495          */
 496         if (ill->ill_dhcpinit != 0 &&
 497             ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION &&
 498             ipha->ipha_protocol == IPPROTO_UDP) {
 499                 udpha_t *udpha;
 500 
 501                 ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira);
 502                 if (ipha == NULL) {
 503                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 504                         ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill);
 505                         freemsg(mp);
 506                         return;
 507                 }
 508                 /* Reload since pullupmsg() can change b_rptr. */
 509                 udpha = (udpha_t *)&ipha[1];
 510 
 511                 if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) {
 512                         DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill,
 513                             mblk_t *, mp);
 514                         /*
 515                          * This assumes that we deliver to all conns for
 516                          * multicast and broadcast packets.
 517                          */
 518                         nexthop = INADDR_BROADCAST;
 519                         ira->ira_flags |= IRAF_DHCP_UNICAST;
 520                 }
 521         }
 522 
 523         /*
 524          * If rsvpd is running, let RSVP daemon handle its processing
 525          * and forwarding of RSVP multicast/unicast packets.
 526          * If rsvpd is not running but mrouted is running, RSVP
 527          * multicast packets are forwarded as multicast traffic
 528          * and RSVP unicast packets are forwarded by unicast router.
 529          * If neither rsvpd nor mrouted is running, RSVP multicast
 530          * packets are not forwarded, but the unicast packets are
 531          * forwarded like unicast traffic.
 532          */
 533         if (ipha->ipha_protocol == IPPROTO_RSVP &&
 534             ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
 535                 /* RSVP packet and rsvpd running. Treat as ours */
 536                 ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop)));
 537                 /*
 538                  * We use a multicast address to get the packet to
 539                  * ire_recv_multicast_v4. There will not be a membership
 540                  * check since we set IRAF_RSVP
 541                  */
 542                 nexthop = htonl(INADDR_UNSPEC_GROUP);
 543                 ira->ira_flags |= IRAF_RSVP;
 544         }
 545 
 546         ill_input_short_v4(mp, ipha, &nexthop, ira, rtc);
 547 }
 548 
 549 /*
 550  * This is the tail-end of the full receive side packet handling.
 551  * It can be used directly when the configuration is simple.
 552  */
 553 void
 554 ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg,
 555     ip_recv_attr_t *ira, rtc_t *rtc)
 556 {
 557         ire_t           *ire;
 558         uint_t          opt_len;
 559         ill_t           *ill = ira->ira_ill;
 560         ip_stack_t      *ipst = ill->ill_ipst;
 561         uint_t          pkt_len;
 562         ssize_t         len;
 563         ipha_t          *ipha = (ipha_t *)iph_arg;
 564         ipaddr_t        nexthop = *(ipaddr_t *)nexthop_arg;
 565         ilb_stack_t     *ilbs = ipst->ips_netstack->netstack_ilb;
 566         uint_t          irr_flags;
 567 #define rptr    ((uchar_t *)ipha)
 568 
 569         ASSERT(DB_TYPE(mp) == M_DATA);
 570 
 571         /*
 572          * The following test for loopback is faster than
 573          * IP_LOOPBACK_ADDR(), because it avoids any bitwise
 574          * operations.
 575          * Note that these addresses are always in network byte order
 576          */
 577         if (((*(uchar_t *)&ipha->ipha_dst) == IN_LOOPBACKNET) ||
 578             ((*(uchar_t *)&ipha->ipha_src) == IN_LOOPBACKNET)) {
 579                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 580                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 581                 freemsg(mp);
 582                 return;
 583         }
 584 
 585         len = mp->b_wptr - rptr;
 586         pkt_len = ira->ira_pktlen;
 587 
 588         /* multiple mblk or too short */
 589         len -= pkt_len;
 590         if (len != 0) {
 591                 mp = ip_check_length(mp, rptr, len, pkt_len,
 592                     IP_SIMPLE_HDR_LENGTH, ira);
 593                 if (mp == NULL)
 594                         return;
 595                 ipha = (ipha_t *)mp->b_rptr;
 596         }
 597 
 598         DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *,
 599             ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL,
 600             int, 0);
 601 
 602         /*
 603          * The event for packets being received from a 'physical'
 604          * interface is placed after validation of the source and/or
 605          * destination address as being local so that packets can be
 606          * redirected to loopback addresses using ipnat.
 607          */
 608         DTRACE_PROBE4(ip4__physical__in__start,
 609             ill_t *, ill, ill_t *, NULL,
 610             ipha_t *, ipha, mblk_t *, mp);
 611 
 612         if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) {
 613                 int     ll_multicast = 0;
 614                 int     error;
 615                 ipaddr_t orig_dst = ipha->ipha_dst;
 616 
 617                 if (ira->ira_flags & IRAF_L2DST_MULTICAST)
 618                         ll_multicast = HPE_MULTICAST;
 619                 else if (ira->ira_flags & IRAF_L2DST_BROADCAST)
 620                         ll_multicast = HPE_BROADCAST;
 621 
 622                 FW_HOOKS(ipst->ips_ip4_physical_in_event,
 623                     ipst->ips_ipv4firewall_physical_in,
 624                     ill, NULL, ipha, mp, mp, ll_multicast, ipst, error);
 625 
 626                 DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp);
 627 
 628                 if (mp == NULL)
 629                         return;
 630                 /* The length could have changed */
 631                 ipha = (ipha_t *)mp->b_rptr;
 632                 ira->ira_pktlen = ntohs(ipha->ipha_length);
 633                 pkt_len = ira->ira_pktlen;
 634 
 635                 /*
 636                  * In case the destination changed we override any previous
 637                  * change to nexthop.
 638                  */
 639                 if (orig_dst != ipha->ipha_dst)
 640                         nexthop = ipha->ipha_dst;
 641                 if (nexthop == INADDR_ANY) {
 642                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors);
 643                         ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
 644                         freemsg(mp);
 645                         return;
 646                 }
 647         }
 648 
 649         if (ipst->ips_ip4_observe.he_interested) {
 650                 zoneid_t dzone;
 651 
 652                 /*
 653                  * On the inbound path the src zone will be unknown as
 654                  * this packet has come from the wire.
 655                  */
 656                 dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES);
 657                 ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst);
 658         }
 659 
 660         /*
 661          * If there is a good HW IP header checksum we clear the need
 662          * look at the IP header checksum.
 663          */
 664         if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) &&
 665             ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
 666                 /* Header checksum was ok. Clear the flag */
 667                 DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
 668                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 669         }
 670 
 671         /*
 672          * Here we check to see if we machine is setup as
 673          * L3 loadbalancer and if the incoming packet is for a VIP
 674          *
 675          * Check the following:
 676          * - there is at least a rule
 677          * - protocol of the packet is supported
 678          */
 679         if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) {
 680                 ipaddr_t        lb_dst;
 681                 int             lb_ret;
 682 
 683                 /* For convenience, we pull up the mblk. */
 684                 if (mp->b_cont != NULL) {
 685                         if (pullupmsg(mp, -1) == 0) {
 686                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 687                                 ip_drop_input("ipIfStatsInDiscards - pullupmsg",
 688                                     mp, ill);
 689                                 freemsg(mp);
 690                                 return;
 691                         }
 692                         ipha = (ipha_t *)mp->b_rptr;
 693                 }
 694 
 695                 /*
 696                  * We just drop all fragments going to any VIP, at
 697                  * least for now....
 698                  */
 699                 if (ntohs(ipha->ipha_fragment_offset_and_flags) &
 700                     (IPH_MF | IPH_OFFSET)) {
 701                         if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) {
 702                                 goto after_ilb;
 703                         }
 704 
 705                         ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1);
 706                         ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1);
 707                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 708                         ip_drop_input("ILB fragment", mp, ill);
 709                         freemsg(mp);
 710                         return;
 711                 }
 712                 lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol,
 713                     (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst);
 714 
 715                 if (lb_ret == ILB_DROPPED) {
 716                         /* Is this the right counter to increase? */
 717                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 718                         ip_drop_input("ILB_DROPPED", mp, ill);
 719                         freemsg(mp);
 720                         return;
 721                 }
 722                 if (lb_ret == ILB_BALANCED) {
 723                         /* Set the dst to that of the chosen server */
 724                         nexthop = lb_dst;
 725                         DB_CKSUMFLAGS(mp) = 0;
 726                 }
 727         }
 728 
 729 after_ilb:
 730         opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION;
 731         ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH;
 732         if (opt_len != 0) {
 733                 int error = 0;
 734 
 735                 ira->ira_ip_hdr_length += (opt_len << 2);
 736                 ira->ira_flags |= IRAF_IPV4_OPTIONS;
 737 
 738                 /* IP Options present!  Validate the length. */
 739                 mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira);
 740                 if (mp == NULL)
 741                         return;
 742 
 743                 /* Might have changed */
 744                 ipha = (ipha_t *)mp->b_rptr;
 745 
 746                 /* Verify IP header checksum before parsing the options */
 747                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
 748                     ip_csum_hdr(ipha)) {
 749                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
 750                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
 751                         freemsg(mp);
 752                         return;
 753                 }
 754                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
 755 
 756                 /*
 757                  * Go off to ip_input_options which returns the next hop
 758                  * destination address, which may have been affected
 759                  * by source routing.
 760                  */
 761                 IP_STAT(ipst, ip_opt);
 762 
 763                 nexthop = ip_input_options(ipha, nexthop, mp, ira, &error);
 764                 if (error != 0) {
 765                         /*
 766                          * An ICMP error has been sent and the packet has
 767                          * been dropped.
 768                          */
 769                         return;
 770                 }
 771         }
 772 
 773         if (ill->ill_flags & ILLF_ROUTER)
 774                 irr_flags = IRR_ALLOCATE;
 775         else
 776                 irr_flags = IRR_NONE;
 777 
 778         /* Can not use route cache with TX since the labels can differ */
 779         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
 780                 if (CLASSD(nexthop)) {
 781                         ire = ire_multicast(ill);
 782                 } else {
 783                         /* Match destination and label */
 784                         ire = ire_route_recursive_v4(nexthop, 0, NULL,
 785                             ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR,
 786                             irr_flags, ira->ira_xmit_hint, ipst, NULL, NULL,
 787                             NULL);
 788                 }
 789                 /* Update the route cache so we do the ire_refrele */
 790                 ASSERT(ire != NULL);
 791                 if (rtc->rtc_ire != NULL)
 792                         ire_refrele(rtc->rtc_ire);
 793                 rtc->rtc_ire = ire;
 794                 rtc->rtc_ipaddr = nexthop;
 795         } else if (nexthop == rtc->rtc_ipaddr) {
 796                 /* Use the route cache */
 797                 ASSERT(rtc->rtc_ire != NULL);
 798                 ire = rtc->rtc_ire;
 799         } else {
 800                 /* Update the route cache */
 801                 if (CLASSD(nexthop)) {
 802                         ire = ire_multicast(ill);
 803                 } else {
 804                         /* Just match the destination */
 805                         ire = ire_route_recursive_dstonly_v4(nexthop, irr_flags,
 806                             ira->ira_xmit_hint, ipst);
 807                 }
 808                 ASSERT(ire != NULL);
 809                 if (rtc->rtc_ire != NULL)
 810                         ire_refrele(rtc->rtc_ire);
 811                 rtc->rtc_ire = ire;
 812                 rtc->rtc_ipaddr = nexthop;
 813         }
 814 
 815         ire->ire_ib_pkt_count++;
 816 
 817         /*
 818          * Based on ire_type and ire_flags call one of:
 819          *      ire_recv_local_v4 - for IRE_LOCAL
 820          *      ire_recv_loopback_v4 - for IRE_LOOPBACK
 821          *      ire_recv_multirt_v4 - if RTF_MULTIRT
 822          *      ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE
 823          *      ire_recv_multicast_v4 - for IRE_MULTICAST
 824          *      ire_recv_broadcast_v4 - for IRE_BROADCAST
 825          *      ire_recv_noaccept_v4 - for ire_noaccept ones
 826          *      ire_recv_forward_v4 - for the rest.
 827          */
 828         (*ire->ire_recvfn)(ire, mp, ipha, ira);
 829 }
 830 #undef rptr
 831 
 832 /*
 833  * ire_recvfn for IREs that need forwarding
 834  */
 835 void
 836 ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
 837 {
 838         ipha_t          *ipha = (ipha_t *)iph_arg;
 839         ill_t           *ill = ira->ira_ill;
 840         ip_stack_t      *ipst = ill->ill_ipst;
 841         ill_t           *dst_ill;
 842         nce_t           *nce;
 843         ipaddr_t        src = ipha->ipha_src;
 844         uint32_t        added_tx_len;
 845         uint32_t        mtu, iremtu;
 846 
 847         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
 848                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 849                 ip_drop_input("l2 multicast not forwarded", mp, ill);
 850                 freemsg(mp);
 851                 return;
 852         }
 853 
 854         if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) {
 855                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 856                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 857                 freemsg(mp);
 858                 return;
 859         }
 860 
 861         /*
 862          * Either ire_nce_capable or ire_dep_parent would be set for the IRE
 863          * when it is found by ire_route_recursive, but that some other thread
 864          * could have changed the routes with the effect of clearing
 865          * ire_dep_parent. In that case we'd end up dropping the packet, or
 866          * finding a new nce below.
 867          * Get, allocate, or update the nce.
 868          * We get a refhold on ire_nce_cache as a result of this to avoid races
 869          * where ire_nce_cache is deleted.
 870          *
 871          * This ensures that we don't forward if the interface is down since
 872          * ipif_down removes all the nces.
 873          */
 874         mutex_enter(&ire->ire_lock);
 875         nce = ire->ire_nce_cache;
 876         if (nce == NULL) {
 877                 /* Not yet set up - try to set one up */
 878                 mutex_exit(&ire->ire_lock);
 879                 (void) ire_revalidate_nce(ire);
 880                 mutex_enter(&ire->ire_lock);
 881                 nce = ire->ire_nce_cache;
 882                 if (nce == NULL) {
 883                         mutex_exit(&ire->ire_lock);
 884                         /* The ire_dep_parent chain went bad, or no memory */
 885                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 886                         ip_drop_input("No ire_dep_parent", mp, ill);
 887                         freemsg(mp);
 888                         return;
 889                 }
 890         }
 891         nce_refhold(nce);
 892         mutex_exit(&ire->ire_lock);
 893 
 894         if (nce->nce_is_condemned) {
 895                 nce_t *nce1;
 896 
 897                 nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE);
 898                 nce_refrele(nce);
 899                 if (nce1 == NULL) {
 900                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
 901                         ip_drop_input("No nce", mp, ill);
 902                         freemsg(mp);
 903                         return;
 904                 }
 905                 nce = nce1;
 906         }
 907         dst_ill = nce->nce_ill;
 908 
 909         /*
 910          * Unless we are forwarding, drop the packet.
 911          * We have to let source routed packets through if they go out
 912          * the same interface i.e., they are 'ping -l' packets.
 913          */
 914         if (!(dst_ill->ill_flags & ILLF_ROUTER) &&
 915             !(ip_source_routed(ipha, ipst) && dst_ill == ill)) {
 916                 if (ip_source_routed(ipha, ipst)) {
 917                         ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
 918                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
 919                         nce_refrele(nce);
 920                         return;
 921                 }
 922                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
 923                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
 924                 freemsg(mp);
 925                 nce_refrele(nce);
 926                 return;
 927         }
 928 
 929         if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) {
 930                 ipaddr_t        dst = ipha->ipha_dst;
 931 
 932                 ire->ire_ib_pkt_count--;
 933                 /*
 934                  * Should only use IREs that are visible from the
 935                  * global zone for forwarding.
 936                  * Take a source route into account the same way as ip_input
 937                  * did.
 938                  */
 939                 if (ira->ira_flags & IRAF_IPV4_OPTIONS) {
 940                         int             error = 0;
 941 
 942                         dst = ip_input_options(ipha, dst, mp, ira, &error);
 943                         ASSERT(error == 0);     /* ip_input checked */
 944                 }
 945                 ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID,
 946                     ira->ira_tsl, MATCH_IRE_SECATTR,
 947                     (ill->ill_flags & ILLF_ROUTER) ? IRR_ALLOCATE : IRR_NONE,
 948                     ira->ira_xmit_hint, ipst, NULL, NULL, NULL);
 949                 ire->ire_ib_pkt_count++;
 950                 (*ire->ire_recvfn)(ire, mp, ipha, ira);
 951                 ire_refrele(ire);
 952                 nce_refrele(nce);
 953                 return;
 954         }
 955 
 956         /*
 957          * ipIfStatsHCInForwDatagrams should only be increment if there
 958          * will be an attempt to forward the packet, which is why we
 959          * increment after the above condition has been checked.
 960          */
 961         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
 962 
 963         /* Initiate Read side IPPF processing */
 964         if (IPP_ENABLED(IPP_FWD_IN, ipst)) {
 965                 /* ip_process translates an IS_UNDER_IPMP */
 966                 mp = ip_process(IPP_FWD_IN, mp, ill, ill);
 967                 if (mp == NULL) {
 968                         /* ip_drop_packet and MIB done */
 969                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred "
 970                             "during IPPF processing\n"));
 971                         nce_refrele(nce);
 972                         return;
 973                 }
 974         }
 975 
 976         DTRACE_PROBE4(ip4__forwarding__start,
 977             ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp);
 978 
 979         if (HOOKS4_INTERESTED_FORWARDING(ipst)) {
 980                 int error;
 981 
 982                 FW_HOOKS(ipst->ips_ip4_forwarding_event,
 983                     ipst->ips_ipv4firewall_forwarding,
 984                     ill, dst_ill, ipha, mp, mp, 0, ipst, error);
 985 
 986                 DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp);
 987 
 988                 if (mp == NULL) {
 989                         nce_refrele(nce);
 990                         return;
 991                 }
 992                 /*
 993                  * Even if the destination was changed by the filter we use the
 994                  * forwarding decision that was made based on the address
 995                  * in ip_input.
 996                  */
 997 
 998                 /* Might have changed */
 999                 ipha = (ipha_t *)mp->b_rptr;
1000                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1001         }
1002 
1003         /* Packet is being forwarded. Turning off hwcksum flag. */
1004         DB_CKSUMFLAGS(mp) = 0;
1005 
1006         /*
1007          * Martian Address Filtering [RFC 1812, Section 5.3.7]
1008          * The loopback address check for both src and dst has already
1009          * been checked in ip_input
1010          * In the future one can envision adding RPF checks using number 3.
1011          * If we already checked the same source address we can skip this.
1012          */
1013         if (!(ira->ira_flags & IRAF_VERIFIED_SRC) ||
1014             src != ira->ira_verified_src) {
1015                 switch (ipst->ips_src_check) {
1016                 case 0:
1017                         break;
1018                 case 2:
1019                         if (ip_type_v4(src, ipst) == IRE_BROADCAST) {
1020                                 BUMP_MIB(ill->ill_ip_mib,
1021                                     ipIfStatsForwProhibits);
1022                                 BUMP_MIB(ill->ill_ip_mib,
1023                                     ipIfStatsInAddrErrors);
1024                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1025                                 freemsg(mp);
1026                                 nce_refrele(nce);
1027                                 return;
1028                         }
1029                         /* FALLTHRU */
1030 
1031                 case 1:
1032                         if (CLASSD(src)) {
1033                                 BUMP_MIB(ill->ill_ip_mib,
1034                                     ipIfStatsForwProhibits);
1035                                 BUMP_MIB(ill->ill_ip_mib,
1036                                     ipIfStatsInAddrErrors);
1037                                 ip_drop_input("ipIfStatsInAddrErrors", mp, ill);
1038                                 freemsg(mp);
1039                                 nce_refrele(nce);
1040                                 return;
1041                         }
1042                         break;
1043                 }
1044                 /* Remember for next packet */
1045                 ira->ira_flags |= IRAF_VERIFIED_SRC;
1046                 ira->ira_verified_src = src;
1047         }
1048 
1049         /*
1050          * Check if packet is going out the same link on which it arrived.
1051          * Means we might need to send a redirect.
1052          */
1053         if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) {
1054                 ip_send_potential_redirect_v4(mp, ipha, ire, ira);
1055         }
1056 
1057         added_tx_len = 0;
1058         if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
1059                 mblk_t          *mp1;
1060                 uint32_t        old_pkt_len = ira->ira_pktlen;
1061 
1062                 /* Verify IP header checksum before adding/removing options */
1063                 if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) &&
1064                     ip_csum_hdr(ipha)) {
1065                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1066                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1067                         freemsg(mp);
1068                         nce_refrele(nce);
1069                         return;
1070                 }
1071                 ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM;
1072 
1073                 /*
1074                  * Check if it can be forwarded and add/remove
1075                  * CIPSO options as needed.
1076                  */
1077                 if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) {
1078                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1079                         ip_drop_input("tsol_ip_forward", mp, ill);
1080                         freemsg(mp);
1081                         nce_refrele(nce);
1082                         return;
1083                 }
1084                 /*
1085                  * Size may have changed. Remember amount added in case
1086                  * IP needs to send an ICMP too big.
1087                  */
1088                 mp = mp1;
1089                 ipha = (ipha_t *)mp->b_rptr;
1090                 ira->ira_pktlen = ntohs(ipha->ipha_length);
1091                 ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
1092                 if (ira->ira_pktlen > old_pkt_len)
1093                         added_tx_len = ira->ira_pktlen - old_pkt_len;
1094 
1095                 /* Options can have been added or removed */
1096                 if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH)
1097                         ira->ira_flags |= IRAF_IPV4_OPTIONS;
1098                 else
1099                         ira->ira_flags &= ~IRAF_IPV4_OPTIONS;
1100         }
1101 
1102         mtu = dst_ill->ill_mtu;
1103         if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu)
1104                 mtu = iremtu;
1105         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len);
1106         nce_refrele(nce);
1107 }
1108 
1109 /*
1110  * Used for sending out unicast and multicast packets that are
1111  * forwarded.
1112  */
1113 void
1114 ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha,
1115     ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len)
1116 {
1117         ill_t           *dst_ill = nce->nce_ill;
1118         uint32_t        pkt_len;
1119         uint32_t        sum;
1120         iaflags_t       iraflags = ira->ira_flags;
1121         ip_stack_t      *ipst = ill->ill_ipst;
1122         iaflags_t       ixaflags;
1123 
1124         if (ipha->ipha_ttl <= 1) {
1125                 /* Perhaps the checksum was bad */
1126                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1127                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1128                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1129                         freemsg(mp);
1130                         return;
1131                 }
1132                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1133                 ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill);
1134                 icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira);
1135                 return;
1136         }
1137         ipha->ipha_ttl--;
1138         /* Adjust the checksum to reflect the ttl decrement. */
1139         sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST;
1140         ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16));
1141 
1142         /* Check if there are options to update */
1143         if (iraflags & IRAF_IPV4_OPTIONS) {
1144                 ASSERT(ipha->ipha_version_and_hdr_length !=
1145                     IP_SIMPLE_HDR_VERSION);
1146                 ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM));
1147 
1148                 if (!ip_forward_options(mp, ipha, dst_ill, ira)) {
1149                         /* ipIfStatsForwProhibits and ip_drop_input done */
1150                         return;
1151                 }
1152 
1153                 ipha->ipha_hdr_checksum = 0;
1154                 ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1155         }
1156 
1157         /* Initiate Write side IPPF processing before any fragmentation */
1158         if (IPP_ENABLED(IPP_FWD_OUT, ipst)) {
1159                 /* ip_process translates an IS_UNDER_IPMP */
1160                 mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill);
1161                 if (mp == NULL) {
1162                         /* ip_drop_packet and MIB done */
1163                         ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \
1164                             " during IPPF processing\n"));
1165                         return;
1166                 }
1167         }
1168 
1169         pkt_len = ira->ira_pktlen;
1170 
1171         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams);
1172 
1173         ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL;
1174 
1175         if (pkt_len > mtu) {
1176                 /*
1177                  * It needs fragging on its way out.  If we haven't
1178                  * verified the header checksum yet we do it now since
1179                  * are going to put a surely good checksum in the
1180                  * outgoing header, we have to make sure that it
1181                  * was good coming in.
1182                  */
1183                 if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1184                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1185                         ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1186                         freemsg(mp);
1187                         return;
1188                 }
1189                 if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) {
1190                         BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails);
1191                         ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill);
1192                         if (iraflags & IRAF_SYSTEM_LABELED) {
1193                                 /*
1194                                  * Remove any CIPSO option added by
1195                                  * tsol_ip_forward, and make sure we report
1196                                  * a path MTU so that there
1197                                  * is room to add such a CIPSO option for future
1198                                  * packets.
1199                                  */
1200                                 mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len,
1201                                     AF_INET);
1202                         }
1203 
1204                         icmp_frag_needed(mp, mtu, ira);
1205                         return;
1206                 }
1207 
1208                 (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu,
1209                     ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL);
1210                 return;
1211         }
1212 
1213         ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
1214         if (iraflags & IRAF_LOOPBACK_COPY) {
1215                 /*
1216                  * IXAF_NO_LOOP_ZONEID is not set hence 7th arg
1217                  * is don't care
1218                  */
1219                 (void) ip_postfrag_loopcheck(mp, nce,
1220                     ixaflags | IXAF_LOOPBACK_COPY,
1221                     pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL);
1222         } else {
1223                 (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint,
1224                     GLOBAL_ZONEID, 0, NULL);
1225         }
1226 }
1227 
1228 /*
1229  * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE,
1230  * which is what ire_route_recursive returns when there is no matching ire.
1231  * Send ICMP unreachable unless blackhole.
1232  */
1233 void
1234 ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1235 {
1236         ipha_t          *ipha = (ipha_t *)iph_arg;
1237         ill_t           *ill = ira->ira_ill;
1238         ip_stack_t      *ipst = ill->ill_ipst;
1239 
1240         /* Would we have forwarded this packet if we had a route? */
1241         if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) {
1242                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1243                 ip_drop_input("l2 multicast not forwarded", mp, ill);
1244                 freemsg(mp);
1245                 return;
1246         }
1247 
1248         if (!(ill->ill_flags & ILLF_ROUTER)) {
1249                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1250                 ip_drop_input("ipIfStatsForwProhibits", mp, ill);
1251                 freemsg(mp);
1252                 return;
1253         }
1254         /*
1255          * If we had a route this could have been forwarded. Count as such.
1256          *
1257          * ipIfStatsHCInForwDatagrams should only be increment if there
1258          * will be an attempt to forward the packet, which is why we
1259          * increment after the above condition has been checked.
1260          */
1261         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams);
1262 
1263         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1264 
1265         ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST,
1266             ipst);
1267 
1268         if (ire->ire_flags & RTF_BLACKHOLE) {
1269                 ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill);
1270                 freemsg(mp);
1271         } else {
1272                 ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill);
1273 
1274                 if (ip_source_routed(ipha, ipst)) {
1275                         icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
1276                 } else {
1277                         icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira);
1278                 }
1279         }
1280 }
1281 
1282 /*
1283  * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for
1284  * VRRP when in noaccept mode.
1285  * We silently drop the packet. ARP handles packets even if noaccept is set.
1286  */
1287 /* ARGSUSED */
1288 void
1289 ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1290     ip_recv_attr_t *ira)
1291 {
1292         ill_t           *ill = ira->ira_ill;
1293 
1294         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1295         ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill);
1296         freemsg(mp);
1297 }
1298 
1299 /*
1300  * ire_recvfn for IRE_BROADCAST.
1301  */
1302 void
1303 ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1304     ip_recv_attr_t *ira)
1305 {
1306         ipha_t          *ipha = (ipha_t *)iph_arg;
1307         ill_t           *ill = ira->ira_ill;
1308         ill_t           *dst_ill = ire->ire_ill;
1309         ip_stack_t      *ipst = ill->ill_ipst;
1310         ire_t           *alt_ire;
1311         nce_t           *nce;
1312         ipaddr_t        ipha_dst;
1313 
1314         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts);
1315 
1316         /* Tag for higher-level protocols */
1317         ira->ira_flags |= IRAF_BROADCAST;
1318 
1319         /*
1320          * Whether local or directed broadcast forwarding: don't allow
1321          * for TCP.
1322          */
1323         if (ipha->ipha_protocol == IPPROTO_TCP) {
1324                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1325                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
1326                 freemsg(mp);
1327                 return;
1328         }
1329 
1330         /*
1331          * So that we don't end up with dups, only one ill an IPMP group is
1332          * nominated to receive broadcast traffic.
1333          * If we have no cast_ill we are liberal and accept everything.
1334          */
1335         if (IS_UNDER_IPMP(ill)) {
1336                 /* For an under ill_grp can change under lock */
1337                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1338                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1339                     ill->ill_grp->ig_cast_ill != NULL) {
1340                         rw_exit(&ipst->ips_ill_g_lock);
1341                         /* No MIB since this is normal operation */
1342                         ip_drop_input("not nom_cast", mp, ill);
1343                         freemsg(mp);
1344                         return;
1345                 }
1346                 rw_exit(&ipst->ips_ill_g_lock);
1347 
1348                 ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1349         }
1350 
1351         /*
1352          * After reassembly and IPsec we will need to duplicate the
1353          * broadcast packet for all matching zones on the ill.
1354          */
1355         ira->ira_zoneid = ALL_ZONES;
1356 
1357         /*
1358          * Check for directed broadcast i.e. ire->ire_ill is different than
1359          * the incoming ill.
1360          * The same broadcast address can be assigned to multiple interfaces
1361          * so have to check explicitly for that case by looking up the alt_ire
1362          */
1363         if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) {
1364                 /* Reassemble on the ill on which the packet arrived */
1365                 ip_input_local_v4(ire, mp, ipha, ira);
1366                 /* Restore */
1367                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1368                 return;
1369         }
1370 
1371         /* Is there an IRE_BROADCAST on the incoming ill? */
1372         ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST :
1373             ipha->ipha_dst);
1374         alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill,
1375             ALL_ZONES, ira->ira_tsl,
1376             MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL);
1377         if (alt_ire != NULL) {
1378                 /* Not a directed broadcast */
1379                 /*
1380                  * In the special case of multirouted broadcast
1381                  * packets, we unconditionally need to "gateway"
1382                  * them to the appropriate interface here so that reassembly
1383                  * works. We know that the IRE_BROADCAST on cgtp0 doesn't
1384                  * have RTF_MULTIRT set so we look for such an IRE in the
1385                  * bucket.
1386                  */
1387                 if (alt_ire->ire_flags & RTF_MULTIRT) {
1388                         irb_t           *irb;
1389                         ire_t           *ire1;
1390 
1391                         irb = ire->ire_bucket;
1392                         irb_refhold(irb);
1393                         for (ire1 = irb->irb_ire; ire1 != NULL;
1394                             ire1 = ire1->ire_next) {
1395                                 if (IRE_IS_CONDEMNED(ire1))
1396                                         continue;
1397                                 if (!(ire1->ire_type & IRE_BROADCAST) ||
1398                                     (ire1->ire_flags & RTF_MULTIRT))
1399                                         continue;
1400                                 ill = ire1->ire_ill;
1401                                 ill_refhold(ill);
1402                                 break;
1403                         }
1404                         irb_refrele(irb);
1405                         if (ire1 != NULL) {
1406                                 ill_t *orig_ill = ira->ira_ill;
1407 
1408                                 ire_refrele(alt_ire);
1409                                 /* Reassemble on the new ill */
1410                                 ira->ira_ill = ill;
1411                                 ip_input_local_v4(ire, mp, ipha, ira);
1412                                 ill_refrele(ill);
1413                                 /* Restore */
1414                                 ira->ira_ill = orig_ill;
1415                                 ira->ira_ruifindex =
1416                                     orig_ill->ill_phyint->phyint_ifindex;
1417                                 return;
1418                         }
1419                 }
1420                 ire_refrele(alt_ire);
1421                 /* Reassemble on the ill on which the packet arrived */
1422                 ip_input_local_v4(ire, mp, ipha, ira);
1423                 goto done;
1424         }
1425 
1426         /*
1427          * This is a directed broadcast
1428          *
1429          * If directed broadcast is allowed, then forward the packet out
1430          * the destination interface with IXAF_LOOPBACK_COPY set. That will
1431          * result in ip_input() receiving a copy of the packet on the
1432          * appropriate ill. (We could optimize this to avoid the extra trip
1433          * via ip_input(), but since directed broadcasts are normally disabled
1434          * it doesn't make sense to optimize it.)
1435          */
1436         if (!ipst->ips_ip_g_forward_directed_bcast ||
1437             (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) {
1438                 ip_drop_input("directed broadcast not allowed", mp, ill);
1439                 freemsg(mp);
1440                 goto done;
1441         }
1442         if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1443                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1444                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1445                 freemsg(mp);
1446                 goto done;
1447         }
1448 
1449         /*
1450          * Clear the indication that this may have hardware
1451          * checksum as we are not using it for forwarding.
1452          */
1453         DB_CKSUMFLAGS(mp) = 0;
1454 
1455         /*
1456          * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one.
1457          */
1458         ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1;
1459         ipha->ipha_hdr_checksum = 0;
1460         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
1461 
1462         /*
1463          * We use ip_forward_xmit to do any fragmentation.
1464          * and loopback copy on the outbound interface.
1465          *
1466          * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side.
1467          */
1468         ira->ira_flags |= IRAF_LOOPBACK_COPY;
1469 
1470         nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST);
1471         if (nce == NULL) {
1472                 BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards);
1473                 ip_drop_output("No nce", mp, dst_ill);
1474                 freemsg(mp);
1475                 goto done;
1476         }
1477 
1478         ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mc_mtu, 0);
1479         nce_refrele(nce);
1480 done:
1481         /* Restore */
1482         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1483 }
1484 
1485 /*
1486  * ire_recvfn for IRE_MULTICAST.
1487  */
1488 void
1489 ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg,
1490     ip_recv_attr_t *ira)
1491 {
1492         ipha_t          *ipha = (ipha_t *)iph_arg;
1493         ill_t           *ill = ira->ira_ill;
1494         ip_stack_t      *ipst = ill->ill_ipst;
1495 
1496         ASSERT(ire->ire_ill == ira->ira_ill);
1497 
1498         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts);
1499         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen);
1500 
1501         /* RSVP hook */
1502         if (ira->ira_flags & IRAF_RSVP)
1503                 goto forus;
1504 
1505         /* Tag for higher-level protocols */
1506         ira->ira_flags |= IRAF_MULTICAST;
1507 
1508         /*
1509          * So that we don't end up with dups, only one ill an IPMP group is
1510          * nominated to receive multicast traffic.
1511          * If we have no cast_ill we are liberal and accept everything.
1512          */
1513         if (IS_UNDER_IPMP(ill)) {
1514                 ip_stack_t      *ipst = ill->ill_ipst;
1515 
1516                 /* For an under ill_grp can change under lock */
1517                 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1518                 if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
1519                     ill->ill_grp->ig_cast_ill != NULL) {
1520                         rw_exit(&ipst->ips_ill_g_lock);
1521                         ip_drop_input("not on cast ill", mp, ill);
1522                         freemsg(mp);
1523                         return;
1524                 }
1525                 rw_exit(&ipst->ips_ill_g_lock);
1526                 /*
1527                  * We switch to the upper ill so that mrouter and hasmembers
1528                  * can operate on upper here and in ip_input_multicast.
1529                  */
1530                 ill = ipmp_ill_hold_ipmp_ill(ill);
1531                 if (ill != NULL) {
1532                         ASSERT(ill != ira->ira_ill);
1533                         ASSERT(ire->ire_ill == ira->ira_ill);
1534                         ira->ira_ill = ill;
1535                         ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1536                 } else {
1537                         ill = ira->ira_ill;
1538                 }
1539         }
1540 
1541         /*
1542          * Check if we are a multicast router - send ip_mforward a copy of
1543          * the packet.
1544          * Due to mroute_decap tunnels we consider forwarding packets even if
1545          * mrouted has not joined the allmulti group on this interface.
1546          */
1547         if (ipst->ips_ip_g_mrouter) {
1548                 int retval;
1549 
1550                 /*
1551                  * Clear the indication that this may have hardware
1552                  * checksum as we are not using it for forwarding.
1553                  */
1554                 DB_CKSUMFLAGS(mp) = 0;
1555 
1556                 /*
1557                  * ip_mforward helps us make these distinctions: If received
1558                  * on tunnel and not IGMP, then drop.
1559                  * If IGMP packet, then don't check membership
1560                  * If received on a phyint and IGMP or PIM, then
1561                  * don't check membership
1562                  */
1563                 retval = ip_mforward(mp, ira);
1564                 /* ip_mforward updates mib variables if needed */
1565 
1566                 switch (retval) {
1567                 case 0:
1568                         /*
1569                          * pkt is okay and arrived on phyint.
1570                          *
1571                          * If we are running as a multicast router
1572                          * we need to see all IGMP and/or PIM packets.
1573                          */
1574                         if ((ipha->ipha_protocol == IPPROTO_IGMP) ||
1575                             (ipha->ipha_protocol == IPPROTO_PIM)) {
1576                                 goto forus;
1577                         }
1578                         break;
1579                 case -1:
1580                         /* pkt is mal-formed, toss it */
1581                         freemsg(mp);
1582                         goto done;
1583                 case 1:
1584                         /*
1585                          * pkt is okay and arrived on a tunnel
1586                          *
1587                          * If we are running a multicast router
1588                          * we need to see all igmp packets.
1589                          */
1590                         if (ipha->ipha_protocol == IPPROTO_IGMP) {
1591                                 goto forus;
1592                         }
1593                         ip_drop_input("Multicast on tunnel ignored", mp, ill);
1594                         freemsg(mp);
1595                         goto done;
1596                 }
1597         }
1598 
1599         /*
1600          * Check if we have members on this ill. This is not necessary for
1601          * correctness because even if the NIC/GLD had a leaky filter, we
1602          * filter before passing to each conn_t.
1603          */
1604         if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) {
1605                 /*
1606                  * Nobody interested
1607                  *
1608                  * This might just be caused by the fact that
1609                  * multiple IP Multicast addresses map to the same
1610                  * link layer multicast - no need to increment counter!
1611                  */
1612                 ip_drop_input("Multicast with no members", mp, ill);
1613                 freemsg(mp);
1614                 goto done;
1615         }
1616 forus:
1617         ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n",
1618             ntohl(ipha->ipha_dst)));
1619 
1620         /*
1621          * After reassembly and IPsec we will need to duplicate the
1622          * multicast packet for all matching zones on the ill.
1623          */
1624         ira->ira_zoneid = ALL_ZONES;
1625 
1626         /* Reassemble on the ill on which the packet arrived */
1627         ip_input_local_v4(ire, mp, ipha, ira);
1628 done:
1629         if (ill != ire->ire_ill) {
1630                 ill_refrele(ill);
1631                 ira->ira_ill = ire->ire_ill;
1632                 ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
1633         }
1634 }
1635 
1636 /*
1637  * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT.
1638  * Drop packets since we don't forward out multirt routes.
1639  */
1640 /* ARGSUSED */
1641 void
1642 ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1643 {
1644         ill_t           *ill = ira->ira_ill;
1645 
1646         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes);
1647         ip_drop_input("Not forwarding out MULTIRT", mp, ill);
1648         freemsg(mp);
1649 }
1650 
1651 /*
1652  * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK
1653  * has rewritten the packet to have a loopback destination address (We
1654  * filter out packet with a loopback destination from arriving over the wire).
1655  * We don't know what zone to use, thus we always use the GLOBAL_ZONEID.
1656  */
1657 void
1658 ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1659 {
1660         ipha_t          *ipha = (ipha_t *)iph_arg;
1661         ill_t           *ill = ira->ira_ill;
1662         ill_t           *ire_ill = ire->ire_ill;
1663 
1664         ira->ira_zoneid = GLOBAL_ZONEID;
1665 
1666         /* Switch to the lo0 ill for further processing  */
1667         if (ire_ill != ill) {
1668                 /*
1669                  * Update ira_ill to be the ILL on which the IP address
1670                  * is hosted.
1671                  * No need to hold the ill since we have a hold on the ire
1672                  */
1673                 ASSERT(ira->ira_ill == ira->ira_rill);
1674                 ira->ira_ill = ire_ill;
1675 
1676                 ip_input_local_v4(ire, mp, ipha, ira);
1677 
1678                 /* Restore */
1679                 ASSERT(ira->ira_ill == ire_ill);
1680                 ira->ira_ill = ill;
1681                 return;
1682 
1683         }
1684         ip_input_local_v4(ire, mp, ipha, ira);
1685 }
1686 
1687 /*
1688  * ire_recvfn for IRE_LOCAL.
1689  */
1690 void
1691 ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira)
1692 {
1693         ipha_t          *ipha = (ipha_t *)iph_arg;
1694         ill_t           *ill = ira->ira_ill;
1695         ill_t           *ire_ill = ire->ire_ill;
1696 
1697         /* Make a note for DAD that this address is in use */
1698         ire->ire_last_used_time = LBOLT_FASTPATH;
1699 
1700         /* Only target the IRE_LOCAL with the right zoneid. */
1701         ira->ira_zoneid = ire->ire_zoneid;
1702 
1703         /*
1704          * If the packet arrived on the wrong ill, we check that
1705          * this is ok.
1706          * If it is, then we ensure that we do the reassembly on
1707          * the ill on which the address is hosted. We keep ira_rill as
1708          * the one on which the packet arrived, so that IP_PKTINFO and
1709          * friends can report this.
1710          */
1711         if (ire_ill != ill) {
1712                 ire_t *new_ire;
1713 
1714                 new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill);
1715                 if (new_ire == NULL) {
1716                         /* Drop packet */
1717                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits);
1718                         ip_drop_input("ipIfStatsInForwProhibits", mp, ill);
1719                         freemsg(mp);
1720                         return;
1721                 }
1722                 /*
1723                  * Update ira_ill to be the ILL on which the IP address
1724                  * is hosted. No need to hold the ill since we have a
1725                  * hold on the ire. Note that we do the switch even if
1726                  * new_ire == ire (for IPMP, ire would be the one corresponding
1727                  * to the IPMP ill).
1728                  */
1729                 ASSERT(ira->ira_ill == ira->ira_rill);
1730                 ira->ira_ill = new_ire->ire_ill;
1731 
1732                 /* ira_ruifindex tracks the upper for ira_rill */
1733                 if (IS_UNDER_IPMP(ill))
1734                         ira->ira_ruifindex = ill_get_upper_ifindex(ill);
1735 
1736                 ip_input_local_v4(new_ire, mp, ipha, ira);
1737 
1738                 /* Restore */
1739                 ASSERT(ira->ira_ill == new_ire->ire_ill);
1740                 ira->ira_ill = ill;
1741                 ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex;
1742 
1743                 if (new_ire != ire)
1744                         ire_refrele(new_ire);
1745                 return;
1746         }
1747 
1748         ip_input_local_v4(ire, mp, ipha, ira);
1749 }
1750 
1751 /*
1752  * Common function for packets arriving for the host. Handles
1753  * checksum verification, reassembly checks, etc.
1754  */
1755 static void
1756 ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1757 {
1758         ill_t           *ill = ira->ira_ill;
1759         iaflags_t       iraflags = ira->ira_flags;
1760 
1761         /*
1762          * Verify IP header checksum. If the packet was AH or ESP then
1763          * this flag has already been cleared. Likewise if the packet
1764          * had a hardware checksum.
1765          */
1766         if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) {
1767                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs);
1768                 ip_drop_input("ipIfStatsInCksumErrs", mp, ill);
1769                 freemsg(mp);
1770                 return;
1771         }
1772 
1773         if (iraflags & IRAF_IPV4_OPTIONS) {
1774                 if (!ip_input_local_options(mp, ipha, ira)) {
1775                         /* Error has been sent and mp consumed */
1776                         return;
1777                 }
1778                 /*
1779                  * Some old hardware does partial checksum by including the
1780                  * whole IP header, so the partial checksum value might have
1781                  * become invalid if any option in the packet have been
1782                  * updated. Always clear partial checksum flag here.
1783                  */
1784                 DB_CKSUMFLAGS(mp) &= ~HCK_PARTIALCKSUM;
1785         }
1786 
1787         /*
1788          * Is packet part of fragmented IP packet?
1789          * We compare against defined values in network byte order
1790          */
1791         if (ipha->ipha_fragment_offset_and_flags &
1792             (IPH_MF_HTONS | IPH_OFFSET_HTONS)) {
1793                 /*
1794                  * Make sure we have ira_l2src before we loose the original
1795                  * mblk
1796                  */
1797                 if (!(ira->ira_flags & IRAF_L2SRC_SET))
1798                         ip_setl2src(mp, ira, ira->ira_rill);
1799 
1800                 mp = ip_input_fragment(mp, ipha, ira);
1801                 if (mp == NULL)
1802                         return;
1803                 /* Completed reassembly */
1804                 ipha = (ipha_t *)mp->b_rptr;
1805         }
1806 
1807         /*
1808          * For broadcast and multicast we need some extra work before
1809          * we call ip_fanout_v4(), since in the case of shared-IP zones
1810          * we need to pretend that a packet arrived for each zoneid.
1811          */
1812         if (iraflags & IRAF_MULTIBROADCAST) {
1813                 if (iraflags & IRAF_BROADCAST)
1814                         ip_input_broadcast_v4(ire, mp, ipha, ira);
1815                 else
1816                         ip_input_multicast_v4(ire, mp, ipha, ira);
1817                 return;
1818         }
1819         ip_fanout_v4(mp, ipha, ira);
1820 }
1821 
1822 
1823 /*
1824  * Handle multiple zones which match the same broadcast address
1825  * and ill by delivering a packet to each of them.
1826  * Walk the bucket and look for different ire_zoneid but otherwise
1827  * the same IRE (same ill/addr/mask/type).
1828  * Note that ire_add() tracks IREs that are identical in all
1829  * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by
1830  * increasing ire_identical_cnt. Thus we don't need to be concerned
1831  * about those.
1832  */
1833 static void
1834 ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1835 {
1836         ill_t           *ill = ira->ira_ill;
1837         ip_stack_t      *ipst = ill->ill_ipst;
1838         netstack_t      *ns = ipst->ips_netstack;
1839         irb_t           *irb;
1840         ire_t           *ire1;
1841         mblk_t          *mp1;
1842         ipha_t          *ipha1;
1843         uint_t          ira_pktlen = ira->ira_pktlen;
1844         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1845 
1846         irb = ire->ire_bucket;
1847 
1848         /*
1849          * If we don't have more than one shared-IP zone, or if
1850          * there can't be more than one IRE_BROADCAST for this
1851          * IP address, then just set the zoneid and proceed.
1852          */
1853         if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) {
1854                 ira->ira_zoneid = ire->ire_zoneid;
1855 
1856                 ip_fanout_v4(mp, ipha, ira);
1857                 return;
1858         }
1859         irb_refhold(irb);
1860         for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
1861                 /* We do the main IRE after the end of the loop */
1862                 if (ire1 == ire)
1863                         continue;
1864 
1865                 /*
1866                  * Only IREs for the same IP address should be in the same
1867                  * bucket.
1868                  * But could have IRE_HOSTs in the case of CGTP.
1869                  */
1870                 ASSERT(ire1->ire_addr == ire->ire_addr);
1871                 if (!(ire1->ire_type & IRE_BROADCAST))
1872                         continue;
1873 
1874                 if (IRE_IS_CONDEMNED(ire1))
1875                         continue;
1876 
1877                 mp1 = copymsg(mp);
1878                 if (mp1 == NULL) {
1879                         /* Failed to deliver to one zone */
1880                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1881                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1882                         continue;
1883                 }
1884                 ira->ira_zoneid = ire1->ire_zoneid;
1885                 ipha1 = (ipha_t *)mp1->b_rptr;
1886                 ip_fanout_v4(mp1, ipha1, ira);
1887                 /*
1888                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1889                  * so we restore them for a potential next iteration
1890                  */
1891                 ira->ira_pktlen = ira_pktlen;
1892                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1893         }
1894         irb_refrele(irb);
1895         /* Do the main ire */
1896         ira->ira_zoneid = ire->ire_zoneid;
1897         ip_fanout_v4(mp, ipha, ira);
1898 }
1899 
1900 /*
1901  * Handle multiple zones which want to receive the same multicast packets
1902  * on this ill by delivering a packet to each of them.
1903  *
1904  * Note that for packets delivered to transports we could instead do this
1905  * as part of the fanout code, but since we need to handle icmp_inbound
1906  * it is simpler to have multicast work the same as broadcast.
1907  *
1908  * The ip_fanout matching for multicast matches based on ilm independent of
1909  * zoneid since the zoneid restriction is applied when joining a multicast
1910  * group.
1911  */
1912 /* ARGSUSED */
1913 static void
1914 ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
1915 {
1916         ill_t           *ill = ira->ira_ill;
1917         iaflags_t       iraflags = ira->ira_flags;
1918         ip_stack_t      *ipst = ill->ill_ipst;
1919         netstack_t      *ns = ipst->ips_netstack;
1920         zoneid_t        zoneid;
1921         mblk_t          *mp1;
1922         ipha_t          *ipha1;
1923         uint_t          ira_pktlen = ira->ira_pktlen;
1924         uint16_t        ira_ip_hdr_length = ira->ira_ip_hdr_length;
1925 
1926         /* ire_recv_multicast has switched to the upper ill for IPMP */
1927         ASSERT(!IS_UNDER_IPMP(ill));
1928 
1929         /*
1930          * If we don't have more than one shared-IP zone, or if
1931          * there are no members in anything but the global zone,
1932          * then just set the zoneid and proceed.
1933          */
1934         if (ns->netstack_numzones == 1 ||
1935             !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst,
1936             GLOBAL_ZONEID)) {
1937                 ira->ira_zoneid = GLOBAL_ZONEID;
1938 
1939                 /* If sender didn't want this zone to receive it, drop */
1940                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1941                     ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1942                         ip_drop_input("Multicast but wrong zoneid", mp, ill);
1943                         freemsg(mp);
1944                         return;
1945                 }
1946                 ip_fanout_v4(mp, ipha, ira);
1947                 return;
1948         }
1949 
1950         /*
1951          * Here we loop over all zoneids that have members in the group
1952          * and deliver a packet to ip_fanout for each zoneid.
1953          *
1954          * First find any members in the lowest numeric zoneid by looking for
1955          * first zoneid larger than -1 (ALL_ZONES).
1956          * We terminate the loop when we receive -1 (ALL_ZONES).
1957          */
1958         zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES);
1959         for (; zoneid != ALL_ZONES;
1960             zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) {
1961                 /*
1962                  * Avoid an extra copymsg/freemsg by skipping global zone here
1963                  * and doing that at the end.
1964                  */
1965                 if (zoneid == GLOBAL_ZONEID)
1966                         continue;
1967 
1968                 ira->ira_zoneid = zoneid;
1969 
1970                 /* If sender didn't want this zone to receive it, skip */
1971                 if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1972                     ira->ira_no_loop_zoneid == ira->ira_zoneid)
1973                         continue;
1974 
1975                 mp1 = copymsg(mp);
1976                 if (mp1 == NULL) {
1977                         /* Failed to deliver to one zone */
1978                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1979                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
1980                         continue;
1981                 }
1982                 ipha1 = (ipha_t *)mp1->b_rptr;
1983                 ip_fanout_v4(mp1, ipha1, ira);
1984                 /*
1985                  * IPsec might have modified ira_pktlen and ira_ip_hdr_length
1986                  * so we restore them for a potential next iteration
1987                  */
1988                 ira->ira_pktlen = ira_pktlen;
1989                 ira->ira_ip_hdr_length = ira_ip_hdr_length;
1990         }
1991 
1992         /* Do the main ire */
1993         ira->ira_zoneid = GLOBAL_ZONEID;
1994         /* If sender didn't want this zone to receive it, drop */
1995         if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) &&
1996             ira->ira_no_loop_zoneid == ira->ira_zoneid) {
1997                 ip_drop_input("Multicast but wrong zoneid", mp, ill);
1998                 freemsg(mp);
1999         } else {
2000                 ip_fanout_v4(mp, ipha, ira);
2001         }
2002 }
2003 
2004 
2005 /*
2006  * Determine the zoneid and IRAF_TX_* flags if trusted extensions
2007  * is in use. Updates ira_zoneid and ira_flags as a result.
2008  */
2009 static void
2010 ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol,
2011     uint_t ip_hdr_length, ip_recv_attr_t *ira)
2012 {
2013         uint16_t        *up;
2014         uint16_t        lport;
2015         zoneid_t        zoneid;
2016 
2017         ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED);
2018 
2019         /*
2020          * If the packet is unlabeled we might allow read-down
2021          * for MAC_EXEMPT. Below we clear this if it is a multi-level
2022          * port (MLP).
2023          * Note that ira_tsl can be NULL here.
2024          */
2025         if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED)
2026                 ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE;
2027 
2028         if (ira->ira_zoneid != ALL_ZONES)
2029                 return;
2030 
2031         ira->ira_flags |= IRAF_TX_SHARED_ADDR;
2032 
2033         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2034         switch (protocol) {
2035         case IPPROTO_TCP:
2036         case IPPROTO_SCTP:
2037         case IPPROTO_UDP:
2038                 /* Caller ensures this */
2039                 ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr);
2040 
2041                 /*
2042                  * Only these transports support MLP.
2043                  * We know their destination port numbers is in
2044                  * the same place in the header.
2045                  */
2046                 lport = up[1];
2047 
2048                 /*
2049                  * No need to handle exclusive-stack zones
2050                  * since ALL_ZONES only applies to the shared IP instance.
2051                  */
2052                 zoneid = tsol_mlp_findzone(protocol, lport);
2053                 /*
2054                  * If no shared MLP is found, tsol_mlp_findzone returns
2055                  * ALL_ZONES.  In that case, we assume it's SLP, and
2056                  * search for the zone based on the packet label.
2057                  *
2058                  * If there is such a zone, we prefer to find a
2059                  * connection in it.  Otherwise, we look for a
2060                  * MAC-exempt connection in any zone whose label
2061                  * dominates the default label on the packet.
2062                  */
2063                 if (zoneid == ALL_ZONES)
2064                         zoneid = tsol_attr_to_zoneid(ira);
2065                 else
2066                         ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE;
2067                 break;
2068         default:
2069                 /* Handle shared address for other protocols */
2070                 zoneid = tsol_attr_to_zoneid(ira);
2071                 break;
2072         }
2073         ira->ira_zoneid = zoneid;
2074 }
2075 
2076 /*
2077  * Increment checksum failure statistics
2078  */
2079 static void
2080 ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill)
2081 {
2082         ip_stack_t      *ipst = ill->ill_ipst;
2083 
2084         switch (protocol) {
2085         case IPPROTO_TCP:
2086                 BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs);
2087 
2088                 if (hck_flags & HCK_FULLCKSUM)
2089                         IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err);
2090                 else if (hck_flags & HCK_PARTIALCKSUM)
2091                         IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err);
2092                 else
2093                         IP_STAT(ipst, ip_tcp_in_sw_cksum_err);
2094                 break;
2095         case IPPROTO_UDP:
2096                 BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs);
2097                 if (hck_flags & HCK_FULLCKSUM)
2098                         IP_STAT(ipst, ip_udp_in_full_hw_cksum_err);
2099                 else if (hck_flags & HCK_PARTIALCKSUM)
2100                         IP_STAT(ipst, ip_udp_in_part_hw_cksum_err);
2101                 else
2102                         IP_STAT(ipst, ip_udp_in_sw_cksum_err);
2103                 break;
2104         case IPPROTO_ICMP:
2105                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2106                 break;
2107         default:
2108                 ASSERT(0);
2109                 break;
2110         }
2111 }
2112 
2113 /* Calculate the IPv4 pseudo-header checksum */
2114 uint32_t
2115 ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira)
2116 {
2117         uint_t          ulp_len;
2118         uint32_t        cksum;
2119         uint8_t         protocol = ira->ira_protocol;
2120         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2121 
2122 #define iphs    ((uint16_t *)ipha)
2123 
2124         switch (protocol) {
2125         case IPPROTO_TCP:
2126                 ulp_len = ira->ira_pktlen - ip_hdr_length;
2127 
2128                 /* Protocol and length */
2129                 cksum = htons(ulp_len) + IP_TCP_CSUM_COMP;
2130                 /* IP addresses */
2131                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2132                 break;
2133 
2134         case IPPROTO_UDP: {
2135                 udpha_t         *udpha;
2136 
2137                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2138 
2139                 /* Protocol and length */
2140                 cksum = udpha->uha_length + IP_UDP_CSUM_COMP;
2141                 /* IP addresses */
2142                 cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9];
2143                 break;
2144         }
2145 
2146         default:
2147                 cksum = 0;
2148                 break;
2149         }
2150 #undef  iphs
2151         return (cksum);
2152 }
2153 
2154 
2155 /*
2156  * Software verification of the ULP checksums.
2157  * Returns B_TRUE if ok.
2158  * Increments statistics of failed.
2159  */
2160 static boolean_t
2161 ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2162 {
2163         ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2164         uint32_t        cksum;
2165         uint8_t         protocol = ira->ira_protocol;
2166         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2167 
2168         IP_STAT(ipst, ip_in_sw_cksum);
2169 
2170         ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
2171 
2172         cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2173         cksum = IP_CSUM(mp, ip_hdr_length, cksum);
2174         if (cksum == 0)
2175                 return (B_TRUE);
2176 
2177         ip_input_cksum_err_v4(protocol, 0, ira->ira_ill);
2178         return (B_FALSE);
2179 }
2180 
2181 /*
2182  * Verify the ULP checksums.
2183  * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum
2184  * algorithm.
2185  * Increments statistics if failed.
2186  */
2187 static boolean_t
2188 ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha,
2189     ip_recv_attr_t *ira)
2190 {
2191         ill_t           *ill = ira->ira_rill;
2192         uint16_t        hck_flags;
2193         uint32_t        cksum;
2194         mblk_t          *mp1;
2195         int32_t         len;
2196         uint8_t         protocol = ira->ira_protocol;
2197         uint16_t        ip_hdr_length = ira->ira_ip_hdr_length;
2198 
2199 
2200         switch (protocol) {
2201         case IPPROTO_TCP:
2202                 break;
2203 
2204         case IPPROTO_UDP: {
2205                 udpha_t         *udpha;
2206 
2207                 udpha = (udpha_t  *)((uchar_t *)ipha + ip_hdr_length);
2208                 if (udpha->uha_checksum == 0) {
2209                         /* Packet doesn't have a UDP checksum */
2210                         return (B_TRUE);
2211                 }
2212                 break;
2213         }
2214         case IPPROTO_SCTP: {
2215                 sctp_hdr_t      *sctph;
2216                 uint32_t        pktsum;
2217 
2218                 sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length);
2219 #ifdef  DEBUG
2220                 if (skip_sctp_cksum)
2221                         return (B_TRUE);
2222 #endif
2223                 pktsum = sctph->sh_chksum;
2224                 sctph->sh_chksum = 0;
2225                 cksum = sctp_cksum(mp, ip_hdr_length);
2226                 sctph->sh_chksum = pktsum;
2227                 if (cksum == pktsum)
2228                         return (B_TRUE);
2229 
2230                 /*
2231                  * Defer until later whether a bad checksum is ok
2232                  * in order to allow RAW sockets to use Adler checksum
2233                  * with SCTP.
2234                  */
2235                 ira->ira_flags |= IRAF_SCTP_CSUM_ERR;
2236                 return (B_TRUE);
2237         }
2238 
2239         default:
2240                 /* No ULP checksum to verify. */
2241                 return (B_TRUE);
2242         }
2243         /*
2244          * Revert to software checksum calculation if the interface
2245          * isn't capable of checksum offload.
2246          * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout.
2247          * Note: IRAF_NO_HW_CKSUM is not currently used.
2248          */
2249         ASSERT(!IS_IPMP(ill));
2250         if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) ||
2251             !dohwcksum) {
2252                 return (ip_input_sw_cksum_v4(mp, ipha, ira));
2253         }
2254 
2255         /*
2256          * We apply this for all ULP protocols. Does the HW know to
2257          * not set the flags for SCTP and other protocols.
2258          */
2259 
2260         hck_flags = DB_CKSUMFLAGS(mp);
2261 
2262         if (hck_flags & HCK_FULLCKSUM_OK) {
2263                 /*
2264                  * Hardware has already verified the checksum.
2265                  */
2266                 return (B_TRUE);
2267         }
2268 
2269         if (hck_flags & HCK_FULLCKSUM) {
2270                 /*
2271                  * Full checksum has been computed by the hardware
2272                  * and has been attached.  If the driver wants us to
2273                  * verify the correctness of the attached value, in
2274                  * order to protect against faulty hardware, compare
2275                  * it against -0 (0xFFFF) to see if it's valid.
2276                  */
2277                 cksum = DB_CKSUM16(mp);
2278                 if (cksum == 0xFFFF)
2279                         return (B_TRUE);
2280                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2281                 return (B_FALSE);
2282         }
2283 
2284         mp1 = mp->b_cont;
2285         if ((hck_flags & HCK_PARTIALCKSUM) &&
2286             (mp1 == NULL || mp1->b_cont == NULL) &&
2287             ip_hdr_length >= DB_CKSUMSTART(mp) &&
2288             ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) {
2289                 uint32_t        adj;
2290                 uchar_t         *cksum_start;
2291 
2292                 cksum = ip_input_cksum_pseudo_v4(ipha, ira);
2293 
2294                 cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp));
2295 
2296                 /*
2297                  * Partial checksum has been calculated by hardware
2298                  * and attached to the packet; in addition, any
2299                  * prepended extraneous data is even byte aligned,
2300                  * and there are at most two mblks associated with
2301                  * the packet.  If any such data exists, we adjust
2302                  * the checksum; also take care any postpended data.
2303                  */
2304                 IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj);
2305                 /*
2306                  * One's complement subtract extraneous checksum
2307                  */
2308                 cksum += DB_CKSUM16(mp);
2309                 if (adj >= cksum)
2310                         cksum = ~(adj - cksum) & 0xFFFF;
2311                 else
2312                         cksum -= adj;
2313                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2314                 cksum = (cksum & 0xFFFF) + ((int)cksum >> 16);
2315                 if (!(~cksum & 0xFFFF))
2316                         return (B_TRUE);
2317 
2318                 ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill);
2319                 return (B_FALSE);
2320         }
2321         return (ip_input_sw_cksum_v4(mp, ipha, ira));
2322 }
2323 
2324 
2325 /*
2326  * Handle fanout of received packets.
2327  * Unicast packets that are looped back (from ire_send_local_v4) and packets
2328  * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM.
2329  *
2330  * IPQoS Notes
2331  * Before sending it to the client, invoke IPPF processing. Policy processing
2332  * takes place only if the callout_position, IPP_LOCAL_IN, is enabled.
2333  */
2334 void
2335 ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
2336 {
2337         ill_t           *ill = ira->ira_ill;
2338         iaflags_t       iraflags = ira->ira_flags;
2339         ip_stack_t      *ipst = ill->ill_ipst;
2340         uint8_t         protocol = ipha->ipha_protocol;
2341         conn_t          *connp;
2342 #define rptr    ((uchar_t *)ipha)
2343         uint_t          ip_hdr_length;
2344         uint_t          min_ulp_header_length;
2345         int             offset;
2346         ssize_t         len;
2347         netstack_t      *ns = ipst->ips_netstack;
2348         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2349         ill_t           *rill = ira->ira_rill;
2350 
2351         ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length));
2352 
2353         ip_hdr_length = ira->ira_ip_hdr_length;
2354         ira->ira_protocol = protocol;
2355 
2356         /*
2357          * Time for IPP once we've done reassembly and IPsec.
2358          * We skip this for loopback packets since we don't do IPQoS
2359          * on loopback.
2360          */
2361         if (IPP_ENABLED(IPP_LOCAL_IN, ipst) &&
2362             !(iraflags & IRAF_LOOPBACK) &&
2363             (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) {
2364                 /*
2365                  * Use the interface on which the packet arrived - not where
2366                  * the IP address is hosted.
2367                  */
2368                 /* ip_process translates an IS_UNDER_IPMP */
2369                 mp = ip_process(IPP_LOCAL_IN, mp, rill, ill);
2370                 if (mp == NULL) {
2371                         /* ip_drop_packet and MIB done */
2372                         return;
2373                 }
2374         }
2375 
2376         /* Determine the minimum required size of the upper-layer header */
2377         /* Need to do this for at least the set of ULPs that TX handles. */
2378         switch (protocol) {
2379         case IPPROTO_TCP:
2380                 min_ulp_header_length = TCP_MIN_HEADER_LENGTH;
2381                 break;
2382         case IPPROTO_SCTP:
2383                 min_ulp_header_length = SCTP_COMMON_HDR_LENGTH;
2384                 break;
2385         case IPPROTO_UDP:
2386                 min_ulp_header_length = UDPH_SIZE;
2387                 break;
2388         case IPPROTO_ICMP:
2389                 min_ulp_header_length = ICMPH_SIZE;
2390                 break;
2391         default:
2392                 min_ulp_header_length = 0;
2393                 break;
2394         }
2395         /* Make sure we have the min ULP header length */
2396         len = mp->b_wptr - rptr;
2397         if (len < ip_hdr_length + min_ulp_header_length) {
2398                 if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) {
2399                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2400                         ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2401                         freemsg(mp);
2402                         return;
2403                 }
2404                 IP_STAT(ipst, ip_recv_pullup);
2405                 ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length,
2406                     ira);
2407                 if (ipha == NULL)
2408                         goto discard;
2409                 len = mp->b_wptr - rptr;
2410         }
2411 
2412         /*
2413          * If trusted extensions then determine the zoneid and TX specific
2414          * ira_flags.
2415          */
2416         if (iraflags & IRAF_SYSTEM_LABELED) {
2417                 /* This can update ira->ira_flags and ira->ira_zoneid */
2418                 ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira);
2419                 iraflags = ira->ira_flags;
2420         }
2421 
2422 
2423         /* Verify ULP checksum. Handles TCP, UDP, and SCTP */
2424         if (iraflags & IRAF_VERIFY_ULP_CKSUM) {
2425                 if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) {
2426                         /* Bad checksum. Stats are already incremented */
2427                         ip_drop_input("Bad ULP checksum", mp, ill);
2428                         freemsg(mp);
2429                         return;
2430                 }
2431                 /* IRAF_SCTP_CSUM_ERR could have been set */
2432                 iraflags = ira->ira_flags;
2433         }
2434         switch (protocol) {
2435         case IPPROTO_TCP:
2436                 /* For TCP, discard broadcast and multicast packets. */
2437                 if (iraflags & IRAF_MULTIBROADCAST)
2438                         goto discard;
2439 
2440                 /* First mblk contains IP+TCP headers per above check */
2441                 ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH);
2442 
2443                 /* TCP options present? */
2444                 offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4;
2445                 if (offset != 5) {
2446                         if (offset < 5)
2447                                 goto discard;
2448 
2449                         /*
2450                          * There must be TCP options.
2451                          * Make sure we can grab them.
2452                          */
2453                         offset <<= 2;
2454                         offset += ip_hdr_length;
2455                         if (len < offset) {
2456                                 if (ira->ira_pktlen < offset) {
2457                                         BUMP_MIB(ill->ill_ip_mib,
2458                                             ipIfStatsInTruncatedPkts);
2459                                         ip_drop_input(
2460                                             "ipIfStatsInTruncatedPkts",
2461                                             mp, ill);
2462                                         freemsg(mp);
2463                                         return;
2464                                 }
2465                                 IP_STAT(ipst, ip_recv_pullup);
2466                                 ipha = ip_pullup(mp, offset, ira);
2467                                 if (ipha == NULL)
2468                                         goto discard;
2469                                 len = mp->b_wptr - rptr;
2470                         }
2471                 }
2472 
2473                 /*
2474                  * Pass up a squeue hint to tcp.
2475                  * If ira_sqp is already set (this is loopback) we leave it
2476                  * alone.
2477                  */
2478                 if (ira->ira_sqp == NULL) {
2479                         ira->ira_sqp = ip_squeue_get(ira->ira_ring);
2480                 }
2481 
2482                 /* Look for AF_INET or AF_INET6 that matches */
2483                 connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length,
2484                     ira, ipst);
2485                 if (connp == NULL) {
2486                         /* Send the TH_RST */
2487                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2488                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2489                         return;
2490                 }
2491                 if (connp->conn_incoming_ifindex != 0 &&
2492                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2493                         CONN_DEC_REF(connp);
2494 
2495                         /* Send the TH_RST */
2496                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2497                         tcp_xmit_listeners_reset(mp, ira, ipst, NULL);
2498                         return;
2499                 }
2500                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2501                     (iraflags & IRAF_IPSEC_SECURE)) {
2502                         mp = ipsec_check_inbound_policy(mp, connp,
2503                             ipha, NULL, ira);
2504                         if (mp == NULL) {
2505                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2506                                 /* Note that mp is NULL */
2507                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2508                                 CONN_DEC_REF(connp);
2509                                 return;
2510                         }
2511                 }
2512                 /* Found a client; up it goes */
2513                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2514                 ira->ira_ill = ira->ira_rill = NULL;
2515                 if (!IPCL_IS_TCP(connp)) {
2516                         /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2517                         (connp->conn_recv)(connp, mp, NULL, ira);
2518                         CONN_DEC_REF(connp);
2519                         ira->ira_ill = ill;
2520                         ira->ira_rill = rill;
2521                         return;
2522                 }
2523 
2524                 /*
2525                  * We do different processing whether called from
2526                  * ip_accept_tcp and we match the target, don't match
2527                  * the target, and when we are called by ip_input.
2528                  */
2529                 if (iraflags & IRAF_TARGET_SQP) {
2530                         if (ira->ira_target_sqp == connp->conn_sqp) {
2531                                 mblk_t  *attrmp;
2532 
2533                                 attrmp = ip_recv_attr_to_mblk(ira);
2534                                 if (attrmp == NULL) {
2535                                         BUMP_MIB(ill->ill_ip_mib,
2536                                             ipIfStatsInDiscards);
2537                                         ip_drop_input("ipIfStatsInDiscards",
2538                                             mp, ill);
2539                                         freemsg(mp);
2540                                         CONN_DEC_REF(connp);
2541                                 } else {
2542                                         SET_SQUEUE(attrmp, connp->conn_recv,
2543                                             connp);
2544                                         attrmp->b_cont = mp;
2545                                         ASSERT(ira->ira_target_sqp_mp == NULL);
2546                                         ira->ira_target_sqp_mp = attrmp;
2547                                         /*
2548                                          * Conn ref release when drained from
2549                                          * the squeue.
2550                                          */
2551                                 }
2552                         } else {
2553                                 SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2554                                     connp->conn_recv, connp, ira, SQ_FILL,
2555                                     SQTAG_IP_TCP_INPUT);
2556                         }
2557                 } else {
2558                         SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv,
2559                             connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT);
2560                 }
2561                 ira->ira_ill = ill;
2562                 ira->ira_rill = rill;
2563                 return;
2564 
2565         case IPPROTO_SCTP: {
2566                 sctp_hdr_t      *sctph;
2567                 in6_addr_t      map_src, map_dst;
2568                 uint32_t        ports;  /* Source and destination ports */
2569                 sctp_stack_t    *sctps = ipst->ips_netstack->netstack_sctp;
2570 
2571                 /* For SCTP, discard broadcast and multicast packets. */
2572                 if (iraflags & IRAF_MULTIBROADCAST)
2573                         goto discard;
2574 
2575                 /*
2576                  * Since there is no SCTP h/w cksum support yet, just
2577                  * clear the flag.
2578                  */
2579                 DB_CKSUMFLAGS(mp) = 0;
2580 
2581                 /* Length ensured above */
2582                 ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH);
2583                 sctph = (sctp_hdr_t *)(rptr + ip_hdr_length);
2584 
2585                 /* get the ports */
2586                 ports = *(uint32_t *)&sctph->sh_sport;
2587 
2588                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst);
2589                 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src);
2590                 if (iraflags & IRAF_SCTP_CSUM_ERR) {
2591                         /*
2592                          * No potential sctp checksum errors go to the Sun
2593                          * sctp stack however they might be Adler-32 summed
2594                          * packets a userland stack bound to a raw IP socket
2595                          * could reasonably use. Note though that Adler-32 is
2596                          * a long deprecated algorithm and customer sctp
2597                          * networks should eventually migrate to CRC-32 at
2598                          * which time this facility should be removed.
2599                          */
2600                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2601                         return;
2602                 }
2603                 connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp,
2604                     sctps, sctph);
2605                 if (connp == NULL) {
2606                         /* Check for raw socket or OOTB handling */
2607                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2608                         return;
2609                 }
2610                 if (connp->conn_incoming_ifindex != 0 &&
2611                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2612                         CONN_DEC_REF(connp);
2613                         /* Check for raw socket or OOTB handling */
2614                         ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira);
2615                         return;
2616                 }
2617 
2618                 /* Found a client; up it goes */
2619                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2620                 sctp_input(connp, ipha, NULL, mp, ira);
2621                 /* sctp_input does a rele of the sctp_t */
2622                 return;
2623         }
2624 
2625         case IPPROTO_UDP:
2626                 /* First mblk contains IP+UDP headers as checked above */
2627                 ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE);
2628 
2629                 if (iraflags & IRAF_MULTIBROADCAST) {
2630                         uint16_t *up;   /* Pointer to ports in ULP header */
2631 
2632                         up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length);
2633                         ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira);
2634                         return;
2635                 }
2636 
2637                 /* Look for AF_INET or AF_INET6 that matches */
2638                 connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length,
2639                     ira, ipst);
2640                 if (connp == NULL) {
2641         no_udp_match:
2642                         if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].
2643                             connf_head != NULL) {
2644                                 ASSERT(ira->ira_protocol == IPPROTO_UDP);
2645                                 ip_fanout_proto_v4(mp, ipha, ira);
2646                         } else {
2647                                 ip_fanout_send_icmp_v4(mp,
2648                                     ICMP_DEST_UNREACHABLE,
2649                                     ICMP_PORT_UNREACHABLE, ira);
2650                         }
2651                         return;
2652 
2653                 }
2654                 if (connp->conn_incoming_ifindex != 0 &&
2655                     connp->conn_incoming_ifindex != ira->ira_ruifindex) {
2656                         CONN_DEC_REF(connp);
2657                         goto no_udp_match;
2658                 }
2659                 if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
2660                     !canputnext(connp->conn_rq)) {
2661                         CONN_DEC_REF(connp);
2662                         BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
2663                         ip_drop_input("udpIfStatsInOverflows", mp, ill);
2664                         freemsg(mp);
2665                         return;
2666                 }
2667                 if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2668                     (iraflags & IRAF_IPSEC_SECURE)) {
2669                         mp = ipsec_check_inbound_policy(mp, connp,
2670                             ipha, NULL, ira);
2671                         if (mp == NULL) {
2672                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2673                                 /* Note that mp is NULL */
2674                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2675                                 CONN_DEC_REF(connp);
2676                                 return;
2677                         }
2678                 }
2679                 /*
2680                  * Remove 0-spi if it's 0, or move everything behind
2681                  * the UDP header over it and forward to ESP via
2682                  * ip_fanout_v4().
2683                  */
2684                 if (connp->conn_udp->udp_nat_t_endpoint) {
2685                         if (iraflags & IRAF_IPSEC_SECURE) {
2686                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2687                                     DROPPER(ipss, ipds_esp_nat_t_ipsec),
2688                                     &ipss->ipsec_dropper);
2689                                 CONN_DEC_REF(connp);
2690                                 return;
2691                         }
2692 
2693                         mp = zero_spi_check(mp, ira);
2694                         if (mp == NULL) {
2695                                 /*
2696                                  * Packet was consumed - probably sent to
2697                                  * ip_fanout_v4.
2698                                  */
2699                                 CONN_DEC_REF(connp);
2700                                 return;
2701                         }
2702                         /* Else continue like a normal UDP packet. */
2703                         ipha = (ipha_t *)mp->b_rptr;
2704                         protocol = ipha->ipha_protocol;
2705                         ira->ira_protocol = protocol;
2706                 }
2707                 /* Found a client; up it goes */
2708                 IP_STAT(ipst, ip_udp_fannorm);
2709                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2710                 ira->ira_ill = ira->ira_rill = NULL;
2711                 (connp->conn_recv)(connp, mp, NULL, ira);
2712                 CONN_DEC_REF(connp);
2713                 ira->ira_ill = ill;
2714                 ira->ira_rill = rill;
2715                 return;
2716         default:
2717                 break;
2718         }
2719 
2720         /*
2721          * Clear hardware checksumming flag as it is currently only
2722          * used by TCP and UDP.
2723          */
2724         DB_CKSUMFLAGS(mp) = 0;
2725 
2726         switch (protocol) {
2727         case IPPROTO_ICMP:
2728                 /*
2729                  * We need to accomodate icmp messages coming in clear
2730                  * until we get everything secure from the wire. If
2731                  * icmp_accept_clear_messages is zero we check with
2732                  * the global policy and act accordingly. If it is
2733                  * non-zero, we accept the message without any checks.
2734                  * But *this does not mean* that this will be delivered
2735                  * to RAW socket clients. By accepting we might send
2736                  * replies back, change our MTU value etc.,
2737                  * but delivery to the ULP/clients depends on their
2738                  * policy dispositions.
2739                  */
2740                 if (ipst->ips_icmp_accept_clear_messages == 0) {
2741                         mp = ipsec_check_global_policy(mp, NULL,
2742                             ipha, NULL, ira, ns);
2743                         if (mp == NULL)
2744                                 return;
2745                 }
2746 
2747                 /*
2748                  * On a labeled system, we have to check whether the zone
2749                  * itself is permitted to receive raw traffic.
2750                  */
2751                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
2752                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
2753                                 BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
2754                                 ip_drop_input("tsol_can_accept_raw", mp, ill);
2755                                 freemsg(mp);
2756                                 return;
2757                         }
2758                 }
2759 
2760                 /*
2761                  * ICMP header checksum, including checksum field,
2762                  * should be zero.
2763                  */
2764                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2765                         BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs);
2766                         ip_drop_input("icmpInCksumErrs", mp, ill);
2767                         freemsg(mp);
2768                         return;
2769                 }
2770                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2771                 mp = icmp_inbound_v4(mp, ira);
2772                 if (mp == NULL) {
2773                         /* No need to pass to RAW sockets */
2774                         return;
2775                 }
2776                 break;
2777 
2778         case IPPROTO_IGMP:
2779                 /*
2780                  * If we are not willing to accept IGMP packets in clear,
2781                  * then check with global policy.
2782                  */
2783                 if (ipst->ips_igmp_accept_clear_messages == 0) {
2784                         mp = ipsec_check_global_policy(mp, NULL,
2785                             ipha, NULL, ira, ns);
2786                         if (mp == NULL)
2787                                 return;
2788                 }
2789                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2790                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2791                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2792                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2793                         freemsg(mp);
2794                         return;
2795                 }
2796                 /*
2797                  * Validate checksum
2798                  */
2799                 if (IP_CSUM(mp, ip_hdr_length, 0)) {
2800                         ++ipst->ips_igmpstat.igps_rcv_badsum;
2801                         ip_drop_input("igps_rcv_badsum", mp, ill);
2802                         freemsg(mp);
2803                         return;
2804                 }
2805 
2806                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2807                 mp = igmp_input(mp, ira);
2808                 if (mp == NULL) {
2809                         /* Bad packet - discarded by igmp_input */
2810                         return;
2811                 }
2812                 break;
2813         case IPPROTO_PIM:
2814                 /*
2815                  * If we are not willing to accept PIM packets in clear,
2816                  * then check with global policy.
2817                  */
2818                 if (ipst->ips_pim_accept_clear_messages == 0) {
2819                         mp = ipsec_check_global_policy(mp, NULL,
2820                             ipha, NULL, ira, ns);
2821                         if (mp == NULL)
2822                                 return;
2823                 }
2824                 if ((ira->ira_flags & IRAF_SYSTEM_LABELED) &&
2825                     !tsol_can_accept_raw(mp, ira, B_TRUE)) {
2826                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2827                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2828                         freemsg(mp);
2829                         return;
2830                 }
2831                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2832 
2833                 /* Checksum is verified in pim_input */
2834                 mp = pim_input(mp, ira);
2835                 if (mp == NULL) {
2836                         /* Bad packet - discarded by pim_input */
2837                         return;
2838                 }
2839                 break;
2840         case IPPROTO_AH:
2841         case IPPROTO_ESP: {
2842                 /*
2843                  * Fast path for AH/ESP.
2844                  */
2845                 netstack_t *ns = ipst->ips_netstack;
2846                 ipsec_stack_t *ipss = ns->netstack_ipsec;
2847 
2848                 IP_STAT(ipst, ipsec_proto_ahesp);
2849 
2850                 if (!ipsec_loaded(ipss)) {
2851                         ip_proto_not_sup(mp, ira);
2852                         return;
2853                 }
2854 
2855                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
2856                 /* select inbound SA and have IPsec process the pkt */
2857                 if (protocol == IPPROTO_ESP) {
2858                         esph_t *esph;
2859                         boolean_t esp_in_udp_sa;
2860                         boolean_t esp_in_udp_packet;
2861 
2862                         mp = ipsec_inbound_esp_sa(mp, ira, &esph);
2863                         if (mp == NULL)
2864                                 return;
2865 
2866                         ASSERT(esph != NULL);
2867                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2868                         ASSERT(ira->ira_ipsec_esp_sa != NULL);
2869                         ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL);
2870 
2871                         esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags &
2872                             IPSA_F_NATT) != 0);
2873                         esp_in_udp_packet =
2874                             (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0;
2875 
2876                         /*
2877                          * The following is a fancy, but quick, way of saying:
2878                          * ESP-in-UDP SA and Raw ESP packet --> drop
2879                          *    OR
2880                          * ESP SA and ESP-in-UDP packet --> drop
2881                          */
2882                         if (esp_in_udp_sa != esp_in_udp_packet) {
2883                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2884                                 ip_drop_packet(mp, B_TRUE, ira->ira_ill,
2885                                     DROPPER(ipss, ipds_esp_no_sa),
2886                                     &ipss->ipsec_dropper);
2887                                 return;
2888                         }
2889                         mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph,
2890                             ira);
2891                 } else {
2892                         ah_t *ah;
2893 
2894                         mp = ipsec_inbound_ah_sa(mp, ira, &ah);
2895                         if (mp == NULL)
2896                                 return;
2897 
2898                         ASSERT(ah != NULL);
2899                         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
2900                         ASSERT(ira->ira_ipsec_ah_sa != NULL);
2901                         ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL);
2902                         mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah,
2903                             ira);
2904                 }
2905 
2906                 if (mp == NULL) {
2907                         /*
2908                          * Either it failed or is pending. In the former case
2909                          * ipIfStatsInDiscards was increased.
2910                          */
2911                         return;
2912                 }
2913                 /* we're done with IPsec processing, send it up */
2914                 ip_input_post_ipsec(mp, ira);
2915                 return;
2916         }
2917         case IPPROTO_ENCAP: {
2918                 ipha_t          *inner_ipha;
2919 
2920                 /*
2921                  * Handle self-encapsulated packets (IP-in-IP where
2922                  * the inner addresses == the outer addresses).
2923                  */
2924                 if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) >
2925                     mp->b_wptr) {
2926                         if (ira->ira_pktlen <
2927                             ip_hdr_length + sizeof (ipha_t)) {
2928                                 BUMP_MIB(ill->ill_ip_mib,
2929                                     ipIfStatsInTruncatedPkts);
2930                                 ip_drop_input("ipIfStatsInTruncatedPkts",
2931                                     mp, ill);
2932                                 freemsg(mp);
2933                                 return;
2934                         }
2935                         ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length +
2936                             sizeof (ipha_t) - mp->b_rptr, ira);
2937                         if (ipha == NULL) {
2938                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2939                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2940                                 freemsg(mp);
2941                                 return;
2942                         }
2943                 }
2944                 inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length);
2945                 /*
2946                  * Check the sanity of the inner IP header.
2947                  */
2948                 if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) {
2949                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2950                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2951                         freemsg(mp);
2952                         return;
2953                 }
2954                 if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) {
2955                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2956                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
2957                         freemsg(mp);
2958                         return;
2959                 }
2960                 if (inner_ipha->ipha_src != ipha->ipha_src ||
2961                     inner_ipha->ipha_dst != ipha->ipha_dst) {
2962                         /* We fallthru to iptun fanout below */
2963                         goto iptun;
2964                 }
2965 
2966                 /*
2967                  * Self-encapsulated tunnel packet. Remove
2968                  * the outer IP header and fanout again.
2969                  * We also need to make sure that the inner
2970                  * header is pulled up until options.
2971                  */
2972                 mp->b_rptr = (uchar_t *)inner_ipha;
2973                 ipha = inner_ipha;
2974                 ip_hdr_length = IPH_HDR_LENGTH(ipha);
2975                 if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) {
2976                         if (ira->ira_pktlen <
2977                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) {
2978                                 BUMP_MIB(ill->ill_ip_mib,
2979                                     ipIfStatsInTruncatedPkts);
2980                                 ip_drop_input("ipIfStatsInTruncatedPkts",
2981                                     mp, ill);
2982                                 freemsg(mp);
2983                                 return;
2984                         }
2985                         ipha = ip_pullup(mp,
2986                             (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira);
2987                         if (ipha == NULL) {
2988                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2989                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
2990                                 freemsg(mp);
2991                                 return;
2992                         }
2993                 }
2994                 if (ip_hdr_length > sizeof (ipha_t)) {
2995                         /* We got options on the inner packet. */
2996                         ipaddr_t        dst = ipha->ipha_dst;
2997                         int             error = 0;
2998 
2999                         dst = ip_input_options(ipha, dst, mp, ira, &error);
3000                         if (error != 0) {
3001                                 /*
3002                                  * An ICMP error has been sent and the packet
3003                                  * has been dropped.
3004                                  */
3005                                 return;
3006                         }
3007                         if (dst != ipha->ipha_dst) {
3008                                 /*
3009                                  * Someone put a source-route in
3010                                  * the inside header of a self-
3011                                  * encapsulated packet.  Drop it
3012                                  * with extreme prejudice and let
3013                                  * the sender know.
3014                                  */
3015                                 ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
3016                                     mp, ill);
3017                                 icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
3018                                     ira);
3019                                 return;
3020                         }
3021                 }
3022                 if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
3023                         /*
3024                          * This means that somebody is sending
3025                          * Self-encapsualted packets without AH/ESP.
3026                          *
3027                          * Send this packet to find a tunnel endpoint.
3028                          * if I can't find one, an ICMP
3029                          * PROTOCOL_UNREACHABLE will get sent.
3030                          */
3031                         protocol = ipha->ipha_protocol;
3032                         ira->ira_protocol = protocol;
3033                         goto iptun;
3034                 }
3035 
3036                 /* Update based on removed IP header */
3037                 ira->ira_ip_hdr_length = ip_hdr_length;
3038                 ira->ira_pktlen = ntohs(ipha->ipha_length);
3039 
3040                 if (ira->ira_flags & IRAF_IPSEC_DECAPS) {
3041                         /*
3042                          * This packet is self-encapsulated multiple
3043                          * times. We don't want to recurse infinitely.
3044                          * To keep it simple, drop the packet.
3045                          */
3046                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3047                         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3048                         freemsg(mp);
3049                         return;
3050                 }
3051                 ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3052                 ira->ira_flags |= IRAF_IPSEC_DECAPS;
3053 
3054                 ip_input_post_ipsec(mp, ira);
3055                 return;
3056         }
3057 
3058         iptun:  /* IPPROTO_ENCAPS that is not self-encapsulated */
3059         case IPPROTO_IPV6:
3060                 /* iptun will verify trusted label */
3061                 connp = ipcl_classify_v4(mp, protocol, ip_hdr_length,
3062                     ira, ipst);
3063                 if (connp != NULL) {
3064                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
3065                         ira->ira_ill = ira->ira_rill = NULL;
3066                         (connp->conn_recv)(connp, mp, NULL, ira);
3067                         CONN_DEC_REF(connp);
3068                         ira->ira_ill = ill;
3069                         ira->ira_rill = rill;
3070                         return;
3071                 }
3072                 /* FALLTHRU */
3073         default:
3074                 /*
3075                  * On a labeled system, we have to check whether the zone
3076                  * itself is permitted to receive raw traffic.
3077                  */
3078                 if (ira->ira_flags & IRAF_SYSTEM_LABELED) {
3079                         if (!tsol_can_accept_raw(mp, ira, B_FALSE)) {
3080                                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3081                                 ip_drop_input("ipIfStatsInDiscards", mp, ill);
3082                                 freemsg(mp);
3083                                 return;
3084                         }
3085                 }
3086                 break;
3087         }
3088 
3089         /*
3090          * The above input functions may have returned the pulled up message.
3091          * So ipha need to be reinitialized.
3092          */
3093         ipha = (ipha_t *)mp->b_rptr;
3094         ira->ira_protocol = protocol = ipha->ipha_protocol;
3095         if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) {
3096                 /*
3097                  * No user-level listener for these packets packets.
3098                  * Check for IPPROTO_ENCAP...
3099                  */
3100                 if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) {
3101                         /*
3102                          * Check policy here,
3103                          * THEN ship off to ip_mroute_decap().
3104                          *
3105                          * BTW,  If I match a configured IP-in-IP
3106                          * tunnel above, this path will not be reached, and
3107                          * ip_mroute_decap will never be called.
3108                          */
3109                         mp = ipsec_check_global_policy(mp, connp,
3110                             ipha, NULL, ira, ns);
3111                         if (mp != NULL) {
3112                                 ip_mroute_decap(mp, ira);
3113                         } /* Else we already freed everything! */
3114                 } else {
3115                         ip_proto_not_sup(mp, ira);
3116                 }
3117                 return;
3118         }
3119 
3120         /*
3121          * Handle fanout to raw sockets.  There
3122          * can be more than one stream bound to a particular
3123          * protocol.  When this is the case, each one gets a copy
3124          * of any incoming packets.
3125          */
3126         ASSERT(ira->ira_protocol == ipha->ipha_protocol);
3127         ip_fanout_proto_v4(mp, ipha, ira);
3128         return;
3129 
3130 discard:
3131         BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
3132         ip_drop_input("ipIfStatsInDiscards", mp, ill);
3133         freemsg(mp);
3134 #undef rptr
3135 }