ij-tcp-connstat Wdiff usr/src/uts/common/inet/ip/ip.c

Print this page

DLPX-25998 TCP congestion control is inadequate
Reviewed at: http://reviews.delphix.com/r/34808/
DLPX-37540 TCP per-connection kernel statistics DLPX-37544 connstat command to display per-connection TCP statistics

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/ip/ip.c
          +++ new/usr/src/uts/common/inet/ip/ip.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 1990 Mentat Inc.
  25   25   * Copyright (c) 2017 OmniTI Computer Consulting, Inc. All rights reserved.
  26   26   * Copyright (c) 2016 by Delphix. All rights reserved.
  27   27   * Copyright (c) 2018 Joyent, Inc. All rights reserved.
  28   28   */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/stream.h>
  32   32  #include <sys/dlpi.h>
  33   33  #include <sys/stropts.h>
  34   34  #include <sys/sysmacros.h>
  35   35  #include <sys/strsubr.h>
  36   36  #include <sys/strlog.h>
  37   37  #include <sys/strsun.h>
  38   38  #include <sys/zone.h>
  39   39  #define _SUN_TPI_VERSION 2
  40   40  #include <sys/tihdr.h>
  41   41  #include <sys/xti_inet.h>
  42   42  #include <sys/ddi.h>
  43   43  #include <sys/suntpi.h>
  44   44  #include <sys/cmn_err.h>
  45   45  #include <sys/debug.h>
  46   46  #include <sys/kobj.h>
  47   47  #include <sys/modctl.h>
  48   48  #include <sys/atomic.h>
  49   49  #include <sys/policy.h>
  50   50  #include <sys/priv.h>
  51   51  #include <sys/taskq.h>
  52   52  
  53   53  #include <sys/systm.h>
  54   54  #include <sys/param.h>
  55   55  #include <sys/kmem.h>
  56   56  #include <sys/sdt.h>
  57   57  #include <sys/socket.h>
  58   58  #include <sys/vtrace.h>
  59   59  #include <sys/isa_defs.h>
  60   60  #include <sys/mac.h>
  61   61  #include <net/if.h>
  62   62  #include <net/if_arp.h>
  63   63  #include <net/route.h>
  64   64  #include <sys/sockio.h>
  65   65  #include <netinet/in.h>
  66   66  #include <net/if_dl.h>
  67   67  
  68   68  #include <inet/common.h>
  69   69  #include <inet/mi.h>
  70   70  #include <inet/mib2.h>
  71   71  #include <inet/nd.h>
  72   72  #include <inet/arp.h>
  73   73  #include <inet/snmpcom.h>
  74   74  #include <inet/optcom.h>
  75   75  #include <inet/kstatcom.h>
  76   76  
  77   77  #include <netinet/igmp_var.h>
  78   78  #include <netinet/ip6.h>
  79   79  #include <netinet/icmp6.h>
  80   80  #include <netinet/sctp.h>
  81   81  
  82   82  #include <inet/ip.h>
  83   83  #include <inet/ip_impl.h>
  84   84  #include <inet/ip6.h>
  85   85  #include <inet/ip6_asp.h>
  86   86  #include <inet/tcp.h>
  87   87  #include <inet/tcp_impl.h>

↓ open down ↓

87 lines elided

↑ open up ↑

  88   88  #include <inet/ip_multi.h>
  89   89  #include <inet/ip_if.h>
  90   90  #include <inet/ip_ire.h>
  91   91  #include <inet/ip_ftable.h>
  92   92  #include <inet/ip_rts.h>
  93   93  #include <inet/ip_ndp.h>
  94   94  #include <inet/ip_listutils.h>
  95   95  #include <netinet/igmp.h>
  96   96  #include <netinet/ip_mroute.h>
  97   97  #include <inet/ipp_common.h>
       98 +#include <inet/cc.h>
  98   99  
  99  100  #include <net/pfkeyv2.h>
 100  101  #include <inet/sadb.h>
 101  102  #include <inet/ipsec_impl.h>
 102  103  #include <inet/iptun/iptun_impl.h>
 103  104  #include <inet/ipdrop.h>
 104  105  #include <inet/ip_netinfo.h>
 105  106  #include <inet/ilb_ip.h>
 106  107  
 107  108  #include <sys/ethernet.h>

 108  109  #include <net/if_types.h>
 109  110  #include <sys/cpuvar.h>
 110  111  
 111  112  #include <ipp/ipp.h>
 112  113  #include <ipp/ipp_impl.h>
 113  114  #include <ipp/ipgpc/ipgpc.h>
 114  115  
 115  116  #include <sys/pattr.h>
 116  117  #include <inet/ipclassifier.h>
 117  118  #include <inet/sctp_ip.h>
 118  119  #include <inet/sctp/sctp_impl.h>
 119  120  #include <inet/udp_impl.h>
 120  121  #include <inet/rawip_impl.h>
 121  122  #include <inet/rts_impl.h>
 122  123  
 123  124  #include <sys/tsol/label.h>
 124  125  #include <sys/tsol/tnet.h>
 125  126  
 126  127  #include <sys/squeue_impl.h>
 127  128  #include <inet/ip_arp.h>
 128  129  
 129  130  #include <sys/clock_impl.h>     /* For LBOLT_FASTPATH{,64} */
 130  131  
 131  132  /*
 132  133   * Values for squeue switch:
 133  134   * IP_SQUEUE_ENTER_NODRAIN: SQ_NODRAIN
 134  135   * IP_SQUEUE_ENTER: SQ_PROCESS
 135  136   * IP_SQUEUE_FILL: SQ_FILL
 136  137   */
 137  138  int ip_squeue_enter = IP_SQUEUE_ENTER;  /* Setable in /etc/system */
 138  139  
 139  140  int ip_squeue_flag;
 140  141  
 141  142  /*
 142  143   * Setable in /etc/system
 143  144   */
 144  145  int ip_poll_normal_ms = 100;
 145  146  int ip_poll_normal_ticks = 0;
 146  147  int ip_modclose_ackwait_ms = 3000;
 147  148  
 148  149  /*
 149  150   * It would be nice to have these present only in DEBUG systems, but the
 150  151   * current design of the global symbol checking logic requires them to be
 151  152   * unconditionally present.
 152  153   */
 153  154  uint_t ip_thread_data;                  /* TSD key for debug support */
 154  155  krwlock_t ip_thread_rwlock;
 155  156  list_t  ip_thread_list;
 156  157  
 157  158  /*
 158  159   * Structure to represent a linked list of msgblks. Used by ip_snmp_ functions.
 159  160   */
 160  161  
 161  162  struct listptr_s {
 162  163          mblk_t  *lp_head;       /* pointer to the head of the list */
 163  164          mblk_t  *lp_tail;       /* pointer to the tail of the list */
 164  165  };
 165  166  
 166  167  typedef struct listptr_s listptr_t;
 167  168  
 168  169  /*
 169  170   * This is used by ip_snmp_get_mib2_ip_route_media and
 170  171   * ip_snmp_get_mib2_ip6_route_media to carry the lists of return data.
 171  172   */
 172  173  typedef struct iproutedata_s {
 173  174          uint_t          ird_idx;
 174  175          uint_t          ird_flags;      /* see below */
 175  176          listptr_t       ird_route;      /* ipRouteEntryTable */
 176  177          listptr_t       ird_netmedia;   /* ipNetToMediaEntryTable */
 177  178          listptr_t       ird_attrs;      /* ipRouteAttributeTable */
 178  179  } iproutedata_t;
 179  180  
 180  181  /* Include ire_testhidden and IRE_IF_CLONE routes */
 181  182  #define IRD_REPORT_ALL  0x01
 182  183  
 183  184  /*
 184  185   * Cluster specific hooks. These should be NULL when booted as a non-cluster
 185  186   */
 186  187  
 187  188  /*
 188  189   * Hook functions to enable cluster networking
 189  190   * On non-clustered systems these vectors must always be NULL.
 190  191   *
 191  192   * Hook function to Check ip specified ip address is a shared ip address
 192  193   * in the cluster
 193  194   *
 194  195   */
 195  196  int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol,
 196  197      sa_family_t addr_family, uint8_t *laddrp, void *args) = NULL;
 197  198  
 198  199  /*
 199  200   * Hook function to generate cluster wide ip fragment identifier
 200  201   */
 201  202  uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol,
 202  203      sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp,
 203  204      void *args) = NULL;
 204  205  
 205  206  /*
 206  207   * Hook function to generate cluster wide SPI.
 207  208   */
 208  209  void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t,
 209  210      void *) = NULL;
 210  211  
 211  212  /*
 212  213   * Hook function to verify if the SPI is already utlized.
 213  214   */
 214  215  
 215  216  int (*cl_inet_checkspi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
 216  217  
 217  218  /*
 218  219   * Hook function to delete the SPI from the cluster wide repository.
 219  220   */
 220  221  
 221  222  void (*cl_inet_deletespi)(netstackid_t, uint8_t, uint32_t, void *) = NULL;
 222  223  
 223  224  /*
 224  225   * Hook function to inform the cluster when packet received on an IDLE SA
 225  226   */
 226  227  
 227  228  void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
 228  229      in6_addr_t, in6_addr_t, void *) = NULL;
 229  230  
 230  231  /*
 231  232   * Synchronization notes:
 232  233   *
 233  234   * IP is a fully D_MP STREAMS module/driver. Thus it does not depend on any
 234  235   * MT level protection given by STREAMS. IP uses a combination of its own
 235  236   * internal serialization mechanism and standard Solaris locking techniques.
 236  237   * The internal serialization is per phyint.  This is used to serialize
 237  238   * plumbing operations, IPMP operations, most set ioctls, etc.
 238  239   *
 239  240   * Plumbing is a long sequence of operations involving message
 240  241   * exchanges between IP, ARP and device drivers. Many set ioctls are typically
 241  242   * involved in plumbing operations. A natural model is to serialize these
 242  243   * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in
 243  244   * parallel without any interference. But various set ioctls on hme0 are best
 244  245   * serialized, along with IPMP operations and processing of DLPI control
 245  246   * messages received from drivers on a per phyint basis. This serialization is
 246  247   * provided by the ipsq_t and primitives operating on this. Details can
 247  248   * be found in ip_if.c above the core primitives operating on ipsq_t.
 248  249   *
 249  250   * Lookups of an ipif or ill by a thread return a refheld ipif / ill.
 250  251   * Simiarly lookup of an ire by a thread also returns a refheld ire.
 251  252   * In addition ipif's and ill's referenced by the ire are also indirectly
 252  253   * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld
 253  254   * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the
 254  255   * address of an ipif has to go through the ipsq_t. This ensures that only
 255  256   * one such exclusive operation proceeds at any time on the ipif. It then
 256  257   * waits for all refcnts
 257  258   * associated with this ipif to come down to zero. The address is changed
 258  259   * only after the ipif has been quiesced. Then the ipif is brought up again.
 259  260   * More details are described above the comment in ip_sioctl_flags.
 260  261   *
 261  262   * Packet processing is based mostly on IREs and are fully multi-threaded
 262  263   * using standard Solaris MT techniques.
 263  264   *
 264  265   * There are explicit locks in IP to handle:
 265  266   * - The ip_g_head list maintained by mi_open_link() and friends.
 266  267   *
 267  268   * - The reassembly data structures (one lock per hash bucket)
 268  269   *
 269  270   * - conn_lock is meant to protect conn_t fields. The fields actually
 270  271   *   protected by conn_lock are documented in the conn_t definition.
 271  272   *
 272  273   * - ire_lock to protect some of the fields of the ire, IRE tables
 273  274   *   (one lock per hash bucket). Refer to ip_ire.c for details.
 274  275   *
 275  276   * - ndp_g_lock and ncec_lock for protecting NCEs.
 276  277   *
 277  278   * - ill_lock protects fields of the ill and ipif. Details in ip.h
 278  279   *
 279  280   * - ill_g_lock: This is a global reader/writer lock. Protects the following
 280  281   *      * The AVL tree based global multi list of all ills.
 281  282   *      * The linked list of all ipifs of an ill
 282  283   *      * The <ipsq-xop> mapping
 283  284   *      * <ill-phyint> association
 284  285   *   Insertion/deletion of an ill in the system, insertion/deletion of an ipif
 285  286   *   into an ill, changing the <ipsq-xop> mapping of an ill, changing the
 286  287   *   <ill-phyint> assoc of an ill will all have to hold the ill_g_lock as
 287  288   *   writer for the actual duration of the insertion/deletion/change.
 288  289   *
 289  290   * - ill_lock:  This is a per ill mutex.
 290  291   *   It protects some members of the ill_t struct; see ip.h for details.
 291  292   *   It also protects the <ill-phyint> assoc.
 292  293   *   It also protects the list of ipifs hanging off the ill.
 293  294   *
 294  295   * - ipsq_lock: This is a per ipsq_t mutex lock.
 295  296   *   This protects some members of the ipsq_t struct; see ip.h for details.
 296  297   *   It also protects the <ipsq-ipxop> mapping
 297  298   *
 298  299   * - ipx_lock: This is a per ipxop_t mutex lock.
 299  300   *   This protects some members of the ipxop_t struct; see ip.h for details.
 300  301   *
 301  302   * - phyint_lock: This is a per phyint mutex lock. Protects just the
 302  303   *   phyint_flags
 303  304   *
 304  305   * - ip_addr_avail_lock: This is used to ensure the uniqueness of IP addresses.
 305  306   *   This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the
 306  307   *   uniqueness check also done atomically.
 307  308   *
 308  309   * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc
 309  310   *   group list linked by ill_usesrc_grp_next. It also protects the
 310  311   *   ill_usesrc_ifindex field. It is taken as a writer when a member of the
 311  312   *   group is being added or deleted.  This lock is taken as a reader when
 312  313   *   walking the list/group(eg: to get the number of members in a usesrc group).
 313  314   *   Note, it is only necessary to take this lock if the ill_usesrc_grp_next
 314  315   *   field is changing state i.e from NULL to non-NULL or vice-versa. For
 315  316   *   example, it is not necessary to take this lock in the initial portion
 316  317   *   of ip_sioctl_slifusesrc or at all in ip_sioctl_flags since these
 317  318   *   operations are executed exclusively and that ensures that the "usesrc
 318  319   *   group state" cannot change. The "usesrc group state" change can happen
 319  320   *   only in the latter part of ip_sioctl_slifusesrc and in ill_delete.
 320  321   *
 321  322   * Changing <ill-phyint>, <ipsq-xop> assocications:
 322  323   *
 323  324   * To change the <ill-phyint> association, the ill_g_lock must be held
 324  325   * as writer, and the ill_locks of both the v4 and v6 instance of the ill
 325  326   * must be held.
 326  327   *
 327  328   * To change the <ipsq-xop> association, the ill_g_lock must be held as
 328  329   * writer, the ipsq_lock must be held, and one must be writer on the ipsq.
 329  330   * This is only done when ills are added or removed from IPMP groups.
 330  331   *
 331  332   * To add or delete an ipif from the list of ipifs hanging off the ill,
 332  333   * ill_g_lock (writer) and ill_lock must be held and the thread must be
 333  334   * a writer on the associated ipsq.
 334  335   *
 335  336   * To add or delete an ill to the system, the ill_g_lock must be held as
 336  337   * writer and the thread must be a writer on the associated ipsq.
 337  338   *
 338  339   * To add or delete an ilm to an ill, the ill_lock must be held and the thread
 339  340   * must be a writer on the associated ipsq.
 340  341   *
 341  342   * Lock hierarchy
 342  343   *
 343  344   * Some lock hierarchy scenarios are listed below.
 344  345   *
 345  346   * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock
 346  347   * ill_g_lock -> ill_lock(s) -> phyint_lock
 347  348   * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock
 348  349   * ill_g_lock -> ip_addr_avail_lock
 349  350   * conn_lock -> irb_lock -> ill_lock -> ire_lock
 350  351   * ill_g_lock -> ip_g_nd_lock
 351  352   * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock
 352  353   * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock
 353  354   * arl_lock -> ill_lock
 354  355   * ips_ire_dep_lock -> irb_lock
 355  356   *
 356  357   * When more than 1 ill lock is needed to be held, all ill lock addresses
 357  358   * are sorted on address and locked starting from highest addressed lock
 358  359   * downward.
 359  360   *
 360  361   * Multicast scenarios
 361  362   * ips_ill_g_lock -> ill_mcast_lock
 362  363   * conn_ilg_lock -> ips_ill_g_lock -> ill_lock
 363  364   * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock
 364  365   * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock
 365  366   * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock
 366  367   * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock
 367  368   *
 368  369   * IPsec scenarios
 369  370   *
 370  371   * ipsa_lock -> ill_g_lock -> ill_lock
 371  372   * ill_g_usesrc_lock -> ill_g_lock -> ill_lock
 372  373   *
 373  374   * Trusted Solaris scenarios
 374  375   *
 375  376   * igsa_lock -> gcgrp_rwlock -> gcgrp_lock
 376  377   * igsa_lock -> gcdb_lock
 377  378   * gcgrp_rwlock -> ire_lock
 378  379   * gcgrp_rwlock -> gcdb_lock
 379  380   *
 380  381   * squeue(sq_lock), flow related (ft_lock, fe_lock) locking
 381  382   *
 382  383   * cpu_lock --> ill_lock --> sqset_lock --> sq_lock
 383  384   * sq_lock -> conn_lock -> QLOCK(q)
 384  385   * ill_lock -> ft_lock -> fe_lock
 385  386   *
 386  387   * Routing/forwarding table locking notes:
 387  388   *
 388  389   * Lock acquisition order: Radix tree lock, irb_lock.
 389  390   * Requirements:
 390  391   * i.  Walker must not hold any locks during the walker callback.
 391  392   * ii  Walker must not see a truncated tree during the walk because of any node
 392  393   *     deletion.
 393  394   * iii Existing code assumes ire_bucket is valid if it is non-null and is used
 394  395   *     in many places in the code to walk the irb list. Thus even if all the
 395  396   *     ires in a bucket have been deleted, we still can't free the radix node
 396  397   *     until the ires have actually been inactive'd (freed).
 397  398   *
 398  399   * Tree traversal - Need to hold the global tree lock in read mode.
 399  400   * Before dropping the global tree lock, need to either increment the ire_refcnt
 400  401   * to ensure that the radix node can't be deleted.
 401  402   *
 402  403   * Tree add - Need to hold the global tree lock in write mode to add a
 403  404   * radix node. To prevent the node from being deleted, increment the
 404  405   * irb_refcnt, after the node is added to the tree. The ire itself is
 405  406   * added later while holding the irb_lock, but not the tree lock.
 406  407   *
 407  408   * Tree delete - Need to hold the global tree lock and irb_lock in write mode.
 408  409   * All associated ires must be inactive (i.e. freed), and irb_refcnt
 409  410   * must be zero.
 410  411   *
 411  412   * Walker - Increment irb_refcnt before calling the walker callback. Hold the
 412  413   * global tree lock (read mode) for traversal.
 413  414   *
 414  415   * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele
 415  416   * hence we will acquire irb_lock while holding ips_ire_dep_lock.
 416  417   *
 417  418   * IPsec notes :
 418  419   *
 419  420   * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes
 420  421   * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the
 421  422   * ip_xmit_attr_t has the
 422  423   * information used by the IPsec code for applying the right level of
 423  424   * protection. The information initialized by IP in the ip_xmit_attr_t
 424  425   * is determined by the per-socket policy or global policy in the system.
 425  426   * For inbound datagrams, the ip_recv_attr_t
 426  427   * starts out with nothing in it. It gets filled
 427  428   * with the right information if it goes through the AH/ESP code, which
 428  429   * happens if the incoming packet is secure. The information initialized
 429  430   * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether
 430  431   * the policy requirements needed by per-socket policy or global policy
 431  432   * is met or not.
 432  433   *
 433  434   * For fully connected sockets i.e dst, src [addr, port] is known,
 434  435   * conn_policy_cached is set indicating that policy has been cached.
 435  436   * conn_in_enforce_policy may or may not be set depending on whether
 436  437   * there is a global policy match or per-socket policy match.
 437  438   * Policy inheriting happpens in ip_policy_set once the destination is known.
 438  439   * Once the right policy is set on the conn_t, policy cannot change for
 439  440   * this socket. This makes life simpler for TCP (UDP ?) where
 440  441   * re-transmissions go out with the same policy. For symmetry, policy
 441  442   * is cached for fully connected UDP sockets also. Thus if policy is cached,
 442  443   * it also implies that policy is latched i.e policy cannot change
 443  444   * on these sockets. As we have the right policy on the conn, we don't
 444  445   * have to lookup global policy for every outbound and inbound datagram
 445  446   * and thus serving as an optimization. Note that a global policy change
 446  447   * does not affect fully connected sockets if they have policy. If fully
 447  448   * connected sockets did not have any policy associated with it, global
 448  449   * policy change may affect them.
 449  450   *
 450  451   * IP Flow control notes:
 451  452   * ---------------------
 452  453   * Non-TCP streams are flow controlled by IP. The way this is accomplished
 453  454   * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
 454  455   * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
 455  456   * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
 456  457   * functions.
 457  458   *
 458  459   * Per Tx ring udp flow control:
 459  460   * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
 460  461   * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
 461  462   *
 462  463   * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
 463  464   * To achieve best performance, outgoing traffic need to be fanned out among
 464  465   * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
 465  466   * traffic out of the NIC and it takes a fanout hint. UDP connections pass
 466  467   * the address of connp as fanout hint to mac_tx(). Under flow controlled
 467  468   * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
 468  469   * cookie points to a specific Tx ring that is blocked. The cookie is used to
 469  470   * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
 470  471   * point to drain_lists (idl_t's). These drain list will store the blocked UDP
 471  472   * connp's. The drain list is not a single list but a configurable number of
 472  473   * lists.
 473  474   *
 474  475   * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
 475  476   * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
 476  477   * which is equal to 128. This array in turn contains a pointer to idl_t[],
 477  478   * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
 478  479   * list will point to the list of connp's that are flow controlled.
 479  480   *
 480  481   *                      ---------------   -------   -------   -------
 481  482   *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
 482  483   *                   |  ---------------   -------   -------   -------
 483  484   *                   |  ---------------   -------   -------   -------
 484  485   *                   |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
 485  486   * ----------------  |  ---------------   -------   -------   -------
 486  487   * |idl_tx_list[0]|->|  ---------------   -------   -------   -------
 487  488   * ----------------  |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
 488  489   *                   |  ---------------   -------   -------   -------
 489  490   *                   .        .              .         .         .
 490  491   *                   |  ---------------   -------   -------   -------
 491  492   *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
 492  493   *                      ---------------   -------   -------   -------
 493  494   *                      ---------------   -------   -------   -------
 494  495   *                   |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
 495  496   *                   |  ---------------   -------   -------   -------
 496  497   *                   |  ---------------   -------   -------   -------
 497  498   * ----------------  |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
 498  499   * |idl_tx_list[1]|->|  ---------------   -------   -------   -------
 499  500   * ----------------  |        .              .         .         .
 500  501   *                   |  ---------------   -------   -------   -------
 501  502   *                   |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
 502  503   *                      ---------------   -------   -------   -------
 503  504   *     .....
 504  505   * ----------------
 505  506   * |idl_tx_list[n]|-> ...
 506  507   * ----------------
 507  508   *
 508  509   * When mac_tx() returns a cookie, the cookie is hashed into an index into
 509  510   * ips_idl_tx_list[], and conn_drain_insert() is called with the idl_tx_list
 510  511   * to insert the conn onto.  conn_drain_insert() asserts flow control for the
 511  512   * sockets via su_txq_full() (non-STREAMS) or QFULL on conn_wq (STREAMS).
 512  513   * Further, conn_blocked is set to indicate that the conn is blocked.
 513  514   *
 514  515   * GLDv3 calls ill_flow_enable() when flow control is relieved.  The cookie
 515  516   * passed in the call to ill_flow_enable() identifies the blocked Tx ring and
 516  517   * is again hashed to locate the appropriate idl_tx_list, which is then
 517  518   * drained via conn_walk_drain().  conn_walk_drain() goes through each conn in
 518  519   * the drain list and calls conn_drain_remove() to clear flow control (via
 519  520   * calling su_txq_full() or clearing QFULL), and remove the conn from the
 520  521   * drain list.
 521  522   *
 522  523   * Note that the drain list is not a single list but a (configurable) array of
 523  524   * lists (8 elements by default).  Synchronization between drain insertion and
 524  525   * flow control wakeup is handled by using idl_txl->txl_lock, and only
 525  526   * conn_drain_insert() and conn_drain_remove() manipulate the drain list.
 526  527   *
 527  528   * Flow control via STREAMS is used when ILL_DIRECT_CAPABLE() returns FALSE.
 528  529   * On the send side, if the packet cannot be sent down to the driver by IP
 529  530   * (canput() fails), ip_xmit() drops the packet and returns EWOULDBLOCK to the
 530  531   * caller, who may then invoke ixa_check_drain_insert() to insert the conn on
 531  532   * the 0'th drain list.  When ip_wsrv() runs on the ill_wq because flow
 532  533   * control has been relieved, the blocked conns in the 0'th drain list are
 533  534   * drained as in the non-STREAMS case.
 534  535   *
 535  536   * In both the STREAMS and non-STREAMS cases, the sockfs upcall to set QFULL
 536  537   * is done when the conn is inserted into the drain list (conn_drain_insert())
 537  538   * and cleared when the conn is removed from the it (conn_drain_remove()).
 538  539   *
 539  540   * IPQOS notes:
 540  541   *
 541  542   * IPQoS Policies are applied to packets using IPPF (IP Policy framework)
 542  543   * and IPQoS modules. IPPF includes hooks in IP at different control points
 543  544   * (callout positions) which direct packets to IPQoS modules for policy
 544  545   * processing. Policies, if present, are global.
 545  546   *
 546  547   * The callout positions are located in the following paths:
 547  548   *              o local_in (packets destined for this host)
 548  549   *              o local_out (packets orginating from this host )
 549  550   *              o fwd_in  (packets forwarded by this m/c - inbound)
 550  551   *              o fwd_out (packets forwarded by this m/c - outbound)
 551  552   * Hooks at these callout points can be enabled/disabled using the ndd variable
 552  553   * ip_policy_mask (a bit mask with the 4 LSB indicating the callout positions).
 553  554   * By default all the callout positions are enabled.
 554  555   *
 555  556   * Outbound (local_out)
 556  557   * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6.
 557  558   *
 558  559   * Inbound (local_in)
 559  560   * Hooks are placed in ip_fanout_v4 and ip_fanout_v6.
 560  561   *
 561  562   * Forwarding (in and out)
 562  563   * Hooks are placed in ire_recv_forward_v4/v6.
 563  564   *
 564  565   * IP Policy Framework processing (IPPF processing)
 565  566   * Policy processing for a packet is initiated by ip_process, which ascertains
 566  567   * that the classifier (ipgpc) is loaded and configured, failing which the
 567  568   * packet resumes normal processing in IP. If the clasifier is present, the
 568  569   * packet is acted upon by one or more IPQoS modules (action instances), per
 569  570   * filters configured in ipgpc and resumes normal IP processing thereafter.
 570  571   * An action instance can drop a packet in course of its processing.
 571  572   *
 572  573   * Zones notes:
 573  574   *
 574  575   * The partitioning rules for networking are as follows:
 575  576   * 1) Packets coming from a zone must have a source address belonging to that
 576  577   * zone.
 577  578   * 2) Packets coming from a zone can only be sent on a physical interface on
 578  579   * which the zone has an IP address.
 579  580   * 3) Between two zones on the same machine, packet delivery is only allowed if
 580  581   * there's a matching route for the destination and zone in the forwarding
 581  582   * table.
 582  583   * 4) The TCP and UDP port spaces are per-zone; that is, two processes in
 583  584   * different zones can bind to the same port with the wildcard address
 584  585   * (INADDR_ANY).
 585  586   *
 586  587   * The granularity of interface partitioning is at the logical interface level.
 587  588   * Therefore, every zone has its own IP addresses, and incoming packets can be
 588  589   * attributed to a zone unambiguously. A logical interface is placed into a zone
 589  590   * using the SIOCSLIFZONE ioctl; this sets the ipif_zoneid field in the ipif_t
 590  591   * structure. Rule (1) is implemented by modifying the source address selection
 591  592   * algorithm so that the list of eligible addresses is filtered based on the
 592  593   * sending process zone.
 593  594   *
 594  595   * The Internet Routing Entries (IREs) are either exclusive to a zone or shared
 595  596   * across all zones, depending on their type. Here is the break-up:
 596  597   *
 597  598   * IRE type                             Shared/exclusive
 598  599   * --------                             ----------------
 599  600   * IRE_BROADCAST                        Exclusive
 600  601   * IRE_DEFAULT (default routes)         Shared (*)
 601  602   * IRE_LOCAL                            Exclusive (x)
 602  603   * IRE_LOOPBACK                         Exclusive
 603  604   * IRE_PREFIX (net routes)              Shared (*)
 604  605   * IRE_IF_NORESOLVER (interface routes) Exclusive
 605  606   * IRE_IF_RESOLVER (interface routes)   Exclusive
 606  607   * IRE_IF_CLONE (interface routes)      Exclusive
 607  608   * IRE_HOST (host routes)               Shared (*)
 608  609   *
 609  610   * (*) A zone can only use a default or off-subnet route if the gateway is
 610  611   * directly reachable from the zone, that is, if the gateway's address matches
 611  612   * one of the zone's logical interfaces.
 612  613   *
 613  614   * (x) IRE_LOCAL are handled a bit differently.
 614  615   * When ip_restrict_interzone_loopback is set (the default),
 615  616   * ire_route_recursive restricts loopback using an IRE_LOCAL
 616  617   * between zone to the case when L2 would have conceptually looped the packet
 617  618   * back, i.e. the loopback which is required since neither Ethernet drivers
 618  619   * nor Ethernet hardware loops them back. This is the case when the normal
 619  620   * routes (ignoring IREs with different zoneids) would send out the packet on
 620  621   * the same ill as the ill with which is IRE_LOCAL is associated.
 621  622   *
 622  623   * Multiple zones can share a common broadcast address; typically all zones
 623  624   * share the 255.255.255.255 address. Incoming as well as locally originated
 624  625   * broadcast packets must be dispatched to all the zones on the broadcast
 625  626   * network. For directed broadcasts (e.g. 10.16.72.255) this is not trivial
 626  627   * since some zones may not be on the 10.16.72/24 network. To handle this, each
 627  628   * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are
 628  629   * sent to every zone that has an IRE_BROADCAST entry for the destination
 629  630   * address on the input ill, see ip_input_broadcast().
 630  631   *
 631  632   * Applications in different zones can join the same multicast group address.
 632  633   * The same logic applies for multicast as for broadcast. ip_input_multicast
 633  634   * dispatches packets to all zones that have members on the physical interface.
 634  635   */
 635  636  
 636  637  /*
 637  638   * Squeue Fanout flags:
 638  639   *      0: No fanout.
 639  640   *      1: Fanout across all squeues
 640  641   */
 641  642  boolean_t       ip_squeue_fanout = 0;
 642  643  
 643  644  /*
 644  645   * Maximum dups allowed per packet.
 645  646   */
 646  647  uint_t ip_max_frag_dups = 10;
 647  648  
 648  649  static int      ip_open(queue_t *q, dev_t *devp, int flag, int sflag,
 649  650                      cred_t *credp, boolean_t isv6);
 650  651  static mblk_t   *ip_xmit_attach_llhdr(mblk_t *, nce_t *);
 651  652  
 652  653  static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *);
 653  654  static void     icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *);
 654  655  static void     icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *,
 655  656      ip_recv_attr_t *);
 656  657  static void     icmp_options_update(ipha_t *);
 657  658  static void     icmp_param_problem(mblk_t *, uint8_t,  ip_recv_attr_t *);
 658  659  static void     icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *);
 659  660  static mblk_t   *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *);
 660  661  static void     icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *,
 661  662      ip_recv_attr_t *);
 662  663  static void     icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *);
 663  664  static void     icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *,
 664  665      ip_recv_attr_t *);
 665  666  
 666  667  mblk_t          *ip_dlpi_alloc(size_t, t_uscalar_t);
 667  668  char            *ip_dot_addr(ipaddr_t, char *);
 668  669  mblk_t          *ip_carve_mp(mblk_t **, ssize_t);
 669  670  int             ip_close(queue_t *, int);
 670  671  static char     *ip_dot_saddr(uchar_t *, char *);
 671  672  static void     ip_lrput(queue_t *, mblk_t *);
 672  673  ipaddr_t        ip_net_mask(ipaddr_t);
 673  674  char            *ip_nv_lookup(nv_t *, int);
 674  675  void    ip_rput(queue_t *, mblk_t *);
 675  676  static void     ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp,
 676  677                      void *dummy_arg);
 677  678  int             ip_snmp_get(queue_t *, mblk_t *, int, boolean_t);
 678  679  static mblk_t   *ip_snmp_get_mib2_ip(queue_t *, mblk_t *,
 679  680                      mib2_ipIfStatsEntry_t *, ip_stack_t *, boolean_t);
 680  681  static mblk_t   *ip_snmp_get_mib2_ip_traffic_stats(queue_t *, mblk_t *,
 681  682                      ip_stack_t *, boolean_t);
 682  683  static mblk_t   *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *, ip_stack_t *,
 683  684                      boolean_t);
 684  685  static mblk_t   *ip_snmp_get_mib2_icmp(queue_t *, mblk_t *, ip_stack_t *ipst);
 685  686  static mblk_t   *ip_snmp_get_mib2_icmp6(queue_t *, mblk_t *, ip_stack_t *ipst);
 686  687  static mblk_t   *ip_snmp_get_mib2_igmp(queue_t *, mblk_t *, ip_stack_t *ipst);
 687  688  static mblk_t   *ip_snmp_get_mib2_multi(queue_t *, mblk_t *, ip_stack_t *ipst);
 688  689  static mblk_t   *ip_snmp_get_mib2_ip_addr(queue_t *, mblk_t *,
 689  690                      ip_stack_t *ipst, boolean_t);
 690  691  static mblk_t   *ip_snmp_get_mib2_ip6_addr(queue_t *, mblk_t *,
 691  692                      ip_stack_t *ipst, boolean_t);
 692  693  static mblk_t   *ip_snmp_get_mib2_ip_group_src(queue_t *, mblk_t *,
 693  694                      ip_stack_t *ipst);
 694  695  static mblk_t   *ip_snmp_get_mib2_ip6_group_src(queue_t *, mblk_t *,
 695  696                      ip_stack_t *ipst);
 696  697  static mblk_t   *ip_snmp_get_mib2_ip_group_mem(queue_t *, mblk_t *,
 697  698                      ip_stack_t *ipst);
 698  699  static mblk_t   *ip_snmp_get_mib2_ip6_group_mem(queue_t *, mblk_t *,
 699  700                      ip_stack_t *ipst);
 700  701  static mblk_t   *ip_snmp_get_mib2_virt_multi(queue_t *, mblk_t *,
 701  702                      ip_stack_t *ipst);
 702  703  static mblk_t   *ip_snmp_get_mib2_multi_rtable(queue_t *, mblk_t *,
 703  704                      ip_stack_t *ipst);
 704  705  static mblk_t   *ip_snmp_get_mib2_ip_route_media(queue_t *, mblk_t *, int,
 705  706                      ip_stack_t *ipst);
 706  707  static mblk_t   *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int,
 707  708                      ip_stack_t *ipst);
 708  709  static void     ip_snmp_get2_v4(ire_t *, iproutedata_t *);
 709  710  static void     ip_snmp_get2_v6_route(ire_t *, iproutedata_t *);
 710  711  static int      ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *);
 711  712  static int      ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *);
 712  713  int             ip_snmp_set(queue_t *, int, int, uchar_t *, int);
 713  714  
 714  715  static mblk_t   *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *,
 715  716                      mblk_t *);
 716  717  
 717  718  static void     conn_drain_init(ip_stack_t *);
 718  719  static void     conn_drain_fini(ip_stack_t *);
 719  720  static void     conn_drain(conn_t *connp, boolean_t closing);
 720  721  
 721  722  static void     conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
 722  723  static void     conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *);
 723  724  
 724  725  static void     *ip_stack_init(netstackid_t stackid, netstack_t *ns);
 725  726  static void     ip_stack_shutdown(netstackid_t stackid, void *arg);
 726  727  static void     ip_stack_fini(netstackid_t stackid, void *arg);
 727  728  
 728  729  static int      ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
 729  730      const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
 730  731      ire_t *, conn_t *, boolean_t, const in6_addr_t *,  mcast_record_t,
 731  732      const in6_addr_t *);
 732  733  
 733  734  static int      ip_squeue_switch(int);
 734  735  
 735  736  static void     *ip_kstat_init(netstackid_t, ip_stack_t *);
 736  737  static void     ip_kstat_fini(netstackid_t, kstat_t *);
 737  738  static int      ip_kstat_update(kstat_t *kp, int rw);
 738  739  static void     *icmp_kstat_init(netstackid_t);
 739  740  static void     icmp_kstat_fini(netstackid_t, kstat_t *);
 740  741  static int      icmp_kstat_update(kstat_t *kp, int rw);
 741  742  static void     *ip_kstat2_init(netstackid_t, ip_stat_t *);
 742  743  static void     ip_kstat2_fini(netstackid_t, kstat_t *);
 743  744  
 744  745  static void     ipobs_init(ip_stack_t *);
 745  746  static void     ipobs_fini(ip_stack_t *);
 746  747  
 747  748  static int      ip_tp_cpu_update(cpu_setup_t, int, void *);
 748  749  
 749  750  ipaddr_t        ip_g_all_ones = IP_HOST_MASK;
 750  751  
 751  752  static long ip_rput_pullups;
 752  753  int     dohwcksum = 1;  /* use h/w cksum if supported by the hardware */
 753  754  
 754  755  vmem_t *ip_minor_arena_sa; /* for minor nos. from INET_MIN_DEV+2 thru 2^^18-1 */
 755  756  vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */
 756  757  
 757  758  int     ip_debug;
 758  759  
 759  760  /*
 760  761   * Multirouting/CGTP stuff
 761  762   */
 762  763  int     ip_cgtp_filter_rev = CGTP_FILTER_REV;   /* CGTP hooks version */
 763  764  
 764  765  /*
 765  766   * IP tunables related declarations. Definitions are in ip_tunables.c
 766  767   */
 767  768  extern mod_prop_info_t ip_propinfo_tbl[];
 768  769  extern int ip_propinfo_count;
 769  770  
 770  771  /*
 771  772   * Table of IP ioctls encoding the various properties of the ioctl and
 772  773   * indexed based on the last byte of the ioctl command. Occasionally there
 773  774   * is a clash, and there is more than 1 ioctl with the same last byte.
 774  775   * In such a case 1 ioctl is encoded in the ndx table and the remaining
 775  776   * ioctls are encoded in the misc table. An entry in the ndx table is
 776  777   * retrieved by indexing on the last byte of the ioctl command and comparing
 777  778   * the ioctl command with the value in the ndx table. In the event of a
 778  779   * mismatch the misc table is then searched sequentially for the desired
 779  780   * ioctl command.
 780  781   *
 781  782   * Entry: <command> <copyin_size> <flags> <cmd_type> <function> <restart_func>
 782  783   */
 783  784  ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
 784  785          /* 000 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 785  786          /* 001 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 786  787          /* 002 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 787  788          /* 003 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 788  789          /* 004 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 789  790          /* 005 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 790  791          /* 006 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 791  792          /* 007 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 792  793          /* 008 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 793  794          /* 009 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 794  795  
 795  796          /* 010 */ { SIOCADDRT,  sizeof (struct rtentry), IPI_PRIV,
 796  797                          MISC_CMD, ip_siocaddrt, NULL },
 797  798          /* 011 */ { SIOCDELRT,  sizeof (struct rtentry), IPI_PRIV,
 798  799                          MISC_CMD, ip_siocdelrt, NULL },
 799  800  
 800  801          /* 012 */ { SIOCSIFADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 801  802                          IF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
 802  803          /* 013 */ { SIOCGIFADDR, sizeof (struct ifreq), IPI_GET_CMD,
 803  804                          IF_CMD, ip_sioctl_get_addr, NULL },
 804  805  
 805  806          /* 014 */ { SIOCSIFDSTADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 806  807                          IF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 807  808          /* 015 */ { SIOCGIFDSTADDR, sizeof (struct ifreq),
 808  809                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_dstaddr, NULL },
 809  810  
 810  811          /* 016 */ { SIOCSIFFLAGS, sizeof (struct ifreq),
 811  812                          IPI_PRIV | IPI_WR,
 812  813                          IF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 813  814          /* 017 */ { SIOCGIFFLAGS, sizeof (struct ifreq),
 814  815                          IPI_MODOK | IPI_GET_CMD,
 815  816                          IF_CMD, ip_sioctl_get_flags, NULL },
 816  817  
 817  818          /* 018 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 818  819          /* 019 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 819  820  
 820  821          /* copyin size cannot be coded for SIOCGIFCONF */
 821  822          /* 020 */ { O_SIOCGIFCONF, 0, IPI_GET_CMD,
 822  823                          MISC_CMD, ip_sioctl_get_ifconf, NULL },
 823  824  
 824  825          /* 021 */ { SIOCSIFMTU, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 825  826                          IF_CMD, ip_sioctl_mtu, NULL },
 826  827          /* 022 */ { SIOCGIFMTU, sizeof (struct ifreq), IPI_GET_CMD,
 827  828                          IF_CMD, ip_sioctl_get_mtu, NULL },
 828  829          /* 023 */ { SIOCGIFBRDADDR, sizeof (struct ifreq),
 829  830                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_brdaddr, NULL },
 830  831          /* 024 */ { SIOCSIFBRDADDR, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 831  832                          IF_CMD, ip_sioctl_brdaddr, NULL },
 832  833          /* 025 */ { SIOCGIFNETMASK, sizeof (struct ifreq),
 833  834                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_netmask, NULL },
 834  835          /* 026 */ { SIOCSIFNETMASK, sizeof (struct ifreq), IPI_PRIV | IPI_WR,
 835  836                          IF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 836  837          /* 027 */ { SIOCGIFMETRIC, sizeof (struct ifreq),
 837  838                          IPI_GET_CMD, IF_CMD, ip_sioctl_get_metric, NULL },
 838  839          /* 028 */ { SIOCSIFMETRIC, sizeof (struct ifreq), IPI_PRIV,
 839  840                          IF_CMD, ip_sioctl_metric, NULL },
 840  841          /* 029 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 841  842  
 842  843          /* See 166-168 below for extended SIOC*XARP ioctls */
 843  844          /* 030 */ { SIOCSARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 844  845                          ARP_CMD, ip_sioctl_arp, NULL },
 845  846          /* 031 */ { SIOCGARP, sizeof (struct arpreq), IPI_GET_CMD,
 846  847                          ARP_CMD, ip_sioctl_arp, NULL },
 847  848          /* 032 */ { SIOCDARP, sizeof (struct arpreq), IPI_PRIV | IPI_WR,
 848  849                          ARP_CMD, ip_sioctl_arp, NULL },
 849  850  
 850  851          /* 033 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 851  852          /* 034 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 852  853          /* 035 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 853  854          /* 036 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 854  855          /* 037 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 855  856          /* 038 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 856  857          /* 039 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 857  858          /* 040 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 858  859          /* 041 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 859  860          /* 042 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 860  861          /* 043 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 861  862          /* 044 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 862  863          /* 045 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 863  864          /* 046 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 864  865          /* 047 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 865  866          /* 048 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 866  867          /* 049 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 867  868          /* 050 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 868  869          /* 051 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 869  870          /* 052 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 870  871          /* 053 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 871  872  
 872  873          /* 054 */ { IF_UNITSEL, sizeof (int), IPI_PRIV | IPI_WR | IPI_MODOK,
 873  874                          MISC_CMD, if_unitsel, if_unitsel_restart },
 874  875  
 875  876          /* 055 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 876  877          /* 056 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 877  878          /* 057 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 878  879          /* 058 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 879  880          /* 059 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 880  881          /* 060 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 881  882          /* 061 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 882  883          /* 062 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 883  884          /* 063 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 884  885          /* 064 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 885  886          /* 065 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 886  887          /* 066 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 887  888          /* 067 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 888  889          /* 068 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 889  890          /* 069 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 890  891          /* 070 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 891  892          /* 071 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 892  893          /* 072 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 893  894  
 894  895          /* 073 */ { SIOCSIFNAME, sizeof (struct ifreq),
 895  896                          IPI_PRIV | IPI_WR | IPI_MODOK,
 896  897                          IF_CMD, ip_sioctl_sifname, NULL },
 897  898  
 898  899          /* 074 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 899  900          /* 075 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 900  901          /* 076 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 901  902          /* 077 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 902  903          /* 078 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 903  904          /* 079 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 904  905          /* 080 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 905  906          /* 081 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 906  907          /* 082 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 907  908          /* 083 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 908  909          /* 084 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 909  910          /* 085 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 910  911          /* 086 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 911  912  
 912  913          /* 087 */ { SIOCGIFNUM, sizeof (int), IPI_GET_CMD,
 913  914                          MISC_CMD, ip_sioctl_get_ifnum, NULL },
 914  915          /* 088 */ { SIOCGIFMUXID, sizeof (struct ifreq), IPI_GET_CMD,
 915  916                          IF_CMD, ip_sioctl_get_muxid, NULL },
 916  917          /* 089 */ { SIOCSIFMUXID, sizeof (struct ifreq),
 917  918                          IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_muxid, NULL },
 918  919  
 919  920          /* Both if and lif variants share same func */
 920  921          /* 090 */ { SIOCGIFINDEX, sizeof (struct ifreq), IPI_GET_CMD,
 921  922                          IF_CMD, ip_sioctl_get_lifindex, NULL },
 922  923          /* Both if and lif variants share same func */
 923  924          /* 091 */ { SIOCSIFINDEX, sizeof (struct ifreq),
 924  925                          IPI_PRIV | IPI_WR, IF_CMD, ip_sioctl_slifindex, NULL },
 925  926  
 926  927          /* copyin size cannot be coded for SIOCGIFCONF */
 927  928          /* 092 */ { SIOCGIFCONF, 0, IPI_GET_CMD,
 928  929                          MISC_CMD, ip_sioctl_get_ifconf, NULL },
 929  930          /* 093 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 930  931          /* 094 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 931  932          /* 095 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 932  933          /* 096 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 933  934          /* 097 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 934  935          /* 098 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 935  936          /* 099 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 936  937          /* 100 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 937  938          /* 101 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 938  939          /* 102 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 939  940          /* 103 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 940  941          /* 104 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 941  942          /* 105 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 942  943          /* 106 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 943  944          /* 107 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 944  945          /* 108 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 945  946          /* 109 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 946  947  
 947  948          /* 110 */ { SIOCLIFREMOVEIF, sizeof (struct lifreq),
 948  949                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_removeif,
 949  950                          ip_sioctl_removeif_restart },
 950  951          /* 111 */ { SIOCLIFADDIF, sizeof (struct lifreq),
 951  952                          IPI_GET_CMD | IPI_PRIV | IPI_WR,
 952  953                          LIF_CMD, ip_sioctl_addif, NULL },
 953  954  #define SIOCLIFADDR_NDX 112
 954  955          /* 112 */ { SIOCSLIFADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 955  956                          LIF_CMD, ip_sioctl_addr, ip_sioctl_addr_restart },
 956  957          /* 113 */ { SIOCGLIFADDR, sizeof (struct lifreq),
 957  958                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_addr, NULL },
 958  959          /* 114 */ { SIOCSLIFDSTADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 959  960                          LIF_CMD, ip_sioctl_dstaddr, ip_sioctl_dstaddr_restart },
 960  961          /* 115 */ { SIOCGLIFDSTADDR, sizeof (struct lifreq),
 961  962                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dstaddr, NULL },
 962  963          /* 116 */ { SIOCSLIFFLAGS, sizeof (struct lifreq),
 963  964                          IPI_PRIV | IPI_WR,
 964  965                          LIF_CMD, ip_sioctl_flags, ip_sioctl_flags_restart },
 965  966          /* 117 */ { SIOCGLIFFLAGS, sizeof (struct lifreq),
 966  967                          IPI_GET_CMD | IPI_MODOK,
 967  968                          LIF_CMD, ip_sioctl_get_flags, NULL },
 968  969  
 969  970          /* 118 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 970  971          /* 119 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
 971  972  
 972  973          /* 120 */ { O_SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
 973  974                          ip_sioctl_get_lifconf, NULL },
 974  975          /* 121 */ { SIOCSLIFMTU, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 975  976                          LIF_CMD, ip_sioctl_mtu, NULL },
 976  977          /* 122 */ { SIOCGLIFMTU, sizeof (struct lifreq), IPI_GET_CMD,
 977  978                          LIF_CMD, ip_sioctl_get_mtu, NULL },
 978  979          /* 123 */ { SIOCGLIFBRDADDR, sizeof (struct lifreq),
 979  980                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_brdaddr, NULL },
 980  981          /* 124 */ { SIOCSLIFBRDADDR, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 981  982                          LIF_CMD, ip_sioctl_brdaddr, NULL },
 982  983          /* 125 */ { SIOCGLIFNETMASK, sizeof (struct lifreq),
 983  984                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_netmask, NULL },
 984  985          /* 126 */ { SIOCSLIFNETMASK, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 985  986                          LIF_CMD, ip_sioctl_netmask, ip_sioctl_netmask_restart },
 986  987          /* 127 */ { SIOCGLIFMETRIC, sizeof (struct lifreq),
 987  988                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_metric, NULL },
 988  989          /* 128 */ { SIOCSLIFMETRIC, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
 989  990                          LIF_CMD, ip_sioctl_metric, NULL },
 990  991          /* 129 */ { SIOCSLIFNAME, sizeof (struct lifreq),
 991  992                          IPI_PRIV | IPI_WR | IPI_MODOK,
 992  993                          LIF_CMD, ip_sioctl_slifname,
 993  994                          ip_sioctl_slifname_restart },
 994  995  
 995  996          /* 130 */ { SIOCGLIFNUM, sizeof (struct lifnum), IPI_GET_CMD,
 996  997                          MISC_CMD, ip_sioctl_get_lifnum, NULL },
 997  998          /* 131 */ { SIOCGLIFMUXID, sizeof (struct lifreq),
 998  999                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_muxid, NULL },
 999 1000          /* 132 */ { SIOCSLIFMUXID, sizeof (struct lifreq),
1000 1001                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_muxid, NULL },
1001 1002          /* 133 */ { SIOCGLIFINDEX, sizeof (struct lifreq),
1002 1003                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifindex, 0 },
1003 1004          /* 134 */ { SIOCSLIFINDEX, sizeof (struct lifreq),
1004 1005                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifindex, 0 },
1005 1006          /* 135 */ { SIOCSLIFTOKEN, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1006 1007                          LIF_CMD, ip_sioctl_token, NULL },
1007 1008          /* 136 */ { SIOCGLIFTOKEN, sizeof (struct lifreq),
1008 1009                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_token, NULL },
1009 1010          /* 137 */ { SIOCSLIFSUBNET, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1010 1011                          LIF_CMD, ip_sioctl_subnet, ip_sioctl_subnet_restart },
1011 1012          /* 138 */ { SIOCGLIFSUBNET, sizeof (struct lifreq),
1012 1013                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_subnet, NULL },
1013 1014          /* 139 */ { SIOCSLIFLNKINFO, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1014 1015                          LIF_CMD, ip_sioctl_lnkinfo, NULL },
1015 1016  
1016 1017          /* 140 */ { SIOCGLIFLNKINFO, sizeof (struct lifreq),
1017 1018                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lnkinfo, NULL },
1018 1019          /* 141 */ { SIOCLIFDELND, sizeof (struct lifreq), IPI_PRIV,
1019 1020                          LIF_CMD, ip_siocdelndp_v6, NULL },
1020 1021          /* 142 */ { SIOCLIFGETND, sizeof (struct lifreq), IPI_GET_CMD,
1021 1022                          LIF_CMD, ip_siocqueryndp_v6, NULL },
1022 1023          /* 143 */ { SIOCLIFSETND, sizeof (struct lifreq), IPI_PRIV,
1023 1024                          LIF_CMD, ip_siocsetndp_v6, NULL },
1024 1025          /* 144 */ { SIOCTMYADDR, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1025 1026                          MISC_CMD, ip_sioctl_tmyaddr, NULL },
1026 1027          /* 145 */ { SIOCTONLINK, sizeof (struct sioc_addrreq), IPI_GET_CMD,
1027 1028                          MISC_CMD, ip_sioctl_tonlink, NULL },
1028 1029          /* 146 */ { SIOCTMYSITE, sizeof (struct sioc_addrreq), 0,
1029 1030                          MISC_CMD, ip_sioctl_tmysite, NULL },
1030 1031          /* 147 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1031 1032          /* 148 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1032 1033  
1033 1034          /* Old *IPSECONFIG ioctls are now deprecated, now see spdsock.c */
1034 1035          /* 149 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1035 1036          /* 150 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1036 1037          /* 151 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1037 1038          /* 152 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1038 1039  
1039 1040          /* 153 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1040 1041  
1041 1042          /* 154 */ { SIOCGLIFBINDING, sizeof (struct lifreq), IPI_GET_CMD,
1042 1043                          LIF_CMD, ip_sioctl_get_binding, NULL },
1043 1044          /* 155 */ { SIOCSLIFGROUPNAME, sizeof (struct lifreq),
1044 1045                          IPI_PRIV | IPI_WR,
1045 1046                          LIF_CMD, ip_sioctl_groupname, ip_sioctl_groupname },
1046 1047          /* 156 */ { SIOCGLIFGROUPNAME, sizeof (struct lifreq),
1047 1048                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_groupname, NULL },
1048 1049          /* 157 */ { SIOCGLIFGROUPINFO, sizeof (lifgroupinfo_t),
1049 1050                          IPI_GET_CMD, MISC_CMD, ip_sioctl_groupinfo, NULL },
1050 1051  
1051 1052          /* Leave 158-160 unused; used to be SIOC*IFARP ioctls */
1052 1053          /* 158 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1053 1054          /* 159 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1054 1055          /* 160 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1055 1056  
1056 1057          /* 161 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1057 1058  
1058 1059          /* These are handled in ip_sioctl_copyin_setup itself */
1059 1060          /* 162 */ { SIOCGIP6ADDRPOLICY, 0, IPI_NULL_BCONT,
1060 1061                          MISC_CMD, NULL, NULL },
1061 1062          /* 163 */ { SIOCSIP6ADDRPOLICY, 0, IPI_PRIV | IPI_NULL_BCONT,
1062 1063                          MISC_CMD, NULL, NULL },
1063 1064          /* 164 */ { SIOCGDSTINFO, 0, IPI_GET_CMD, MISC_CMD, NULL, NULL },
1064 1065  
1065 1066          /* 165 */ { SIOCGLIFCONF, 0, IPI_GET_CMD, MISC_CMD,
1066 1067                          ip_sioctl_get_lifconf, NULL },
1067 1068  
1068 1069          /* 166 */ { SIOCSXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1069 1070                          XARP_CMD, ip_sioctl_arp, NULL },
1070 1071          /* 167 */ { SIOCGXARP, sizeof (struct xarpreq), IPI_GET_CMD,
1071 1072                          XARP_CMD, ip_sioctl_arp, NULL },
1072 1073          /* 168 */ { SIOCDXARP, sizeof (struct xarpreq), IPI_PRIV | IPI_WR,
1073 1074                          XARP_CMD, ip_sioctl_arp, NULL },
1074 1075  
1075 1076          /* SIOCPOPSOCKFS is not handled by IP */
1076 1077          /* 169 */ { IPI_DONTCARE /* SIOCPOPSOCKFS */, 0, 0, 0, NULL, NULL },
1077 1078  
1078 1079          /* 170 */ { SIOCGLIFZONE, sizeof (struct lifreq),
1079 1080                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_lifzone, NULL },
1080 1081          /* 171 */ { SIOCSLIFZONE, sizeof (struct lifreq),
1081 1082                          IPI_PRIV | IPI_WR, LIF_CMD, ip_sioctl_slifzone,
1082 1083                          ip_sioctl_slifzone_restart },
1083 1084          /* 172-174 are SCTP ioctls and not handled by IP */
1084 1085          /* 172 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1085 1086          /* 173 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1086 1087          /* 174 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1087 1088          /* 175 */ { SIOCGLIFUSESRC, sizeof (struct lifreq),
1088 1089                          IPI_GET_CMD, LIF_CMD,
1089 1090                          ip_sioctl_get_lifusesrc, 0 },
1090 1091          /* 176 */ { SIOCSLIFUSESRC, sizeof (struct lifreq),
1091 1092                          IPI_PRIV | IPI_WR,
1092 1093                          LIF_CMD, ip_sioctl_slifusesrc,
1093 1094                          NULL },
1094 1095          /* 177 */ { SIOCGLIFSRCOF, 0, IPI_GET_CMD, MISC_CMD,
1095 1096                          ip_sioctl_get_lifsrcof, NULL },
1096 1097          /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD,
1097 1098                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1098 1099          /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0,
1099 1100                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1100 1101          /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD,
1101 1102                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1102 1103          /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0,
1103 1104                          MSFILT_CMD, ip_sioctl_msfilter, NULL },
1104 1105          /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL },
1105 1106          /* SIOCSENABLESDP is handled by SDP */
1106 1107          /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
1107 1108          /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
1108 1109          /* 185 */ { SIOCGIFHWADDR, sizeof (struct ifreq), IPI_GET_CMD,
1109 1110                          IF_CMD, ip_sioctl_get_ifhwaddr, NULL },
1110 1111          /* 186 */ { IPI_DONTCARE /* SIOCGSTAMP */, 0, 0, 0, NULL, NULL },
1111 1112          /* 187 */ { SIOCILB, 0, IPI_PRIV | IPI_GET_CMD, MISC_CMD,
1112 1113                          ip_sioctl_ilb_cmd, NULL },
1113 1114          /* 188 */ { SIOCGETPROP, 0, IPI_GET_CMD, 0, NULL, NULL },
1114 1115          /* 189 */ { SIOCSETPROP, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL},
1115 1116          /* 190 */ { SIOCGLIFDADSTATE, sizeof (struct lifreq),
1116 1117                          IPI_GET_CMD, LIF_CMD, ip_sioctl_get_dadstate, NULL },
1117 1118          /* 191 */ { SIOCSLIFPREFIX, sizeof (struct lifreq), IPI_PRIV | IPI_WR,
1118 1119                          LIF_CMD, ip_sioctl_prefix, ip_sioctl_prefix_restart },
1119 1120          /* 192 */ { SIOCGLIFHWADDR, sizeof (struct lifreq), IPI_GET_CMD,
1120 1121                          LIF_CMD, ip_sioctl_get_lifhwaddr, NULL }
1121 1122  };
1122 1123  
1123 1124  int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1124 1125  
1125 1126  ip_ioctl_cmd_t ip_misc_ioctl_table[] = {
1126 1127          { I_LINK,       0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1127 1128          { I_UNLINK,     0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1128 1129          { I_PLINK,      0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1129 1130          { I_PUNLINK,    0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1130 1131          { ND_GET,       0, 0, 0, NULL, NULL },
1131 1132          { ND_SET,       0, IPI_PRIV | IPI_WR, 0, NULL, NULL },
1132 1133          { IP_IOCTL,     0, 0, 0, NULL, NULL },
1133 1134          { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD,
1134 1135                  MISC_CMD, mrt_ioctl},
1135 1136          { SIOCGETSGCNT, sizeof (struct sioc_sg_req), IPI_GET_CMD,
1136 1137                  MISC_CMD, mrt_ioctl},
1137 1138          { SIOCGETLSGCNT, sizeof (struct sioc_lsg_req), IPI_GET_CMD,
1138 1139                  MISC_CMD, mrt_ioctl}
1139 1140  };
1140 1141  
1141 1142  int ip_misc_ioctl_count =
1142 1143      sizeof (ip_misc_ioctl_table) / sizeof (ip_ioctl_cmd_t);
1143 1144  
1144 1145  int     conn_drain_nthreads;            /* Number of drainers reqd. */
1145 1146                                          /* Settable in /etc/system */
1146 1147  /* Defined in ip_ire.c */
1147 1148  extern uint32_t ip_ire_max_bucket_cnt, ip6_ire_max_bucket_cnt;
1148 1149  extern uint32_t ip_ire_min_bucket_cnt, ip6_ire_min_bucket_cnt;
1149 1150  extern uint32_t ip_ire_mem_ratio, ip_ire_cpu_ratio;
1150 1151  
1151 1152  static nv_t     ire_nv_arr[] = {
1152 1153          { IRE_BROADCAST, "BROADCAST" },
1153 1154          { IRE_LOCAL, "LOCAL" },
1154 1155          { IRE_LOOPBACK, "LOOPBACK" },
1155 1156          { IRE_DEFAULT, "DEFAULT" },
1156 1157          { IRE_PREFIX, "PREFIX" },
1157 1158          { IRE_IF_NORESOLVER, "IF_NORESOL" },
1158 1159          { IRE_IF_RESOLVER, "IF_RESOLV" },
1159 1160          { IRE_IF_CLONE, "IF_CLONE" },
1160 1161          { IRE_HOST, "HOST" },
1161 1162          { IRE_MULTICAST, "MULTICAST" },
1162 1163          { IRE_NOROUTE, "NOROUTE" },
1163 1164          { 0 }
1164 1165  };
1165 1166  
1166 1167  nv_t    *ire_nv_tbl = ire_nv_arr;
1167 1168  
1168 1169  /* Simple ICMP IP Header Template */
1169 1170  static ipha_t icmp_ipha = {
1170 1171          IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
1171 1172  };
1172 1173  
1173 1174  struct module_info ip_mod_info = {
1174 1175          IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
1175 1176          IP_MOD_LOWAT
1176 1177  };
1177 1178  
1178 1179  /*
1179 1180   * Duplicate static symbols within a module confuses mdb; so we avoid the
1180 1181   * problem by making the symbols here distinct from those in udp.c.
1181 1182   */
1182 1183  
1183 1184  /*
1184 1185   * Entry points for IP as a device and as a module.
1185 1186   * We have separate open functions for the /dev/ip and /dev/ip6 devices.
1186 1187   */
1187 1188  static struct qinit iprinitv4 = {
1188 1189          (pfi_t)ip_rput, NULL, ip_openv4, ip_close, NULL,
1189 1190          &ip_mod_info
1190 1191  };
1191 1192  
1192 1193  struct qinit iprinitv6 = {
1193 1194          (pfi_t)ip_rput_v6, NULL, ip_openv6, ip_close, NULL,
1194 1195          &ip_mod_info
1195 1196  };
1196 1197  
1197 1198  static struct qinit ipwinit = {
1198 1199          (pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL,
1199 1200          &ip_mod_info
1200 1201  };
1201 1202  
1202 1203  static struct qinit iplrinit = {
1203 1204          (pfi_t)ip_lrput, NULL, ip_openv4, ip_close, NULL,
1204 1205          &ip_mod_info
1205 1206  };
1206 1207  
1207 1208  static struct qinit iplwinit = {
1208 1209          (pfi_t)ip_lwput, NULL, NULL, NULL, NULL,
1209 1210          &ip_mod_info
1210 1211  };
1211 1212  
1212 1213  /* For AF_INET aka /dev/ip */
1213 1214  struct streamtab ipinfov4 = {
1214 1215          &iprinitv4, &ipwinit, &iplrinit, &iplwinit
1215 1216  };
1216 1217  
1217 1218  /* For AF_INET6 aka /dev/ip6 */
1218 1219  struct streamtab ipinfov6 = {
1219 1220          &iprinitv6, &ipwinit, &iplrinit, &iplwinit
1220 1221  };
1221 1222  
1222 1223  #ifdef  DEBUG
1223 1224  boolean_t skip_sctp_cksum = B_FALSE;
1224 1225  #endif
1225 1226  
1226 1227  /*
1227 1228   * Generate an ICMP fragmentation needed message.
1228 1229   * When called from ip_output side a minimal ip_recv_attr_t needs to be
1229 1230   * constructed by the caller.
1230 1231   */
1231 1232  void
1232 1233  icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira)
1233 1234  {
1234 1235          icmph_t icmph;
1235 1236          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
1236 1237  
1237 1238          mp = icmp_pkt_err_ok(mp, ira);
1238 1239          if (mp == NULL)
1239 1240                  return;
1240 1241  
1241 1242          bzero(&icmph, sizeof (icmph_t));
1242 1243          icmph.icmph_type = ICMP_DEST_UNREACHABLE;
1243 1244          icmph.icmph_code = ICMP_FRAGMENTATION_NEEDED;
1244 1245          icmph.icmph_du_mtu = htons((uint16_t)mtu);
1245 1246          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded);
1246 1247          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
1247 1248  
1248 1249          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
1249 1250  }
1250 1251  
1251 1252  /*
1252 1253   * icmp_inbound_v4 deals with ICMP messages that are handled by IP.
1253 1254   * If the ICMP message is consumed by IP, i.e., it should not be delivered
1254 1255   * to any IPPROTO_ICMP raw sockets, then it returns NULL.
1255 1256   * Likewise, if the ICMP error is misformed (too short, etc), then it
1256 1257   * returns NULL. The caller uses this to determine whether or not to send
1257 1258   * to raw sockets.
1258 1259   *
1259 1260   * All error messages are passed to the matching transport stream.
1260 1261   *
1261 1262   * The following cases are handled by icmp_inbound:
1262 1263   * 1) It needs to send a reply back and possibly delivering it
1263 1264   *    to the "interested" upper clients.
1264 1265   * 2) Return the mblk so that the caller can pass it to the RAW socket clients.
1265 1266   * 3) It needs to change some values in IP only.
1266 1267   * 4) It needs to change some values in IP and upper layers e.g TCP
1267 1268   *    by delivering an error to the upper layers.
1268 1269   *
1269 1270   * We handle the above three cases in the context of IPsec in the
1270 1271   * following way :
1271 1272   *
1272 1273   * 1) Send the reply back in the same way as the request came in.
1273 1274   *    If it came in encrypted, it goes out encrypted. If it came in
1274 1275   *    clear, it goes out in clear. Thus, this will prevent chosen
1275 1276   *    plain text attack.
1276 1277   * 2) The client may or may not expect things to come in secure.
1277 1278   *    If it comes in secure, the policy constraints are checked
1278 1279   *    before delivering it to the upper layers. If it comes in
1279 1280   *    clear, ipsec_inbound_accept_clear will decide whether to
1280 1281   *    accept this in clear or not. In both the cases, if the returned
1281 1282   *    message (IP header + 8 bytes) that caused the icmp message has
1282 1283   *    AH/ESP headers, it is sent up to AH/ESP for validation before
1283 1284   *    sending up. If there are only 8 bytes of returned message, then
1284 1285   *    upper client will not be notified.
1285 1286   * 3) Check with global policy to see whether it matches the constaints.
1286 1287   *    But this will be done only if icmp_accept_messages_in_clear is
1287 1288   *    zero.
1288 1289   * 4) If we need to change both in IP and ULP, then the decision taken
1289 1290   *    while affecting the values in IP and while delivering up to TCP
1290 1291   *    should be the same.
1291 1292   *
1292 1293   *      There are two cases.
1293 1294   *
1294 1295   *      a) If we reject data at the IP layer (ipsec_check_global_policy()
1295 1296   *         failed), we will not deliver it to the ULP, even though they
1296 1297   *         are *willing* to accept in *clear*. This is fine as our global
1297 1298   *         disposition to icmp messages asks us reject the datagram.
1298 1299   *
1299 1300   *      b) If we accept data at the IP layer (ipsec_check_global_policy()
1300 1301   *         succeeded or icmp_accept_messages_in_clear is 1), and not able
1301 1302   *         to deliver it to ULP (policy failed), it can lead to
1302 1303   *         consistency problems. The cases known at this time are
1303 1304   *         ICMP_DESTINATION_UNREACHABLE  messages with following code
1304 1305   *         values :
1305 1306   *
1306 1307   *         - ICMP_FRAGMENTATION_NEEDED : IP adapts to the new value
1307 1308   *           and Upper layer rejects. Then the communication will
1308 1309   *           come to a stop. This is solved by making similar decisions
1309 1310   *           at both levels. Currently, when we are unable to deliver
1310 1311   *           to the Upper Layer (due to policy failures) while IP has
1311 1312   *           adjusted dce_pmtu, the next outbound datagram would
1312 1313   *           generate a local ICMP_FRAGMENTATION_NEEDED message - which
1313 1314   *           will be with the right level of protection. Thus the right
1314 1315   *           value will be communicated even if we are not able to
1315 1316   *           communicate when we get from the wire initially. But this
1316 1317   *           assumes there would be at least one outbound datagram after
1317 1318   *           IP has adjusted its dce_pmtu value. To make things
1318 1319   *           simpler, we accept in clear after the validation of
1319 1320   *           AH/ESP headers.
1320 1321   *
1321 1322   *         - Other ICMP ERRORS : We may not be able to deliver it to the
1322 1323   *           upper layer depending on the level of protection the upper
1323 1324   *           layer expects and the disposition in ipsec_inbound_accept_clear().
1324 1325   *           ipsec_inbound_accept_clear() decides whether a given ICMP error
1325 1326   *           should be accepted in clear when the Upper layer expects secure.
1326 1327   *           Thus the communication may get aborted by some bad ICMP
1327 1328   *           packets.
1328 1329   */
1329 1330  mblk_t *
1330 1331  icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira)
1331 1332  {
1332 1333          icmph_t         *icmph;
1333 1334          ipha_t          *ipha;          /* Outer header */
1334 1335          int             ip_hdr_length;  /* Outer header length */
1335 1336          boolean_t       interested;
1336 1337          ipif_t          *ipif;
1337 1338          uint32_t        ts;
1338 1339          uint32_t        *tsp;
1339 1340          timestruc_t     now;
1340 1341          ill_t           *ill = ira->ira_ill;
1341 1342          ip_stack_t      *ipst = ill->ill_ipst;
1342 1343          zoneid_t        zoneid = ira->ira_zoneid;
1343 1344          int             len_needed;
1344 1345          mblk_t          *mp_ret = NULL;
1345 1346  
1346 1347          ipha = (ipha_t *)mp->b_rptr;
1347 1348  
1348 1349          BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs);
1349 1350  
1350 1351          ip_hdr_length = ira->ira_ip_hdr_length;
1351 1352          if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) {
1352 1353                  if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) {
1353 1354                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1354 1355                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1355 1356                          freemsg(mp);
1356 1357                          return (NULL);
1357 1358                  }
1358 1359                  /* Last chance to get real. */
1359 1360                  ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira);
1360 1361                  if (ipha == NULL) {
1361 1362                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
1362 1363                          freemsg(mp);
1363 1364                          return (NULL);
1364 1365                  }
1365 1366          }
1366 1367  
1367 1368          /* The IP header will always be a multiple of four bytes */
1368 1369          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1369 1370          ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type,
1370 1371              icmph->icmph_code));
1371 1372  
1372 1373          /*
1373 1374           * We will set "interested" to "true" if we should pass a copy to
1374 1375           * the transport or if we handle the packet locally.
1375 1376           */
1376 1377          interested = B_FALSE;
1377 1378          switch (icmph->icmph_type) {
1378 1379          case ICMP_ECHO_REPLY:
1379 1380                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchoReps);
1380 1381                  break;
1381 1382          case ICMP_DEST_UNREACHABLE:
1382 1383                  if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED)
1383 1384                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInFragNeeded);
1384 1385                  interested = B_TRUE;    /* Pass up to transport */
1385 1386                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInDestUnreachs);
1386 1387                  break;
1387 1388          case ICMP_SOURCE_QUENCH:
1388 1389                  interested = B_TRUE;    /* Pass up to transport */
1389 1390                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInSrcQuenchs);
1390 1391                  break;
1391 1392          case ICMP_REDIRECT:
1392 1393                  if (!ipst->ips_ip_ignore_redirect)
1393 1394                          interested = B_TRUE;
1394 1395                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInRedirects);
1395 1396                  break;
1396 1397          case ICMP_ECHO_REQUEST:
1397 1398                  /*
1398 1399                   * Whether to respond to echo requests that come in as IP
1399 1400                   * broadcasts or as IP multicast is subject to debate
1400 1401                   * (what isn't?).  We aim to please, you pick it.
1401 1402                   * Default is do it.
1402 1403                   */
1403 1404                  if (ira->ira_flags & IRAF_MULTICAST) {
1404 1405                          /* multicast: respond based on tunable */
1405 1406                          interested = ipst->ips_ip_g_resp_to_echo_mcast;
1406 1407                  } else if (ira->ira_flags & IRAF_BROADCAST) {
1407 1408                          /* broadcast: respond based on tunable */
1408 1409                          interested = ipst->ips_ip_g_resp_to_echo_bcast;
1409 1410                  } else {
1410 1411                          /* unicast: always respond */
1411 1412                          interested = B_TRUE;
1412 1413                  }
1413 1414                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos);
1414 1415                  if (!interested) {
1415 1416                          /* We never pass these to RAW sockets */
1416 1417                          freemsg(mp);
1417 1418                          return (NULL);
1418 1419                  }
1419 1420  
1420 1421                  /* Check db_ref to make sure we can modify the packet. */
1421 1422                  if (mp->b_datap->db_ref > 1) {
1422 1423                          mblk_t  *mp1;
1423 1424  
1424 1425                          mp1 = copymsg(mp);
1425 1426                          freemsg(mp);
1426 1427                          if (!mp1) {
1427 1428                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1428 1429                                  return (NULL);
1429 1430                          }
1430 1431                          mp = mp1;
1431 1432                          ipha = (ipha_t *)mp->b_rptr;
1432 1433                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1433 1434                  }
1434 1435                  icmph->icmph_type = ICMP_ECHO_REPLY;
1435 1436                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps);
1436 1437                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1437 1438                  return (NULL);
1438 1439  
1439 1440          case ICMP_ROUTER_ADVERTISEMENT:
1440 1441          case ICMP_ROUTER_SOLICITATION:
1441 1442                  break;
1442 1443          case ICMP_TIME_EXCEEDED:
1443 1444                  interested = B_TRUE;    /* Pass up to transport */
1444 1445                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimeExcds);
1445 1446                  break;
1446 1447          case ICMP_PARAM_PROBLEM:
1447 1448                  interested = B_TRUE;    /* Pass up to transport */
1448 1449                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInParmProbs);
1449 1450                  break;
1450 1451          case ICMP_TIME_STAMP_REQUEST:
1451 1452                  /* Response to Time Stamp Requests is local policy. */
1452 1453                  if (ipst->ips_ip_g_resp_to_timestamp) {
1453 1454                          if (ira->ira_flags & IRAF_MULTIBROADCAST)
1454 1455                                  interested =
1455 1456                                      ipst->ips_ip_g_resp_to_timestamp_bcast;
1456 1457                          else
1457 1458                                  interested = B_TRUE;
1458 1459                  }
1459 1460                  if (!interested) {
1460 1461                          /* We never pass these to RAW sockets */
1461 1462                          freemsg(mp);
1462 1463                          return (NULL);
1463 1464                  }
1464 1465  
1465 1466                  /* Make sure we have enough of the packet */
1466 1467                  len_needed = ip_hdr_length + ICMPH_SIZE +
1467 1468                      3 * sizeof (uint32_t);
1468 1469  
1469 1470                  if (mp->b_wptr - mp->b_rptr < len_needed) {
1470 1471                          ipha = ip_pullup(mp, len_needed, ira);
1471 1472                          if (ipha == NULL) {
1472 1473                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1473 1474                                  ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1474 1475                                      mp, ill);
1475 1476                                  freemsg(mp);
1476 1477                                  return (NULL);
1477 1478                          }
1478 1479                          /* Refresh following the pullup. */
1479 1480                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1480 1481                  }
1481 1482                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps);
1482 1483                  /* Check db_ref to make sure we can modify the packet. */
1483 1484                  if (mp->b_datap->db_ref > 1) {
1484 1485                          mblk_t  *mp1;
1485 1486  
1486 1487                          mp1 = copymsg(mp);
1487 1488                          freemsg(mp);
1488 1489                          if (!mp1) {
1489 1490                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1490 1491                                  return (NULL);
1491 1492                          }
1492 1493                          mp = mp1;
1493 1494                          ipha = (ipha_t *)mp->b_rptr;
1494 1495                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1495 1496                  }
1496 1497                  icmph->icmph_type = ICMP_TIME_STAMP_REPLY;
1497 1498                  tsp = (uint32_t *)&icmph[1];
1498 1499                  tsp++;          /* Skip past 'originate time' */
1499 1500                  /* Compute # of milliseconds since midnight */
1500 1501                  gethrestime(&now);
1501 1502                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
1502 1503                      NSEC2MSEC(now.tv_nsec);
1503 1504                  *tsp++ = htonl(ts);     /* Lay in 'receive time' */
1504 1505                  *tsp++ = htonl(ts);     /* Lay in 'send time' */
1505 1506                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps);
1506 1507                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1507 1508                  return (NULL);
1508 1509  
1509 1510          case ICMP_TIME_STAMP_REPLY:
1510 1511                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps);
1511 1512                  break;
1512 1513          case ICMP_INFO_REQUEST:
1513 1514                  /* Per RFC 1122 3.2.2.7, ignore this. */
1514 1515          case ICMP_INFO_REPLY:
1515 1516                  break;
1516 1517          case ICMP_ADDRESS_MASK_REQUEST:
1517 1518                  if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1518 1519                          interested =
1519 1520                              ipst->ips_ip_respond_to_address_mask_broadcast;
1520 1521                  } else {
1521 1522                          interested = B_TRUE;
1522 1523                  }
1523 1524                  if (!interested) {
1524 1525                          /* We never pass these to RAW sockets */
1525 1526                          freemsg(mp);
1526 1527                          return (NULL);
1527 1528                  }
1528 1529                  len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN;
1529 1530                  if (mp->b_wptr - mp->b_rptr < len_needed) {
1530 1531                          ipha = ip_pullup(mp, len_needed, ira);
1531 1532                          if (ipha == NULL) {
1532 1533                                  BUMP_MIB(ill->ill_ip_mib,
1533 1534                                      ipIfStatsInTruncatedPkts);
1534 1535                                  ip_drop_input("ipIfStatsInTruncatedPkts", mp,
1535 1536                                      ill);
1536 1537                                  freemsg(mp);
1537 1538                                  return (NULL);
1538 1539                          }
1539 1540                          /* Refresh following the pullup. */
1540 1541                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1541 1542                  }
1542 1543                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks);
1543 1544                  /* Check db_ref to make sure we can modify the packet. */
1544 1545                  if (mp->b_datap->db_ref > 1) {
1545 1546                          mblk_t  *mp1;
1546 1547  
1547 1548                          mp1 = copymsg(mp);
1548 1549                          freemsg(mp);
1549 1550                          if (!mp1) {
1550 1551                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
1551 1552                                  return (NULL);
1552 1553                          }
1553 1554                          mp = mp1;
1554 1555                          ipha = (ipha_t *)mp->b_rptr;
1555 1556                          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1556 1557                  }
1557 1558                  /*
1558 1559                   * Need the ipif with the mask be the same as the source
1559 1560                   * address of the mask reply. For unicast we have a specific
1560 1561                   * ipif. For multicast/broadcast we only handle onlink
1561 1562                   * senders, and use the source address to pick an ipif.
1562 1563                   */
1563 1564                  ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst);
1564 1565                  if (ipif == NULL) {
1565 1566                          /* Broadcast or multicast */
1566 1567                          ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid);
1567 1568                          if (ipif == NULL) {
1568 1569                                  freemsg(mp);
1569 1570                                  return (NULL);
1570 1571                          }
1571 1572                  }
1572 1573                  icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
1573 1574                  bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
1574 1575                  ipif_refrele(ipif);
1575 1576                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps);
1576 1577                  icmp_send_reply_v4(mp, ipha, icmph, ira);
1577 1578                  return (NULL);
1578 1579  
1579 1580          case ICMP_ADDRESS_MASK_REPLY:
1580 1581                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps);
1581 1582                  break;
1582 1583          default:
1583 1584                  interested = B_TRUE;    /* Pass up to transport */
1584 1585                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns);
1585 1586                  break;
1586 1587          }
1587 1588          /*
1588 1589           * See if there is an ICMP client to avoid an extra copymsg/freemsg
1589 1590           * if there isn't one.
1590 1591           */
1591 1592          if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) {
1592 1593                  /* If there is an ICMP client and we want one too, copy it. */
1593 1594  
1594 1595                  if (!interested) {
1595 1596                          /* Caller will deliver to RAW sockets */
1596 1597                          return (mp);
1597 1598                  }
1598 1599                  mp_ret = copymsg(mp);
1599 1600                  if (mp_ret == NULL) {
1600 1601                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1601 1602                          ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1602 1603                  }
1603 1604          } else if (!interested) {
1604 1605                  /* Neither we nor raw sockets are interested. Drop packet now */
1605 1606                  freemsg(mp);
1606 1607                  return (NULL);
1607 1608          }
1608 1609  
1609 1610          /*
1610 1611           * ICMP error or redirect packet. Make sure we have enough of
1611 1612           * the header and that db_ref == 1 since we might end up modifying
1612 1613           * the packet.
1613 1614           */
1614 1615          if (mp->b_cont != NULL) {
1615 1616                  if (ip_pullup(mp, -1, ira) == NULL) {
1616 1617                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1617 1618                          ip_drop_input("ipIfStatsInDiscards - ip_pullup",
1618 1619                              mp, ill);
1619 1620                          freemsg(mp);
1620 1621                          return (mp_ret);
1621 1622                  }
1622 1623          }
1623 1624  
1624 1625          if (mp->b_datap->db_ref > 1) {
1625 1626                  mblk_t  *mp1;
1626 1627  
1627 1628                  mp1 = copymsg(mp);
1628 1629                  if (mp1 == NULL) {
1629 1630                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1630 1631                          ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill);
1631 1632                          freemsg(mp);
1632 1633                          return (mp_ret);
1633 1634                  }
1634 1635                  freemsg(mp);
1635 1636                  mp = mp1;
1636 1637          }
1637 1638  
1638 1639          /*
1639 1640           * In case mp has changed, verify the message before any further
1640 1641           * processes.
1641 1642           */
1642 1643          ipha = (ipha_t *)mp->b_rptr;
1643 1644          icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length];
1644 1645          if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
1645 1646                  freemsg(mp);
1646 1647                  return (mp_ret);
1647 1648          }
1648 1649  
1649 1650          switch (icmph->icmph_type) {
1650 1651          case ICMP_REDIRECT:
1651 1652                  icmp_redirect_v4(mp, ipha, icmph, ira);
1652 1653                  break;
1653 1654          case ICMP_DEST_UNREACHABLE:
1654 1655                  if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) {
1655 1656                          /* Update DCE and adjust MTU is icmp header if needed */
1656 1657                          icmp_inbound_too_big_v4(icmph, ira);
1657 1658                  }
1658 1659                  /* FALLTHRU */
1659 1660          default:
1660 1661                  icmp_inbound_error_fanout_v4(mp, icmph, ira);
1661 1662                  break;
1662 1663          }
1663 1664          return (mp_ret);
1664 1665  }
1665 1666  
1666 1667  /*
1667 1668   * Send an ICMP echo, timestamp or address mask reply.
1668 1669   * The caller has already updated the payload part of the packet.
1669 1670   * We handle the ICMP checksum, IP source address selection and feed
1670 1671   * the packet into ip_output_simple.
1671 1672   */
1672 1673  static void
1673 1674  icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph,
1674 1675      ip_recv_attr_t *ira)
1675 1676  {
1676 1677          uint_t          ip_hdr_length = ira->ira_ip_hdr_length;
1677 1678          ill_t           *ill = ira->ira_ill;
1678 1679          ip_stack_t      *ipst = ill->ill_ipst;
1679 1680          ip_xmit_attr_t  ixas;
1680 1681  
1681 1682          /* Send out an ICMP packet */
1682 1683          icmph->icmph_checksum = 0;
1683 1684          icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0);
1684 1685          /* Reset time to live. */
1685 1686          ipha->ipha_ttl = ipst->ips_ip_def_ttl;
1686 1687          {
1687 1688                  /* Swap source and destination addresses */
1688 1689                  ipaddr_t tmp;
1689 1690  
1690 1691                  tmp = ipha->ipha_src;
1691 1692                  ipha->ipha_src = ipha->ipha_dst;
1692 1693                  ipha->ipha_dst = tmp;
1693 1694          }
1694 1695          ipha->ipha_ident = 0;
1695 1696          if (!IS_SIMPLE_IPH(ipha))
1696 1697                  icmp_options_update(ipha);
1697 1698  
1698 1699          bzero(&ixas, sizeof (ixas));
1699 1700          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
1700 1701          ixas.ixa_zoneid = ira->ira_zoneid;
1701 1702          ixas.ixa_cred = kcred;
1702 1703          ixas.ixa_cpid = NOPID;
1703 1704          ixas.ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
1704 1705          ixas.ixa_ifindex = 0;
1705 1706          ixas.ixa_ipst = ipst;
1706 1707          ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1707 1708  
1708 1709          if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) {
1709 1710                  /*
1710 1711                   * This packet should go out the same way as it
1711 1712                   * came in i.e in clear, independent of the IPsec policy
1712 1713                   * for transmitting packets.
1713 1714                   */
1714 1715                  ixas.ixa_flags |= IXAF_NO_IPSEC;
1715 1716          } else {
1716 1717                  if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
1717 1718                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1718 1719                          /* Note: mp already consumed and ip_drop_packet done */
1719 1720                          return;
1720 1721                  }
1721 1722          }
1722 1723          if (ira->ira_flags & IRAF_MULTIBROADCAST) {
1723 1724                  /*
1724 1725                   * Not one or our addresses (IRE_LOCALs), thus we let
1725 1726                   * ip_output_simple pick the source.
1726 1727                   */
1727 1728                  ipha->ipha_src = INADDR_ANY;
1728 1729                  ixas.ixa_flags |= IXAF_SET_SOURCE;
1729 1730          }
1730 1731          /* Should we send with DF and use dce_pmtu? */
1731 1732          if (ipst->ips_ipv4_icmp_return_pmtu) {
1732 1733                  ixas.ixa_flags |= IXAF_PMTU_DISCOVERY;
1733 1734                  ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS;
1734 1735          }
1735 1736  
1736 1737          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
1737 1738  
1738 1739          (void) ip_output_simple(mp, &ixas);
1739 1740          ixa_cleanup(&ixas);
1740 1741  }
1741 1742  
1742 1743  /*
1743 1744   * Verify the ICMP messages for either for ICMP error or redirect packet.
1744 1745   * The caller should have fully pulled up the message. If it's a redirect
1745 1746   * packet, only basic checks on IP header will be done; otherwise, verify
1746 1747   * the packet by looking at the included ULP header.
1747 1748   *
1748 1749   * Called before icmp_inbound_error_fanout_v4 is called.
1749 1750   */
1750 1751  static boolean_t
1751 1752  icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
1752 1753  {
1753 1754          ill_t           *ill = ira->ira_ill;
1754 1755          int             hdr_length;
1755 1756          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
1756 1757          conn_t          *connp;
1757 1758          ipha_t          *ipha;  /* Inner IP header */
1758 1759  
1759 1760          ipha = (ipha_t *)&icmph[1];
1760 1761          if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr)
1761 1762                  goto truncated;
1762 1763  
1763 1764          hdr_length = IPH_HDR_LENGTH(ipha);
1764 1765  
1765 1766          if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION))
1766 1767                  goto discard_pkt;
1767 1768  
1768 1769          if (hdr_length < sizeof (ipha_t))
1769 1770                  goto truncated;
1770 1771  
1771 1772          if ((uchar_t *)ipha + hdr_length > mp->b_wptr)
1772 1773                  goto truncated;
1773 1774  
1774 1775          /*
1775 1776           * Stop here for ICMP_REDIRECT.
1776 1777           */
1777 1778          if (icmph->icmph_type == ICMP_REDIRECT)
1778 1779                  return (B_TRUE);
1779 1780  
1780 1781          /*
1781 1782           * ICMP errors only.
1782 1783           */
1783 1784          switch (ipha->ipha_protocol) {
1784 1785          case IPPROTO_UDP:
1785 1786                  /*
1786 1787                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1787 1788                   * transport header.
1788 1789                   */
1789 1790                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1790 1791                      mp->b_wptr)
1791 1792                          goto truncated;
1792 1793                  break;
1793 1794          case IPPROTO_TCP: {
1794 1795                  tcpha_t         *tcpha;
1795 1796  
1796 1797                  /*
1797 1798                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1798 1799                   * transport header.
1799 1800                   */
1800 1801                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1801 1802                      mp->b_wptr)
1802 1803                          goto truncated;
1803 1804  
1804 1805                  tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
1805 1806                  connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
1806 1807                      ipst);
1807 1808                  if (connp == NULL)
1808 1809                          goto discard_pkt;
1809 1810  
1810 1811                  if ((connp->conn_verifyicmp != NULL) &&
1811 1812                      !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) {
1812 1813                          CONN_DEC_REF(connp);
1813 1814                          goto discard_pkt;
1814 1815                  }
1815 1816                  CONN_DEC_REF(connp);
1816 1817                  break;
1817 1818          }
1818 1819          case IPPROTO_SCTP:
1819 1820                  /*
1820 1821                   * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of
1821 1822                   * transport header.
1822 1823                   */
1823 1824                  if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN >
1824 1825                      mp->b_wptr)
1825 1826                          goto truncated;
1826 1827                  break;
1827 1828          case IPPROTO_ESP:
1828 1829          case IPPROTO_AH:
1829 1830                  break;
1830 1831          case IPPROTO_ENCAP:
1831 1832                  if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) >
1832 1833                      mp->b_wptr)
1833 1834                          goto truncated;
1834 1835                  break;
1835 1836          default:
1836 1837                  break;
1837 1838          }
1838 1839  
1839 1840          return (B_TRUE);
1840 1841  
1841 1842  discard_pkt:
1842 1843          /* Bogus ICMP error. */
1843 1844          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1844 1845          return (B_FALSE);
1845 1846  
1846 1847  truncated:
1847 1848          /* We pulled up everthing already. Must be truncated */
1848 1849          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
1849 1850          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
1850 1851          return (B_FALSE);
1851 1852  }
1852 1853  
1853 1854  /* Table from RFC 1191 */
1854 1855  static int icmp_frag_size_table[] =
1855 1856  { 32000, 17914, 8166, 4352, 2002, 1496, 1006, 508, 296, 68 };
1856 1857  
1857 1858  /*
1858 1859   * Process received ICMP Packet too big.
1859 1860   * Just handles the DCE create/update, including using the above table of
1860 1861   * PMTU guesses. The caller is responsible for validating the packet before
1861 1862   * passing it in and also to fanout the ICMP error to any matching transport
1862 1863   * conns. Assumes the message has been fully pulled up and verified.
1863 1864   *
1864 1865   * Before getting here, the caller has called icmp_inbound_verify_v4()
1865 1866   * that should have verified with ULP to prevent undoing the changes we're
1866 1867   * going to make to DCE. For example, TCP might have verified that the packet
1867 1868   * which generated error is in the send window.
1868 1869   *
1869 1870   * In some cases modified this MTU in the ICMP header packet; the caller
1870 1871   * should pass to the matching ULP after this returns.
1871 1872   */
1872 1873  static void
1873 1874  icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira)
1874 1875  {
1875 1876          dce_t           *dce;
1876 1877          int             old_mtu;
1877 1878          int             mtu, orig_mtu;
1878 1879          ipaddr_t        dst;
1879 1880          boolean_t       disable_pmtud;
1880 1881          ill_t           *ill = ira->ira_ill;
1881 1882          ip_stack_t      *ipst = ill->ill_ipst;
1882 1883          uint_t          hdr_length;
1883 1884          ipha_t          *ipha;
1884 1885  
1885 1886          /* Caller already pulled up everything. */
1886 1887          ipha = (ipha_t *)&icmph[1];
1887 1888          ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE &&
1888 1889              icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED);
1889 1890          ASSERT(ill != NULL);
1890 1891  
1891 1892          hdr_length = IPH_HDR_LENGTH(ipha);
1892 1893  
1893 1894          /*
1894 1895           * We handle path MTU for source routed packets since the DCE
1895 1896           * is looked up using the final destination.
1896 1897           */
1897 1898          dst = ip_get_dst(ipha);
1898 1899  
1899 1900          dce = dce_lookup_and_add_v4(dst, ipst);
1900 1901          if (dce == NULL) {
1901 1902                  /* Couldn't add a unique one - ENOMEM */
1902 1903                  ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n",
1903 1904                      ntohl(dst)));
1904 1905                  return;
1905 1906          }
1906 1907  
1907 1908          /* Check for MTU discovery advice as described in RFC 1191 */
1908 1909          mtu = ntohs(icmph->icmph_du_mtu);
1909 1910          orig_mtu = mtu;
1910 1911          disable_pmtud = B_FALSE;
1911 1912  
1912 1913          mutex_enter(&dce->dce_lock);
1913 1914          if (dce->dce_flags & DCEF_PMTU)
1914 1915                  old_mtu = dce->dce_pmtu;
1915 1916          else
1916 1917                  old_mtu = ill->ill_mtu;
1917 1918  
1918 1919          if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) {
1919 1920                  uint32_t length;
1920 1921                  int     i;
1921 1922  
1922 1923                  /*
1923 1924                   * Use the table from RFC 1191 to figure out
1924 1925                   * the next "plateau" based on the length in
1925 1926                   * the original IP packet.
1926 1927                   */
1927 1928                  length = ntohs(ipha->ipha_length);
1928 1929                  DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce,
1929 1930                      uint32_t, length);
1930 1931                  if (old_mtu <= length &&
1931 1932                      old_mtu >= length - hdr_length) {
1932 1933                          /*
1933 1934                           * Handle broken BSD 4.2 systems that
1934 1935                           * return the wrong ipha_length in ICMP
1935 1936                           * errors.
1936 1937                           */
1937 1938                          ip1dbg(("Wrong mtu: sent %d, dce %d\n",
1938 1939                              length, old_mtu));
1939 1940                          length -= hdr_length;
1940 1941                  }
1941 1942                  for (i = 0; i < A_CNT(icmp_frag_size_table); i++) {
1942 1943                          if (length > icmp_frag_size_table[i])
1943 1944                                  break;
1944 1945                  }
1945 1946                  if (i == A_CNT(icmp_frag_size_table)) {
1946 1947                          /* Smaller than IP_MIN_MTU! */
1947 1948                          ip1dbg(("Too big for packet size %d\n",
1948 1949                              length));
1949 1950                          disable_pmtud = B_TRUE;
1950 1951                          mtu = ipst->ips_ip_pmtu_min;
1951 1952                  } else {
1952 1953                          mtu = icmp_frag_size_table[i];
1953 1954                          ip1dbg(("Calculated mtu %d, packet size %d, "
1954 1955                              "before %d\n", mtu, length, old_mtu));
1955 1956                          if (mtu < ipst->ips_ip_pmtu_min) {
1956 1957                                  mtu = ipst->ips_ip_pmtu_min;
1957 1958                                  disable_pmtud = B_TRUE;
1958 1959                          }
1959 1960                  }
1960 1961          }
1961 1962          if (disable_pmtud)
1962 1963                  dce->dce_flags |= DCEF_TOO_SMALL_PMTU;
1963 1964          else
1964 1965                  dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU;
1965 1966  
1966 1967          dce->dce_pmtu = MIN(old_mtu, mtu);
1967 1968          /* Prepare to send the new max frag size for the ULP. */
1968 1969          icmph->icmph_du_zero = 0;
1969 1970          icmph->icmph_du_mtu =  htons((uint16_t)dce->dce_pmtu);
1970 1971          DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *,
1971 1972              dce, int, orig_mtu, int, mtu);
1972 1973  
1973 1974          /* We now have a PMTU for sure */
1974 1975          dce->dce_flags |= DCEF_PMTU;
1975 1976          dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
1976 1977          mutex_exit(&dce->dce_lock);
1977 1978          /*
1978 1979           * After dropping the lock the new value is visible to everyone.
1979 1980           * Then we bump the generation number so any cached values reinspect
1980 1981           * the dce_t.
1981 1982           */
1982 1983          dce_increment_generation(dce);
1983 1984          dce_refrele(dce);
1984 1985  }
1985 1986  
1986 1987  /*
1987 1988   * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4
1988 1989   * calls this function.
1989 1990   */
1990 1991  static mblk_t *
1991 1992  icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha)
1992 1993  {
1993 1994          int length;
1994 1995  
1995 1996          ASSERT(mp->b_datap->db_type == M_DATA);
1996 1997  
1997 1998          /* icmp_inbound_v4 has already pulled up the whole error packet */
1998 1999          ASSERT(mp->b_cont == NULL);
1999 2000  
2000 2001          /*
2001 2002           * The length that we want to overlay is the inner header
2002 2003           * and what follows it.
2003 2004           */
2004 2005          length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr);
2005 2006  
2006 2007          /*
2007 2008           * Overlay the inner header and whatever follows it over the
2008 2009           * outer header.
2009 2010           */
2010 2011          bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length);
2011 2012  
2012 2013          /* Adjust for what we removed */
2013 2014          mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha;
2014 2015          return (mp);
2015 2016  }
2016 2017  
2017 2018  /*
2018 2019   * Try to pass the ICMP message upstream in case the ULP cares.
2019 2020   *
2020 2021   * If the packet that caused the ICMP error is secure, we send
2021 2022   * it to AH/ESP to make sure that the attached packet has a
2022 2023   * valid association. ipha in the code below points to the
2023 2024   * IP header of the packet that caused the error.
2024 2025   *
2025 2026   * For IPsec cases, we let the next-layer-up (which has access to
2026 2027   * cached policy on the conn_t, or can query the SPD directly)
2027 2028   * subtract out any IPsec overhead if they must.  We therefore make no
2028 2029   * adjustments here for IPsec overhead.
2029 2030   *
2030 2031   * IFN could have been generated locally or by some router.
2031 2032   *
2032 2033   * LOCAL : ire_send_wire (before calling ipsec_out_process) can call
2033 2034   * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN.
2034 2035   *          This happens because IP adjusted its value of MTU on an
2035 2036   *          earlier IFN message and could not tell the upper layer,
2036 2037   *          the new adjusted value of MTU e.g. Packet was encrypted
2037 2038   *          or there was not enough information to fanout to upper
2038 2039   *          layers. Thus on the next outbound datagram, ire_send_wire
2039 2040   *          generates the IFN, where IPsec processing has *not* been
2040 2041   *          done.
2041 2042   *
2042 2043   *          Note that we retain ixa_fragsize across IPsec thus once
2043 2044   *          we have picking ixa_fragsize and entered ipsec_out_process we do
2044 2045   *          no change the fragsize even if the path MTU changes before
2045 2046   *          we reach ip_output_post_ipsec.
2046 2047   *
2047 2048   *          In the local case, IRAF_LOOPBACK will be set indicating
2048 2049   *          that IFN was generated locally.
2049 2050   *
2050 2051   * ROUTER : IFN could be secure or non-secure.
2051 2052   *
2052 2053   *          * SECURE : We use the IPSEC_IN to fanout to AH/ESP if the
2053 2054   *            packet in error has AH/ESP headers to validate the AH/ESP
2054 2055   *            headers. AH/ESP will verify whether there is a valid SA or
2055 2056   *            not and send it back. We will fanout again if we have more
2056 2057   *            data in the packet.
2057 2058   *
2058 2059   *            If the packet in error does not have AH/ESP, we handle it
2059 2060   *            like any other case.
2060 2061   *
2061 2062   *          * NON_SECURE : If the packet in error has AH/ESP headers, we send it
2062 2063   *            up to AH/ESP for validation. AH/ESP will verify whether there is a
2063 2064   *            valid SA or not and send it back. We will fanout again if
2064 2065   *            we have more data in the packet.
2065 2066   *
2066 2067   *            If the packet in error does not have AH/ESP, we handle it
2067 2068   *            like any other case.
2068 2069   *
2069 2070   * The caller must have called icmp_inbound_verify_v4.
2070 2071   */
2071 2072  static void
2072 2073  icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira)
2073 2074  {
2074 2075          uint16_t        *up;    /* Pointer to ports in ULP header */
2075 2076          uint32_t        ports;  /* reversed ports for fanout */
2076 2077          ipha_t          ripha;  /* With reversed addresses */
2077 2078          ipha_t          *ipha;  /* Inner IP header */
2078 2079          uint_t          hdr_length; /* Inner IP header length */
2079 2080          tcpha_t         *tcpha;
2080 2081          conn_t          *connp;
2081 2082          ill_t           *ill = ira->ira_ill;
2082 2083          ip_stack_t      *ipst = ill->ill_ipst;
2083 2084          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
2084 2085          ill_t           *rill = ira->ira_rill;
2085 2086  
2086 2087          /* Caller already pulled up everything. */
2087 2088          ipha = (ipha_t *)&icmph[1];
2088 2089          ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr);
2089 2090          ASSERT(mp->b_cont == NULL);
2090 2091  
2091 2092          hdr_length = IPH_HDR_LENGTH(ipha);
2092 2093          ira->ira_protocol = ipha->ipha_protocol;
2093 2094  
2094 2095          /*
2095 2096           * We need a separate IP header with the source and destination
2096 2097           * addresses reversed to do fanout/classification because the ipha in
2097 2098           * the ICMP error is in the form we sent it out.
2098 2099           */
2099 2100          ripha.ipha_src = ipha->ipha_dst;
2100 2101          ripha.ipha_dst = ipha->ipha_src;
2101 2102          ripha.ipha_protocol = ipha->ipha_protocol;
2102 2103          ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length;
2103 2104  
2104 2105          ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n",
2105 2106              ripha.ipha_protocol, ntohl(ipha->ipha_src),
2106 2107              ntohl(ipha->ipha_dst),
2107 2108              icmph->icmph_type, icmph->icmph_code));
2108 2109  
2109 2110          switch (ipha->ipha_protocol) {
2110 2111          case IPPROTO_UDP:
2111 2112                  up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2112 2113  
2113 2114                  /* Attempt to find a client stream based on port. */
2114 2115                  ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n",
2115 2116                      ntohs(up[0]), ntohs(up[1])));
2116 2117  
2117 2118                  /* Note that we send error to all matches. */
2118 2119                  ira->ira_flags |= IRAF_ICMP_ERROR;
2119 2120                  ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira);
2120 2121                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2121 2122                  return;
2122 2123  
2123 2124          case IPPROTO_TCP:
2124 2125                  /*
2125 2126                   * Find a TCP client stream for this packet.
2126 2127                   * Note that we do a reverse lookup since the header is
2127 2128                   * in the form we sent it out.
2128 2129                   */
2129 2130                  tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length);
2130 2131                  connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN,
2131 2132                      ipst);
2132 2133                  if (connp == NULL)
2133 2134                          goto discard_pkt;
2134 2135  
2135 2136                  if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) ||
2136 2137                      (ira->ira_flags & IRAF_IPSEC_SECURE)) {
2137 2138                          mp = ipsec_check_inbound_policy(mp, connp,
2138 2139                              ipha, NULL, ira);
2139 2140                          if (mp == NULL) {
2140 2141                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2141 2142                                  /* Note that mp is NULL */
2142 2143                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
2143 2144                                  CONN_DEC_REF(connp);
2144 2145                                  return;
2145 2146                          }
2146 2147                  }
2147 2148  
2148 2149                  ira->ira_flags |= IRAF_ICMP_ERROR;
2149 2150                  ira->ira_ill = ira->ira_rill = NULL;
2150 2151                  if (IPCL_IS_TCP(connp)) {
2151 2152                          SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
2152 2153                              connp->conn_recvicmp, connp, ira, SQ_FILL,
2153 2154                              SQTAG_TCP_INPUT_ICMP_ERR);
2154 2155                  } else {
2155 2156                          /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */
2156 2157                          (connp->conn_recv)(connp, mp, NULL, ira);
2157 2158                          CONN_DEC_REF(connp);
2158 2159                  }
2159 2160                  ira->ira_ill = ill;
2160 2161                  ira->ira_rill = rill;
2161 2162                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2162 2163                  return;
2163 2164  
2164 2165          case IPPROTO_SCTP:
2165 2166                  up = (uint16_t *)((uchar_t *)ipha + hdr_length);
2166 2167                  /* Find a SCTP client stream for this packet. */
2167 2168                  ((uint16_t *)&ports)[0] = up[1];
2168 2169                  ((uint16_t *)&ports)[1] = up[0];
2169 2170  
2170 2171                  ira->ira_flags |= IRAF_ICMP_ERROR;
2171 2172                  ip_fanout_sctp(mp, &ripha, NULL, ports, ira);
2172 2173                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2173 2174                  return;
2174 2175  
2175 2176          case IPPROTO_ESP:
2176 2177          case IPPROTO_AH:
2177 2178                  if (!ipsec_loaded(ipss)) {
2178 2179                          ip_proto_not_sup(mp, ira);
2179 2180                          return;
2180 2181                  }
2181 2182  
2182 2183                  if (ipha->ipha_protocol == IPPROTO_ESP)
2183 2184                          mp = ipsecesp_icmp_error(mp, ira);
2184 2185                  else
2185 2186                          mp = ipsecah_icmp_error(mp, ira);
2186 2187                  if (mp == NULL)
2187 2188                          return;
2188 2189  
2189 2190                  /* Just in case ipsec didn't preserve the NULL b_cont */
2190 2191                  if (mp->b_cont != NULL) {
2191 2192                          if (!pullupmsg(mp, -1))
2192 2193                                  goto discard_pkt;
2193 2194                  }
2194 2195  
2195 2196                  /*
2196 2197                   * Note that ira_pktlen and ira_ip_hdr_length are no longer
2197 2198                   * correct, but we don't use them any more here.
2198 2199                   *
2199 2200                   * If succesful, the mp has been modified to not include
2200 2201                   * the ESP/AH header so we can fanout to the ULP's icmp
2201 2202                   * error handler.
2202 2203                   */
2203 2204                  if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2204 2205                          goto truncated;
2205 2206  
2206 2207                  /* Verify the modified message before any further processes. */
2207 2208                  ipha = (ipha_t *)mp->b_rptr;
2208 2209                  hdr_length = IPH_HDR_LENGTH(ipha);
2209 2210                  icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2210 2211                  if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2211 2212                          freemsg(mp);
2212 2213                          return;
2213 2214                  }
2214 2215  
2215 2216                  icmp_inbound_error_fanout_v4(mp, icmph, ira);
2216 2217                  return;
2217 2218  
2218 2219          case IPPROTO_ENCAP: {
2219 2220                  /* Look for self-encapsulated packets that caused an error */
2220 2221                  ipha_t *in_ipha;
2221 2222  
2222 2223                  /*
2223 2224                   * Caller has verified that length has to be
2224 2225                   * at least the size of IP header.
2225 2226                   */
2226 2227                  ASSERT(hdr_length >= sizeof (ipha_t));
2227 2228                  /*
2228 2229                   * Check the sanity of the inner IP header like
2229 2230                   * we did for the outer header.
2230 2231                   */
2231 2232                  in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length);
2232 2233                  if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) {
2233 2234                          goto discard_pkt;
2234 2235                  }
2235 2236                  if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) {
2236 2237                          goto discard_pkt;
2237 2238                  }
2238 2239                  /* Check for Self-encapsulated tunnels */
2239 2240                  if (in_ipha->ipha_src == ipha->ipha_src &&
2240 2241                      in_ipha->ipha_dst == ipha->ipha_dst) {
2241 2242  
2242 2243                          mp = icmp_inbound_self_encap_error_v4(mp, ipha,
2243 2244                              in_ipha);
2244 2245                          if (mp == NULL)
2245 2246                                  goto discard_pkt;
2246 2247  
2247 2248                          /*
2248 2249                           * Just in case self_encap didn't preserve the NULL
2249 2250                           * b_cont
2250 2251                           */
2251 2252                          if (mp->b_cont != NULL) {
2252 2253                                  if (!pullupmsg(mp, -1))
2253 2254                                          goto discard_pkt;
2254 2255                          }
2255 2256                          /*
2256 2257                           * Note that ira_pktlen and ira_ip_hdr_length are no
2257 2258                           * longer correct, but we don't use them any more here.
2258 2259                           */
2259 2260                          if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH)
2260 2261                                  goto truncated;
2261 2262  
2262 2263                          /*
2263 2264                           * Verify the modified message before any further
2264 2265                           * processes.
2265 2266                           */
2266 2267                          ipha = (ipha_t *)mp->b_rptr;
2267 2268                          hdr_length = IPH_HDR_LENGTH(ipha);
2268 2269                          icmph = (icmph_t *)&mp->b_rptr[hdr_length];
2269 2270                          if (!icmp_inbound_verify_v4(mp, icmph, ira)) {
2270 2271                                  freemsg(mp);
2271 2272                                  return;
2272 2273                          }
2273 2274  
2274 2275                          /*
2275 2276                           * The packet in error is self-encapsualted.
2276 2277                           * And we are finding it further encapsulated
2277 2278                           * which we could not have possibly generated.
2278 2279                           */
2279 2280                          if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2280 2281                                  goto discard_pkt;
2281 2282                          }
2282 2283                          icmp_inbound_error_fanout_v4(mp, icmph, ira);
2283 2284                          return;
2284 2285                  }
2285 2286                  /* No self-encapsulated */
2286 2287                  /* FALLTHRU */
2287 2288          }
2288 2289          case IPPROTO_IPV6:
2289 2290                  if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src,
2290 2291                      &ripha.ipha_dst, ipst)) != NULL) {
2291 2292                          ira->ira_flags |= IRAF_ICMP_ERROR;
2292 2293                          connp->conn_recvicmp(connp, mp, NULL, ira);
2293 2294                          CONN_DEC_REF(connp);
2294 2295                          ira->ira_flags &= ~IRAF_ICMP_ERROR;
2295 2296                          return;
2296 2297                  }
2297 2298                  /*
2298 2299                   * No IP tunnel is interested, fallthrough and see
2299 2300                   * if a raw socket will want it.
2300 2301                   */
2301 2302                  /* FALLTHRU */
2302 2303          default:
2303 2304                  ira->ira_flags |= IRAF_ICMP_ERROR;
2304 2305                  ip_fanout_proto_v4(mp, &ripha, ira);
2305 2306                  ira->ira_flags &= ~IRAF_ICMP_ERROR;
2306 2307                  return;
2307 2308          }
2308 2309          /* NOTREACHED */
2309 2310  discard_pkt:
2310 2311          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2311 2312          ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n"));
2312 2313          ip_drop_input("ipIfStatsInDiscards", mp, ill);
2313 2314          freemsg(mp);
2314 2315          return;
2315 2316  
2316 2317  truncated:
2317 2318          /* We pulled up everthing already. Must be truncated */
2318 2319          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
2319 2320          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
2320 2321          freemsg(mp);
2321 2322  }
2322 2323  
2323 2324  /*
2324 2325   * Common IP options parser.
2325 2326   *
2326 2327   * Setup routine: fill in *optp with options-parsing state, then
2327 2328   * tail-call ipoptp_next to return the first option.
2328 2329   */
2329 2330  uint8_t
2330 2331  ipoptp_first(ipoptp_t *optp, ipha_t *ipha)
2331 2332  {
2332 2333          uint32_t totallen; /* total length of all options */
2333 2334  
2334 2335          totallen = ipha->ipha_version_and_hdr_length -
2335 2336              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
2336 2337          totallen <<= 2;
2337 2338          optp->ipoptp_next = (uint8_t *)(&ipha[1]);
2338 2339          optp->ipoptp_end = optp->ipoptp_next + totallen;
2339 2340          optp->ipoptp_flags = 0;
2340 2341          return (ipoptp_next(optp));
2341 2342  }
2342 2343  
2343 2344  /* Like above but without an ipha_t */
2344 2345  uint8_t
2345 2346  ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt)
2346 2347  {
2347 2348          optp->ipoptp_next = opt;
2348 2349          optp->ipoptp_end = optp->ipoptp_next + totallen;
2349 2350          optp->ipoptp_flags = 0;
2350 2351          return (ipoptp_next(optp));
2351 2352  }
2352 2353  
2353 2354  /*
2354 2355   * Common IP options parser: extract next option.
2355 2356   */
2356 2357  uint8_t
2357 2358  ipoptp_next(ipoptp_t *optp)
2358 2359  {
2359 2360          uint8_t *end = optp->ipoptp_end;
2360 2361          uint8_t *cur = optp->ipoptp_next;
2361 2362          uint8_t opt, len, pointer;
2362 2363  
2363 2364          /*
2364 2365           * If cur > end already, then the ipoptp_end or ipoptp_next pointer
2365 2366           * has been corrupted.
2366 2367           */
2367 2368          ASSERT(cur <= end);
2368 2369  
2369 2370          if (cur == end)
2370 2371                  return (IPOPT_EOL);
2371 2372  
2372 2373          opt = cur[IPOPT_OPTVAL];
2373 2374  
2374 2375          /*
2375 2376           * Skip any NOP options.
2376 2377           */
2377 2378          while (opt == IPOPT_NOP) {
2378 2379                  cur++;
2379 2380                  if (cur == end)
2380 2381                          return (IPOPT_EOL);
2381 2382                  opt = cur[IPOPT_OPTVAL];
2382 2383          }
2383 2384  
2384 2385          if (opt == IPOPT_EOL)
2385 2386                  return (IPOPT_EOL);
2386 2387  
2387 2388          /*
2388 2389           * Option requiring a length.
2389 2390           */
2390 2391          if ((cur + 1) >= end) {
2391 2392                  optp->ipoptp_flags |= IPOPTP_ERROR;
2392 2393                  return (IPOPT_EOL);
2393 2394          }
2394 2395          len = cur[IPOPT_OLEN];
2395 2396          if (len < 2) {
2396 2397                  optp->ipoptp_flags |= IPOPTP_ERROR;
2397 2398                  return (IPOPT_EOL);
2398 2399          }
2399 2400          optp->ipoptp_cur = cur;
2400 2401          optp->ipoptp_len = len;
2401 2402          optp->ipoptp_next = cur + len;
2402 2403          if (cur + len > end) {
2403 2404                  optp->ipoptp_flags |= IPOPTP_ERROR;
2404 2405                  return (IPOPT_EOL);
2405 2406          }
2406 2407  
2407 2408          /*
2408 2409           * For the options which require a pointer field, make sure
2409 2410           * its there, and make sure it points to either something
2410 2411           * inside this option, or the end of the option.
2411 2412           */
2412 2413          switch (opt) {
2413 2414          case IPOPT_RR:
2414 2415          case IPOPT_TS:
2415 2416          case IPOPT_LSRR:
2416 2417          case IPOPT_SSRR:
2417 2418                  if (len <= IPOPT_OFFSET) {
2418 2419                          optp->ipoptp_flags |= IPOPTP_ERROR;
2419 2420                          return (opt);
2420 2421                  }
2421 2422                  pointer = cur[IPOPT_OFFSET];
2422 2423                  if (pointer - 1 > len) {
2423 2424                          optp->ipoptp_flags |= IPOPTP_ERROR;
2424 2425                          return (opt);
2425 2426                  }
2426 2427                  break;
2427 2428          }
2428 2429  
2429 2430          /*
2430 2431           * Sanity check the pointer field based on the type of the
2431 2432           * option.
2432 2433           */
2433 2434          switch (opt) {
2434 2435          case IPOPT_RR:
2435 2436          case IPOPT_SSRR:
2436 2437          case IPOPT_LSRR:
2437 2438                  if (pointer < IPOPT_MINOFF_SR)
2438 2439                          optp->ipoptp_flags |= IPOPTP_ERROR;
2439 2440                  break;
2440 2441          case IPOPT_TS:
2441 2442                  if (pointer < IPOPT_MINOFF_IT)
2442 2443                          optp->ipoptp_flags |= IPOPTP_ERROR;
2443 2444                  /*
2444 2445                   * Note that the Internet Timestamp option also
2445 2446                   * contains two four bit fields (the Overflow field,
2446 2447                   * and the Flag field), which follow the pointer
2447 2448                   * field.  We don't need to check that these fields
2448 2449                   * fall within the length of the option because this
2449 2450                   * was implicitely done above.  We've checked that the
2450 2451                   * pointer value is at least IPOPT_MINOFF_IT, and that
2451 2452                   * it falls within the option.  Since IPOPT_MINOFF_IT >
2452 2453                   * IPOPT_POS_OV_FLG, we don't need the explicit check.
2453 2454                   */
2454 2455                  ASSERT(len > IPOPT_POS_OV_FLG);
2455 2456                  break;
2456 2457          }
2457 2458  
2458 2459          return (opt);
2459 2460  }
2460 2461  
2461 2462  /*
2462 2463   * Use the outgoing IP header to create an IP_OPTIONS option the way
2463 2464   * it was passed down from the application.
2464 2465   *
2465 2466   * This is compatible with BSD in that it returns
2466 2467   * the reverse source route with the final destination
2467 2468   * as the last entry. The first 4 bytes of the option
2468 2469   * will contain the final destination.
2469 2470   */
2470 2471  int
2471 2472  ip_opt_get_user(conn_t *connp, uchar_t *buf)
2472 2473  {
2473 2474          ipoptp_t        opts;
2474 2475          uchar_t         *opt;
2475 2476          uint8_t         optval;
2476 2477          uint8_t         optlen;
2477 2478          uint32_t        len = 0;
2478 2479          uchar_t         *buf1 = buf;
2479 2480          uint32_t        totallen;
2480 2481          ipaddr_t        dst;
2481 2482          ip_pkt_t        *ipp = &connp->conn_xmit_ipp;
2482 2483  
2483 2484          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
2484 2485                  return (0);
2485 2486  
2486 2487          totallen = ipp->ipp_ipv4_options_len;
2487 2488          if (totallen & 0x3)
2488 2489                  return (0);
2489 2490  
2490 2491          buf += IP_ADDR_LEN;     /* Leave room for final destination */
2491 2492          len += IP_ADDR_LEN;
2492 2493          bzero(buf1, IP_ADDR_LEN);
2493 2494  
2494 2495          dst = connp->conn_faddr_v4;
2495 2496  
2496 2497          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
2497 2498              optval != IPOPT_EOL;
2498 2499              optval = ipoptp_next(&opts)) {
2499 2500                  int     off;
2500 2501  
2501 2502                  opt = opts.ipoptp_cur;
2502 2503                  if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
2503 2504                          break;
2504 2505                  }
2505 2506                  optlen = opts.ipoptp_len;
2506 2507  
2507 2508                  switch (optval) {
2508 2509                  case IPOPT_SSRR:
2509 2510                  case IPOPT_LSRR:
2510 2511  
2511 2512                          /*
2512 2513                           * Insert destination as the first entry in the source
2513 2514                           * route and move down the entries on step.
2514 2515                           * The last entry gets placed at buf1.
2515 2516                           */
2516 2517                          buf[IPOPT_OPTVAL] = optval;
2517 2518                          buf[IPOPT_OLEN] = optlen;
2518 2519                          buf[IPOPT_OFFSET] = optlen;
2519 2520  
2520 2521                          off = optlen - IP_ADDR_LEN;
2521 2522                          if (off < 0) {
2522 2523                                  /* No entries in source route */
2523 2524                                  break;
2524 2525                          }
2525 2526                          /* Last entry in source route if not already set */
2526 2527                          if (dst == INADDR_ANY)
2527 2528                                  bcopy(opt + off, buf1, IP_ADDR_LEN);
2528 2529                          off -= IP_ADDR_LEN;
2529 2530  
2530 2531                          while (off > 0) {
2531 2532                                  bcopy(opt + off,
2532 2533                                      buf + off + IP_ADDR_LEN,
2533 2534                                      IP_ADDR_LEN);
2534 2535                                  off -= IP_ADDR_LEN;
2535 2536                          }
2536 2537                          /* ipha_dst into first slot */
2537 2538                          bcopy(&dst, buf + off + IP_ADDR_LEN,
2538 2539                              IP_ADDR_LEN);
2539 2540                          buf += optlen;
2540 2541                          len += optlen;
2541 2542                          break;
2542 2543  
2543 2544                  default:
2544 2545                          bcopy(opt, buf, optlen);
2545 2546                          buf += optlen;
2546 2547                          len += optlen;
2547 2548                          break;
2548 2549                  }
2549 2550          }
2550 2551  done:
2551 2552          /* Pad the resulting options */
2552 2553          while (len & 0x3) {
2553 2554                  *buf++ = IPOPT_EOL;
2554 2555                  len++;
2555 2556          }
2556 2557          return (len);
2557 2558  }
2558 2559  
2559 2560  /*
2560 2561   * Update any record route or timestamp options to include this host.
2561 2562   * Reverse any source route option.
2562 2563   * This routine assumes that the options are well formed i.e. that they
2563 2564   * have already been checked.
2564 2565   */
2565 2566  static void
2566 2567  icmp_options_update(ipha_t *ipha)
2567 2568  {
2568 2569          ipoptp_t        opts;
2569 2570          uchar_t         *opt;
2570 2571          uint8_t         optval;
2571 2572          ipaddr_t        src;            /* Our local address */
2572 2573          ipaddr_t        dst;
2573 2574  
2574 2575          ip2dbg(("icmp_options_update\n"));
2575 2576          src = ipha->ipha_src;
2576 2577          dst = ipha->ipha_dst;
2577 2578  
2578 2579          for (optval = ipoptp_first(&opts, ipha);
2579 2580              optval != IPOPT_EOL;
2580 2581              optval = ipoptp_next(&opts)) {
2581 2582                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
2582 2583                  opt = opts.ipoptp_cur;
2583 2584                  ip2dbg(("icmp_options_update: opt %d, len %d\n",
2584 2585                      optval, opts.ipoptp_len));
2585 2586                  switch (optval) {
2586 2587                          int off1, off2;
2587 2588                  case IPOPT_SSRR:
2588 2589                  case IPOPT_LSRR:
2589 2590                          /*
2590 2591                           * Reverse the source route.  The first entry
2591 2592                           * should be the next to last one in the current
2592 2593                           * source route (the last entry is our address).
2593 2594                           * The last entry should be the final destination.
2594 2595                           */
2595 2596                          off1 = IPOPT_MINOFF_SR - 1;
2596 2597                          off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
2597 2598                          if (off2 < 0) {
2598 2599                                  /* No entries in source route */
2599 2600                                  ip1dbg((
2600 2601                                      "icmp_options_update: bad src route\n"));
2601 2602                                  break;
2602 2603                          }
2603 2604                          bcopy((char *)opt + off2, &dst, IP_ADDR_LEN);
2604 2605                          bcopy(&ipha->ipha_dst, (char *)opt + off2, IP_ADDR_LEN);
2605 2606                          bcopy(&dst, &ipha->ipha_dst, IP_ADDR_LEN);
2606 2607                          off2 -= IP_ADDR_LEN;
2607 2608  
2608 2609                          while (off1 < off2) {
2609 2610                                  bcopy((char *)opt + off1, &src, IP_ADDR_LEN);
2610 2611                                  bcopy((char *)opt + off2, (char *)opt + off1,
2611 2612                                      IP_ADDR_LEN);
2612 2613                                  bcopy(&src, (char *)opt + off2, IP_ADDR_LEN);
2613 2614                                  off1 += IP_ADDR_LEN;
2614 2615                                  off2 -= IP_ADDR_LEN;
2615 2616                          }
2616 2617                          opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
2617 2618                          break;
2618 2619                  }
2619 2620          }
2620 2621  }
2621 2622  
2622 2623  /*
2623 2624   * Process received ICMP Redirect messages.
2624 2625   * Assumes the caller has verified that the headers are in the pulled up mblk.
2625 2626   * Consumes mp.
2626 2627   */
2627 2628  static void
2628 2629  icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira)
2629 2630  {
2630 2631          ire_t           *ire, *nire;
2631 2632          ire_t           *prev_ire;
2632 2633          ipaddr_t        src, dst, gateway;
2633 2634          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2634 2635          ipha_t          *inner_ipha;    /* Inner IP header */
2635 2636  
2636 2637          /* Caller already pulled up everything. */
2637 2638          inner_ipha = (ipha_t *)&icmph[1];
2638 2639          src = ipha->ipha_src;
2639 2640          dst = inner_ipha->ipha_dst;
2640 2641          gateway = icmph->icmph_rd_gateway;
2641 2642          /* Make sure the new gateway is reachable somehow. */
2642 2643          ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL,
2643 2644              ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
2644 2645          /*
2645 2646           * Make sure we had a route for the dest in question and that
2646 2647           * that route was pointing to the old gateway (the source of the
2647 2648           * redirect packet.)
2648 2649           * We do longest match and then compare ire_gateway_addr below.
2649 2650           */
2650 2651          prev_ire = ire_ftable_lookup_v4(dst, 0, 0, 0, NULL, ALL_ZONES,
2651 2652              NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
2652 2653          /*
2653 2654           * Check that
2654 2655           *      the redirect was not from ourselves
2655 2656           *      the new gateway and the old gateway are directly reachable
2656 2657           */
2657 2658          if (prev_ire == NULL || ire == NULL ||
2658 2659              (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) ||
2659 2660              (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) ||
2660 2661              !(ire->ire_type & IRE_IF_ALL) ||
2661 2662              prev_ire->ire_gateway_addr != src) {
2662 2663                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2663 2664                  ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill);
2664 2665                  freemsg(mp);
2665 2666                  if (ire != NULL)
2666 2667                          ire_refrele(ire);
2667 2668                  if (prev_ire != NULL)
2668 2669                          ire_refrele(prev_ire);
2669 2670                  return;
2670 2671          }
2671 2672  
2672 2673          ire_refrele(prev_ire);
2673 2674          ire_refrele(ire);
2674 2675  
2675 2676          /*
2676 2677           * TODO: more precise handling for cases 0, 2, 3, the latter two
2677 2678           * require TOS routing
2678 2679           */
2679 2680          switch (icmph->icmph_code) {
2680 2681          case 0:
2681 2682          case 1:
2682 2683                  /* TODO: TOS specificity for cases 2 and 3 */
2683 2684          case 2:
2684 2685          case 3:
2685 2686                  break;
2686 2687          default:
2687 2688                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects);
2688 2689                  ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill);
2689 2690                  freemsg(mp);
2690 2691                  return;
2691 2692          }
2692 2693          /*
2693 2694           * Create a Route Association.  This will allow us to remember that
2694 2695           * someone we believe told us to use the particular gateway.
2695 2696           */
2696 2697          ire = ire_create(
2697 2698              (uchar_t *)&dst,                    /* dest addr */
2698 2699              (uchar_t *)&ip_g_all_ones,          /* mask */
2699 2700              (uchar_t *)&gateway,                /* gateway addr */
2700 2701              IRE_HOST,
2701 2702              NULL,                               /* ill */
2702 2703              ALL_ZONES,
2703 2704              (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST),
2704 2705              NULL,                               /* tsol_gc_t */
2705 2706              ipst);
2706 2707  
2707 2708          if (ire == NULL) {
2708 2709                  freemsg(mp);
2709 2710                  return;
2710 2711          }
2711 2712          nire = ire_add(ire);
2712 2713          /* Check if it was a duplicate entry */
2713 2714          if (nire != NULL && nire != ire) {
2714 2715                  ASSERT(nire->ire_identical_ref > 1);
2715 2716                  ire_delete(nire);
2716 2717                  ire_refrele(nire);
2717 2718                  nire = NULL;
2718 2719          }
2719 2720          ire = nire;
2720 2721          if (ire != NULL) {
2721 2722                  ire_refrele(ire);               /* Held in ire_add */
2722 2723  
2723 2724                  /* tell routing sockets that we received a redirect */
2724 2725                  ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src,
2725 2726                      (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0,
2726 2727                      (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst);
2727 2728          }
2728 2729  
2729 2730          /*
2730 2731           * Delete any existing IRE_HOST type redirect ires for this destination.
2731 2732           * This together with the added IRE has the effect of
2732 2733           * modifying an existing redirect.
2733 2734           */
2734 2735          prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL,
2735 2736              ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL);
2736 2737          if (prev_ire != NULL) {
2737 2738                  if (prev_ire ->ire_flags & RTF_DYNAMIC)
2738 2739                          ire_delete(prev_ire);
2739 2740                  ire_refrele(prev_ire);
2740 2741          }
2741 2742  
2742 2743          freemsg(mp);
2743 2744  }
2744 2745  
2745 2746  /*
2746 2747   * Generate an ICMP parameter problem message.
2747 2748   * When called from ip_output side a minimal ip_recv_attr_t needs to be
2748 2749   * constructed by the caller.
2749 2750   */
2750 2751  static void
2751 2752  icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira)
2752 2753  {
2753 2754          icmph_t icmph;
2754 2755          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2755 2756  
2756 2757          mp = icmp_pkt_err_ok(mp, ira);
2757 2758          if (mp == NULL)
2758 2759                  return;
2759 2760  
2760 2761          bzero(&icmph, sizeof (icmph_t));
2761 2762          icmph.icmph_type = ICMP_PARAM_PROBLEM;
2762 2763          icmph.icmph_pp_ptr = ptr;
2763 2764          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs);
2764 2765          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
2765 2766  }
2766 2767  
2767 2768  /*
2768 2769   * Build and ship an IPv4 ICMP message using the packet data in mp, and
2769 2770   * the ICMP header pointed to by "stuff".  (May be called as writer.)
2770 2771   * Note: assumes that icmp_pkt_err_ok has been called to verify that
2771 2772   * an icmp error packet can be sent.
2772 2773   * Assigns an appropriate source address to the packet. If ipha_dst is
2773 2774   * one of our addresses use it for source. Otherwise let ip_output_simple
2774 2775   * pick the source address.
2775 2776   */
2776 2777  static void
2777 2778  icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira)
2778 2779  {
2779 2780          ipaddr_t dst;
2780 2781          icmph_t *icmph;
2781 2782          ipha_t  *ipha;
2782 2783          uint_t  len_needed;
2783 2784          size_t  msg_len;
2784 2785          mblk_t  *mp1;
2785 2786          ipaddr_t src;
2786 2787          ire_t   *ire;
2787 2788          ip_xmit_attr_t ixas;
2788 2789          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
2789 2790  
2790 2791          ipha = (ipha_t *)mp->b_rptr;
2791 2792  
2792 2793          bzero(&ixas, sizeof (ixas));
2793 2794          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
2794 2795          ixas.ixa_zoneid = ira->ira_zoneid;
2795 2796          ixas.ixa_ifindex = 0;
2796 2797          ixas.ixa_ipst = ipst;
2797 2798          ixas.ixa_cred = kcred;
2798 2799          ixas.ixa_cpid = NOPID;
2799 2800          ixas.ixa_tsl = ira->ira_tsl;    /* Behave as a multi-level responder */
2800 2801          ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
2801 2802  
2802 2803          if (ira->ira_flags & IRAF_IPSEC_SECURE) {
2803 2804                  /*
2804 2805                   * Apply IPsec based on how IPsec was applied to
2805 2806                   * the packet that had the error.
2806 2807                   *
2807 2808                   * If it was an outbound packet that caused the ICMP
2808 2809                   * error, then the caller will have setup the IRA
2809 2810                   * appropriately.
2810 2811                   */
2811 2812                  if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) {
2812 2813                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2813 2814                          /* Note: mp already consumed and ip_drop_packet done */
2814 2815                          return;
2815 2816                  }
2816 2817          } else {
2817 2818                  /*
2818 2819                   * This is in clear. The icmp message we are building
2819 2820                   * here should go out in clear, independent of our policy.
2820 2821                   */
2821 2822                  ixas.ixa_flags |= IXAF_NO_IPSEC;
2822 2823          }
2823 2824  
2824 2825          /* Remember our eventual destination */
2825 2826          dst = ipha->ipha_src;
2826 2827  
2827 2828          /*
2828 2829           * If the packet was for one of our unicast addresses, make
2829 2830           * sure we respond with that as the source. Otherwise
2830 2831           * have ip_output_simple pick the source address.
2831 2832           */
2832 2833          ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0,
2833 2834              (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL,
2834 2835              MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL);
2835 2836          if (ire != NULL) {
2836 2837                  ire_refrele(ire);
2837 2838                  src = ipha->ipha_dst;
2838 2839          } else {
2839 2840                  src = INADDR_ANY;
2840 2841                  ixas.ixa_flags |= IXAF_SET_SOURCE;
2841 2842          }
2842 2843  
2843 2844          /*
2844 2845           * Check if we can send back more then 8 bytes in addition to
2845 2846           * the IP header.  We try to send 64 bytes of data and the internal
2846 2847           * header in the special cases of ipv4 encapsulated ipv4 or ipv6.
2847 2848           */
2848 2849          len_needed = IPH_HDR_LENGTH(ipha);
2849 2850          if (ipha->ipha_protocol == IPPROTO_ENCAP ||
2850 2851              ipha->ipha_protocol == IPPROTO_IPV6) {
2851 2852                  if (!pullupmsg(mp, -1)) {
2852 2853                          BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
2853 2854                          ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2854 2855                          freemsg(mp);
2855 2856                          return;
2856 2857                  }
2857 2858                  ipha = (ipha_t *)mp->b_rptr;
2858 2859  
2859 2860                  if (ipha->ipha_protocol == IPPROTO_ENCAP) {
2860 2861                          len_needed += IPH_HDR_LENGTH(((uchar_t *)ipha +
2861 2862                              len_needed));
2862 2863                  } else {
2863 2864                          ip6_t *ip6h = (ip6_t *)((uchar_t *)ipha + len_needed);
2864 2865  
2865 2866                          ASSERT(ipha->ipha_protocol == IPPROTO_IPV6);
2866 2867                          len_needed += ip_hdr_length_v6(mp, ip6h);
2867 2868                  }
2868 2869          }
2869 2870          len_needed += ipst->ips_ip_icmp_return;
2870 2871          msg_len = msgdsize(mp);
2871 2872          if (msg_len > len_needed) {
2872 2873                  (void) adjmsg(mp, len_needed - msg_len);
2873 2874                  msg_len = len_needed;
2874 2875          }
2875 2876          mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED);
2876 2877          if (mp1 == NULL) {
2877 2878                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors);
2878 2879                  freemsg(mp);
2879 2880                  return;
2880 2881          }
2881 2882          mp1->b_cont = mp;
2882 2883          mp = mp1;
2883 2884  
2884 2885          /*
2885 2886           * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this
2886 2887           * node generates be accepted in peace by all on-host destinations.
2887 2888           * If we do NOT assume that all on-host destinations trust
2888 2889           * self-generated ICMP messages, then rework here, ip6.c, and spd.c.
2889 2890           * (Look for IXAF_TRUSTED_ICMP).
2890 2891           */
2891 2892          ixas.ixa_flags |= IXAF_TRUSTED_ICMP;
2892 2893  
2893 2894          ipha = (ipha_t *)mp->b_rptr;
2894 2895          mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len);
2895 2896          *ipha = icmp_ipha;
2896 2897          ipha->ipha_src = src;
2897 2898          ipha->ipha_dst = dst;
2898 2899          ipha->ipha_ttl = ipst->ips_ip_def_ttl;
2899 2900          msg_len += sizeof (icmp_ipha) + len;
2900 2901          if (msg_len > IP_MAXPACKET) {
2901 2902                  (void) adjmsg(mp, IP_MAXPACKET - msg_len);
2902 2903                  msg_len = IP_MAXPACKET;
2903 2904          }
2904 2905          ipha->ipha_length = htons((uint16_t)msg_len);
2905 2906          icmph = (icmph_t *)&ipha[1];
2906 2907          bcopy(stuff, icmph, len);
2907 2908          icmph->icmph_checksum = 0;
2908 2909          icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
2909 2910          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs);
2910 2911  
2911 2912          (void) ip_output_simple(mp, &ixas);
2912 2913          ixa_cleanup(&ixas);
2913 2914  }
2914 2915  
2915 2916  /*
2916 2917   * Determine if an ICMP error packet can be sent given the rate limit.
2917 2918   * The limit consists of an average frequency (icmp_pkt_err_interval measured
2918 2919   * in milliseconds) and a burst size. Burst size number of packets can
2919 2920   * be sent arbitrarely closely spaced.
2920 2921   * The state is tracked using two variables to implement an approximate
2921 2922   * token bucket filter:
2922 2923   *      icmp_pkt_err_last - lbolt value when the last burst started
2923 2924   *      icmp_pkt_err_sent - number of packets sent in current burst
2924 2925   */
2925 2926  boolean_t
2926 2927  icmp_err_rate_limit(ip_stack_t *ipst)
2927 2928  {
2928 2929          clock_t now = TICK_TO_MSEC(ddi_get_lbolt());
2929 2930          uint_t refilled; /* Number of packets refilled in tbf since last */
2930 2931          /* Guard against changes by loading into local variable */
2931 2932          uint_t err_interval = ipst->ips_ip_icmp_err_interval;
2932 2933  
2933 2934          if (err_interval == 0)
2934 2935                  return (B_FALSE);
2935 2936  
2936 2937          if (ipst->ips_icmp_pkt_err_last > now) {
2937 2938                  /* 100HZ lbolt in ms for 32bit arch wraps every 49.7 days */
2938 2939                  ipst->ips_icmp_pkt_err_last = 0;
2939 2940                  ipst->ips_icmp_pkt_err_sent = 0;
2940 2941          }
2941 2942          /*
2942 2943           * If we are in a burst update the token bucket filter.
2943 2944           * Update the "last" time to be close to "now" but make sure
2944 2945           * we don't loose precision.
2945 2946           */
2946 2947          if (ipst->ips_icmp_pkt_err_sent != 0) {
2947 2948                  refilled = (now - ipst->ips_icmp_pkt_err_last)/err_interval;
2948 2949                  if (refilled > ipst->ips_icmp_pkt_err_sent) {
2949 2950                          ipst->ips_icmp_pkt_err_sent = 0;
2950 2951                  } else {
2951 2952                          ipst->ips_icmp_pkt_err_sent -= refilled;
2952 2953                          ipst->ips_icmp_pkt_err_last += refilled * err_interval;
2953 2954                  }
2954 2955          }
2955 2956          if (ipst->ips_icmp_pkt_err_sent == 0) {
2956 2957                  /* Start of new burst */
2957 2958                  ipst->ips_icmp_pkt_err_last = now;
2958 2959          }
2959 2960          if (ipst->ips_icmp_pkt_err_sent < ipst->ips_ip_icmp_err_burst) {
2960 2961                  ipst->ips_icmp_pkt_err_sent++;
2961 2962                  ip1dbg(("icmp_err_rate_limit: %d sent in burst\n",
2962 2963                      ipst->ips_icmp_pkt_err_sent));
2963 2964                  return (B_FALSE);
2964 2965          }
2965 2966          ip1dbg(("icmp_err_rate_limit: dropped\n"));
2966 2967          return (B_TRUE);
2967 2968  }
2968 2969  
2969 2970  /*
2970 2971   * Check if it is ok to send an IPv4 ICMP error packet in
2971 2972   * response to the IPv4 packet in mp.
2972 2973   * Free the message and return null if no
2973 2974   * ICMP error packet should be sent.
2974 2975   */
2975 2976  static mblk_t *
2976 2977  icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira)
2977 2978  {
2978 2979          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
2979 2980          icmph_t *icmph;
2980 2981          ipha_t  *ipha;
2981 2982          uint_t  len_needed;
2982 2983  
2983 2984          if (!mp)
2984 2985                  return (NULL);
2985 2986          ipha = (ipha_t *)mp->b_rptr;
2986 2987          if (ip_csum_hdr(ipha)) {
2987 2988                  BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs);
2988 2989                  ip_drop_input("ipIfStatsInCksumErrs", mp, NULL);
2989 2990                  freemsg(mp);
2990 2991                  return (NULL);
2991 2992          }
2992 2993          if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST ||
2993 2994              ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST ||
2994 2995              CLASSD(ipha->ipha_dst) ||
2995 2996              CLASSD(ipha->ipha_src) ||
2996 2997              (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) {
2997 2998                  /* Note: only errors to the fragment with offset 0 */
2998 2999                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
2999 3000                  freemsg(mp);
3000 3001                  return (NULL);
3001 3002          }
3002 3003          if (ipha->ipha_protocol == IPPROTO_ICMP) {
3003 3004                  /*
3004 3005                   * Check the ICMP type.  RFC 1122 sez:  don't send ICMP
3005 3006                   * errors in response to any ICMP errors.
3006 3007                   */
3007 3008                  len_needed = IPH_HDR_LENGTH(ipha) + ICMPH_SIZE;
3008 3009                  if (mp->b_wptr - mp->b_rptr < len_needed) {
3009 3010                          if (!pullupmsg(mp, len_needed)) {
3010 3011                                  BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors);
3011 3012                                  freemsg(mp);
3012 3013                                  return (NULL);
3013 3014                          }
3014 3015                          ipha = (ipha_t *)mp->b_rptr;
3015 3016                  }
3016 3017                  icmph = (icmph_t *)
3017 3018                      (&((char *)ipha)[IPH_HDR_LENGTH(ipha)]);
3018 3019                  switch (icmph->icmph_type) {
3019 3020                  case ICMP_DEST_UNREACHABLE:
3020 3021                  case ICMP_SOURCE_QUENCH:
3021 3022                  case ICMP_TIME_EXCEEDED:
3022 3023                  case ICMP_PARAM_PROBLEM:
3023 3024                  case ICMP_REDIRECT:
3024 3025                          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3025 3026                          freemsg(mp);
3026 3027                          return (NULL);
3027 3028                  default:
3028 3029                          break;
3029 3030                  }
3030 3031          }
3031 3032          /*
3032 3033           * If this is a labeled system, then check to see if we're allowed to
3033 3034           * send a response to this particular sender.  If not, then just drop.
3034 3035           */
3035 3036          if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) {
3036 3037                  ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n"));
3037 3038                  BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops);
3038 3039                  freemsg(mp);
3039 3040                  return (NULL);
3040 3041          }
3041 3042          if (icmp_err_rate_limit(ipst)) {
3042 3043                  /*
3043 3044                   * Only send ICMP error packets every so often.
3044 3045                   * This should be done on a per port/source basis,
3045 3046                   * but for now this will suffice.
3046 3047                   */
3047 3048                  freemsg(mp);
3048 3049                  return (NULL);
3049 3050          }
3050 3051          return (mp);
3051 3052  }
3052 3053  
3053 3054  /*
3054 3055   * Called when a packet was sent out the same link that it arrived on.
3055 3056   * Check if it is ok to send a redirect and then send it.
3056 3057   */
3057 3058  void
3058 3059  ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire,
3059 3060      ip_recv_attr_t *ira)
3060 3061  {
3061 3062          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
3062 3063          ipaddr_t        src, nhop;
3063 3064          mblk_t          *mp1;
3064 3065          ire_t           *nhop_ire;
3065 3066  
3066 3067          /*
3067 3068           * Check the source address to see if it originated
3068 3069           * on the same logical subnet it is going back out on.
3069 3070           * If so, we should be able to send it a redirect.
3070 3071           * Avoid sending a redirect if the destination
3071 3072           * is directly connected (i.e., we matched an IRE_ONLINK),
3072 3073           * or if the packet was source routed out this interface.
3073 3074           *
3074 3075           * We avoid sending a redirect if the
3075 3076           * destination is directly connected
3076 3077           * because it is possible that multiple
3077 3078           * IP subnets may have been configured on
3078 3079           * the link, and the source may not
3079 3080           * be on the same subnet as ip destination,
3080 3081           * even though they are on the same
3081 3082           * physical link.
3082 3083           */
3083 3084          if ((ire->ire_type & IRE_ONLINK) ||
3084 3085              ip_source_routed(ipha, ipst))
3085 3086                  return;
3086 3087  
3087 3088          nhop_ire = ire_nexthop(ire);
3088 3089          if (nhop_ire == NULL)
3089 3090                  return;
3090 3091  
3091 3092          nhop = nhop_ire->ire_addr;
3092 3093  
3093 3094          if (nhop_ire->ire_type & IRE_IF_CLONE) {
3094 3095                  ire_t   *ire2;
3095 3096  
3096 3097                  /* Follow ire_dep_parent to find non-clone IRE_INTERFACE */
3097 3098                  mutex_enter(&nhop_ire->ire_lock);
3098 3099                  ire2 = nhop_ire->ire_dep_parent;
3099 3100                  if (ire2 != NULL)
3100 3101                          ire_refhold(ire2);
3101 3102                  mutex_exit(&nhop_ire->ire_lock);
3102 3103                  ire_refrele(nhop_ire);
3103 3104                  nhop_ire = ire2;
3104 3105          }
3105 3106          if (nhop_ire == NULL)
3106 3107                  return;
3107 3108  
3108 3109          ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE));
3109 3110  
3110 3111          src = ipha->ipha_src;
3111 3112  
3112 3113          /*
3113 3114           * We look at the interface ire for the nexthop,
3114 3115           * to see if ipha_src is in the same subnet
3115 3116           * as the nexthop.
3116 3117           */
3117 3118          if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) {
3118 3119                  /*
3119 3120                   * The source is directly connected.
3120 3121                   */
3121 3122                  mp1 = copymsg(mp);
3122 3123                  if (mp1 != NULL) {
3123 3124                          icmp_send_redirect(mp1, nhop, ira);
3124 3125                  }
3125 3126          }
3126 3127          ire_refrele(nhop_ire);
3127 3128  }
3128 3129  
3129 3130  /*
3130 3131   * Generate an ICMP redirect message.
3131 3132   */
3132 3133  static void
3133 3134  icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira)
3134 3135  {
3135 3136          icmph_t icmph;
3136 3137          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3137 3138  
3138 3139          mp = icmp_pkt_err_ok(mp, ira);
3139 3140          if (mp == NULL)
3140 3141                  return;
3141 3142  
3142 3143          bzero(&icmph, sizeof (icmph_t));
3143 3144          icmph.icmph_type = ICMP_REDIRECT;
3144 3145          icmph.icmph_code = 1;
3145 3146          icmph.icmph_rd_gateway = gateway;
3146 3147          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects);
3147 3148          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3148 3149  }
3149 3150  
3150 3151  /*
3151 3152   * Generate an ICMP time exceeded message.
3152 3153   */
3153 3154  void
3154 3155  icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3155 3156  {
3156 3157          icmph_t icmph;
3157 3158          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3158 3159  
3159 3160          mp = icmp_pkt_err_ok(mp, ira);
3160 3161          if (mp == NULL)
3161 3162                  return;
3162 3163  
3163 3164          bzero(&icmph, sizeof (icmph_t));
3164 3165          icmph.icmph_type = ICMP_TIME_EXCEEDED;
3165 3166          icmph.icmph_code = code;
3166 3167          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds);
3167 3168          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3168 3169  }
3169 3170  
3170 3171  /*
3171 3172   * Generate an ICMP unreachable message.
3172 3173   * When called from ip_output side a minimal ip_recv_attr_t needs to be
3173 3174   * constructed by the caller.
3174 3175   */
3175 3176  void
3176 3177  icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira)
3177 3178  {
3178 3179          icmph_t icmph;
3179 3180          ip_stack_t *ipst = ira->ira_ill->ill_ipst;
3180 3181  
3181 3182          mp = icmp_pkt_err_ok(mp, ira);
3182 3183          if (mp == NULL)
3183 3184                  return;
3184 3185  
3185 3186          bzero(&icmph, sizeof (icmph_t));
3186 3187          icmph.icmph_type = ICMP_DEST_UNREACHABLE;
3187 3188          icmph.icmph_code = code;
3188 3189          BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs);
3189 3190          icmp_pkt(mp, &icmph, sizeof (icmph_t), ira);
3190 3191  }
3191 3192  
3192 3193  /*
3193 3194   * Latch in the IPsec state for a stream based the policy in the listener
3194 3195   * and the actions in the ip_recv_attr_t.
3195 3196   * Called directly from TCP and SCTP.
3196 3197   */
3197 3198  boolean_t
3198 3199  ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira)
3199 3200  {
3200 3201          ASSERT(lconnp->conn_policy != NULL);
3201 3202          ASSERT(connp->conn_policy == NULL);
3202 3203  
3203 3204          IPPH_REFHOLD(lconnp->conn_policy);
3204 3205          connp->conn_policy = lconnp->conn_policy;
3205 3206  
3206 3207          if (ira->ira_ipsec_action != NULL) {
3207 3208                  if (connp->conn_latch == NULL) {
3208 3209                          connp->conn_latch = iplatch_create();
3209 3210                          if (connp->conn_latch == NULL)
3210 3211                                  return (B_FALSE);
3211 3212                  }
3212 3213                  ipsec_latch_inbound(connp, ira);
3213 3214          }
3214 3215          return (B_TRUE);
3215 3216  }
3216 3217  
3217 3218  /*
3218 3219   * Verify whether or not the IP address is a valid local address.
3219 3220   * Could be a unicast, including one for a down interface.
3220 3221   * If allow_mcbc then a multicast or broadcast address is also
3221 3222   * acceptable.
3222 3223   *
3223 3224   * In the case of a broadcast/multicast address, however, the
3224 3225   * upper protocol is expected to reset the src address
3225 3226   * to zero when we return IPVL_MCAST/IPVL_BCAST so that
3226 3227   * no packets are emitted with broadcast/multicast address as
3227 3228   * source address (that violates hosts requirements RFC 1122)
3228 3229   * The addresses valid for bind are:
3229 3230   *      (1) - INADDR_ANY (0)
3230 3231   *      (2) - IP address of an UP interface
3231 3232   *      (3) - IP address of a DOWN interface
3232 3233   *      (4) - valid local IP broadcast addresses. In this case
3233 3234   *      the conn will only receive packets destined to
3234 3235   *      the specified broadcast address.
3235 3236   *      (5) - a multicast address. In this case
3236 3237   *      the conn will only receive packets destined to
3237 3238   *      the specified multicast address. Note: the
3238 3239   *      application still has to issue an
3239 3240   *      IP_ADD_MEMBERSHIP socket option.
3240 3241   *
3241 3242   * In all the above cases, the bound address must be valid in the current zone.
3242 3243   * When the address is loopback, multicast or broadcast, there might be many
3243 3244   * matching IREs so bind has to look up based on the zone.
3244 3245   */
3245 3246  ip_laddr_t
3246 3247  ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid,
3247 3248      ip_stack_t *ipst, boolean_t allow_mcbc)
3248 3249  {
3249 3250          ire_t *src_ire;
3250 3251  
3251 3252          ASSERT(src_addr != INADDR_ANY);
3252 3253  
3253 3254          src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0,
3254 3255              NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL);
3255 3256  
3256 3257          /*
3257 3258           * If an address other than in6addr_any is requested,
3258 3259           * we verify that it is a valid address for bind
3259 3260           * Note: Following code is in if-else-if form for
3260 3261           * readability compared to a condition check.
3261 3262           */
3262 3263          if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) {
3263 3264                  /*
3264 3265                   * (2) Bind to address of local UP interface
3265 3266                   */
3266 3267                  ire_refrele(src_ire);
3267 3268                  return (IPVL_UNICAST_UP);
3268 3269          } else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) {
3269 3270                  /*
3270 3271                   * (4) Bind to broadcast address
3271 3272                   */
3272 3273                  ire_refrele(src_ire);
3273 3274                  if (allow_mcbc)
3274 3275                          return (IPVL_BCAST);
3275 3276                  else
3276 3277                          return (IPVL_BAD);
3277 3278          } else if (CLASSD(src_addr)) {
3278 3279                  /* (5) bind to multicast address. */
3279 3280                  if (src_ire != NULL)
3280 3281                          ire_refrele(src_ire);
3281 3282  
3282 3283                  if (allow_mcbc)
3283 3284                          return (IPVL_MCAST);
3284 3285                  else
3285 3286                          return (IPVL_BAD);
3286 3287          } else {
3287 3288                  ipif_t *ipif;
3288 3289  
3289 3290                  /*
3290 3291                   * (3) Bind to address of local DOWN interface?
3291 3292                   * (ipif_lookup_addr() looks up all interfaces
3292 3293                   * but we do not get here for UP interfaces
3293 3294                   * - case (2) above)
3294 3295                   */
3295 3296                  if (src_ire != NULL)
3296 3297                          ire_refrele(src_ire);
3297 3298  
3298 3299                  ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst);
3299 3300                  if (ipif == NULL)
3300 3301                          return (IPVL_BAD);
3301 3302  
3302 3303                  /* Not a useful source? */
3303 3304                  if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) {
3304 3305                          ipif_refrele(ipif);
3305 3306                          return (IPVL_BAD);
3306 3307                  }
3307 3308                  ipif_refrele(ipif);
3308 3309                  return (IPVL_UNICAST_DOWN);
3309 3310          }
3310 3311  }
3311 3312  
3312 3313  /*
3313 3314   * Insert in the bind fanout for IPv4 and IPv6.
3314 3315   * The caller should already have used ip_laddr_verify_v*() before calling
3315 3316   * this.
3316 3317   */
3317 3318  int
3318 3319  ip_laddr_fanout_insert(conn_t *connp)
3319 3320  {
3320 3321          int             error;
3321 3322  
3322 3323          /*
3323 3324           * Allow setting new policies. For example, disconnects result
3324 3325           * in us being called. As we would have set conn_policy_cached
3325 3326           * to B_TRUE before, we should set it to B_FALSE, so that policy
3326 3327           * can change after the disconnect.
3327 3328           */
3328 3329          connp->conn_policy_cached = B_FALSE;
3329 3330  
3330 3331          error = ipcl_bind_insert(connp);
3331 3332          if (error != 0) {
3332 3333                  if (connp->conn_anon_port) {
3333 3334                          (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
3334 3335                              connp->conn_mlp_type, connp->conn_proto,
3335 3336                              ntohs(connp->conn_lport), B_FALSE);
3336 3337                  }
3337 3338                  connp->conn_mlp_type = mlptSingle;
3338 3339          }
3339 3340          return (error);
3340 3341  }
3341 3342  
3342 3343  /*
3343 3344   * Verify that both the source and destination addresses are valid. If
3344 3345   * IPDF_VERIFY_DST is not set, then the destination address may be unreachable,
3345 3346   * i.e. have no route to it.  Protocols like TCP want to verify destination
3346 3347   * reachability, while tunnels do not.
3347 3348   *
3348 3349   * Determine the route, the interface, and (optionally) the source address
3349 3350   * to use to reach a given destination.
3350 3351   * Note that we allow connect to broadcast and multicast addresses when
3351 3352   * IPDF_ALLOW_MCBC is set.
3352 3353   * first_hop and dst_addr are normally the same, but if source routing
3353 3354   * they will differ; in that case the first_hop is what we'll use for the
3354 3355   * routing lookup but the dce and label checks will be done on dst_addr,
3355 3356   *
3356 3357   * If uinfo is set, then we fill in the best available information
3357 3358   * we have for the destination. This is based on (in priority order) any
3358 3359   * metrics and path MTU stored in a dce_t, route metrics, and finally the
3359 3360   * ill_mtu/ill_mc_mtu.
3360 3361   *
3361 3362   * Tsol note: If we have a source route then dst_addr != firsthop. But we
3362 3363   * always do the label check on dst_addr.
3363 3364   */
3364 3365  int
3365 3366  ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop,
3366 3367      ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode)
3367 3368  {
3368 3369          ire_t           *ire = NULL;
3369 3370          int             error = 0;
3370 3371          ipaddr_t        setsrc;                         /* RTF_SETSRC */
3371 3372          zoneid_t        zoneid = ixa->ixa_zoneid;       /* Honors SO_ALLZONES */
3372 3373          ip_stack_t      *ipst = ixa->ixa_ipst;
3373 3374          dce_t           *dce;
3374 3375          uint_t          pmtu;
3375 3376          uint_t          generation;
3376 3377          nce_t           *nce;
3377 3378          ill_t           *ill = NULL;
3378 3379          boolean_t       multirt = B_FALSE;
3379 3380  
3380 3381          ASSERT(ixa->ixa_flags & IXAF_IS_IPV4);
3381 3382  
3382 3383          /*
3383 3384           * We never send to zero; the ULPs map it to the loopback address.
3384 3385           * We can't allow it since we use zero to mean unitialized in some
3385 3386           * places.
3386 3387           */
3387 3388          ASSERT(dst_addr != INADDR_ANY);
3388 3389  
3389 3390          if (is_system_labeled()) {
3390 3391                  ts_label_t *tsl = NULL;
3391 3392  
3392 3393                  error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION,
3393 3394                      mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl);
3394 3395                  if (error != 0)
3395 3396                          return (error);
3396 3397                  if (tsl != NULL) {
3397 3398                          /* Update the label */
3398 3399                          ip_xmit_attr_replace_tsl(ixa, tsl);
3399 3400                  }
3400 3401          }
3401 3402  
3402 3403          setsrc = INADDR_ANY;
3403 3404          /*
3404 3405           * Select a route; For IPMP interfaces, we would only select
3405 3406           * a "hidden" route (i.e., going through a specific under_ill)
3406 3407           * if ixa_ifindex has been specified.
3407 3408           */
3408 3409          ire = ip_select_route_v4(firsthop, *src_addrp, ixa,
3409 3410              &generation, &setsrc, &error, &multirt);
3410 3411          ASSERT(ire != NULL);    /* IRE_NOROUTE if none found */
3411 3412          if (error != 0)
3412 3413                  goto bad_addr;
3413 3414  
3414 3415          /*
3415 3416           * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set.
3416 3417           * If IPDF_VERIFY_DST is set, the destination must be reachable;
3417 3418           * Otherwise the destination needn't be reachable.
3418 3419           *
3419 3420           * If we match on a reject or black hole, then we've got a
3420 3421           * local failure.  May as well fail out the connect() attempt,
3421 3422           * since it's never going to succeed.
3422 3423           */
3423 3424          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
3424 3425                  /*
3425 3426                   * If we're verifying destination reachability, we always want
3426 3427                   * to complain here.
3427 3428                   *
3428 3429                   * If we're not verifying destination reachability but the
3429 3430                   * destination has a route, we still want to fail on the
3430 3431                   * temporary address and broadcast address tests.
3431 3432                   *
3432 3433                   * In both cases do we let the code continue so some reasonable
3433 3434                   * information is returned to the caller. That enables the
3434 3435                   * caller to use (and even cache) the IRE. conn_ip_ouput will
3435 3436                   * use the generation mismatch path to check for the unreachable
3436 3437                   * case thereby avoiding any specific check in the main path.
3437 3438                   */
3438 3439                  ASSERT(generation == IRE_GENERATION_VERIFY);
3439 3440                  if (flags & IPDF_VERIFY_DST) {
3440 3441                          /*
3441 3442                           * Set errno but continue to set up ixa_ire to be
3442 3443                           * the RTF_REJECT|RTF_BLACKHOLE IRE.
3443 3444                           * That allows callers to use ip_output to get an
3444 3445                           * ICMP error back.
3445 3446                           */
3446 3447                          if (!(ire->ire_type & IRE_HOST))
3447 3448                                  error = ENETUNREACH;
3448 3449                          else
3449 3450                                  error = EHOSTUNREACH;
3450 3451                  }
3451 3452          }
3452 3453  
3453 3454          if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) &&
3454 3455              !(flags & IPDF_ALLOW_MCBC)) {
3455 3456                  ire_refrele(ire);
3456 3457                  ire = ire_reject(ipst, B_FALSE);
3457 3458                  generation = IRE_GENERATION_VERIFY;
3458 3459                  error = ENETUNREACH;
3459 3460          }
3460 3461  
3461 3462          /* Cache things */
3462 3463          if (ixa->ixa_ire != NULL)
3463 3464                  ire_refrele_notr(ixa->ixa_ire);
3464 3465  #ifdef DEBUG
3465 3466          ire_refhold_notr(ire);
3466 3467          ire_refrele(ire);
3467 3468  #endif
3468 3469          ixa->ixa_ire = ire;
3469 3470          ixa->ixa_ire_generation = generation;
3470 3471  
3471 3472          /*
3472 3473           * Ensure that ixa_dce is always set any time that ixa_ire is set,
3473 3474           * since some callers will send a packet to conn_ip_output() even if
3474 3475           * there's an error.
3475 3476           */
3476 3477          if (flags & IPDF_UNIQUE_DCE) {
3477 3478                  /* Fallback to the default dce if allocation fails */
3478 3479                  dce = dce_lookup_and_add_v4(dst_addr, ipst);
3479 3480                  if (dce != NULL)
3480 3481                          generation = dce->dce_generation;
3481 3482                  else
3482 3483                          dce = dce_lookup_v4(dst_addr, ipst, &generation);
3483 3484          } else {
3484 3485                  dce = dce_lookup_v4(dst_addr, ipst, &generation);
3485 3486          }
3486 3487          ASSERT(dce != NULL);
3487 3488          if (ixa->ixa_dce != NULL)
3488 3489                  dce_refrele_notr(ixa->ixa_dce);
3489 3490  #ifdef DEBUG
3490 3491          dce_refhold_notr(dce);
3491 3492          dce_refrele(dce);
3492 3493  #endif
3493 3494          ixa->ixa_dce = dce;
3494 3495          ixa->ixa_dce_generation = generation;
3495 3496  
3496 3497          /*
3497 3498           * For multicast with multirt we have a flag passed back from
3498 3499           * ire_lookup_multi_ill_v4 since we don't have an IRE for each
3499 3500           * possible multicast address.
3500 3501           * We also need a flag for multicast since we can't check
3501 3502           * whether RTF_MULTIRT is set in ixa_ire for multicast.
3502 3503           */
3503 3504          if (multirt) {
3504 3505                  ixa->ixa_postfragfn = ip_postfrag_multirt_v4;
3505 3506                  ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST;
3506 3507          } else {
3507 3508                  ixa->ixa_postfragfn = ire->ire_postfragfn;
3508 3509                  ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST;
3509 3510          }
3510 3511          if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3511 3512                  /* Get an nce to cache. */
3512 3513                  nce = ire_to_nce(ire, firsthop, NULL);
3513 3514                  if (nce == NULL) {
3514 3515                          /* Allocation failure? */
3515 3516                          ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3516 3517                  } else {
3517 3518                          if (ixa->ixa_nce != NULL)
3518 3519                                  nce_refrele(ixa->ixa_nce);
3519 3520                          ixa->ixa_nce = nce;
3520 3521                  }
3521 3522          }
3522 3523  
3523 3524          /*
3524 3525           * If the source address is a loopback address, the
3525 3526           * destination had best be local or multicast.
3526 3527           * If we are sending to an IRE_LOCAL using a loopback source then
3527 3528           * it had better be the same zoneid.
3528 3529           */
3529 3530          if (*src_addrp == htonl(INADDR_LOOPBACK)) {
3530 3531                  if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) {
3531 3532                          ire = NULL;     /* Stored in ixa_ire */
3532 3533                          error = EADDRNOTAVAIL;
3533 3534                          goto bad_addr;
3534 3535                  }
3535 3536                  if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) {
3536 3537                          ire = NULL;     /* Stored in ixa_ire */
3537 3538                          error = EADDRNOTAVAIL;
3538 3539                          goto bad_addr;
3539 3540                  }
3540 3541          }
3541 3542          if (ire->ire_type & IRE_BROADCAST) {
3542 3543                  /*
3543 3544                   * If the ULP didn't have a specified source, then we
3544 3545                   * make sure we reselect the source when sending
3545 3546                   * broadcasts out different interfaces.
3546 3547                   */
3547 3548                  if (flags & IPDF_SELECT_SRC)
3548 3549                          ixa->ixa_flags |= IXAF_SET_SOURCE;
3549 3550                  else
3550 3551                          ixa->ixa_flags &= ~IXAF_SET_SOURCE;
3551 3552          }
3552 3553  
3553 3554          /*
3554 3555           * Does the caller want us to pick a source address?
3555 3556           */
3556 3557          if (flags & IPDF_SELECT_SRC) {
3557 3558                  ipaddr_t        src_addr;
3558 3559  
3559 3560                  /*
3560 3561                   * We use use ire_nexthop_ill to avoid the under ipmp
3561 3562                   * interface for source address selection. Note that for ipmp
3562 3563                   * probe packets, ixa_ifindex would have been specified, and
3563 3564                   * the ip_select_route() invocation would have picked an ire
3564 3565                   * will ire_ill pointing at an under interface.
3565 3566                   */
3566 3567                  ill = ire_nexthop_ill(ire);
3567 3568  
3568 3569                  /* If unreachable we have no ill but need some source */
3569 3570                  if (ill == NULL) {
3570 3571                          src_addr = htonl(INADDR_LOOPBACK);
3571 3572                          /* Make sure we look for a better source address */
3572 3573                          generation = SRC_GENERATION_VERIFY;
3573 3574                  } else {
3574 3575                          error = ip_select_source_v4(ill, setsrc, dst_addr,
3575 3576                              ixa->ixa_multicast_ifaddr, zoneid,
3576 3577                              ipst, &src_addr, &generation, NULL);
3577 3578                          if (error != 0) {
3578 3579                                  ire = NULL;     /* Stored in ixa_ire */
3579 3580                                  goto bad_addr;
3580 3581                          }
3581 3582                  }
3582 3583  
3583 3584                  /*
3584 3585                   * We allow the source address to to down.
3585 3586                   * However, we check that we don't use the loopback address
3586 3587                   * as a source when sending out on the wire.
3587 3588                   */
3588 3589                  if ((src_addr == htonl(INADDR_LOOPBACK)) &&
3589 3590                      !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) &&
3590 3591                      !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
3591 3592                          ire = NULL;     /* Stored in ixa_ire */
3592 3593                          error = EADDRNOTAVAIL;
3593 3594                          goto bad_addr;
3594 3595                  }
3595 3596  
3596 3597                  *src_addrp = src_addr;
3597 3598                  ixa->ixa_src_generation = generation;
3598 3599          }
3599 3600  
3600 3601          /*
3601 3602           * Make sure we don't leave an unreachable ixa_nce in place
3602 3603           * since ip_select_route is used when we unplumb i.e., remove
3603 3604           * references on ixa_ire, ixa_nce, and ixa_dce.
3604 3605           */
3605 3606          nce = ixa->ixa_nce;
3606 3607          if (nce != NULL && nce->nce_is_condemned) {
3607 3608                  nce_refrele(nce);
3608 3609                  ixa->ixa_nce = NULL;
3609 3610                  ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3610 3611          }
3611 3612  
3612 3613          /*
3613 3614           * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired.
3614 3615           * However, we can't do it for IPv4 multicast or broadcast.
3615 3616           */
3616 3617          if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST))
3617 3618                  ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3618 3619  
3619 3620          /*
3620 3621           * Set initial value for fragmentation limit. Either conn_ip_output
3621 3622           * or ULP might updates it when there are routing changes.
3622 3623           * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT.
3623 3624           */
3624 3625          pmtu = ip_get_pmtu(ixa);
3625 3626          ixa->ixa_fragsize = pmtu;
3626 3627          /* Make sure ixa_fragsize and ixa_pmtu remain identical */
3627 3628          if (ixa->ixa_flags & IXAF_VERIFY_PMTU)
3628 3629                  ixa->ixa_pmtu = pmtu;
3629 3630  
3630 3631          /*
3631 3632           * Extract information useful for some transports.
3632 3633           * First we look for DCE metrics. Then we take what we have in
3633 3634           * the metrics in the route, where the offlink is used if we have
3634 3635           * one.
3635 3636           */
3636 3637          if (uinfo != NULL) {
3637 3638                  bzero(uinfo, sizeof (*uinfo));
3638 3639  
3639 3640                  if (dce->dce_flags & DCEF_UINFO)
3640 3641                          *uinfo = dce->dce_uinfo;
3641 3642  
3642 3643                  rts_merge_metrics(uinfo, &ire->ire_metrics);
3643 3644  
3644 3645                  /* Allow ire_metrics to decrease the path MTU from above */
3645 3646                  if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu)
3646 3647                          uinfo->iulp_mtu = pmtu;
3647 3648  
3648 3649                  uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0;
3649 3650                  uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0;
3650 3651                  uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0;
3651 3652          }
3652 3653  
3653 3654          if (ill != NULL)
3654 3655                  ill_refrele(ill);
3655 3656  
3656 3657          return (error);
3657 3658  
3658 3659  bad_addr:
3659 3660          if (ire != NULL)
3660 3661                  ire_refrele(ire);
3661 3662  
3662 3663          if (ill != NULL)
3663 3664                  ill_refrele(ill);
3664 3665  
3665 3666          /*
3666 3667           * Make sure we don't leave an unreachable ixa_nce in place
3667 3668           * since ip_select_route is used when we unplumb i.e., remove
3668 3669           * references on ixa_ire, ixa_nce, and ixa_dce.
3669 3670           */
3670 3671          nce = ixa->ixa_nce;
3671 3672          if (nce != NULL && nce->nce_is_condemned) {
3672 3673                  nce_refrele(nce);
3673 3674                  ixa->ixa_nce = NULL;
3674 3675                  ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
3675 3676          }
3676 3677  
3677 3678          return (error);
3678 3679  }
3679 3680  
3680 3681  
3681 3682  /*
3682 3683   * Get the base MTU for the case when path MTU discovery is not used.
3683 3684   * Takes the MTU of the IRE into account.
3684 3685   */
3685 3686  uint_t
3686 3687  ip_get_base_mtu(ill_t *ill, ire_t *ire)
3687 3688  {
3688 3689          uint_t mtu;
3689 3690          uint_t iremtu = ire->ire_metrics.iulp_mtu;
3690 3691  
3691 3692          if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST))
3692 3693                  mtu = ill->ill_mc_mtu;
3693 3694          else
3694 3695                  mtu = ill->ill_mtu;
3695 3696  
3696 3697          if (iremtu != 0 && iremtu < mtu)
3697 3698                  mtu = iremtu;
3698 3699  
3699 3700          return (mtu);
3700 3701  }
3701 3702  
3702 3703  /*
3703 3704   * Get the PMTU for the attributes. Handles both IPv4 and IPv6.
3704 3705   * Assumes that ixa_ire, dce, and nce have already been set up.
3705 3706   *
3706 3707   * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired.
3707 3708   * We avoid path MTU discovery if it is disabled with ndd.
3708 3709   * Furtermore, if the path MTU is too small, then we don't set DF for IPv4.
3709 3710   *
3710 3711   * NOTE: We also used to turn it off for source routed packets. That
3711 3712   * is no longer required since the dce is per final destination.
3712 3713   */
3713 3714  uint_t
3714 3715  ip_get_pmtu(ip_xmit_attr_t *ixa)
3715 3716  {
3716 3717          ip_stack_t      *ipst = ixa->ixa_ipst;
3717 3718          dce_t           *dce;
3718 3719          nce_t           *nce;
3719 3720          ire_t           *ire;
3720 3721          uint_t          pmtu;
3721 3722  
3722 3723          ire = ixa->ixa_ire;
3723 3724          dce = ixa->ixa_dce;
3724 3725          nce = ixa->ixa_nce;
3725 3726  
3726 3727          /*
3727 3728           * If path MTU discovery has been turned off by ndd, then we ignore
3728 3729           * any dce_pmtu and for IPv4 we will not set DF.
3729 3730           */
3730 3731          if (!ipst->ips_ip_path_mtu_discovery)
3731 3732                  ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY;
3732 3733  
3733 3734          pmtu = IP_MAXPACKET;
3734 3735          /*
3735 3736           * Decide whether whether IPv4 sets DF
3736 3737           * For IPv6 "no DF" means to use the 1280 mtu
3737 3738           */
3738 3739          if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3739 3740                  ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3740 3741          } else {
3741 3742                  ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3742 3743                  if (!(ixa->ixa_flags & IXAF_IS_IPV4))
3743 3744                          pmtu = IPV6_MIN_MTU;
3744 3745          }
3745 3746  
3746 3747          /* Check if the PMTU is to old before we use it */
3747 3748          if ((dce->dce_flags & DCEF_PMTU) &&
3748 3749              TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
3749 3750              ipst->ips_ip_pathmtu_interval) {
3750 3751                  /*
3751 3752                   * Older than 20 minutes. Drop the path MTU information.
3752 3753                   */
3753 3754                  mutex_enter(&dce->dce_lock);
3754 3755                  dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU);
3755 3756                  dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
3756 3757                  mutex_exit(&dce->dce_lock);
3757 3758                  dce_increment_generation(dce);
3758 3759          }
3759 3760  
3760 3761          /* The metrics on the route can lower the path MTU */
3761 3762          if (ire->ire_metrics.iulp_mtu != 0 &&
3762 3763              ire->ire_metrics.iulp_mtu < pmtu)
3763 3764                  pmtu = ire->ire_metrics.iulp_mtu;
3764 3765  
3765 3766          /*
3766 3767           * If the path MTU is smaller than some minimum, we still use dce_pmtu
3767 3768           * above (would be 576 for IPv4 and 1280 for IPv6), but we clear
3768 3769           * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4.
3769 3770           */
3770 3771          if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) {
3771 3772                  if (dce->dce_flags & DCEF_PMTU) {
3772 3773                          if (dce->dce_pmtu < pmtu)
3773 3774                                  pmtu = dce->dce_pmtu;
3774 3775  
3775 3776                          if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) {
3776 3777                                  ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL;
3777 3778                                  ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF;
3778 3779                          } else {
3779 3780                                  ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3780 3781                                  ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3781 3782                          }
3782 3783                  } else {
3783 3784                          ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL;
3784 3785                          ixa->ixa_flags |= IXAF_PMTU_IPV4_DF;
3785 3786                  }
3786 3787          }
3787 3788  
3788 3789          /*
3789 3790           * If we have an IRE_LOCAL we use the loopback mtu instead of
3790 3791           * the ill for going out the wire i.e., IRE_LOCAL gets the same
3791 3792           * mtu as IRE_LOOPBACK.
3792 3793           */
3793 3794          if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
3794 3795                  uint_t loopback_mtu;
3795 3796  
3796 3797                  loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ?
3797 3798                      ip_loopback_mtu_v6plus : ip_loopback_mtuplus;
3798 3799  
3799 3800                  if (loopback_mtu < pmtu)
3800 3801                          pmtu = loopback_mtu;
3801 3802          } else if (nce != NULL) {
3802 3803                  /*
3803 3804                   * Make sure we don't exceed the interface MTU.
3804 3805                   * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have
3805 3806                   * an ill. We'd use the above IP_MAXPACKET in that case just
3806 3807                   * to tell the transport something larger than zero.
3807 3808                   */
3808 3809                  if (ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)) {
3809 3810                          if (nce->nce_common->ncec_ill->ill_mc_mtu < pmtu)
3810 3811                                  pmtu = nce->nce_common->ncec_ill->ill_mc_mtu;
3811 3812                          if (nce->nce_common->ncec_ill != nce->nce_ill &&
3812 3813                              nce->nce_ill->ill_mc_mtu < pmtu) {
3813 3814                                  /*
3814 3815                                   * for interfaces in an IPMP group, the mtu of
3815 3816                                   * the nce_ill (under_ill) could be different
3816 3817                                   * from the mtu of the ncec_ill, so we take the
3817 3818                                   * min of the two.
3818 3819                                   */
3819 3820                                  pmtu = nce->nce_ill->ill_mc_mtu;
3820 3821                          }
3821 3822                  } else {
3822 3823                          if (nce->nce_common->ncec_ill->ill_mtu < pmtu)
3823 3824                                  pmtu = nce->nce_common->ncec_ill->ill_mtu;
3824 3825                          if (nce->nce_common->ncec_ill != nce->nce_ill &&
3825 3826                              nce->nce_ill->ill_mtu < pmtu) {
3826 3827                                  /*
3827 3828                                   * for interfaces in an IPMP group, the mtu of
3828 3829                                   * the nce_ill (under_ill) could be different
3829 3830                                   * from the mtu of the ncec_ill, so we take the
3830 3831                                   * min of the two.
3831 3832                                   */
3832 3833                                  pmtu = nce->nce_ill->ill_mtu;
3833 3834                          }
3834 3835                  }
3835 3836          }
3836 3837  
3837 3838          /*
3838 3839           * Handle the IPV6_USE_MIN_MTU socket option or ancillary data.
3839 3840           * Only applies to IPv6.
3840 3841           */
3841 3842          if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3842 3843                  if (ixa->ixa_flags & IXAF_USE_MIN_MTU) {
3843 3844                          switch (ixa->ixa_use_min_mtu) {
3844 3845                          case IPV6_USE_MIN_MTU_MULTICAST:
3845 3846                                  if (ire->ire_type & IRE_MULTICAST)
3846 3847                                          pmtu = IPV6_MIN_MTU;
3847 3848                                  break;
3848 3849                          case IPV6_USE_MIN_MTU_ALWAYS:
3849 3850                                  pmtu = IPV6_MIN_MTU;
3850 3851                                  break;
3851 3852                          case IPV6_USE_MIN_MTU_NEVER:
3852 3853                                  break;
3853 3854                          }
3854 3855                  } else {
3855 3856                          /* Default is IPV6_USE_MIN_MTU_MULTICAST */
3856 3857                          if (ire->ire_type & IRE_MULTICAST)
3857 3858                                  pmtu = IPV6_MIN_MTU;
3858 3859                  }
3859 3860          }
3860 3861  
3861 3862          /*
3862 3863           * For multirouted IPv6 packets, the IP layer will insert a 8-byte
3863 3864           * fragment header in every packet. We compensate for those cases by
3864 3865           * returning a smaller path MTU to the ULP.
3865 3866           *
3866 3867           * In the case of CGTP then ip_output will add a fragment header.
3867 3868           * Make sure there is room for it by telling a smaller number
3868 3869           * to the transport.
3869 3870           *
3870 3871           * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here
3871 3872           * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu()
3872 3873           * which is the size of the packets it can send.
3873 3874           */
3874 3875          if (!(ixa->ixa_flags & IXAF_IS_IPV4)) {
3875 3876                  if ((ire->ire_flags & RTF_MULTIRT) ||
3876 3877                      (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) {
3877 3878                          pmtu -= sizeof (ip6_frag_t);
3878 3879                          ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR;
3879 3880                  }
3880 3881          }
3881 3882  
3882 3883          return (pmtu);
3883 3884  }
3884 3885  
3885 3886  /*
3886 3887   * Carve "len" bytes out of an mblk chain, consuming any we empty, and duping
3887 3888   * the final piece where we don't.  Return a pointer to the first mblk in the
3888 3889   * result, and update the pointer to the next mblk to chew on.  If anything
3889 3890   * goes wrong (i.e., dupb fails), we waste everything in sight and return a
3890 3891   * NULL pointer.
3891 3892   */
3892 3893  mblk_t *
3893 3894  ip_carve_mp(mblk_t **mpp, ssize_t len)
3894 3895  {
3895 3896          mblk_t  *mp0;
3896 3897          mblk_t  *mp1;
3897 3898          mblk_t  *mp2;
3898 3899  
3899 3900          if (!len || !mpp || !(mp0 = *mpp))
3900 3901                  return (NULL);
3901 3902          /* If we aren't going to consume the first mblk, we need a dup. */
3902 3903          if (mp0->b_wptr - mp0->b_rptr > len) {
3903 3904                  mp1 = dupb(mp0);
3904 3905                  if (mp1) {
3905 3906                          /* Partition the data between the two mblks. */
3906 3907                          mp1->b_wptr = mp1->b_rptr + len;
3907 3908                          mp0->b_rptr = mp1->b_wptr;
3908 3909                          /*
3909 3910                           * after adjustments if mblk not consumed is now
3910 3911                           * unaligned, try to align it. If this fails free
3911 3912                           * all messages and let upper layer recover.
3912 3913                           */
3913 3914                          if (!OK_32PTR(mp0->b_rptr)) {
3914 3915                                  if (!pullupmsg(mp0, -1)) {
3915 3916                                          freemsg(mp0);
3916 3917                                          freemsg(mp1);
3917 3918                                          *mpp = NULL;
3918 3919                                          return (NULL);
3919 3920                                  }
3920 3921                          }
3921 3922                  }
3922 3923                  return (mp1);
3923 3924          }
3924 3925          /* Eat through as many mblks as we need to get len bytes. */
3925 3926          len -= mp0->b_wptr - mp0->b_rptr;
3926 3927          for (mp2 = mp1 = mp0; (mp2 = mp2->b_cont) != 0 && len; mp1 = mp2) {
3927 3928                  if (mp2->b_wptr - mp2->b_rptr > len) {
3928 3929                          /*
3929 3930                           * We won't consume the entire last mblk.  Like
3930 3931                           * above, dup and partition it.
3931 3932                           */
3932 3933                          mp1->b_cont = dupb(mp2);
3933 3934                          mp1 = mp1->b_cont;
3934 3935                          if (!mp1) {
3935 3936                                  /*
3936 3937                                   * Trouble.  Rather than go to a lot of
3937 3938                                   * trouble to clean up, we free the messages.
3938 3939                                   * This won't be any worse than losing it on
3939 3940                                   * the wire.
3940 3941                                   */
3941 3942                                  freemsg(mp0);
3942 3943                                  freemsg(mp2);
3943 3944                                  *mpp = NULL;
3944 3945                                  return (NULL);
3945 3946                          }
3946 3947                          mp1->b_wptr = mp1->b_rptr + len;
3947 3948                          mp2->b_rptr = mp1->b_wptr;
3948 3949                          /*
3949 3950                           * after adjustments if mblk not consumed is now
3950 3951                           * unaligned, try to align it. If this fails free
3951 3952                           * all messages and let upper layer recover.
3952 3953                           */
3953 3954                          if (!OK_32PTR(mp2->b_rptr)) {
3954 3955                                  if (!pullupmsg(mp2, -1)) {
3955 3956                                          freemsg(mp0);
3956 3957                                          freemsg(mp2);
3957 3958                                          *mpp = NULL;
3958 3959                                          return (NULL);
3959 3960                                  }
3960 3961                          }
3961 3962                          *mpp = mp2;
3962 3963                          return (mp0);
3963 3964                  }
3964 3965                  /* Decrement len by the amount we just got. */
3965 3966                  len -= mp2->b_wptr - mp2->b_rptr;
3966 3967          }
3967 3968          /*
3968 3969           * len should be reduced to zero now.  If not our caller has
3969 3970           * screwed up.
3970 3971           */
3971 3972          if (len) {
3972 3973                  /* Shouldn't happen! */
3973 3974                  freemsg(mp0);
3974 3975                  *mpp = NULL;
3975 3976                  return (NULL);
3976 3977          }
3977 3978          /*
3978 3979           * We consumed up to exactly the end of an mblk.  Detach the part
3979 3980           * we are returning from the rest of the chain.
3980 3981           */
3981 3982          mp1->b_cont = NULL;
3982 3983          *mpp = mp2;
3983 3984          return (mp0);
3984 3985  }
3985 3986  
3986 3987  /* The ill stream is being unplumbed. Called from ip_close */
3987 3988  int
3988 3989  ip_modclose(ill_t *ill)
3989 3990  {
3990 3991          boolean_t success;
3991 3992          ipsq_t  *ipsq;
3992 3993          ipif_t  *ipif;
3993 3994          queue_t *q = ill->ill_rq;
3994 3995          ip_stack_t      *ipst = ill->ill_ipst;
3995 3996          int     i;
3996 3997          arl_ill_common_t *ai = ill->ill_common;
3997 3998  
3998 3999          /*
3999 4000           * The punlink prior to this may have initiated a capability
4000 4001           * negotiation. But ipsq_enter will block until that finishes or
4001 4002           * times out.
4002 4003           */
4003 4004          success = ipsq_enter(ill, B_FALSE, NEW_OP);
4004 4005  
4005 4006          /*
4006 4007           * Open/close/push/pop is guaranteed to be single threaded
4007 4008           * per stream by STREAMS. FS guarantees that all references
4008 4009           * from top are gone before close is called. So there can't
4009 4010           * be another close thread that has set CONDEMNED on this ill.
4010 4011           * and cause ipsq_enter to return failure.
4011 4012           */
4012 4013          ASSERT(success);
4013 4014          ipsq = ill->ill_phyint->phyint_ipsq;
4014 4015  
4015 4016          /*
4016 4017           * Mark it condemned. No new reference will be made to this ill.
4017 4018           * Lookup functions will return an error. Threads that try to
4018 4019           * increment the refcnt must check for ILL_CAN_LOOKUP. This ensures
4019 4020           * that the refcnt will drop down to zero.
4020 4021           */
4021 4022          mutex_enter(&ill->ill_lock);
4022 4023          ill->ill_state_flags |= ILL_CONDEMNED;
4023 4024          for (ipif = ill->ill_ipif; ipif != NULL;
4024 4025              ipif = ipif->ipif_next) {
4025 4026                  ipif->ipif_state_flags |= IPIF_CONDEMNED;
4026 4027          }
4027 4028          /*
4028 4029           * Wake up anybody waiting to enter the ipsq. ipsq_enter
4029 4030           * returns  error if ILL_CONDEMNED is set
4030 4031           */
4031 4032          cv_broadcast(&ill->ill_cv);
4032 4033          mutex_exit(&ill->ill_lock);
4033 4034  
4034 4035          /*
4035 4036           * Send all the deferred DLPI messages downstream which came in
4036 4037           * during the small window right before ipsq_enter(). We do this
4037 4038           * without waiting for the ACKs because all the ACKs for M_PROTO
4038 4039           * messages are ignored in ip_rput() when ILL_CONDEMNED is set.
4039 4040           */
4040 4041          ill_dlpi_send_deferred(ill);
4041 4042  
4042 4043          /*
4043 4044           * Shut down fragmentation reassembly.
4044 4045           * ill_frag_timer won't start a timer again.
4045 4046           * Now cancel any existing timer
4046 4047           */
4047 4048          (void) untimeout(ill->ill_frag_timer_id);
4048 4049          (void) ill_frag_timeout(ill, 0);
4049 4050  
4050 4051          /*
4051 4052           * Call ill_delete to bring down the ipifs, ilms and ill on
4052 4053           * this ill. Then wait for the refcnts to drop to zero.
4053 4054           * ill_is_freeable checks whether the ill is really quiescent.
4054 4055           * Then make sure that threads that are waiting to enter the
4055 4056           * ipsq have seen the error returned by ipsq_enter and have
4056 4057           * gone away. Then we call ill_delete_tail which does the
4057 4058           * DL_UNBIND_REQ with the driver and then qprocsoff.
4058 4059           */
4059 4060          ill_delete(ill);
4060 4061          mutex_enter(&ill->ill_lock);
4061 4062          while (!ill_is_freeable(ill))
4062 4063                  cv_wait(&ill->ill_cv, &ill->ill_lock);
4063 4064  
4064 4065          while (ill->ill_waiters)
4065 4066                  cv_wait(&ill->ill_cv, &ill->ill_lock);
4066 4067  
4067 4068          mutex_exit(&ill->ill_lock);
4068 4069  
4069 4070          /*
4070 4071           * ill_delete_tail drops reference on ill_ipst, but we need to keep
4071 4072           * it held until the end of the function since the cleanup
4072 4073           * below needs to be able to use the ip_stack_t.
4073 4074           */
4074 4075          netstack_hold(ipst->ips_netstack);
4075 4076  
4076 4077          /* qprocsoff is done via ill_delete_tail */
4077 4078          ill_delete_tail(ill);
4078 4079          /*
4079 4080           * synchronously wait for arp stream to unbind. After this, we
4080 4081           * cannot get any data packets up from the driver.
4081 4082           */
4082 4083          arp_unbind_complete(ill);
4083 4084          ASSERT(ill->ill_ipst == NULL);
4084 4085  
4085 4086          /*
4086 4087           * Walk through all conns and qenable those that have queued data.
4087 4088           * Close synchronization needs this to
4088 4089           * be done to ensure that all upper layers blocked
4089 4090           * due to flow control to the closing device
4090 4091           * get unblocked.
4091 4092           */
4092 4093          ip1dbg(("ip_wsrv: walking\n"));
4093 4094          for (i = 0; i < TX_FANOUT_SIZE; i++) {
4094 4095                  conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
4095 4096          }
4096 4097  
4097 4098          /*
4098 4099           * ai can be null if this is an IPv6 ill, or if the IPv4
4099 4100           * stream is being torn down before ARP was plumbed (e.g.,
4100 4101           * /sbin/ifconfig plumbing a stream twice, and encountering
4101 4102           * an error
4102 4103           */
4103 4104          if (ai != NULL) {
4104 4105                  ASSERT(!ill->ill_isv6);
4105 4106                  mutex_enter(&ai->ai_lock);
4106 4107                  ai->ai_ill = NULL;
4107 4108                  if (ai->ai_arl == NULL) {
4108 4109                          mutex_destroy(&ai->ai_lock);
4109 4110                          kmem_free(ai, sizeof (*ai));
4110 4111                  } else {
4111 4112                          cv_signal(&ai->ai_ill_unplumb_done);
4112 4113                          mutex_exit(&ai->ai_lock);
4113 4114                  }
4114 4115          }
4115 4116  
4116 4117          mutex_enter(&ipst->ips_ip_mi_lock);
4117 4118          mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
4118 4119          mutex_exit(&ipst->ips_ip_mi_lock);
4119 4120  
4120 4121          /*
4121 4122           * credp could be null if the open didn't succeed and ip_modopen
4122 4123           * itself calls ip_close.
4123 4124           */
4124 4125          if (ill->ill_credp != NULL)
4125 4126                  crfree(ill->ill_credp);
4126 4127  
4127 4128          mutex_destroy(&ill->ill_saved_ire_lock);
4128 4129          mutex_destroy(&ill->ill_lock);
4129 4130          rw_destroy(&ill->ill_mcast_lock);
4130 4131          mutex_destroy(&ill->ill_mcast_serializer);
4131 4132          list_destroy(&ill->ill_nce);
4132 4133  
4133 4134          /*
4134 4135           * Now we are done with the module close pieces that
4135 4136           * need the netstack_t.
4136 4137           */
4137 4138          netstack_rele(ipst->ips_netstack);
4138 4139  
4139 4140          mi_close_free((IDP)ill);
4140 4141          q->q_ptr = WR(q)->q_ptr = NULL;
4141 4142  
4142 4143          ipsq_exit(ipsq);
4143 4144  
4144 4145          return (0);
4145 4146  }
4146 4147  
4147 4148  /*
4148 4149   * This is called as part of close() for IP, UDP, ICMP, and RTS
4149 4150   * in order to quiesce the conn.
4150 4151   */
4151 4152  void
4152 4153  ip_quiesce_conn(conn_t *connp)
4153 4154  {
4154 4155          boolean_t       drain_cleanup_reqd = B_FALSE;
4155 4156          boolean_t       conn_ioctl_cleanup_reqd = B_FALSE;
4156 4157          boolean_t       ilg_cleanup_reqd = B_FALSE;
4157 4158          ip_stack_t      *ipst;
4158 4159  
4159 4160          ASSERT(!IPCL_IS_TCP(connp));
4160 4161          ipst = connp->conn_netstack->netstack_ip;
4161 4162  
4162 4163          /*
4163 4164           * Mark the conn as closing, and this conn must not be
4164 4165           * inserted in future into any list. Eg. conn_drain_insert(),
4165 4166           * won't insert this conn into the conn_drain_list.
4166 4167           *
4167 4168           * conn_idl, and conn_ilg cannot get set henceforth.
4168 4169           */
4169 4170          mutex_enter(&connp->conn_lock);
4170 4171          ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
4171 4172          connp->conn_state_flags |= CONN_CLOSING;
4172 4173          if (connp->conn_idl != NULL)
4173 4174                  drain_cleanup_reqd = B_TRUE;
4174 4175          if (connp->conn_oper_pending_ill != NULL)
4175 4176                  conn_ioctl_cleanup_reqd = B_TRUE;
4176 4177          if (connp->conn_dhcpinit_ill != NULL) {
4177 4178                  ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0);
4178 4179                  atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit);
4179 4180                  ill_set_inputfn(connp->conn_dhcpinit_ill);
4180 4181                  connp->conn_dhcpinit_ill = NULL;
4181 4182          }
4182 4183          if (connp->conn_ilg != NULL)
4183 4184                  ilg_cleanup_reqd = B_TRUE;
4184 4185          mutex_exit(&connp->conn_lock);
4185 4186  
4186 4187          if (conn_ioctl_cleanup_reqd)
4187 4188                  conn_ioctl_cleanup(connp);
4188 4189  
4189 4190          if (is_system_labeled() && connp->conn_anon_port) {
4190 4191                  (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
4191 4192                      connp->conn_mlp_type, connp->conn_proto,
4192 4193                      ntohs(connp->conn_lport), B_FALSE);
4193 4194                  connp->conn_anon_port = 0;
4194 4195          }
4195 4196          connp->conn_mlp_type = mlptSingle;
4196 4197  
4197 4198          /*
4198 4199           * Remove this conn from any fanout list it is on.
4199 4200           * and then wait for any threads currently operating
4200 4201           * on this endpoint to finish
4201 4202           */
4202 4203          ipcl_hash_remove(connp);
4203 4204  
4204 4205          /*
4205 4206           * Remove this conn from the drain list, and do any other cleanup that
4206 4207           * may be required.  (TCP conns are never flow controlled, and
4207 4208           * conn_idl will be NULL.)
4208 4209           */
4209 4210          if (drain_cleanup_reqd && connp->conn_idl != NULL) {
4210 4211                  idl_t *idl = connp->conn_idl;
4211 4212  
4212 4213                  mutex_enter(&idl->idl_lock);
4213 4214                  conn_drain(connp, B_TRUE);
4214 4215                  mutex_exit(&idl->idl_lock);
4215 4216          }
4216 4217  
4217 4218          if (connp == ipst->ips_ip_g_mrouter)
4218 4219                  (void) ip_mrouter_done(ipst);
4219 4220  
4220 4221          if (ilg_cleanup_reqd)
4221 4222                  ilg_delete_all(connp);
4222 4223  
4223 4224          /*
4224 4225           * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
4225 4226           * callers from write side can't be there now because close
4226 4227           * is in progress. The only other caller is ipcl_walk
4227 4228           * which checks for the condemned flag.
4228 4229           */
4229 4230          mutex_enter(&connp->conn_lock);
4230 4231          connp->conn_state_flags |= CONN_CONDEMNED;
4231 4232          while (connp->conn_ref != 1)
4232 4233                  cv_wait(&connp->conn_cv, &connp->conn_lock);
4233 4234          connp->conn_state_flags |= CONN_QUIESCED;
4234 4235          mutex_exit(&connp->conn_lock);
4235 4236  }
4236 4237  
4237 4238  /* ARGSUSED */
4238 4239  int
4239 4240  ip_close(queue_t *q, int flags)
4240 4241  {
4241 4242          conn_t          *connp;
4242 4243  
4243 4244          /*
4244 4245           * Call the appropriate delete routine depending on whether this is
4245 4246           * a module or device.
4246 4247           */
4247 4248          if (WR(q)->q_next != NULL) {
4248 4249                  /* This is a module close */
4249 4250                  return (ip_modclose((ill_t *)q->q_ptr));
4250 4251          }
4251 4252  
4252 4253          connp = q->q_ptr;
4253 4254          ip_quiesce_conn(connp);
4254 4255  
4255 4256          qprocsoff(q);
4256 4257  
4257 4258          /*
4258 4259           * Now we are truly single threaded on this stream, and can
4259 4260           * delete the things hanging off the connp, and finally the connp.
4260 4261           * We removed this connp from the fanout list, it cannot be
4261 4262           * accessed thru the fanouts, and we already waited for the
4262 4263           * conn_ref to drop to 0. We are already in close, so
4263 4264           * there cannot be any other thread from the top. qprocsoff
4264 4265           * has completed, and service has completed or won't run in
4265 4266           * future.
4266 4267           */
4267 4268          ASSERT(connp->conn_ref == 1);
4268 4269  
4269 4270          inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4270 4271  
4271 4272          connp->conn_ref--;
4272 4273          ipcl_conn_destroy(connp);
4273 4274  
4274 4275          q->q_ptr = WR(q)->q_ptr = NULL;
4275 4276          return (0);
4276 4277  }
4277 4278  
4278 4279  /*
4279 4280   * Wapper around putnext() so that ip_rts_request can merely use
4280 4281   * conn_recv.
4281 4282   */
4282 4283  /*ARGSUSED2*/
4283 4284  static void
4284 4285  ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4285 4286  {
4286 4287          conn_t *connp = (conn_t *)arg1;
4287 4288  
4288 4289          putnext(connp->conn_rq, mp);
4289 4290  }
4290 4291  
4291 4292  /* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */
4292 4293  /* ARGSUSED */
4293 4294  static void
4294 4295  ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
4295 4296  {
4296 4297          freemsg(mp);
4297 4298  }
4298 4299  
4299 4300  /*
4300 4301   * Called when the module is about to be unloaded
4301 4302   */
4302 4303  void
4303 4304  ip_ddi_destroy(void)
4304 4305  {
4305 4306          /* This needs to be called before destroying any transports. */
4306 4307          mutex_enter(&cpu_lock);
4307 4308          unregister_cpu_setup_func(ip_tp_cpu_update, NULL);
4308 4309          mutex_exit(&cpu_lock);
4309 4310  
4310 4311          tnet_fini();
4311 4312  
4312 4313          icmp_ddi_g_destroy();
4313 4314          rts_ddi_g_destroy();
4314 4315          udp_ddi_g_destroy();
4315 4316          sctp_ddi_g_destroy();
4316 4317          tcp_ddi_g_destroy();
4317 4318          ilb_ddi_g_destroy();
4318 4319          dce_g_destroy();
4319 4320          ipsec_policy_g_destroy();
4320 4321          ipcl_g_destroy();
4321 4322          ip_net_g_destroy();
4322 4323          ip_ire_g_fini();
4323 4324          inet_minor_destroy(ip_minor_arena_sa);
4324 4325  #if defined(_LP64)
4325 4326          inet_minor_destroy(ip_minor_arena_la);
4326 4327  #endif
4327 4328  
4328 4329  #ifdef DEBUG
4329 4330          list_destroy(&ip_thread_list);
4330 4331          rw_destroy(&ip_thread_rwlock);
4331 4332          tsd_destroy(&ip_thread_data);
4332 4333  #endif
4333 4334  
4334 4335          netstack_unregister(NS_IP);
4335 4336  }
4336 4337  
4337 4338  /*
4338 4339   * First step in cleanup.
4339 4340   */
4340 4341  /* ARGSUSED */
4341 4342  static void
4342 4343  ip_stack_shutdown(netstackid_t stackid, void *arg)
4343 4344  {
4344 4345          ip_stack_t *ipst = (ip_stack_t *)arg;
4345 4346          kt_did_t ktid;
4346 4347  
4347 4348  #ifdef NS_DEBUG
4348 4349          printf("ip_stack_shutdown(%p, stack %d)\n", (void *)ipst, stackid);
4349 4350  #endif
4350 4351  
4351 4352          /*
4352 4353           * Perform cleanup for special interfaces (loopback and IPMP).
4353 4354           */
4354 4355          ip_interface_cleanup(ipst);
4355 4356  
4356 4357          /*
4357 4358           * The *_hook_shutdown()s start the process of notifying any
4358 4359           * consumers that things are going away.... nothing is destroyed.
4359 4360           */
4360 4361          ipv4_hook_shutdown(ipst);
4361 4362          ipv6_hook_shutdown(ipst);
4362 4363          arp_hook_shutdown(ipst);
4363 4364  
4364 4365          mutex_enter(&ipst->ips_capab_taskq_lock);
4365 4366          ktid = ipst->ips_capab_taskq_thread->t_did;
4366 4367          ipst->ips_capab_taskq_quit = B_TRUE;
4367 4368          cv_signal(&ipst->ips_capab_taskq_cv);
4368 4369          mutex_exit(&ipst->ips_capab_taskq_lock);
4369 4370  
4370 4371          /*
4371 4372           * In rare occurrences, particularly on virtual hardware where CPUs can
4372 4373           * be de-scheduled, the thread that we just signaled will not run until
4373 4374           * after we have gotten through parts of ip_stack_fini. If that happens
4374 4375           * then we'll try to grab the ips_capab_taskq_lock as part of returning
4375 4376           * from cv_wait which no longer exists.
4376 4377           */
4377 4378          thread_join(ktid);
4378 4379  }
4379 4380  
4380 4381  /*
4381 4382   * Free the IP stack instance.
4382 4383   */
4383 4384  static void
4384 4385  ip_stack_fini(netstackid_t stackid, void *arg)
4385 4386  {
4386 4387          ip_stack_t *ipst = (ip_stack_t *)arg;
4387 4388          int ret;
4388 4389  
4389 4390  #ifdef NS_DEBUG
4390 4391          printf("ip_stack_fini(%p, stack %d)\n", (void *)ipst, stackid);
4391 4392  #endif
4392 4393          /*
4393 4394           * At this point, all of the notifications that the events and
4394 4395           * protocols are going away have been run, meaning that we can
4395 4396           * now set about starting to clean things up.
4396 4397           */
4397 4398          ipobs_fini(ipst);
4398 4399          ipv4_hook_destroy(ipst);
4399 4400          ipv6_hook_destroy(ipst);
4400 4401          arp_hook_destroy(ipst);
4401 4402          ip_net_destroy(ipst);
4402 4403  
4403 4404          ipmp_destroy(ipst);
4404 4405  
4405 4406          ip_kstat_fini(stackid, ipst->ips_ip_mibkp);
4406 4407          ipst->ips_ip_mibkp = NULL;
4407 4408          icmp_kstat_fini(stackid, ipst->ips_icmp_mibkp);
4408 4409          ipst->ips_icmp_mibkp = NULL;
4409 4410          ip_kstat2_fini(stackid, ipst->ips_ip_kstat);
4410 4411          ipst->ips_ip_kstat = NULL;
4411 4412          bzero(&ipst->ips_ip_statistics, sizeof (ipst->ips_ip_statistics));
4412 4413          ip6_kstat_fini(stackid, ipst->ips_ip6_kstat);
4413 4414          ipst->ips_ip6_kstat = NULL;
4414 4415          bzero(&ipst->ips_ip6_statistics, sizeof (ipst->ips_ip6_statistics));
4415 4416  
4416 4417          kmem_free(ipst->ips_propinfo_tbl,
4417 4418              ip_propinfo_count * sizeof (mod_prop_info_t));
4418 4419          ipst->ips_propinfo_tbl = NULL;
4419 4420  
4420 4421          dce_stack_destroy(ipst);
4421 4422          ip_mrouter_stack_destroy(ipst);
4422 4423  
4423 4424          /*
4424 4425           * Quiesce all of our timers. Note we set the quiesce flags before we
4425 4426           * call untimeout. The slowtimers may actually kick off another instance
4426 4427           * of the non-slow timers.
4427 4428           */
4428 4429          mutex_enter(&ipst->ips_igmp_timer_lock);
4429 4430          ipst->ips_igmp_timer_quiesce = B_TRUE;
4430 4431          mutex_exit(&ipst->ips_igmp_timer_lock);
4431 4432  
4432 4433          mutex_enter(&ipst->ips_mld_timer_lock);
4433 4434          ipst->ips_mld_timer_quiesce = B_TRUE;
4434 4435          mutex_exit(&ipst->ips_mld_timer_lock);
4435 4436  
4436 4437          mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
4437 4438          ipst->ips_igmp_slowtimeout_quiesce = B_TRUE;
4438 4439          mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
4439 4440  
4440 4441          mutex_enter(&ipst->ips_mld_slowtimeout_lock);
4441 4442          ipst->ips_mld_slowtimeout_quiesce = B_TRUE;
4442 4443          mutex_exit(&ipst->ips_mld_slowtimeout_lock);
4443 4444  
4444 4445          ret = untimeout(ipst->ips_igmp_timeout_id);
4445 4446          if (ret == -1) {
4446 4447                  ASSERT(ipst->ips_igmp_timeout_id == 0);
4447 4448          } else {
4448 4449                  ASSERT(ipst->ips_igmp_timeout_id != 0);
4449 4450                  ipst->ips_igmp_timeout_id = 0;
4450 4451          }
4451 4452          ret = untimeout(ipst->ips_igmp_slowtimeout_id);
4452 4453          if (ret == -1) {
4453 4454                  ASSERT(ipst->ips_igmp_slowtimeout_id == 0);
4454 4455          } else {
4455 4456                  ASSERT(ipst->ips_igmp_slowtimeout_id != 0);
4456 4457                  ipst->ips_igmp_slowtimeout_id = 0;
4457 4458          }
4458 4459          ret = untimeout(ipst->ips_mld_timeout_id);
4459 4460          if (ret == -1) {
4460 4461                  ASSERT(ipst->ips_mld_timeout_id == 0);
4461 4462          } else {
4462 4463                  ASSERT(ipst->ips_mld_timeout_id != 0);
4463 4464                  ipst->ips_mld_timeout_id = 0;
4464 4465          }
4465 4466          ret = untimeout(ipst->ips_mld_slowtimeout_id);
4466 4467          if (ret == -1) {
4467 4468                  ASSERT(ipst->ips_mld_slowtimeout_id == 0);
4468 4469          } else {
4469 4470                  ASSERT(ipst->ips_mld_slowtimeout_id != 0);
4470 4471                  ipst->ips_mld_slowtimeout_id = 0;
4471 4472          }
4472 4473  
4473 4474          ip_ire_fini(ipst);
4474 4475          ip6_asp_free(ipst);
4475 4476          conn_drain_fini(ipst);
4476 4477          ipcl_destroy(ipst);
4477 4478  
4478 4479          mutex_destroy(&ipst->ips_ndp4->ndp_g_lock);
4479 4480          mutex_destroy(&ipst->ips_ndp6->ndp_g_lock);
4480 4481          kmem_free(ipst->ips_ndp4, sizeof (ndp_g_t));
4481 4482          ipst->ips_ndp4 = NULL;
4482 4483          kmem_free(ipst->ips_ndp6, sizeof (ndp_g_t));
4483 4484          ipst->ips_ndp6 = NULL;
4484 4485  
4485 4486          if (ipst->ips_loopback_ksp != NULL) {
4486 4487                  kstat_delete_netstack(ipst->ips_loopback_ksp, stackid);
4487 4488                  ipst->ips_loopback_ksp = NULL;
4488 4489          }
4489 4490  
4490 4491          mutex_destroy(&ipst->ips_capab_taskq_lock);
4491 4492          cv_destroy(&ipst->ips_capab_taskq_cv);
4492 4493  
4493 4494          rw_destroy(&ipst->ips_srcid_lock);
4494 4495  
4495 4496          mutex_destroy(&ipst->ips_ip_mi_lock);
4496 4497          rw_destroy(&ipst->ips_ill_g_usesrc_lock);
4497 4498  
4498 4499          mutex_destroy(&ipst->ips_igmp_timer_lock);
4499 4500          mutex_destroy(&ipst->ips_mld_timer_lock);
4500 4501          mutex_destroy(&ipst->ips_igmp_slowtimeout_lock);
4501 4502          mutex_destroy(&ipst->ips_mld_slowtimeout_lock);
4502 4503          mutex_destroy(&ipst->ips_ip_addr_avail_lock);
4503 4504          rw_destroy(&ipst->ips_ill_g_lock);
4504 4505  
4505 4506          kmem_free(ipst->ips_phyint_g_list, sizeof (phyint_list_t));
4506 4507          ipst->ips_phyint_g_list = NULL;
4507 4508          kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
4508 4509          ipst->ips_ill_g_heads = NULL;
4509 4510  
4510 4511          ldi_ident_release(ipst->ips_ldi_ident);
4511 4512          kmem_free(ipst, sizeof (*ipst));
4512 4513  }
4513 4514  
4514 4515  /*
4515 4516   * This function is called from the TSD destructor, and is used to debug
4516 4517   * reference count issues in IP. See block comment in <inet/ip_if.h> for
4517 4518   * details.
4518 4519   */
4519 4520  static void
4520 4521  ip_thread_exit(void *phash)
4521 4522  {
4522 4523          th_hash_t *thh = phash;
4523 4524  
4524 4525          rw_enter(&ip_thread_rwlock, RW_WRITER);
4525 4526          list_remove(&ip_thread_list, thh);
4526 4527          rw_exit(&ip_thread_rwlock);
4527 4528          mod_hash_destroy_hash(thh->thh_hash);
4528 4529          kmem_free(thh, sizeof (*thh));
4529 4530  }
4530 4531  
4531 4532  /*
4532 4533   * Called when the IP kernel module is loaded into the kernel
4533 4534   */
4534 4535  void
4535 4536  ip_ddi_init(void)
4536 4537  {
4537 4538          ip_squeue_flag = ip_squeue_switch(ip_squeue_enter);
4538 4539  
4539 4540          /*
4540 4541           * For IP and TCP the minor numbers should start from 2 since we have 4
4541 4542           * initial devices: ip, ip6, tcp, tcp6.
4542 4543           */
4543 4544          /*
4544 4545           * If this is a 64-bit kernel, then create two separate arenas -
4545 4546           * one for TLIs in the range of INET_MIN_DEV+2 through 2^^18-1, and the
4546 4547           * other for socket apps in the range 2^^18 through 2^^32-1.
4547 4548           */
4548 4549          ip_minor_arena_la = NULL;
4549 4550          ip_minor_arena_sa = NULL;
4550 4551  #if defined(_LP64)
4551 4552          if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4552 4553              INET_MIN_DEV + 2, MAXMIN32, KM_SLEEP)) == NULL) {
4553 4554                  cmn_err(CE_PANIC,
4554 4555                      "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4555 4556          }
4556 4557          if ((ip_minor_arena_la = inet_minor_create("ip_minor_arena_la",
4557 4558              MAXMIN32 + 1, MAXMIN64, KM_SLEEP)) == NULL) {
4558 4559                  cmn_err(CE_PANIC,
4559 4560                      "ip_ddi_init: ip_minor_arena_la creation failed\n");

↓ open down ↓

4452 lines elided

↑ open up ↑

4560 4561          }
4561 4562  #else
4562 4563          if ((ip_minor_arena_sa = inet_minor_create("ip_minor_arena_sa",
4563 4564              INET_MIN_DEV + 2, MAXMIN, KM_SLEEP)) == NULL) {
4564 4565                  cmn_err(CE_PANIC,
4565 4566                      "ip_ddi_init: ip_minor_arena_sa creation failed\n");
4566 4567          }
4567 4568  #endif
4568 4569          ip_poll_normal_ticks = MSEC_TO_TICK_ROUNDUP(ip_poll_normal_ms);
4569 4570  
     4571 +        cc_init();
     4572 +
4570 4573          ipcl_g_init();
4571 4574          ip_ire_g_init();
4572 4575          ip_net_g_init();
4573 4576  
4574 4577  #ifdef DEBUG
4575 4578          tsd_create(&ip_thread_data, ip_thread_exit);
4576 4579          rw_init(&ip_thread_rwlock, NULL, RW_DEFAULT, NULL);
4577 4580          list_create(&ip_thread_list, sizeof (th_hash_t),
4578 4581              offsetof(th_hash_t, thh_link));
4579 4582  #endif

4580 4583          ipsec_policy_g_init();
4581 4584          tcp_ddi_g_init();
4582 4585          sctp_ddi_g_init();
4583 4586          dce_g_init();
4584 4587  
4585 4588          /*
4586 4589           * We want to be informed each time a stack is created or
4587 4590           * destroyed in the kernel, so we can maintain the
4588 4591           * set of udp_stack_t's.
4589 4592           */
4590 4593          netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown,
4591 4594              ip_stack_fini);
4592 4595  
4593 4596          tnet_init();
4594 4597  
4595 4598          udp_ddi_g_init();
4596 4599          rts_ddi_g_init();
4597 4600          icmp_ddi_g_init();
4598 4601          ilb_ddi_g_init();
4599 4602  
4600 4603          /* This needs to be called after all transports are initialized. */
4601 4604          mutex_enter(&cpu_lock);
4602 4605          register_cpu_setup_func(ip_tp_cpu_update, NULL);
4603 4606          mutex_exit(&cpu_lock);
4604 4607  }
4605 4608  
4606 4609  /*
4607 4610   * Initialize the IP stack instance.
4608 4611   */
4609 4612  static void *
4610 4613  ip_stack_init(netstackid_t stackid, netstack_t *ns)
4611 4614  {
4612 4615          ip_stack_t      *ipst;
4613 4616          size_t          arrsz;
4614 4617          major_t         major;
4615 4618  
4616 4619  #ifdef NS_DEBUG
4617 4620          printf("ip_stack_init(stack %d)\n", stackid);
4618 4621  #endif
4619 4622  
4620 4623          ipst = (ip_stack_t *)kmem_zalloc(sizeof (*ipst), KM_SLEEP);
4621 4624          ipst->ips_netstack = ns;
4622 4625  
4623 4626          ipst->ips_ill_g_heads = kmem_zalloc(sizeof (ill_g_head_t) * MAX_G_HEADS,
4624 4627              KM_SLEEP);
4625 4628          ipst->ips_phyint_g_list = kmem_zalloc(sizeof (phyint_list_t),
4626 4629              KM_SLEEP);
4627 4630          ipst->ips_ndp4 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4628 4631          ipst->ips_ndp6 = kmem_zalloc(sizeof (ndp_g_t), KM_SLEEP);
4629 4632          mutex_init(&ipst->ips_ndp4->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4630 4633          mutex_init(&ipst->ips_ndp6->ndp_g_lock, NULL, MUTEX_DEFAULT, NULL);
4631 4634  
4632 4635          mutex_init(&ipst->ips_igmp_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4633 4636          ipst->ips_igmp_deferred_next = INFINITY;
4634 4637          mutex_init(&ipst->ips_mld_timer_lock, NULL, MUTEX_DEFAULT, NULL);
4635 4638          ipst->ips_mld_deferred_next = INFINITY;
4636 4639          mutex_init(&ipst->ips_igmp_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4637 4640          mutex_init(&ipst->ips_mld_slowtimeout_lock, NULL, MUTEX_DEFAULT, NULL);
4638 4641          mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL);
4639 4642          mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL);
4640 4643          rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL);
4641 4644          rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL);
4642 4645  
4643 4646          ipcl_init(ipst);
4644 4647          ip_ire_init(ipst);
4645 4648          ip6_asp_init(ipst);
4646 4649          ipif_init(ipst);
4647 4650          conn_drain_init(ipst);
4648 4651          ip_mrouter_stack_init(ipst);
4649 4652          dce_stack_init(ipst);
4650 4653  
4651 4654          ipst->ips_ip_multirt_log_interval = 1000;
4652 4655  
4653 4656          ipst->ips_ill_index = 1;
4654 4657  
4655 4658          ipst->ips_saved_ip_forwarding = -1;
4656 4659          ipst->ips_reg_vif_num = ALL_VIFS;       /* Index to Register vif */
4657 4660  
4658 4661          arrsz = ip_propinfo_count * sizeof (mod_prop_info_t);
4659 4662          ipst->ips_propinfo_tbl = (mod_prop_info_t *)kmem_alloc(arrsz, KM_SLEEP);
4660 4663          bcopy(ip_propinfo_tbl, ipst->ips_propinfo_tbl, arrsz);
4661 4664  
4662 4665          ipst->ips_ip_mibkp = ip_kstat_init(stackid, ipst);
4663 4666          ipst->ips_icmp_mibkp = icmp_kstat_init(stackid);
4664 4667          ipst->ips_ip_kstat = ip_kstat2_init(stackid, &ipst->ips_ip_statistics);
4665 4668          ipst->ips_ip6_kstat =
4666 4669              ip6_kstat_init(stackid, &ipst->ips_ip6_statistics);
4667 4670  
4668 4671          ipst->ips_ip_src_id = 1;
4669 4672          rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL);
4670 4673  
4671 4674          ipst->ips_src_generation = SRC_GENERATION_INITIAL;
4672 4675  
4673 4676          ip_net_init(ipst, ns);
4674 4677          ipv4_hook_init(ipst);
4675 4678          ipv6_hook_init(ipst);
4676 4679          arp_hook_init(ipst);
4677 4680          ipmp_init(ipst);
4678 4681          ipobs_init(ipst);
4679 4682  
4680 4683          /*
4681 4684           * Create the taskq dispatcher thread and initialize related stuff.
4682 4685           */
4683 4686          mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL);
4684 4687          cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL);
4685 4688          ipst->ips_capab_taskq_thread = thread_create(NULL, 0,
4686 4689              ill_taskq_dispatch, ipst, 0, &p0, TS_RUN, minclsyspri);
4687 4690  
4688 4691          major = mod_name_to_major(INET_NAME);
4689 4692          (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
4690 4693          return (ipst);
4691 4694  }
4692 4695  
4693 4696  /*
4694 4697   * Allocate and initialize a DLPI template of the specified length.  (May be
4695 4698   * called as writer.)
4696 4699   */
4697 4700  mblk_t *
4698 4701  ip_dlpi_alloc(size_t len, t_uscalar_t prim)
4699 4702  {
4700 4703          mblk_t  *mp;
4701 4704  
4702 4705          mp = allocb(len, BPRI_MED);
4703 4706          if (!mp)
4704 4707                  return (NULL);
4705 4708  
4706 4709          /*
4707 4710           * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
4708 4711           * of which we don't seem to use) are sent with M_PCPROTO, and
4709 4712           * that other DLPI are M_PROTO.
4710 4713           */
4711 4714          if (prim == DL_INFO_REQ) {
4712 4715                  mp->b_datap->db_type = M_PCPROTO;
4713 4716          } else {
4714 4717                  mp->b_datap->db_type = M_PROTO;
4715 4718          }
4716 4719  
4717 4720          mp->b_wptr = mp->b_rptr + len;
4718 4721          bzero(mp->b_rptr, len);
4719 4722          ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim;
4720 4723          return (mp);
4721 4724  }
4722 4725  
4723 4726  /*
4724 4727   * Allocate and initialize a DLPI notification.  (May be called as writer.)
4725 4728   */
4726 4729  mblk_t *
4727 4730  ip_dlnotify_alloc(uint_t notification, uint_t data)
4728 4731  {
4729 4732          dl_notify_ind_t *notifyp;
4730 4733          mblk_t          *mp;
4731 4734  
4732 4735          if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4733 4736                  return (NULL);
4734 4737  
4735 4738          notifyp = (dl_notify_ind_t *)mp->b_rptr;
4736 4739          notifyp->dl_notification = notification;
4737 4740          notifyp->dl_data = data;
4738 4741          return (mp);
4739 4742  }
4740 4743  
4741 4744  mblk_t *
4742 4745  ip_dlnotify_alloc2(uint_t notification, uint_t data1, uint_t data2)
4743 4746  {
4744 4747          dl_notify_ind_t *notifyp;
4745 4748          mblk_t          *mp;
4746 4749  
4747 4750          if ((mp = ip_dlpi_alloc(DL_NOTIFY_IND_SIZE, DL_NOTIFY_IND)) == NULL)
4748 4751                  return (NULL);
4749 4752  
4750 4753          notifyp = (dl_notify_ind_t *)mp->b_rptr;
4751 4754          notifyp->dl_notification = notification;
4752 4755          notifyp->dl_data1 = data1;
4753 4756          notifyp->dl_data2 = data2;
4754 4757          return (mp);
4755 4758  }
4756 4759  
4757 4760  /*
4758 4761   * Debug formatting routine.  Returns a character string representation of the
4759 4762   * addr in buf, of the form xxx.xxx.xxx.xxx.  This routine takes the address
4760 4763   * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer.
4761 4764   *
4762 4765   * Once the ndd table-printing interfaces are removed, this can be changed to
4763 4766   * standard dotted-decimal form.
4764 4767   */
4765 4768  char *
4766 4769  ip_dot_addr(ipaddr_t addr, char *buf)
4767 4770  {
4768 4771          uint8_t *ap = (uint8_t *)&addr;
4769 4772  
4770 4773          (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d",
4771 4774              ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF);
4772 4775          return (buf);
4773 4776  }
4774 4777  
4775 4778  /*
4776 4779   * Write the given MAC address as a printable string in the usual colon-
4777 4780   * separated format.
4778 4781   */
4779 4782  const char *
4780 4783  mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen)
4781 4784  {
4782 4785          char *bp;
4783 4786  
4784 4787          if (alen == 0 || buflen < 4)
4785 4788                  return ("?");
4786 4789          bp = buf;
4787 4790          for (;;) {
4788 4791                  /*
4789 4792                   * If there are more MAC address bytes available, but we won't
4790 4793                   * have any room to print them, then add "..." to the string
4791 4794                   * instead.  See below for the 'magic number' explanation.
4792 4795                   */
4793 4796                  if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) {
4794 4797                          (void) strcpy(bp, "...");
4795 4798                          break;
4796 4799                  }
4797 4800                  (void) sprintf(bp, "%02x", *addr++);
4798 4801                  bp += 2;
4799 4802                  if (--alen == 0)
4800 4803                          break;
4801 4804                  *bp++ = ':';
4802 4805                  buflen -= 3;
4803 4806                  /*
4804 4807                   * At this point, based on the first 'if' statement above,
4805 4808                   * either alen == 1 and buflen >= 3, or alen > 1 and
4806 4809                   * buflen >= 4.  The first case leaves room for the final "xx"
4807 4810                   * number and trailing NUL byte.  The second leaves room for at
4808 4811                   * least "...".  Thus the apparently 'magic' numbers chosen for
4809 4812                   * that statement.
4810 4813                   */
4811 4814          }
4812 4815          return (buf);
4813 4816  }
4814 4817  
4815 4818  /*
4816 4819   * Called when it is conceptually a ULP that would sent the packet
4817 4820   * e.g., port unreachable and protocol unreachable. Check that the packet
4818 4821   * would have passed the IPsec global policy before sending the error.
4819 4822   *
4820 4823   * Send an ICMP error after patching up the packet appropriately.
4821 4824   * Uses ip_drop_input and bumps the appropriate MIB.
4822 4825   */
4823 4826  void
4824 4827  ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code,
4825 4828      ip_recv_attr_t *ira)
4826 4829  {
4827 4830          ipha_t          *ipha;
4828 4831          boolean_t       secure;
4829 4832          ill_t           *ill = ira->ira_ill;
4830 4833          ip_stack_t      *ipst = ill->ill_ipst;
4831 4834          netstack_t      *ns = ipst->ips_netstack;
4832 4835          ipsec_stack_t   *ipss = ns->netstack_ipsec;
4833 4836  
4834 4837          secure = ira->ira_flags & IRAF_IPSEC_SECURE;
4835 4838  
4836 4839          /*
4837 4840           * We are generating an icmp error for some inbound packet.
4838 4841           * Called from all ip_fanout_(udp, tcp, proto) functions.
4839 4842           * Before we generate an error, check with global policy
4840 4843           * to see whether this is allowed to enter the system. As
4841 4844           * there is no "conn", we are checking with global policy.
4842 4845           */
4843 4846          ipha = (ipha_t *)mp->b_rptr;
4844 4847          if (secure || ipss->ipsec_inbound_v4_policy_present) {
4845 4848                  mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns);
4846 4849                  if (mp == NULL)
4847 4850                          return;
4848 4851          }
4849 4852  
4850 4853          /* We never send errors for protocols that we do implement */
4851 4854          if (ira->ira_protocol == IPPROTO_ICMP ||
4852 4855              ira->ira_protocol == IPPROTO_IGMP) {
4853 4856                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4854 4857                  ip_drop_input("ip_fanout_send_icmp_v4", mp, ill);
4855 4858                  freemsg(mp);
4856 4859                  return;
4857 4860          }
4858 4861          /*
4859 4862           * Have to correct checksum since
4860 4863           * the packet might have been
4861 4864           * fragmented and the reassembly code in ip_rput
4862 4865           * does not restore the IP checksum.
4863 4866           */
4864 4867          ipha->ipha_hdr_checksum = 0;
4865 4868          ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
4866 4869  
4867 4870          switch (icmp_type) {
4868 4871          case ICMP_DEST_UNREACHABLE:
4869 4872                  switch (icmp_code) {
4870 4873                  case ICMP_PROTOCOL_UNREACHABLE:
4871 4874                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos);
4872 4875                          ip_drop_input("ipIfStatsInUnknownProtos", mp, ill);
4873 4876                          break;
4874 4877                  case ICMP_PORT_UNREACHABLE:
4875 4878                          BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
4876 4879                          ip_drop_input("ipIfStatsNoPorts", mp, ill);
4877 4880                          break;
4878 4881                  }
4879 4882  
4880 4883                  icmp_unreachable(mp, icmp_code, ira);
4881 4884                  break;
4882 4885          default:
4883 4886  #ifdef DEBUG
4884 4887                  panic("ip_fanout_send_icmp_v4: wrong type");
4885 4888                  /*NOTREACHED*/
4886 4889  #else
4887 4890                  freemsg(mp);
4888 4891                  break;
4889 4892  #endif
4890 4893          }
4891 4894  }
4892 4895  
4893 4896  /*
4894 4897   * Used to send an ICMP error message when a packet is received for
4895 4898   * a protocol that is not supported. The mblk passed as argument
4896 4899   * is consumed by this function.
4897 4900   */
4898 4901  void
4899 4902  ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira)
4900 4903  {
4901 4904          ipha_t          *ipha;
4902 4905  
4903 4906          ipha = (ipha_t *)mp->b_rptr;
4904 4907          if (ira->ira_flags & IRAF_IS_IPV4) {
4905 4908                  ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION);
4906 4909                  ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
4907 4910                      ICMP_PROTOCOL_UNREACHABLE, ira);
4908 4911          } else {
4909 4912                  ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
4910 4913                  ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB,
4911 4914                      ICMP6_PARAMPROB_NEXTHEADER, ira);
4912 4915          }
4913 4916  }
4914 4917  
4915 4918  /*
4916 4919   * Deliver a rawip packet to the given conn, possibly applying ipsec policy.
4917 4920   * Handles IPv4 and IPv6.
4918 4921   * We are responsible for disposing of mp, such as by freemsg() or putnext()
4919 4922   * Caller is responsible for dropping references to the conn.
4920 4923   */
4921 4924  void
4922 4925  ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
4923 4926      ip_recv_attr_t *ira)
4924 4927  {
4925 4928          ill_t           *ill = ira->ira_ill;
4926 4929          ip_stack_t      *ipst = ill->ill_ipst;
4927 4930          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
4928 4931          boolean_t       secure;
4929 4932          uint_t          protocol = ira->ira_protocol;
4930 4933          iaflags_t       iraflags = ira->ira_flags;
4931 4934          queue_t         *rq;
4932 4935  
4933 4936          secure = iraflags & IRAF_IPSEC_SECURE;
4934 4937  
4935 4938          rq = connp->conn_rq;
4936 4939          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
4937 4940                  switch (protocol) {
4938 4941                  case IPPROTO_ICMPV6:
4939 4942                          BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows);
4940 4943                          break;
4941 4944                  case IPPROTO_ICMP:
4942 4945                          BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows);
4943 4946                          break;
4944 4947                  default:
4945 4948                          BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
4946 4949                          break;
4947 4950                  }
4948 4951                  freemsg(mp);
4949 4952                  return;
4950 4953          }
4951 4954  
4952 4955          ASSERT(!(IPCL_IS_IPTUN(connp)));
4953 4956  
4954 4957          if (((iraflags & IRAF_IS_IPV4) ?
4955 4958              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
4956 4959              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
4957 4960              secure) {
4958 4961                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
4959 4962                      ip6h, ira);
4960 4963                  if (mp == NULL) {
4961 4964                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
4962 4965                          /* Note that mp is NULL */
4963 4966                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
4964 4967                          return;
4965 4968                  }
4966 4969          }
4967 4970  
4968 4971          if (iraflags & IRAF_ICMP_ERROR) {
4969 4972                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
4970 4973          } else {
4971 4974                  ill_t *rill = ira->ira_rill;
4972 4975  
4973 4976                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
4974 4977                  ira->ira_ill = ira->ira_rill = NULL;
4975 4978                  /* Send it upstream */
4976 4979                  (connp->conn_recv)(connp, mp, NULL, ira);
4977 4980                  ira->ira_ill = ill;
4978 4981                  ira->ira_rill = rill;
4979 4982          }
4980 4983  }
4981 4984  
4982 4985  /*
4983 4986   * Handle protocols with which IP is less intimate.  There
4984 4987   * can be more than one stream bound to a particular
4985 4988   * protocol.  When this is the case, normally each one gets a copy
4986 4989   * of any incoming packets.
4987 4990   *
4988 4991   * IPsec NOTE :
4989 4992   *
4990 4993   * Don't allow a secure packet going up a non-secure connection.
4991 4994   * We don't allow this because
4992 4995   *
4993 4996   * 1) Reply might go out in clear which will be dropped at
4994 4997   *    the sending side.
4995 4998   * 2) If the reply goes out in clear it will give the
4996 4999   *    adversary enough information for getting the key in
4997 5000   *    most of the cases.
4998 5001   *
4999 5002   * Moreover getting a secure packet when we expect clear
5000 5003   * implies that SA's were added without checking for
5001 5004   * policy on both ends. This should not happen once ISAKMP
5002 5005   * is used to negotiate SAs as SAs will be added only after
5003 5006   * verifying the policy.
5004 5007   *
5005 5008   * Zones notes:
5006 5009   * Earlier in ip_input on a system with multiple shared-IP zones we
5007 5010   * duplicate the multicast and broadcast packets and send them up
5008 5011   * with each explicit zoneid that exists on that ill.
5009 5012   * This means that here we can match the zoneid with SO_ALLZONES being special.
5010 5013   */
5011 5014  void
5012 5015  ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
5013 5016  {
5014 5017          mblk_t          *mp1;
5015 5018          ipaddr_t        laddr;
5016 5019          conn_t          *connp, *first_connp, *next_connp;
5017 5020          connf_t         *connfp;
5018 5021          ill_t           *ill = ira->ira_ill;
5019 5022          ip_stack_t      *ipst = ill->ill_ipst;
5020 5023  
5021 5024          laddr = ipha->ipha_dst;
5022 5025  
5023 5026          connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol];
5024 5027          mutex_enter(&connfp->connf_lock);
5025 5028          connp = connfp->connf_head;
5026 5029          for (connp = connfp->connf_head; connp != NULL;
5027 5030              connp = connp->conn_next) {
5028 5031                  /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
5029 5032                  if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
5030 5033                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5031 5034                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) {
5032 5035                          break;
5033 5036                  }
5034 5037          }
5035 5038  
5036 5039          if (connp == NULL) {
5037 5040                  /*
5038 5041                   * No one bound to these addresses.  Is
5039 5042                   * there a client that wants all
5040 5043                   * unclaimed datagrams?
5041 5044                   */
5042 5045                  mutex_exit(&connfp->connf_lock);
5043 5046                  ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE,
5044 5047                      ICMP_PROTOCOL_UNREACHABLE, ira);
5045 5048                  return;
5046 5049          }
5047 5050  
5048 5051          ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5049 5052  
5050 5053          CONN_INC_REF(connp);
5051 5054          first_connp = connp;
5052 5055          connp = connp->conn_next;
5053 5056  
5054 5057          for (;;) {
5055 5058                  while (connp != NULL) {
5056 5059                          /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */
5057 5060                          if (IPCL_PROTO_MATCH(connp, ira, ipha) &&
5058 5061                              (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5059 5062                              tsol_receive_local(mp, &laddr, IPV4_VERSION,
5060 5063                              ira, connp)))
5061 5064                                  break;
5062 5065                          connp = connp->conn_next;
5063 5066                  }
5064 5067  
5065 5068                  if (connp == NULL) {
5066 5069                          /* No more interested clients */
5067 5070                          connp = first_connp;
5068 5071                          break;
5069 5072                  }
5070 5073                  if (((mp1 = dupmsg(mp)) == NULL) &&
5071 5074                      ((mp1 = copymsg(mp)) == NULL)) {
5072 5075                          /* Memory allocation failed */
5073 5076                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5074 5077                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
5075 5078                          connp = first_connp;
5076 5079                          break;
5077 5080                  }
5078 5081  
5079 5082                  CONN_INC_REF(connp);
5080 5083                  mutex_exit(&connfp->connf_lock);
5081 5084  
5082 5085                  ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL,
5083 5086                      ira);
5084 5087  
5085 5088                  mutex_enter(&connfp->connf_lock);
5086 5089                  /* Follow the next pointer before releasing the conn. */
5087 5090                  next_connp = connp->conn_next;
5088 5091                  CONN_DEC_REF(connp);
5089 5092                  connp = next_connp;
5090 5093          }
5091 5094  
5092 5095          /* Last one.  Send it upstream. */
5093 5096          mutex_exit(&connfp->connf_lock);
5094 5097  
5095 5098          ip_fanout_proto_conn(connp, mp, ipha, NULL, ira);
5096 5099  
5097 5100          CONN_DEC_REF(connp);
5098 5101  }
5099 5102  
5100 5103  /*
5101 5104   * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or
5102 5105   * pass it along to ESP if the SPI is non-zero.  Returns the mblk if the mblk
5103 5106   * is not consumed.
5104 5107   *
5105 5108   * One of three things can happen, all of which affect the passed-in mblk:
5106 5109   *
5107 5110   * 1.) The packet is stock UDP and gets its zero-SPI stripped.  Return mblk..
5108 5111   *
5109 5112   * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent
5110 5113   *     ESP packet, and is passed along to ESP for consumption.  Return NULL.
5111 5114   *
5112 5115   * 3.) The packet is an ESP-in-UDP Keepalive.  Drop it and return NULL.
5113 5116   */
5114 5117  mblk_t *
5115 5118  zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira)
5116 5119  {
5117 5120          int shift, plen, iph_len;
5118 5121          ipha_t *ipha;
5119 5122          udpha_t *udpha;
5120 5123          uint32_t *spi;
5121 5124          uint32_t esp_ports;
5122 5125          uint8_t *orptr;
5123 5126          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
5124 5127          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
5125 5128  
5126 5129          ipha = (ipha_t *)mp->b_rptr;
5127 5130          iph_len = ira->ira_ip_hdr_length;
5128 5131          plen = ira->ira_pktlen;
5129 5132  
5130 5133          if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) {
5131 5134                  /*
5132 5135                   * Most likely a keepalive for the benefit of an intervening
5133 5136                   * NAT.  These aren't for us, per se, so drop it.
5134 5137                   *
5135 5138                   * RFC 3947/8 doesn't say for sure what to do for 2-3
5136 5139                   * byte packets (keepalives are 1-byte), but we'll drop them
5137 5140                   * also.
5138 5141                   */
5139 5142                  ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5140 5143                      DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper);
5141 5144                  return (NULL);
5142 5145          }
5143 5146  
5144 5147          if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) {
5145 5148                  /* might as well pull it all up - it might be ESP. */
5146 5149                  if (!pullupmsg(mp, -1)) {
5147 5150                          ip_drop_packet(mp, B_TRUE, ira->ira_ill,
5148 5151                              DROPPER(ipss, ipds_esp_nomem),
5149 5152                              &ipss->ipsec_dropper);
5150 5153                          return (NULL);
5151 5154                  }
5152 5155  
5153 5156                  ipha = (ipha_t *)mp->b_rptr;
5154 5157          }
5155 5158          spi = (uint32_t *)(mp->b_rptr + iph_len + sizeof (udpha_t));
5156 5159          if (*spi == 0) {
5157 5160                  /* UDP packet - remove 0-spi. */
5158 5161                  shift = sizeof (uint32_t);
5159 5162          } else {
5160 5163                  /* ESP-in-UDP packet - reduce to ESP. */
5161 5164                  ipha->ipha_protocol = IPPROTO_ESP;
5162 5165                  shift = sizeof (udpha_t);
5163 5166          }
5164 5167  
5165 5168          /* Fix IP header */
5166 5169          ira->ira_pktlen = (plen - shift);
5167 5170          ipha->ipha_length = htons(ira->ira_pktlen);
5168 5171          ipha->ipha_hdr_checksum = 0;
5169 5172  
5170 5173          orptr = mp->b_rptr;
5171 5174          mp->b_rptr += shift;
5172 5175  
5173 5176          udpha = (udpha_t *)(orptr + iph_len);
5174 5177          if (*spi == 0) {
5175 5178                  ASSERT((uint8_t *)ipha == orptr);
5176 5179                  udpha->uha_length = htons(plen - shift - iph_len);
5177 5180                  iph_len += sizeof (udpha_t);    /* For the call to ovbcopy(). */
5178 5181                  esp_ports = 0;
5179 5182          } else {
5180 5183                  esp_ports = *((uint32_t *)udpha);
5181 5184                  ASSERT(esp_ports != 0);
5182 5185          }
5183 5186          ovbcopy(orptr, orptr + shift, iph_len);
5184 5187          if (esp_ports != 0) /* Punt up for ESP processing. */ {
5185 5188                  ipha = (ipha_t *)(orptr + shift);
5186 5189  
5187 5190                  ira->ira_flags |= IRAF_ESP_UDP_PORTS;
5188 5191                  ira->ira_esp_udp_ports = esp_ports;
5189 5192                  ip_fanout_v4(mp, ipha, ira);
5190 5193                  return (NULL);
5191 5194          }
5192 5195          return (mp);
5193 5196  }
5194 5197  
5195 5198  /*
5196 5199   * Deliver a udp packet to the given conn, possibly applying ipsec policy.
5197 5200   * Handles IPv4 and IPv6.
5198 5201   * We are responsible for disposing of mp, such as by freemsg() or putnext()
5199 5202   * Caller is responsible for dropping references to the conn.
5200 5203   */
5201 5204  void
5202 5205  ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h,
5203 5206      ip_recv_attr_t *ira)
5204 5207  {
5205 5208          ill_t           *ill = ira->ira_ill;
5206 5209          ip_stack_t      *ipst = ill->ill_ipst;
5207 5210          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
5208 5211          boolean_t       secure;
5209 5212          iaflags_t       iraflags = ira->ira_flags;
5210 5213  
5211 5214          secure = iraflags & IRAF_IPSEC_SECURE;
5212 5215  
5213 5216          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld :
5214 5217              !canputnext(connp->conn_rq)) {
5215 5218                  BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
5216 5219                  freemsg(mp);
5217 5220                  return;
5218 5221          }
5219 5222  
5220 5223          if (((iraflags & IRAF_IS_IPV4) ?
5221 5224              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
5222 5225              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
5223 5226              secure) {
5224 5227                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
5225 5228                      ip6h, ira);
5226 5229                  if (mp == NULL) {
5227 5230                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5228 5231                          /* Note that mp is NULL */
5229 5232                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
5230 5233                          return;
5231 5234                  }
5232 5235          }
5233 5236  
5234 5237          /*
5235 5238           * Since this code is not used for UDP unicast we don't need a NAT_T
5236 5239           * check. Only ip_fanout_v4 has that check.
5237 5240           */
5238 5241          if (ira->ira_flags & IRAF_ICMP_ERROR) {
5239 5242                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
5240 5243          } else {
5241 5244                  ill_t *rill = ira->ira_rill;
5242 5245  
5243 5246                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
5244 5247                  ira->ira_ill = ira->ira_rill = NULL;
5245 5248                  /* Send it upstream */
5246 5249                  (connp->conn_recv)(connp, mp, NULL, ira);
5247 5250                  ira->ira_ill = ill;
5248 5251                  ira->ira_rill = rill;
5249 5252          }
5250 5253  }
5251 5254  
5252 5255  /*
5253 5256   * Fanout for UDP packets that are multicast or broadcast, and ICMP errors.
5254 5257   * (Unicast fanout is handled in ip_input_v4.)
5255 5258   *
5256 5259   * If SO_REUSEADDR is set all multicast and broadcast packets
5257 5260   * will be delivered to all conns bound to the same port.
5258 5261   *
5259 5262   * If there is at least one matching AF_INET receiver, then we will
5260 5263   * ignore any AF_INET6 receivers.
5261 5264   * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an
5262 5265   * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4
5263 5266   * packets.
5264 5267   *
5265 5268   * Zones notes:
5266 5269   * Earlier in ip_input on a system with multiple shared-IP zones we
5267 5270   * duplicate the multicast and broadcast packets and send them up
5268 5271   * with each explicit zoneid that exists on that ill.
5269 5272   * This means that here we can match the zoneid with SO_ALLZONES being special.
5270 5273   */
5271 5274  void
5272 5275  ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport,
5273 5276      ip_recv_attr_t *ira)
5274 5277  {
5275 5278          ipaddr_t        laddr;
5276 5279          in6_addr_t      v6faddr;
5277 5280          conn_t          *connp;
5278 5281          connf_t         *connfp;
5279 5282          ipaddr_t        faddr;
5280 5283          ill_t           *ill = ira->ira_ill;
5281 5284          ip_stack_t      *ipst = ill->ill_ipst;
5282 5285  
5283 5286          ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR));
5284 5287  
5285 5288          laddr = ipha->ipha_dst;
5286 5289          faddr = ipha->ipha_src;
5287 5290  
5288 5291          connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5289 5292          mutex_enter(&connfp->connf_lock);
5290 5293          connp = connfp->connf_head;
5291 5294  
5292 5295          /*
5293 5296           * If SO_REUSEADDR has been set on the first we send the
5294 5297           * packet to all clients that have joined the group and
5295 5298           * match the port.
5296 5299           */
5297 5300          while (connp != NULL) {
5298 5301                  if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) &&
5299 5302                      conn_wantpacket(connp, ira, ipha) &&
5300 5303                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5301 5304                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5302 5305                          break;
5303 5306                  connp = connp->conn_next;
5304 5307          }
5305 5308  
5306 5309          if (connp == NULL)
5307 5310                  goto notfound;
5308 5311  
5309 5312          CONN_INC_REF(connp);
5310 5313  
5311 5314          if (connp->conn_reuseaddr) {
5312 5315                  conn_t          *first_connp = connp;
5313 5316                  conn_t          *next_connp;
5314 5317                  mblk_t          *mp1;
5315 5318  
5316 5319                  connp = connp->conn_next;
5317 5320                  for (;;) {
5318 5321                          while (connp != NULL) {
5319 5322                                  if (IPCL_UDP_MATCH(connp, lport, laddr,
5320 5323                                      fport, faddr) &&
5321 5324                                      conn_wantpacket(connp, ira, ipha) &&
5322 5325                                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5323 5326                                      tsol_receive_local(mp, &laddr, IPV4_VERSION,
5324 5327                                      ira, connp)))
5325 5328                                          break;
5326 5329                                  connp = connp->conn_next;
5327 5330                          }
5328 5331                          if (connp == NULL) {
5329 5332                                  /* No more interested clients */
5330 5333                                  connp = first_connp;
5331 5334                                  break;
5332 5335                          }
5333 5336                          if (((mp1 = dupmsg(mp)) == NULL) &&
5334 5337                              ((mp1 = copymsg(mp)) == NULL)) {
5335 5338                                  /* Memory allocation failed */
5336 5339                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5337 5340                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
5338 5341                                  connp = first_connp;
5339 5342                                  break;
5340 5343                          }
5341 5344                          CONN_INC_REF(connp);
5342 5345                          mutex_exit(&connfp->connf_lock);
5343 5346  
5344 5347                          IP_STAT(ipst, ip_udp_fanmb);
5345 5348                          ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5346 5349                              NULL, ira);
5347 5350                          mutex_enter(&connfp->connf_lock);
5348 5351                          /* Follow the next pointer before releasing the conn */
5349 5352                          next_connp = connp->conn_next;
5350 5353                          CONN_DEC_REF(connp);
5351 5354                          connp = next_connp;
5352 5355                  }
5353 5356          }
5354 5357  
5355 5358          /* Last one.  Send it upstream. */
5356 5359          mutex_exit(&connfp->connf_lock);
5357 5360          IP_STAT(ipst, ip_udp_fanmb);
5358 5361          ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5359 5362          CONN_DEC_REF(connp);
5360 5363          return;
5361 5364  
5362 5365  notfound:
5363 5366          mutex_exit(&connfp->connf_lock);
5364 5367          /*
5365 5368           * IPv6 endpoints bound to multicast IPv4-mapped addresses
5366 5369           * have already been matched above, since they live in the IPv4
5367 5370           * fanout tables. This implies we only need to
5368 5371           * check for IPv6 in6addr_any endpoints here.
5369 5372           * Thus we compare using ipv6_all_zeros instead of the destination
5370 5373           * address, except for the multicast group membership lookup which
5371 5374           * uses the IPv4 destination.
5372 5375           */
5373 5376          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr);
5374 5377          connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)];
5375 5378          mutex_enter(&connfp->connf_lock);
5376 5379          connp = connfp->connf_head;
5377 5380          /*
5378 5381           * IPv4 multicast packet being delivered to an AF_INET6
5379 5382           * in6addr_any endpoint.
5380 5383           * Need to check conn_wantpacket(). Note that we use conn_wantpacket()
5381 5384           * and not conn_wantpacket_v6() since any multicast membership is
5382 5385           * for an IPv4-mapped multicast address.
5383 5386           */
5384 5387          while (connp != NULL) {
5385 5388                  if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros,
5386 5389                      fport, v6faddr) &&
5387 5390                      conn_wantpacket(connp, ira, ipha) &&
5388 5391                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5389 5392                      tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp)))
5390 5393                          break;
5391 5394                  connp = connp->conn_next;
5392 5395          }
5393 5396  
5394 5397          if (connp == NULL) {
5395 5398                  /*
5396 5399                   * No one bound to this port.  Is
5397 5400                   * there a client that wants all
5398 5401                   * unclaimed datagrams?
5399 5402                   */
5400 5403                  mutex_exit(&connfp->connf_lock);
5401 5404  
5402 5405                  if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head !=
5403 5406                      NULL) {
5404 5407                          ASSERT(ira->ira_protocol == IPPROTO_UDP);
5405 5408                          ip_fanout_proto_v4(mp, ipha, ira);
5406 5409                  } else {
5407 5410                          /*
5408 5411                           * We used to attempt to send an icmp error here, but
5409 5412                           * since this is known to be a multicast packet
5410 5413                           * and we don't send icmp errors in response to
5411 5414                           * multicast, just drop the packet and give up sooner.
5412 5415                           */
5413 5416                          BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts);
5414 5417                          freemsg(mp);
5415 5418                  }
5416 5419                  return;
5417 5420          }
5418 5421          CONN_INC_REF(connp);
5419 5422          ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
5420 5423  
5421 5424          /*
5422 5425           * If SO_REUSEADDR has been set on the first we send the
5423 5426           * packet to all clients that have joined the group and
5424 5427           * match the port.
5425 5428           */
5426 5429          if (connp->conn_reuseaddr) {
5427 5430                  conn_t          *first_connp = connp;
5428 5431                  conn_t          *next_connp;
5429 5432                  mblk_t          *mp1;
5430 5433  
5431 5434                  connp = connp->conn_next;
5432 5435                  for (;;) {
5433 5436                          while (connp != NULL) {
5434 5437                                  if (IPCL_UDP_MATCH_V6(connp, lport,
5435 5438                                      ipv6_all_zeros, fport, v6faddr) &&
5436 5439                                      conn_wantpacket(connp, ira, ipha) &&
5437 5440                                      (!(ira->ira_flags & IRAF_SYSTEM_LABELED) ||
5438 5441                                      tsol_receive_local(mp, &laddr, IPV4_VERSION,
5439 5442                                      ira, connp)))
5440 5443                                          break;
5441 5444                                  connp = connp->conn_next;
5442 5445                          }
5443 5446                          if (connp == NULL) {
5444 5447                                  /* No more interested clients */
5445 5448                                  connp = first_connp;
5446 5449                                  break;
5447 5450                          }
5448 5451                          if (((mp1 = dupmsg(mp)) == NULL) &&
5449 5452                              ((mp1 = copymsg(mp)) == NULL)) {
5450 5453                                  /* Memory allocation failed */
5451 5454                                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
5452 5455                                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
5453 5456                                  connp = first_connp;
5454 5457                                  break;
5455 5458                          }
5456 5459                          CONN_INC_REF(connp);
5457 5460                          mutex_exit(&connfp->connf_lock);
5458 5461  
5459 5462                          IP_STAT(ipst, ip_udp_fanmb);
5460 5463                          ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr,
5461 5464                              NULL, ira);
5462 5465                          mutex_enter(&connfp->connf_lock);
5463 5466                          /* Follow the next pointer before releasing the conn */
5464 5467                          next_connp = connp->conn_next;
5465 5468                          CONN_DEC_REF(connp);
5466 5469                          connp = next_connp;
5467 5470                  }
5468 5471          }
5469 5472  
5470 5473          /* Last one.  Send it upstream. */
5471 5474          mutex_exit(&connfp->connf_lock);
5472 5475          IP_STAT(ipst, ip_udp_fanmb);
5473 5476          ip_fanout_udp_conn(connp, mp, ipha, NULL, ira);
5474 5477          CONN_DEC_REF(connp);
5475 5478  }
5476 5479  
5477 5480  /*
5478 5481   * Split an incoming packet's IPv4 options into the label and the other options.
5479 5482   * If 'allocate' is set it does memory allocation for the ip_pkt_t, including
5480 5483   * clearing out any leftover label or options.
5481 5484   * Otherwise it just makes ipp point into the packet.
5482 5485   *
5483 5486   * Returns zero if ok; ENOMEM if the buffer couldn't be allocated.
5484 5487   */
5485 5488  int
5486 5489  ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate)
5487 5490  {
5488 5491          uchar_t         *opt;
5489 5492          uint32_t        totallen;
5490 5493          uint32_t        optval;
5491 5494          uint32_t        optlen;
5492 5495  
5493 5496          ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR;
5494 5497          ipp->ipp_hoplimit = ipha->ipha_ttl;
5495 5498          ipp->ipp_type_of_service = ipha->ipha_type_of_service;
5496 5499          IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr);
5497 5500  
5498 5501          /*
5499 5502           * Get length (in 4 byte octets) of IP header options.
5500 5503           */
5501 5504          totallen = ipha->ipha_version_and_hdr_length -
5502 5505              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5503 5506  
5504 5507          if (totallen == 0) {
5505 5508                  if (!allocate)
5506 5509                          return (0);
5507 5510  
5508 5511                  /* Clear out anything from a previous packet */
5509 5512                  if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5510 5513                          kmem_free(ipp->ipp_ipv4_options,
5511 5514                              ipp->ipp_ipv4_options_len);
5512 5515                          ipp->ipp_ipv4_options = NULL;
5513 5516                          ipp->ipp_ipv4_options_len = 0;
5514 5517                          ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5515 5518                  }
5516 5519                  if (ipp->ipp_fields & IPPF_LABEL_V4) {
5517 5520                          kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5518 5521                          ipp->ipp_label_v4 = NULL;
5519 5522                          ipp->ipp_label_len_v4 = 0;
5520 5523                          ipp->ipp_fields &= ~IPPF_LABEL_V4;
5521 5524                  }
5522 5525                  return (0);
5523 5526          }
5524 5527  
5525 5528          totallen <<= 2;
5526 5529          opt = (uchar_t *)&ipha[1];
5527 5530          if (!is_system_labeled()) {
5528 5531  
5529 5532          copyall:
5530 5533                  if (!allocate) {
5531 5534                          if (totallen != 0) {
5532 5535                                  ipp->ipp_ipv4_options = opt;
5533 5536                                  ipp->ipp_ipv4_options_len = totallen;
5534 5537                                  ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5535 5538                          }
5536 5539                          return (0);
5537 5540                  }
5538 5541                  /* Just copy all of options */
5539 5542                  if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
5540 5543                          if (totallen == ipp->ipp_ipv4_options_len) {
5541 5544                                  bcopy(opt, ipp->ipp_ipv4_options, totallen);
5542 5545                                  return (0);
5543 5546                          }
5544 5547                          kmem_free(ipp->ipp_ipv4_options,
5545 5548                              ipp->ipp_ipv4_options_len);
5546 5549                          ipp->ipp_ipv4_options = NULL;
5547 5550                          ipp->ipp_ipv4_options_len = 0;
5548 5551                          ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS;
5549 5552                  }
5550 5553                  if (totallen == 0)
5551 5554                          return (0);
5552 5555  
5553 5556                  ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP);
5554 5557                  if (ipp->ipp_ipv4_options == NULL)
5555 5558                          return (ENOMEM);
5556 5559                  ipp->ipp_ipv4_options_len = totallen;
5557 5560                  ipp->ipp_fields |= IPPF_IPV4_OPTIONS;
5558 5561                  bcopy(opt, ipp->ipp_ipv4_options, totallen);
5559 5562                  return (0);
5560 5563          }
5561 5564  
5562 5565          if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) {
5563 5566                  kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
5564 5567                  ipp->ipp_label_v4 = NULL;
5565 5568                  ipp->ipp_label_len_v4 = 0;
5566 5569                  ipp->ipp_fields &= ~IPPF_LABEL_V4;
5567 5570          }
5568 5571  
5569 5572          /*
5570 5573           * Search for CIPSO option.
5571 5574           * We assume CIPSO is first in options if it is present.
5572 5575           * If it isn't, then ipp_opt_ipv4_options will not include the options
5573 5576           * prior to the CIPSO option.
5574 5577           */
5575 5578          while (totallen != 0) {
5576 5579                  switch (optval = opt[IPOPT_OPTVAL]) {
5577 5580                  case IPOPT_EOL:
5578 5581                          return (0);
5579 5582                  case IPOPT_NOP:
5580 5583                          optlen = 1;
5581 5584                          break;
5582 5585                  default:
5583 5586                          if (totallen <= IPOPT_OLEN)
5584 5587                                  return (EINVAL);
5585 5588                          optlen = opt[IPOPT_OLEN];
5586 5589                          if (optlen < 2)
5587 5590                                  return (EINVAL);
5588 5591                  }
5589 5592                  if (optlen > totallen)
5590 5593                          return (EINVAL);
5591 5594  
5592 5595                  switch (optval) {
5593 5596                  case IPOPT_COMSEC:
5594 5597                          if (!allocate) {
5595 5598                                  ipp->ipp_label_v4 = opt;
5596 5599                                  ipp->ipp_label_len_v4 = optlen;
5597 5600                                  ipp->ipp_fields |= IPPF_LABEL_V4;
5598 5601                          } else {
5599 5602                                  ipp->ipp_label_v4 = kmem_alloc(optlen,
5600 5603                                      KM_NOSLEEP);
5601 5604                                  if (ipp->ipp_label_v4 == NULL)
5602 5605                                          return (ENOMEM);
5603 5606                                  ipp->ipp_label_len_v4 = optlen;
5604 5607                                  ipp->ipp_fields |= IPPF_LABEL_V4;
5605 5608                                  bcopy(opt, ipp->ipp_label_v4, optlen);
5606 5609                          }
5607 5610                          totallen -= optlen;
5608 5611                          opt += optlen;
5609 5612  
5610 5613                          /* Skip padding bytes until we get to a multiple of 4 */
5611 5614                          while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) {
5612 5615                                  totallen--;
5613 5616                                  opt++;
5614 5617                          }
5615 5618                          /* Remaining as ipp_ipv4_options */
5616 5619                          goto copyall;
5617 5620                  }
5618 5621                  totallen -= optlen;
5619 5622                  opt += optlen;
5620 5623          }
5621 5624          /* No CIPSO found; return everything as ipp_ipv4_options */
5622 5625          totallen = ipha->ipha_version_and_hdr_length -
5623 5626              (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS);
5624 5627          totallen <<= 2;
5625 5628          opt = (uchar_t *)&ipha[1];
5626 5629          goto copyall;
5627 5630  }
5628 5631  
5629 5632  /*
5630 5633   * Efficient versions of lookup for an IRE when we only
5631 5634   * match the address.
5632 5635   * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5633 5636   * Does not handle multicast addresses.
5634 5637   */
5635 5638  uint_t
5636 5639  ip_type_v4(ipaddr_t addr, ip_stack_t *ipst)
5637 5640  {
5638 5641          ire_t *ire;
5639 5642          uint_t result;
5640 5643  
5641 5644          ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL);
5642 5645          ASSERT(ire != NULL);
5643 5646          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5644 5647                  result = IRE_NOROUTE;
5645 5648          else
5646 5649                  result = ire->ire_type;
5647 5650          ire_refrele(ire);
5648 5651          return (result);
5649 5652  }
5650 5653  
5651 5654  /*
5652 5655   * Efficient versions of lookup for an IRE when we only
5653 5656   * match the address.
5654 5657   * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE.
5655 5658   * Does not handle multicast addresses.
5656 5659   */
5657 5660  uint_t
5658 5661  ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst)
5659 5662  {
5660 5663          ire_t *ire;
5661 5664          uint_t result;
5662 5665  
5663 5666          ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL);
5664 5667          ASSERT(ire != NULL);
5665 5668          if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
5666 5669                  result = IRE_NOROUTE;
5667 5670          else
5668 5671                  result = ire->ire_type;
5669 5672          ire_refrele(ire);
5670 5673          return (result);
5671 5674  }
5672 5675  
5673 5676  /*
5674 5677   * Nobody should be sending
5675 5678   * packets up this stream
5676 5679   */
5677 5680  static void
5678 5681  ip_lrput(queue_t *q, mblk_t *mp)
5679 5682  {
5680 5683          switch (mp->b_datap->db_type) {
5681 5684          case M_FLUSH:
5682 5685                  /* Turn around */
5683 5686                  if (*mp->b_rptr & FLUSHW) {
5684 5687                          *mp->b_rptr &= ~FLUSHR;
5685 5688                          qreply(q, mp);
5686 5689                          return;
5687 5690                  }
5688 5691                  break;
5689 5692          }
5690 5693          freemsg(mp);
5691 5694  }
5692 5695  
5693 5696  /* Nobody should be sending packets down this stream */
5694 5697  /* ARGSUSED */
5695 5698  void
5696 5699  ip_lwput(queue_t *q, mblk_t *mp)
5697 5700  {
5698 5701          freemsg(mp);
5699 5702  }
5700 5703  
5701 5704  /*
5702 5705   * Move the first hop in any source route to ipha_dst and remove that part of
5703 5706   * the source route.  Called by other protocols.  Errors in option formatting
5704 5707   * are ignored - will be handled by ip_output_options. Return the final
5705 5708   * destination (either ipha_dst or the last entry in a source route.)
5706 5709   */
5707 5710  ipaddr_t
5708 5711  ip_massage_options(ipha_t *ipha, netstack_t *ns)
5709 5712  {
5710 5713          ipoptp_t        opts;
5711 5714          uchar_t         *opt;
5712 5715          uint8_t         optval;
5713 5716          uint8_t         optlen;
5714 5717          ipaddr_t        dst;
5715 5718          int             i;
5716 5719          ip_stack_t      *ipst = ns->netstack_ip;
5717 5720  
5718 5721          ip2dbg(("ip_massage_options\n"));
5719 5722          dst = ipha->ipha_dst;
5720 5723          for (optval = ipoptp_first(&opts, ipha);
5721 5724              optval != IPOPT_EOL;
5722 5725              optval = ipoptp_next(&opts)) {
5723 5726                  opt = opts.ipoptp_cur;
5724 5727                  switch (optval) {
5725 5728                          uint8_t off;
5726 5729                  case IPOPT_SSRR:
5727 5730                  case IPOPT_LSRR:
5728 5731                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
5729 5732                                  ip1dbg(("ip_massage_options: bad src route\n"));
5730 5733                                  break;
5731 5734                          }
5732 5735                          optlen = opts.ipoptp_len;
5733 5736                          off = opt[IPOPT_OFFSET];
5734 5737                          off--;
5735 5738                  redo_srr:
5736 5739                          if (optlen < IP_ADDR_LEN ||
5737 5740                              off > optlen - IP_ADDR_LEN) {
5738 5741                                  /* End of source route */
5739 5742                                  ip1dbg(("ip_massage_options: end of SR\n"));
5740 5743                                  break;
5741 5744                          }
5742 5745                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
5743 5746                          ip1dbg(("ip_massage_options: next hop 0x%x\n",
5744 5747                              ntohl(dst)));
5745 5748                          /*
5746 5749                           * Check if our address is present more than
5747 5750                           * once as consecutive hops in source route.
5748 5751                           * XXX verify per-interface ip_forwarding
5749 5752                           * for source route?
5750 5753                           */
5751 5754                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
5752 5755                                  off += IP_ADDR_LEN;
5753 5756                                  goto redo_srr;
5754 5757                          }
5755 5758                          if (dst == htonl(INADDR_LOOPBACK)) {
5756 5759                                  ip1dbg(("ip_massage_options: loopback addr in "
5757 5760                                      "source route!\n"));
5758 5761                                  break;
5759 5762                          }
5760 5763                          /*
5761 5764                           * Update ipha_dst to be the first hop and remove the
5762 5765                           * first hop from the source route (by overwriting
5763 5766                           * part of the option with NOP options).
5764 5767                           */
5765 5768                          ipha->ipha_dst = dst;
5766 5769                          /* Put the last entry in dst */
5767 5770                          off = ((optlen - IP_ADDR_LEN - 3) & ~(IP_ADDR_LEN-1)) +
5768 5771                              3;
5769 5772                          bcopy(&opt[off], &dst, IP_ADDR_LEN);
5770 5773  
5771 5774                          ip1dbg(("ip_massage_options: last hop 0x%x\n",
5772 5775                              ntohl(dst)));
5773 5776                          /* Move down and overwrite */
5774 5777                          opt[IP_ADDR_LEN] = opt[0];
5775 5778                          opt[IP_ADDR_LEN+1] = opt[IPOPT_OLEN] - IP_ADDR_LEN;
5776 5779                          opt[IP_ADDR_LEN+2] = opt[IPOPT_OFFSET];
5777 5780                          for (i = 0; i < IP_ADDR_LEN; i++)
5778 5781                                  opt[i] = IPOPT_NOP;
5779 5782                          break;
5780 5783                  }
5781 5784          }
5782 5785          return (dst);
5783 5786  }
5784 5787  
5785 5788  /*
5786 5789   * Return the network mask
5787 5790   * associated with the specified address.
5788 5791   */
5789 5792  ipaddr_t
5790 5793  ip_net_mask(ipaddr_t addr)
5791 5794  {
5792 5795          uchar_t *up = (uchar_t *)&addr;
5793 5796          ipaddr_t mask = 0;
5794 5797          uchar_t *maskp = (uchar_t *)&mask;
5795 5798  
5796 5799  #if defined(__i386) || defined(__amd64)
5797 5800  #define TOTALLY_BRAIN_DAMAGED_C_COMPILER
5798 5801  #endif
5799 5802  #ifdef  TOTALLY_BRAIN_DAMAGED_C_COMPILER
5800 5803          maskp[0] = maskp[1] = maskp[2] = maskp[3] = 0;
5801 5804  #endif
5802 5805          if (CLASSD(addr)) {
5803 5806                  maskp[0] = 0xF0;
5804 5807                  return (mask);
5805 5808          }
5806 5809  
5807 5810          /* We assume Class E default netmask to be 32 */
5808 5811          if (CLASSE(addr))
5809 5812                  return (0xffffffffU);
5810 5813  
5811 5814          if (addr == 0)
5812 5815                  return (0);
5813 5816          maskp[0] = 0xFF;
5814 5817          if ((up[0] & 0x80) == 0)
5815 5818                  return (mask);
5816 5819  
5817 5820          maskp[1] = 0xFF;
5818 5821          if ((up[0] & 0xC0) == 0x80)
5819 5822                  return (mask);
5820 5823  
5821 5824          maskp[2] = 0xFF;
5822 5825          if ((up[0] & 0xE0) == 0xC0)
5823 5826                  return (mask);
5824 5827  
5825 5828          /* Otherwise return no mask */
5826 5829          return ((ipaddr_t)0);
5827 5830  }
5828 5831  
5829 5832  /* Name/Value Table Lookup Routine */
5830 5833  char *
5831 5834  ip_nv_lookup(nv_t *nv, int value)
5832 5835  {
5833 5836          if (!nv)
5834 5837                  return (NULL);
5835 5838          for (; nv->nv_name; nv++) {
5836 5839                  if (nv->nv_value == value)
5837 5840                          return (nv->nv_name);
5838 5841          }
5839 5842          return ("unknown");
5840 5843  }
5841 5844  
5842 5845  static int
5843 5846  ip_wait_for_info_ack(ill_t *ill)
5844 5847  {
5845 5848          int err;
5846 5849  
5847 5850          mutex_enter(&ill->ill_lock);
5848 5851          while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) {
5849 5852                  /*
5850 5853                   * Return value of 0 indicates a pending signal.
5851 5854                   */
5852 5855                  err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock);
5853 5856                  if (err == 0) {
5854 5857                          mutex_exit(&ill->ill_lock);
5855 5858                          return (EINTR);
5856 5859                  }
5857 5860          }
5858 5861          mutex_exit(&ill->ill_lock);
5859 5862          /*
5860 5863           * ip_rput_other could have set an error  in ill_error on
5861 5864           * receipt of M_ERROR.
5862 5865           */
5863 5866          return (ill->ill_error);
5864 5867  }
5865 5868  
5866 5869  /*
5867 5870   * This is a module open, i.e. this is a control stream for access
5868 5871   * to a DLPI device.  We allocate an ill_t as the instance data in
5869 5872   * this case.
5870 5873   */
5871 5874  static int
5872 5875  ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5873 5876  {
5874 5877          ill_t   *ill;
5875 5878          int     err;
5876 5879          zoneid_t zoneid;
5877 5880          netstack_t *ns;
5878 5881          ip_stack_t *ipst;
5879 5882  
5880 5883          /*
5881 5884           * Prevent unprivileged processes from pushing IP so that
5882 5885           * they can't send raw IP.
5883 5886           */
5884 5887          if (secpolicy_net_rawaccess(credp) != 0)
5885 5888                  return (EPERM);
5886 5889  
5887 5890          ns = netstack_find_by_cred(credp);
5888 5891          ASSERT(ns != NULL);
5889 5892          ipst = ns->netstack_ip;
5890 5893          ASSERT(ipst != NULL);
5891 5894  
5892 5895          /*
5893 5896           * For exclusive stacks we set the zoneid to zero
5894 5897           * to make IP operate as if in the global zone.
5895 5898           */
5896 5899          if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
5897 5900                  zoneid = GLOBAL_ZONEID;
5898 5901          else
5899 5902                  zoneid = crgetzoneid(credp);
5900 5903  
5901 5904          ill = (ill_t *)mi_open_alloc_sleep(sizeof (ill_t));
5902 5905          q->q_ptr = WR(q)->q_ptr = ill;
5903 5906          ill->ill_ipst = ipst;
5904 5907          ill->ill_zoneid = zoneid;
5905 5908  
5906 5909          /*
5907 5910           * ill_init initializes the ill fields and then sends down
5908 5911           * down a DL_INFO_REQ after calling qprocson.
5909 5912           */
5910 5913          err = ill_init(q, ill);
5911 5914  
5912 5915          if (err != 0) {
5913 5916                  mi_free(ill);
5914 5917                  netstack_rele(ipst->ips_netstack);
5915 5918                  q->q_ptr = NULL;
5916 5919                  WR(q)->q_ptr = NULL;
5917 5920                  return (err);
5918 5921          }
5919 5922  
5920 5923          /*
5921 5924           * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
5922 5925           *
5923 5926           * ill_init initializes the ipsq marking this thread as
5924 5927           * writer
5925 5928           */
5926 5929          ipsq_exit(ill->ill_phyint->phyint_ipsq);
5927 5930          err = ip_wait_for_info_ack(ill);
5928 5931          if (err == 0)
5929 5932                  ill->ill_credp = credp;
5930 5933          else
5931 5934                  goto fail;
5932 5935  
5933 5936          crhold(credp);
5934 5937  
5935 5938          mutex_enter(&ipst->ips_ip_mi_lock);
5936 5939          err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag,
5937 5940              sflag, credp);
5938 5941          mutex_exit(&ipst->ips_ip_mi_lock);
5939 5942  fail:
5940 5943          if (err) {
5941 5944                  (void) ip_close(q, 0);
5942 5945                  return (err);
5943 5946          }
5944 5947          return (0);
5945 5948  }
5946 5949  
5947 5950  /* For /dev/ip aka AF_INET open */
5948 5951  int
5949 5952  ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5950 5953  {
5951 5954          return (ip_open(q, devp, flag, sflag, credp, B_FALSE));
5952 5955  }
5953 5956  
5954 5957  /* For /dev/ip6 aka AF_INET6 open */
5955 5958  int
5956 5959  ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
5957 5960  {
5958 5961          return (ip_open(q, devp, flag, sflag, credp, B_TRUE));
5959 5962  }
5960 5963  
5961 5964  /* IP open routine. */
5962 5965  int
5963 5966  ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
5964 5967      boolean_t isv6)
5965 5968  {
5966 5969          conn_t          *connp;
5967 5970          major_t         maj;
5968 5971          zoneid_t        zoneid;
5969 5972          netstack_t      *ns;
5970 5973          ip_stack_t      *ipst;
5971 5974  
5972 5975          /* Allow reopen. */
5973 5976          if (q->q_ptr != NULL)
5974 5977                  return (0);
5975 5978  
5976 5979          if (sflag & MODOPEN) {
5977 5980                  /* This is a module open */
5978 5981                  return (ip_modopen(q, devp, flag, sflag, credp));
5979 5982          }
5980 5983  
5981 5984          if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
5982 5985                  /*
5983 5986                   * Non streams based socket looking for a stream
5984 5987                   * to access IP
5985 5988                   */
5986 5989                  return (ip_helper_stream_setup(q, devp, flag, sflag,
5987 5990                      credp, isv6));
5988 5991          }
5989 5992  
5990 5993          ns = netstack_find_by_cred(credp);
5991 5994          ASSERT(ns != NULL);
5992 5995          ipst = ns->netstack_ip;
5993 5996          ASSERT(ipst != NULL);
5994 5997  
5995 5998          /*
5996 5999           * For exclusive stacks we set the zoneid to zero
5997 6000           * to make IP operate as if in the global zone.
5998 6001           */
5999 6002          if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
6000 6003                  zoneid = GLOBAL_ZONEID;
6001 6004          else
6002 6005                  zoneid = crgetzoneid(credp);
6003 6006  
6004 6007          /*
6005 6008           * We are opening as a device. This is an IP client stream, and we
6006 6009           * allocate an conn_t as the instance data.
6007 6010           */
6008 6011          connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, ipst->ips_netstack);
6009 6012  
6010 6013          /*
6011 6014           * ipcl_conn_create did a netstack_hold. Undo the hold that was
6012 6015           * done by netstack_find_by_cred()
6013 6016           */
6014 6017          netstack_rele(ipst->ips_netstack);
6015 6018  
6016 6019          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM;
6017 6020          /* conn_allzones can not be set this early, hence no IPCL_ZONEID */
6018 6021          connp->conn_ixa->ixa_zoneid = zoneid;
6019 6022          connp->conn_zoneid = zoneid;
6020 6023  
6021 6024          connp->conn_rq = q;
6022 6025          q->q_ptr = WR(q)->q_ptr = connp;
6023 6026  
6024 6027          /* Minor tells us which /dev entry was opened */
6025 6028          if (isv6) {
6026 6029                  connp->conn_family = AF_INET6;
6027 6030                  connp->conn_ipversion = IPV6_VERSION;
6028 6031                  connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4;
6029 6032                  connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT;
6030 6033          } else {
6031 6034                  connp->conn_family = AF_INET;
6032 6035                  connp->conn_ipversion = IPV4_VERSION;
6033 6036                  connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4;
6034 6037          }
6035 6038  
6036 6039          if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
6037 6040              ((connp->conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
6038 6041                  connp->conn_minor_arena = ip_minor_arena_la;
6039 6042          } else {
6040 6043                  /*
6041 6044                   * Either minor numbers in the large arena were exhausted
6042 6045                   * or a non socket application is doing the open.
6043 6046                   * Try to allocate from the small arena.
6044 6047                   */
6045 6048                  if ((connp->conn_dev =
6046 6049                      inet_minor_alloc(ip_minor_arena_sa)) == 0) {
6047 6050                          /* CONN_DEC_REF takes care of netstack_rele() */
6048 6051                          q->q_ptr = WR(q)->q_ptr = NULL;
6049 6052                          CONN_DEC_REF(connp);
6050 6053                          return (EBUSY);
6051 6054                  }
6052 6055                  connp->conn_minor_arena = ip_minor_arena_sa;
6053 6056          }
6054 6057  
6055 6058          maj = getemajor(*devp);
6056 6059          *devp = makedevice(maj, (minor_t)connp->conn_dev);
6057 6060  
6058 6061          /*
6059 6062           * connp->conn_cred is crfree()ed in ipcl_conn_destroy()
6060 6063           */
6061 6064          connp->conn_cred = credp;
6062 6065          connp->conn_cpid = curproc->p_pid;
6063 6066          /* Cache things in ixa without an extra refhold */
6064 6067          ASSERT(!(connp->conn_ixa->ixa_free_flags & IXA_FREE_CRED));
6065 6068          connp->conn_ixa->ixa_cred = connp->conn_cred;
6066 6069          connp->conn_ixa->ixa_cpid = connp->conn_cpid;
6067 6070          if (is_system_labeled())
6068 6071                  connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred);
6069 6072  
6070 6073          /*
6071 6074           * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv
6072 6075           */
6073 6076          connp->conn_recv = ip_conn_input;
6074 6077          connp->conn_recvicmp = ip_conn_input_icmp;
6075 6078  
6076 6079          crhold(connp->conn_cred);
6077 6080  
6078 6081          /*
6079 6082           * If the caller has the process-wide flag set, then default to MAC
6080 6083           * exempt mode.  This allows read-down to unlabeled hosts.
6081 6084           */
6082 6085          if (getpflags(NET_MAC_AWARE, credp) != 0)
6083 6086                  connp->conn_mac_mode = CONN_MAC_AWARE;
6084 6087  
6085 6088          connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID);
6086 6089  
6087 6090          connp->conn_rq = q;
6088 6091          connp->conn_wq = WR(q);
6089 6092  
6090 6093          /* Non-zero default values */
6091 6094          connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP;
6092 6095  
6093 6096          /*
6094 6097           * Make the conn globally visible to walkers
6095 6098           */
6096 6099          ASSERT(connp->conn_ref == 1);
6097 6100          mutex_enter(&connp->conn_lock);
6098 6101          connp->conn_state_flags &= ~CONN_INCIPIENT;
6099 6102          mutex_exit(&connp->conn_lock);
6100 6103  
6101 6104          qprocson(q);
6102 6105  
6103 6106          return (0);
6104 6107  }
6105 6108  
6106 6109  /*
6107 6110   * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid,
6108 6111   * all of them are copied to the conn_t. If the req is "zero", the policy is
6109 6112   * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req
6110 6113   * fields.
6111 6114   * We keep only the latest setting of the policy and thus policy setting
6112 6115   * is not incremental/cumulative.
6113 6116   *
6114 6117   * Requests to set policies with multiple alternative actions will
6115 6118   * go through a different API.
6116 6119   */
6117 6120  int
6118 6121  ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req)
6119 6122  {
6120 6123          uint_t ah_req = 0;
6121 6124          uint_t esp_req = 0;
6122 6125          uint_t se_req = 0;
6123 6126          ipsec_act_t *actp = NULL;
6124 6127          uint_t nact;
6125 6128          ipsec_policy_head_t *ph;
6126 6129          boolean_t is_pol_reset, is_pol_inserted = B_FALSE;
6127 6130          int error = 0;
6128 6131          netstack_t      *ns = connp->conn_netstack;
6129 6132          ip_stack_t      *ipst = ns->netstack_ip;
6130 6133          ipsec_stack_t   *ipss = ns->netstack_ipsec;
6131 6134  
6132 6135  #define REQ_MASK (IPSEC_PREF_REQUIRED|IPSEC_PREF_NEVER)
6133 6136  
6134 6137          /*
6135 6138           * The IP_SEC_OPT option does not allow variable length parameters,
6136 6139           * hence a request cannot be NULL.
6137 6140           */
6138 6141          if (req == NULL)
6139 6142                  return (EINVAL);
6140 6143  
6141 6144          ah_req = req->ipsr_ah_req;
6142 6145          esp_req = req->ipsr_esp_req;
6143 6146          se_req = req->ipsr_self_encap_req;
6144 6147  
6145 6148          /* Don't allow setting self-encap without one or more of AH/ESP. */
6146 6149          if (se_req != 0 && esp_req == 0 && ah_req == 0)
6147 6150                  return (EINVAL);
6148 6151  
6149 6152          /*
6150 6153           * Are we dealing with a request to reset the policy (i.e.
6151 6154           * zero requests).
6152 6155           */
6153 6156          is_pol_reset = ((ah_req & REQ_MASK) == 0 &&
6154 6157              (esp_req & REQ_MASK) == 0 &&
6155 6158              (se_req & REQ_MASK) == 0);
6156 6159  
6157 6160          if (!is_pol_reset) {
6158 6161                  /*
6159 6162                   * If we couldn't load IPsec, fail with "protocol
6160 6163                   * not supported".
6161 6164                   * IPsec may not have been loaded for a request with zero
6162 6165                   * policies, so we don't fail in this case.
6163 6166                   */
6164 6167                  mutex_enter(&ipss->ipsec_loader_lock);
6165 6168                  if (ipss->ipsec_loader_state != IPSEC_LOADER_SUCCEEDED) {
6166 6169                          mutex_exit(&ipss->ipsec_loader_lock);
6167 6170                          return (EPROTONOSUPPORT);
6168 6171                  }
6169 6172                  mutex_exit(&ipss->ipsec_loader_lock);
6170 6173  
6171 6174                  /*
6172 6175                   * Test for valid requests. Invalid algorithms
6173 6176                   * need to be tested by IPsec code because new
6174 6177                   * algorithms can be added dynamically.
6175 6178                   */
6176 6179                  if ((ah_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6177 6180                      (esp_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0 ||
6178 6181                      (se_req & ~(REQ_MASK|IPSEC_PREF_UNIQUE)) != 0) {
6179 6182                          return (EINVAL);
6180 6183                  }
6181 6184  
6182 6185                  /*
6183 6186                   * Only privileged users can issue these
6184 6187                   * requests.
6185 6188                   */
6186 6189                  if (((ah_req & IPSEC_PREF_NEVER) ||
6187 6190                      (esp_req & IPSEC_PREF_NEVER) ||
6188 6191                      (se_req & IPSEC_PREF_NEVER)) &&
6189 6192                      secpolicy_ip_config(cr, B_FALSE) != 0) {
6190 6193                          return (EPERM);
6191 6194                  }
6192 6195  
6193 6196                  /*
6194 6197                   * The IPSEC_PREF_REQUIRED and IPSEC_PREF_NEVER
6195 6198                   * are mutually exclusive.
6196 6199                   */
6197 6200                  if (((ah_req & REQ_MASK) == REQ_MASK) ||
6198 6201                      ((esp_req & REQ_MASK) == REQ_MASK) ||
6199 6202                      ((se_req & REQ_MASK) == REQ_MASK)) {
6200 6203                          /* Both of them are set */
6201 6204                          return (EINVAL);
6202 6205                  }
6203 6206          }
6204 6207  
6205 6208          ASSERT(MUTEX_HELD(&connp->conn_lock));
6206 6209  
6207 6210          /*
6208 6211           * If we have already cached policies in conn_connect(), don't
6209 6212           * let them change now. We cache policies for connections
6210 6213           * whose src,dst [addr, port] is known.
6211 6214           */
6212 6215          if (connp->conn_policy_cached) {
6213 6216                  return (EINVAL);
6214 6217          }
6215 6218  
6216 6219          /*
6217 6220           * We have a zero policies, reset the connection policy if already
6218 6221           * set. This will cause the connection to inherit the
6219 6222           * global policy, if any.
6220 6223           */
6221 6224          if (is_pol_reset) {
6222 6225                  if (connp->conn_policy != NULL) {
6223 6226                          IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack);
6224 6227                          connp->conn_policy = NULL;
6225 6228                  }
6226 6229                  connp->conn_in_enforce_policy = B_FALSE;
6227 6230                  connp->conn_out_enforce_policy = B_FALSE;
6228 6231                  return (0);
6229 6232          }
6230 6233  
6231 6234          ph = connp->conn_policy = ipsec_polhead_split(connp->conn_policy,
6232 6235              ipst->ips_netstack);
6233 6236          if (ph == NULL)
6234 6237                  goto enomem;
6235 6238  
6236 6239          ipsec_actvec_from_req(req, &actp, &nact, ipst->ips_netstack);
6237 6240          if (actp == NULL)
6238 6241                  goto enomem;
6239 6242  
6240 6243          /*
6241 6244           * Always insert IPv4 policy entries, since they can also apply to
6242 6245           * ipv6 sockets being used in ipv4-compat mode.
6243 6246           */
6244 6247          if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6245 6248              IPSEC_TYPE_INBOUND, ns))
6246 6249                  goto enomem;
6247 6250          is_pol_inserted = B_TRUE;
6248 6251          if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V4,
6249 6252              IPSEC_TYPE_OUTBOUND, ns))
6250 6253                  goto enomem;
6251 6254  
6252 6255          /*
6253 6256           * We're looking at a v6 socket, also insert the v6-specific
6254 6257           * entries.
6255 6258           */
6256 6259          if (connp->conn_family == AF_INET6) {
6257 6260                  if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6258 6261                      IPSEC_TYPE_INBOUND, ns))
6259 6262                          goto enomem;
6260 6263                  if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6,
6261 6264                      IPSEC_TYPE_OUTBOUND, ns))
6262 6265                          goto enomem;
6263 6266          }
6264 6267  
6265 6268          ipsec_actvec_free(actp, nact);
6266 6269  
6267 6270          /*
6268 6271           * If the requests need security, set enforce_policy.
6269 6272           * If the requests are IPSEC_PREF_NEVER, one should
6270 6273           * still set conn_out_enforce_policy so that ip_set_destination
6271 6274           * marks the ip_xmit_attr_t appropriatly. This is needed so that
6272 6275           * for connections that we don't cache policy in at connect time,
6273 6276           * if global policy matches in ip_output_attach_policy, we
6274 6277           * don't wrongly inherit global policy. Similarly, we need
6275 6278           * to set conn_in_enforce_policy also so that we don't verify
6276 6279           * policy wrongly.
6277 6280           */
6278 6281          if ((ah_req & REQ_MASK) != 0 ||
6279 6282              (esp_req & REQ_MASK) != 0 ||
6280 6283              (se_req & REQ_MASK) != 0) {
6281 6284                  connp->conn_in_enforce_policy = B_TRUE;
6282 6285                  connp->conn_out_enforce_policy = B_TRUE;
6283 6286          }
6284 6287  
6285 6288          return (error);
6286 6289  #undef REQ_MASK
6287 6290  
6288 6291          /*
6289 6292           * Common memory-allocation-failure exit path.
6290 6293           */
6291 6294  enomem:
6292 6295          if (actp != NULL)
6293 6296                  ipsec_actvec_free(actp, nact);
6294 6297          if (is_pol_inserted)
6295 6298                  ipsec_polhead_flush(ph, ns);
6296 6299          return (ENOMEM);
6297 6300  }
6298 6301  
6299 6302  /*
6300 6303   * Set socket options for joining and leaving multicast groups.
6301 6304   * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6302 6305   * The caller has already check that the option name is consistent with
6303 6306   * the address family of the socket.
6304 6307   */
6305 6308  int
6306 6309  ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name,
6307 6310      uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6308 6311  {
6309 6312          int             *i1 = (int *)invalp;
6310 6313          int             error = 0;
6311 6314          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
6312 6315          struct ip_mreq  *v4_mreqp;
6313 6316          struct ipv6_mreq *v6_mreqp;
6314 6317          struct group_req *greqp;
6315 6318          ire_t *ire;
6316 6319          boolean_t done = B_FALSE;
6317 6320          ipaddr_t ifaddr;
6318 6321          in6_addr_t v6group;
6319 6322          uint_t ifindex;
6320 6323          boolean_t mcast_opt = B_TRUE;
6321 6324          mcast_record_t fmode;
6322 6325          int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6323 6326              ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6324 6327  
6325 6328          switch (name) {
6326 6329          case IP_ADD_MEMBERSHIP:
6327 6330          case IPV6_JOIN_GROUP:
6328 6331                  mcast_opt = B_FALSE;
6329 6332                  /* FALLTHRU */
6330 6333          case MCAST_JOIN_GROUP:
6331 6334                  fmode = MODE_IS_EXCLUDE;
6332 6335                  optfn = ip_opt_add_group;
6333 6336                  break;
6334 6337  
6335 6338          case IP_DROP_MEMBERSHIP:
6336 6339          case IPV6_LEAVE_GROUP:
6337 6340                  mcast_opt = B_FALSE;
6338 6341                  /* FALLTHRU */
6339 6342          case MCAST_LEAVE_GROUP:
6340 6343                  fmode = MODE_IS_INCLUDE;
6341 6344                  optfn = ip_opt_delete_group;
6342 6345                  break;
6343 6346          default:
6344 6347                  ASSERT(0);
6345 6348          }
6346 6349  
6347 6350          if (mcast_opt) {
6348 6351                  struct sockaddr_in *sin;
6349 6352                  struct sockaddr_in6 *sin6;
6350 6353  
6351 6354                  greqp = (struct group_req *)i1;
6352 6355                  if (greqp->gr_group.ss_family == AF_INET) {
6353 6356                          sin = (struct sockaddr_in *)&(greqp->gr_group);
6354 6357                          IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group);
6355 6358                  } else {
6356 6359                          if (!inet6)
6357 6360                                  return (EINVAL);        /* Not on INET socket */
6358 6361  
6359 6362                          sin6 = (struct sockaddr_in6 *)&(greqp->gr_group);
6360 6363                          v6group = sin6->sin6_addr;
6361 6364                  }
6362 6365                  ifaddr = INADDR_ANY;
6363 6366                  ifindex = greqp->gr_interface;
6364 6367          } else if (inet6) {
6365 6368                  v6_mreqp = (struct ipv6_mreq *)i1;
6366 6369                  v6group = v6_mreqp->ipv6mr_multiaddr;
6367 6370                  ifaddr = INADDR_ANY;
6368 6371                  ifindex = v6_mreqp->ipv6mr_interface;
6369 6372          } else {
6370 6373                  v4_mreqp = (struct ip_mreq *)i1;
6371 6374                  IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group);
6372 6375                  ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr;
6373 6376                  ifindex = 0;
6374 6377          }
6375 6378  
6376 6379          /*
6377 6380           * In the multirouting case, we need to replicate
6378 6381           * the request on all interfaces that will take part
6379 6382           * in replication.  We do so because multirouting is
6380 6383           * reflective, thus we will probably receive multi-
6381 6384           * casts on those interfaces.
6382 6385           * The ip_multirt_apply_membership() succeeds if
6383 6386           * the operation succeeds on at least one interface.
6384 6387           */
6385 6388          if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6386 6389                  ipaddr_t group;
6387 6390  
6388 6391                  IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6389 6392  
6390 6393                  ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6391 6394                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6392 6395                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6393 6396          } else {
6394 6397                  ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6395 6398                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6396 6399                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6397 6400          }
6398 6401          if (ire != NULL) {
6399 6402                  if (ire->ire_flags & RTF_MULTIRT) {
6400 6403                          error = ip_multirt_apply_membership(optfn, ire, connp,
6401 6404                              checkonly, &v6group, fmode, &ipv6_all_zeros);
6402 6405                          done = B_TRUE;
6403 6406                  }
6404 6407                  ire_refrele(ire);
6405 6408          }
6406 6409  
6407 6410          if (!done) {
6408 6411                  error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6409 6412                      fmode, &ipv6_all_zeros);
6410 6413          }
6411 6414          return (error);
6412 6415  }
6413 6416  
6414 6417  /*
6415 6418   * Set socket options for joining and leaving multicast groups
6416 6419   * for specific sources.
6417 6420   * Common to IPv4 and IPv6; inet6 indicates the type of socket.
6418 6421   * The caller has already check that the option name is consistent with
6419 6422   * the address family of the socket.
6420 6423   */
6421 6424  int
6422 6425  ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name,
6423 6426      uchar_t *invalp, boolean_t inet6, boolean_t checkonly)
6424 6427  {
6425 6428          int             *i1 = (int *)invalp;
6426 6429          int             error = 0;
6427 6430          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
6428 6431          struct ip_mreq_source *imreqp;
6429 6432          struct group_source_req *gsreqp;
6430 6433          in6_addr_t v6group, v6src;
6431 6434          uint32_t ifindex;
6432 6435          ipaddr_t ifaddr;
6433 6436          boolean_t mcast_opt = B_TRUE;
6434 6437          mcast_record_t fmode;
6435 6438          ire_t *ire;
6436 6439          boolean_t done = B_FALSE;
6437 6440          int (*optfn)(conn_t *, boolean_t, const in6_addr_t *,
6438 6441              ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *);
6439 6442  
6440 6443          switch (name) {
6441 6444          case IP_BLOCK_SOURCE:
6442 6445                  mcast_opt = B_FALSE;
6443 6446                  /* FALLTHRU */
6444 6447          case MCAST_BLOCK_SOURCE:
6445 6448                  fmode = MODE_IS_EXCLUDE;
6446 6449                  optfn = ip_opt_add_group;
6447 6450                  break;
6448 6451  
6449 6452          case IP_UNBLOCK_SOURCE:
6450 6453                  mcast_opt = B_FALSE;
6451 6454                  /* FALLTHRU */
6452 6455          case MCAST_UNBLOCK_SOURCE:
6453 6456                  fmode = MODE_IS_EXCLUDE;
6454 6457                  optfn = ip_opt_delete_group;
6455 6458                  break;
6456 6459  
6457 6460          case IP_ADD_SOURCE_MEMBERSHIP:
6458 6461                  mcast_opt = B_FALSE;
6459 6462                  /* FALLTHRU */
6460 6463          case MCAST_JOIN_SOURCE_GROUP:
6461 6464                  fmode = MODE_IS_INCLUDE;
6462 6465                  optfn = ip_opt_add_group;
6463 6466                  break;
6464 6467  
6465 6468          case IP_DROP_SOURCE_MEMBERSHIP:
6466 6469                  mcast_opt = B_FALSE;
6467 6470                  /* FALLTHRU */
6468 6471          case MCAST_LEAVE_SOURCE_GROUP:
6469 6472                  fmode = MODE_IS_INCLUDE;
6470 6473                  optfn = ip_opt_delete_group;
6471 6474                  break;
6472 6475          default:
6473 6476                  ASSERT(0);
6474 6477          }
6475 6478  
6476 6479          if (mcast_opt) {
6477 6480                  gsreqp = (struct group_source_req *)i1;
6478 6481                  ifindex = gsreqp->gsr_interface;
6479 6482                  if (gsreqp->gsr_group.ss_family == AF_INET) {
6480 6483                          struct sockaddr_in *s;
6481 6484                          s = (struct sockaddr_in *)&gsreqp->gsr_group;
6482 6485                          IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group);
6483 6486                          s = (struct sockaddr_in *)&gsreqp->gsr_source;
6484 6487                          IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src);
6485 6488                  } else {
6486 6489                          struct sockaddr_in6 *s6;
6487 6490  
6488 6491                          if (!inet6)
6489 6492                                  return (EINVAL);        /* Not on INET socket */
6490 6493  
6491 6494                          s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group;
6492 6495                          v6group = s6->sin6_addr;
6493 6496                          s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source;
6494 6497                          v6src = s6->sin6_addr;
6495 6498                  }
6496 6499                  ifaddr = INADDR_ANY;
6497 6500          } else {
6498 6501                  imreqp = (struct ip_mreq_source *)i1;
6499 6502                  IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group);
6500 6503                  IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src);
6501 6504                  ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr;
6502 6505                  ifindex = 0;
6503 6506          }
6504 6507  
6505 6508          /*
6506 6509           * Handle src being mapped INADDR_ANY by changing it to unspecified.
6507 6510           */
6508 6511          if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src))
6509 6512                  v6src = ipv6_all_zeros;
6510 6513  
6511 6514          /*
6512 6515           * In the multirouting case, we need to replicate
6513 6516           * the request as noted in the mcast cases above.
6514 6517           */
6515 6518          if (IN6_IS_ADDR_V4MAPPED(&v6group)) {
6516 6519                  ipaddr_t group;
6517 6520  
6518 6521                  IN6_V4MAPPED_TO_IPADDR(&v6group, group);
6519 6522  
6520 6523                  ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0,
6521 6524                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6522 6525                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6523 6526          } else {
6524 6527                  ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0,
6525 6528                      IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL,
6526 6529                      MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL);
6527 6530          }
6528 6531          if (ire != NULL) {
6529 6532                  if (ire->ire_flags & RTF_MULTIRT) {
6530 6533                          error = ip_multirt_apply_membership(optfn, ire, connp,
6531 6534                              checkonly, &v6group, fmode, &v6src);
6532 6535                          done = B_TRUE;
6533 6536                  }
6534 6537                  ire_refrele(ire);
6535 6538          }
6536 6539          if (!done) {
6537 6540                  error = optfn(connp, checkonly, &v6group, ifaddr, ifindex,
6538 6541                      fmode, &v6src);
6539 6542          }
6540 6543          return (error);
6541 6544  }
6542 6545  
6543 6546  /*
6544 6547   * Given a destination address and a pointer to where to put the information
6545 6548   * this routine fills in the mtuinfo.
6546 6549   * The socket must be connected.
6547 6550   * For sctp conn_faddr is the primary address.
6548 6551   */
6549 6552  int
6550 6553  ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo)
6551 6554  {
6552 6555          uint32_t        pmtu = IP_MAXPACKET;
6553 6556          uint_t          scopeid;
6554 6557  
6555 6558          if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6))
6556 6559                  return (-1);
6557 6560  
6558 6561          /* In case we never sent or called ip_set_destination_v4/v6 */
6559 6562          if (ixa->ixa_ire != NULL)
6560 6563                  pmtu = ip_get_pmtu(ixa);
6561 6564  
6562 6565          if (ixa->ixa_flags & IXAF_SCOPEID_SET)
6563 6566                  scopeid = ixa->ixa_scopeid;
6564 6567          else
6565 6568                  scopeid = 0;
6566 6569  
6567 6570          bzero(mtuinfo, sizeof (*mtuinfo));
6568 6571          mtuinfo->ip6m_addr.sin6_family = AF_INET6;
6569 6572          mtuinfo->ip6m_addr.sin6_port = connp->conn_fport;
6570 6573          mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6;
6571 6574          mtuinfo->ip6m_addr.sin6_scope_id = scopeid;
6572 6575          mtuinfo->ip6m_mtu = pmtu;
6573 6576  
6574 6577          return (sizeof (struct ip6_mtuinfo));
6575 6578  }
6576 6579  
6577 6580  /*
6578 6581   * When the src multihoming is changed from weak to [strong, preferred]
6579 6582   * ip_ire_rebind_walker is called to walk the list of all ire_t entries
6580 6583   * and identify routes that were created by user-applications in the
6581 6584   * unbound state (i.e., without RTA_IFP), and for which an ire_ill is not
6582 6585   * currently defined. These routes are then 'rebound', i.e., their ire_ill
6583 6586   * is selected by finding an interface route for the gateway.
6584 6587   */
6585 6588  /* ARGSUSED */
6586 6589  void
6587 6590  ip_ire_rebind_walker(ire_t *ire, void *notused)
6588 6591  {
6589 6592          if (!ire->ire_unbound || ire->ire_ill != NULL)
6590 6593                  return;
6591 6594          ire_rebind(ire);
6592 6595          ire_delete(ire);
6593 6596  }
6594 6597  
6595 6598  /*
6596 6599   * When the src multihoming is changed from  [strong, preferred] to weak,
6597 6600   * ip_ire_unbind_walker is called to walk the list of all ire_t entries, and
6598 6601   * set any entries that were created by user-applications in the unbound state
6599 6602   * (i.e., without RTA_IFP) back to having a NULL ire_ill.
6600 6603   */
6601 6604  /* ARGSUSED */
6602 6605  void
6603 6606  ip_ire_unbind_walker(ire_t *ire, void *notused)
6604 6607  {
6605 6608          ire_t *new_ire;
6606 6609  
6607 6610          if (!ire->ire_unbound || ire->ire_ill == NULL)
6608 6611                  return;
6609 6612          if (ire->ire_ipversion == IPV6_VERSION) {
6610 6613                  new_ire = ire_create_v6(&ire->ire_addr_v6, &ire->ire_mask_v6,
6611 6614                      &ire->ire_gateway_addr_v6, ire->ire_type, NULL,
6612 6615                      ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6613 6616          } else {
6614 6617                  new_ire = ire_create((uchar_t *)&ire->ire_addr,
6615 6618                      (uchar_t *)&ire->ire_mask,
6616 6619                      (uchar_t *)&ire->ire_gateway_addr, ire->ire_type, NULL,
6617 6620                      ire->ire_zoneid, ire->ire_flags, NULL, ire->ire_ipst);
6618 6621          }
6619 6622          if (new_ire == NULL)
6620 6623                  return;
6621 6624          new_ire->ire_unbound = B_TRUE;
6622 6625          /*
6623 6626           * The bound ire must first be deleted so that we don't return
6624 6627           * the existing one on the attempt to add the unbound new_ire.
6625 6628           */
6626 6629          ire_delete(ire);
6627 6630          new_ire = ire_add(new_ire);
6628 6631          if (new_ire != NULL)
6629 6632                  ire_refrele(new_ire);
6630 6633  }
6631 6634  
6632 6635  /*
6633 6636   * When the settings of ip*_strict_src_multihoming tunables are changed,
6634 6637   * all cached routes need to be recomputed. This recomputation needs to be
6635 6638   * done when going from weaker to stronger modes so that the cached ire
6636 6639   * for the connection does not violate the current ip*_strict_src_multihoming
6637 6640   * setting. It also needs to be done when going from stronger to weaker modes,
6638 6641   * so that we fall back to matching on the longest-matching-route (as opposed
6639 6642   * to a shorter match that may have been selected in the strong mode
6640 6643   * to satisfy src_multihoming settings).
6641 6644   *
6642 6645   * The cached ixa_ire entires for all conn_t entries are marked as
6643 6646   * "verify" so that they will be recomputed for the next packet.
6644 6647   */
6645 6648  void
6646 6649  conn_ire_revalidate(conn_t *connp, void *arg)
6647 6650  {
6648 6651          boolean_t isv6 = (boolean_t)arg;
6649 6652  
6650 6653          if ((isv6 && connp->conn_ipversion != IPV6_VERSION) ||
6651 6654              (!isv6 && connp->conn_ipversion != IPV4_VERSION))
6652 6655                  return;
6653 6656          connp->conn_ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
6654 6657  }
6655 6658  
6656 6659  /*
6657 6660   * Handles both IPv4 and IPv6 reassembly - doing the out-of-order cases,
6658 6661   * When an ipf is passed here for the first time, if
6659 6662   * we already have in-order fragments on the queue, we convert from the fast-
6660 6663   * path reassembly scheme to the hard-case scheme.  From then on, additional
6661 6664   * fragments are reassembled here.  We keep track of the start and end offsets
6662 6665   * of each piece, and the number of holes in the chain.  When the hole count
6663 6666   * goes to zero, we are done!
6664 6667   *
6665 6668   * The ipf_count will be updated to account for any mblk(s) added (pointed to
6666 6669   * by mp) or subtracted (freeb()ed dups), upon return the caller must update
6667 6670   * ipfb_count and ill_frag_count by the difference of ipf_count before and
6668 6671   * after the call to ip_reassemble().
6669 6672   */
6670 6673  int
6671 6674  ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill,
6672 6675      size_t msg_len)
6673 6676  {
6674 6677          uint_t  end;
6675 6678          mblk_t  *next_mp;
6676 6679          mblk_t  *mp1;
6677 6680          uint_t  offset;
6678 6681          boolean_t incr_dups = B_TRUE;
6679 6682          boolean_t offset_zero_seen = B_FALSE;
6680 6683          boolean_t pkt_boundary_checked = B_FALSE;
6681 6684  
6682 6685          /* If start == 0 then ipf_nf_hdr_len has to be set. */
6683 6686          ASSERT(start != 0 || ipf->ipf_nf_hdr_len != 0);
6684 6687  
6685 6688          /* Add in byte count */
6686 6689          ipf->ipf_count += msg_len;
6687 6690          if (ipf->ipf_end) {
6688 6691                  /*
6689 6692                   * We were part way through in-order reassembly, but now there
6690 6693                   * is a hole.  We walk through messages already queued, and
6691 6694                   * mark them for hard case reassembly.  We know that up till
6692 6695                   * now they were in order starting from offset zero.
6693 6696                   */
6694 6697                  offset = 0;
6695 6698                  for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
6696 6699                          IP_REASS_SET_START(mp1, offset);
6697 6700                          if (offset == 0) {
6698 6701                                  ASSERT(ipf->ipf_nf_hdr_len != 0);
6699 6702                                  offset = -ipf->ipf_nf_hdr_len;
6700 6703                          }
6701 6704                          offset += mp1->b_wptr - mp1->b_rptr;
6702 6705                          IP_REASS_SET_END(mp1, offset);
6703 6706                  }
6704 6707                  /* One hole at the end. */
6705 6708                  ipf->ipf_hole_cnt = 1;
6706 6709                  /* Brand it as a hard case, forever. */
6707 6710                  ipf->ipf_end = 0;
6708 6711          }
6709 6712          /* Walk through all the new pieces. */
6710 6713          do {
6711 6714                  end = start + (mp->b_wptr - mp->b_rptr);
6712 6715                  /*
6713 6716                   * If start is 0, decrease 'end' only for the first mblk of
6714 6717                   * the fragment. Otherwise 'end' can get wrong value in the
6715 6718                   * second pass of the loop if first mblk is exactly the
6716 6719                   * size of ipf_nf_hdr_len.
6717 6720                   */
6718 6721                  if (start == 0 && !offset_zero_seen) {
6719 6722                          /* First segment */
6720 6723                          ASSERT(ipf->ipf_nf_hdr_len != 0);
6721 6724                          end -= ipf->ipf_nf_hdr_len;
6722 6725                          offset_zero_seen = B_TRUE;
6723 6726                  }
6724 6727                  next_mp = mp->b_cont;
6725 6728                  /*
6726 6729                   * We are checking to see if there is any interesing data
6727 6730                   * to process.  If there isn't and the mblk isn't the
6728 6731                   * one which carries the unfragmentable header then we
6729 6732                   * drop it.  It's possible to have just the unfragmentable
6730 6733                   * header come through without any data.  That needs to be
6731 6734                   * saved.
6732 6735                   *
6733 6736                   * If the assert at the top of this function holds then the
6734 6737                   * term "ipf->ipf_nf_hdr_len != 0" isn't needed.  This code
6735 6738                   * is infrequently traveled enough that the test is left in
6736 6739                   * to protect against future code changes which break that
6737 6740                   * invariant.
6738 6741                   */
6739 6742                  if (start == end && start != 0 && ipf->ipf_nf_hdr_len != 0) {
6740 6743                          /* Empty.  Blast it. */
6741 6744                          IP_REASS_SET_START(mp, 0);
6742 6745                          IP_REASS_SET_END(mp, 0);
6743 6746                          /*
6744 6747                           * If the ipf points to the mblk we are about to free,
6745 6748                           * update ipf to point to the next mblk (or NULL
6746 6749                           * if none).
6747 6750                           */
6748 6751                          if (ipf->ipf_mp->b_cont == mp)
6749 6752                                  ipf->ipf_mp->b_cont = next_mp;
6750 6753                          freeb(mp);
6751 6754                          continue;
6752 6755                  }
6753 6756                  mp->b_cont = NULL;
6754 6757                  IP_REASS_SET_START(mp, start);
6755 6758                  IP_REASS_SET_END(mp, end);
6756 6759                  if (!ipf->ipf_tail_mp) {
6757 6760                          ipf->ipf_tail_mp = mp;
6758 6761                          ipf->ipf_mp->b_cont = mp;
6759 6762                          if (start == 0 || !more) {
6760 6763                                  ipf->ipf_hole_cnt = 1;
6761 6764                                  /*
6762 6765                                   * if the first fragment comes in more than one
6763 6766                                   * mblk, this loop will be executed for each
6764 6767                                   * mblk. Need to adjust hole count so exiting
6765 6768                                   * this routine will leave hole count at 1.
6766 6769                                   */
6767 6770                                  if (next_mp)
6768 6771                                          ipf->ipf_hole_cnt++;
6769 6772                          } else
6770 6773                                  ipf->ipf_hole_cnt = 2;
6771 6774                          continue;
6772 6775                  } else if (ipf->ipf_last_frag_seen && !more &&
6773 6776                      !pkt_boundary_checked) {
6774 6777                          /*
6775 6778                           * We check datagram boundary only if this fragment
6776 6779                           * claims to be the last fragment and we have seen a
6777 6780                           * last fragment in the past too. We do this only
6778 6781                           * once for a given fragment.
6779 6782                           *
6780 6783                           * start cannot be 0 here as fragments with start=0
6781 6784                           * and MF=0 gets handled as a complete packet. These
6782 6785                           * fragments should not reach here.
6783 6786                           */
6784 6787  
6785 6788                          if (start + msgdsize(mp) !=
6786 6789                              IP_REASS_END(ipf->ipf_tail_mp)) {
6787 6790                                  /*
6788 6791                                   * We have two fragments both of which claim
6789 6792                                   * to be the last fragment but gives conflicting
6790 6793                                   * information about the whole datagram size.
6791 6794                                   * Something fishy is going on. Drop the
6792 6795                                   * fragment and free up the reassembly list.
6793 6796                                   */
6794 6797                                  return (IP_REASS_FAILED);
6795 6798                          }
6796 6799  
6797 6800                          /*
6798 6801                           * We shouldn't come to this code block again for this
6799 6802                           * particular fragment.
6800 6803                           */
6801 6804                          pkt_boundary_checked = B_TRUE;
6802 6805                  }
6803 6806  
6804 6807                  /* New stuff at or beyond tail? */
6805 6808                  offset = IP_REASS_END(ipf->ipf_tail_mp);
6806 6809                  if (start >= offset) {
6807 6810                          if (ipf->ipf_last_frag_seen) {
6808 6811                                  /* current fragment is beyond last fragment */
6809 6812                                  return (IP_REASS_FAILED);
6810 6813                          }
6811 6814                          /* Link it on end. */
6812 6815                          ipf->ipf_tail_mp->b_cont = mp;
6813 6816                          ipf->ipf_tail_mp = mp;
6814 6817                          if (more) {
6815 6818                                  if (start != offset)
6816 6819                                          ipf->ipf_hole_cnt++;
6817 6820                          } else if (start == offset && next_mp == NULL)
6818 6821                                          ipf->ipf_hole_cnt--;
6819 6822                          continue;
6820 6823                  }
6821 6824                  mp1 = ipf->ipf_mp->b_cont;
6822 6825                  offset = IP_REASS_START(mp1);
6823 6826                  /* New stuff at the front? */
6824 6827                  if (start < offset) {
6825 6828                          if (start == 0) {
6826 6829                                  if (end >= offset) {
6827 6830                                          /* Nailed the hole at the begining. */
6828 6831                                          ipf->ipf_hole_cnt--;
6829 6832                                  }
6830 6833                          } else if (end < offset) {
6831 6834                                  /*
6832 6835                                   * A hole, stuff, and a hole where there used
6833 6836                                   * to be just a hole.
6834 6837                                   */
6835 6838                                  ipf->ipf_hole_cnt++;
6836 6839                          }
6837 6840                          mp->b_cont = mp1;
6838 6841                          /* Check for overlap. */
6839 6842                          while (end > offset) {
6840 6843                                  if (end < IP_REASS_END(mp1)) {
6841 6844                                          mp->b_wptr -= end - offset;
6842 6845                                          IP_REASS_SET_END(mp, offset);
6843 6846                                          BUMP_MIB(ill->ill_ip_mib,
6844 6847                                              ipIfStatsReasmPartDups);
6845 6848                                          break;
6846 6849                                  }
6847 6850                                  /* Did we cover another hole? */
6848 6851                                  if ((mp1->b_cont &&
6849 6852                                      IP_REASS_END(mp1) !=
6850 6853                                      IP_REASS_START(mp1->b_cont) &&
6851 6854                                      end >= IP_REASS_START(mp1->b_cont)) ||
6852 6855                                      (!ipf->ipf_last_frag_seen && !more)) {
6853 6856                                          ipf->ipf_hole_cnt--;
6854 6857                                  }
6855 6858                                  /* Clip out mp1. */
6856 6859                                  if ((mp->b_cont = mp1->b_cont) == NULL) {
6857 6860                                          /*
6858 6861                                           * After clipping out mp1, this guy
6859 6862                                           * is now hanging off the end.
6860 6863                                           */
6861 6864                                          ipf->ipf_tail_mp = mp;
6862 6865                                  }
6863 6866                                  IP_REASS_SET_START(mp1, 0);
6864 6867                                  IP_REASS_SET_END(mp1, 0);
6865 6868                                  /* Subtract byte count */
6866 6869                                  ipf->ipf_count -= mp1->b_datap->db_lim -
6867 6870                                      mp1->b_datap->db_base;
6868 6871                                  freeb(mp1);
6869 6872                                  BUMP_MIB(ill->ill_ip_mib,
6870 6873                                      ipIfStatsReasmPartDups);
6871 6874                                  mp1 = mp->b_cont;
6872 6875                                  if (!mp1)
6873 6876                                          break;
6874 6877                                  offset = IP_REASS_START(mp1);
6875 6878                          }
6876 6879                          ipf->ipf_mp->b_cont = mp;
6877 6880                          continue;
6878 6881                  }
6879 6882                  /*
6880 6883                   * The new piece starts somewhere between the start of the head
6881 6884                   * and before the end of the tail.
6882 6885                   */
6883 6886                  for (; mp1; mp1 = mp1->b_cont) {
6884 6887                          offset = IP_REASS_END(mp1);
6885 6888                          if (start < offset) {
6886 6889                                  if (end <= offset) {
6887 6890                                          /* Nothing new. */
6888 6891                                          IP_REASS_SET_START(mp, 0);
6889 6892                                          IP_REASS_SET_END(mp, 0);
6890 6893                                          /* Subtract byte count */
6891 6894                                          ipf->ipf_count -= mp->b_datap->db_lim -
6892 6895                                              mp->b_datap->db_base;
6893 6896                                          if (incr_dups) {
6894 6897                                                  ipf->ipf_num_dups++;
6895 6898                                                  incr_dups = B_FALSE;
6896 6899                                          }
6897 6900                                          freeb(mp);
6898 6901                                          BUMP_MIB(ill->ill_ip_mib,
6899 6902                                              ipIfStatsReasmDuplicates);
6900 6903                                          break;
6901 6904                                  }
6902 6905                                  /*
6903 6906                                   * Trim redundant stuff off beginning of new
6904 6907                                   * piece.
6905 6908                                   */
6906 6909                                  IP_REASS_SET_START(mp, offset);
6907 6910                                  mp->b_rptr += offset - start;
6908 6911                                  BUMP_MIB(ill->ill_ip_mib,
6909 6912                                      ipIfStatsReasmPartDups);
6910 6913                                  start = offset;
6911 6914                                  if (!mp1->b_cont) {
6912 6915                                          /*
6913 6916                                           * After trimming, this guy is now
6914 6917                                           * hanging off the end.
6915 6918                                           */
6916 6919                                          mp1->b_cont = mp;
6917 6920                                          ipf->ipf_tail_mp = mp;
6918 6921                                          if (!more) {
6919 6922                                                  ipf->ipf_hole_cnt--;
6920 6923                                          }
6921 6924                                          break;
6922 6925                                  }
6923 6926                          }
6924 6927                          if (start >= IP_REASS_START(mp1->b_cont))
6925 6928                                  continue;
6926 6929                          /* Fill a hole */
6927 6930                          if (start > offset)
6928 6931                                  ipf->ipf_hole_cnt++;
6929 6932                          mp->b_cont = mp1->b_cont;
6930 6933                          mp1->b_cont = mp;
6931 6934                          mp1 = mp->b_cont;
6932 6935                          offset = IP_REASS_START(mp1);
6933 6936                          if (end >= offset) {
6934 6937                                  ipf->ipf_hole_cnt--;
6935 6938                                  /* Check for overlap. */
6936 6939                                  while (end > offset) {
6937 6940                                          if (end < IP_REASS_END(mp1)) {
6938 6941                                                  mp->b_wptr -= end - offset;
6939 6942                                                  IP_REASS_SET_END(mp, offset);
6940 6943                                                  /*
6941 6944                                                   * TODO we might bump
6942 6945                                                   * this up twice if there is
6943 6946                                                   * overlap at both ends.
6944 6947                                                   */
6945 6948                                                  BUMP_MIB(ill->ill_ip_mib,
6946 6949                                                      ipIfStatsReasmPartDups);
6947 6950                                                  break;
6948 6951                                          }
6949 6952                                          /* Did we cover another hole? */
6950 6953                                          if ((mp1->b_cont &&
6951 6954                                              IP_REASS_END(mp1)
6952 6955                                              != IP_REASS_START(mp1->b_cont) &&
6953 6956                                              end >=
6954 6957                                              IP_REASS_START(mp1->b_cont)) ||
6955 6958                                              (!ipf->ipf_last_frag_seen &&
6956 6959                                              !more)) {
6957 6960                                                  ipf->ipf_hole_cnt--;
6958 6961                                          }
6959 6962                                          /* Clip out mp1. */
6960 6963                                          if ((mp->b_cont = mp1->b_cont) ==
6961 6964                                              NULL) {
6962 6965                                                  /*
6963 6966                                                   * After clipping out mp1,
6964 6967                                                   * this guy is now hanging
6965 6968                                                   * off the end.
6966 6969                                                   */
6967 6970                                                  ipf->ipf_tail_mp = mp;
6968 6971                                          }
6969 6972                                          IP_REASS_SET_START(mp1, 0);
6970 6973                                          IP_REASS_SET_END(mp1, 0);
6971 6974                                          /* Subtract byte count */
6972 6975                                          ipf->ipf_count -=
6973 6976                                              mp1->b_datap->db_lim -
6974 6977                                              mp1->b_datap->db_base;
6975 6978                                          freeb(mp1);
6976 6979                                          BUMP_MIB(ill->ill_ip_mib,
6977 6980                                              ipIfStatsReasmPartDups);
6978 6981                                          mp1 = mp->b_cont;
6979 6982                                          if (!mp1)
6980 6983                                                  break;
6981 6984                                          offset = IP_REASS_START(mp1);
6982 6985                                  }
6983 6986                          }
6984 6987                          break;
6985 6988                  }
6986 6989          } while (start = end, mp = next_mp);
6987 6990  
6988 6991          /* Fragment just processed could be the last one. Remember this fact */
6989 6992          if (!more)
6990 6993                  ipf->ipf_last_frag_seen = B_TRUE;
6991 6994  
6992 6995          /* Still got holes? */
6993 6996          if (ipf->ipf_hole_cnt)
6994 6997                  return (IP_REASS_PARTIAL);
6995 6998          /* Clean up overloaded fields to avoid upstream disasters. */
6996 6999          for (mp1 = ipf->ipf_mp->b_cont; mp1; mp1 = mp1->b_cont) {
6997 7000                  IP_REASS_SET_START(mp1, 0);
6998 7001                  IP_REASS_SET_END(mp1, 0);
6999 7002          }
7000 7003          return (IP_REASS_COMPLETE);
7001 7004  }
7002 7005  
7003 7006  /*
7004 7007   * Fragmentation reassembly.  Each ILL has a hash table for
7005 7008   * queuing packets undergoing reassembly for all IPIFs
7006 7009   * associated with the ILL.  The hash is based on the packet
7007 7010   * IP ident field.  The ILL frag hash table was allocated
7008 7011   * as a timer block at the time the ILL was created.  Whenever
7009 7012   * there is anything on the reassembly queue, the timer will
7010 7013   * be running.  Returns the reassembled packet if reassembly completes.
7011 7014   */
7012 7015  mblk_t *
7013 7016  ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
7014 7017  {
7015 7018          uint32_t        frag_offset_flags;
7016 7019          mblk_t          *t_mp;
7017 7020          ipaddr_t        dst;
7018 7021          uint8_t         proto = ipha->ipha_protocol;
7019 7022          uint32_t        sum_val;
7020 7023          uint16_t        sum_flags;
7021 7024          ipf_t           *ipf;
7022 7025          ipf_t           **ipfp;
7023 7026          ipfb_t          *ipfb;
7024 7027          uint16_t        ident;
7025 7028          uint32_t        offset;
7026 7029          ipaddr_t        src;
7027 7030          uint_t          hdr_length;
7028 7031          uint32_t        end;
7029 7032          mblk_t          *mp1;
7030 7033          mblk_t          *tail_mp;
7031 7034          size_t          count;
7032 7035          size_t          msg_len;
7033 7036          uint8_t         ecn_info = 0;
7034 7037          uint32_t        packet_size;
7035 7038          boolean_t       pruned = B_FALSE;
7036 7039          ill_t           *ill = ira->ira_ill;
7037 7040          ip_stack_t      *ipst = ill->ill_ipst;
7038 7041  
7039 7042          /*
7040 7043           * Drop the fragmented as early as possible, if
7041 7044           * we don't have resource(s) to re-assemble.
7042 7045           */
7043 7046          if (ipst->ips_ip_reass_queue_bytes == 0) {
7044 7047                  freemsg(mp);
7045 7048                  return (NULL);
7046 7049          }
7047 7050  
7048 7051          /* Check for fragmentation offset; return if there's none */
7049 7052          if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
7050 7053              (IPH_MF | IPH_OFFSET)) == 0)
7051 7054                  return (mp);
7052 7055  
7053 7056          /*
7054 7057           * We utilize hardware computed checksum info only for UDP since
7055 7058           * IP fragmentation is a normal occurrence for the protocol.  In
7056 7059           * addition, checksum offload support for IP fragments carrying
7057 7060           * UDP payload is commonly implemented across network adapters.
7058 7061           */
7059 7062          ASSERT(ira->ira_rill != NULL);
7060 7063          if (proto == IPPROTO_UDP && dohwcksum &&
7061 7064              ILL_HCKSUM_CAPABLE(ira->ira_rill) &&
7062 7065              (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
7063 7066                  mblk_t *mp1 = mp->b_cont;
7064 7067                  int32_t len;
7065 7068  
7066 7069                  /* Record checksum information from the packet */
7067 7070                  sum_val = (uint32_t)DB_CKSUM16(mp);
7068 7071                  sum_flags = DB_CKSUMFLAGS(mp);
7069 7072  
7070 7073                  /* IP payload offset from beginning of mblk */
7071 7074                  offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
7072 7075  
7073 7076                  if ((sum_flags & HCK_PARTIALCKSUM) &&
7074 7077                      (mp1 == NULL || mp1->b_cont == NULL) &&
7075 7078                      offset >= DB_CKSUMSTART(mp) &&
7076 7079                      ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
7077 7080                          uint32_t adj;
7078 7081                          /*
7079 7082                           * Partial checksum has been calculated by hardware
7080 7083                           * and attached to the packet; in addition, any
7081 7084                           * prepended extraneous data is even byte aligned.
7082 7085                           * If any such data exists, we adjust the checksum;
7083 7086                           * this would also handle any postpended data.
7084 7087                           */
7085 7088                          IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
7086 7089                              mp, mp1, len, adj);
7087 7090  
7088 7091                          /* One's complement subtract extraneous checksum */
7089 7092                          if (adj >= sum_val)
7090 7093                                  sum_val = ~(adj - sum_val) & 0xFFFF;
7091 7094                          else
7092 7095                                  sum_val -= adj;
7093 7096                  }
7094 7097          } else {
7095 7098                  sum_val = 0;
7096 7099                  sum_flags = 0;
7097 7100          }
7098 7101  
7099 7102          /* Clear hardware checksumming flag */
7100 7103          DB_CKSUMFLAGS(mp) = 0;
7101 7104  
7102 7105          ident = ipha->ipha_ident;
7103 7106          offset = (frag_offset_flags << 3) & 0xFFFF;
7104 7107          src = ipha->ipha_src;
7105 7108          dst = ipha->ipha_dst;
7106 7109          hdr_length = IPH_HDR_LENGTH(ipha);
7107 7110          end = ntohs(ipha->ipha_length) - hdr_length;
7108 7111  
7109 7112          /* If end == 0 then we have a packet with no data, so just free it */
7110 7113          if (end == 0) {
7111 7114                  freemsg(mp);
7112 7115                  return (NULL);
7113 7116          }
7114 7117  
7115 7118          /* Record the ECN field info. */
7116 7119          ecn_info = (ipha->ipha_type_of_service & 0x3);
7117 7120          if (offset != 0) {
7118 7121                  /*
7119 7122                   * If this isn't the first piece, strip the header, and
7120 7123                   * add the offset to the end value.
7121 7124                   */
7122 7125                  mp->b_rptr += hdr_length;
7123 7126                  end += offset;
7124 7127          }
7125 7128  
7126 7129          /* Handle vnic loopback of fragments */
7127 7130          if (mp->b_datap->db_ref > 2)
7128 7131                  msg_len = 0;
7129 7132          else
7130 7133                  msg_len = MBLKSIZE(mp);
7131 7134  
7132 7135          tail_mp = mp;
7133 7136          while (tail_mp->b_cont != NULL) {
7134 7137                  tail_mp = tail_mp->b_cont;
7135 7138                  if (tail_mp->b_datap->db_ref <= 2)
7136 7139                          msg_len += MBLKSIZE(tail_mp);
7137 7140          }
7138 7141  
7139 7142          /* If the reassembly list for this ILL will get too big, prune it */
7140 7143          if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
7141 7144              ipst->ips_ip_reass_queue_bytes) {
7142 7145                  DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len,
7143 7146                      uint_t, ill->ill_frag_count,
7144 7147                      uint_t, ipst->ips_ip_reass_queue_bytes);
7145 7148                  ill_frag_prune(ill,
7146 7149                      (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 :
7147 7150                      (ipst->ips_ip_reass_queue_bytes - msg_len));
7148 7151                  pruned = B_TRUE;
7149 7152          }
7150 7153  
7151 7154          ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
7152 7155          mutex_enter(&ipfb->ipfb_lock);
7153 7156  
7154 7157          ipfp = &ipfb->ipfb_ipf;
7155 7158          /* Try to find an existing fragment queue for this packet. */
7156 7159          for (;;) {
7157 7160                  ipf = ipfp[0];
7158 7161                  if (ipf != NULL) {
7159 7162                          /*
7160 7163                           * It has to match on ident and src/dst address.
7161 7164                           */
7162 7165                          if (ipf->ipf_ident == ident &&
7163 7166                              ipf->ipf_src == src &&
7164 7167                              ipf->ipf_dst == dst &&
7165 7168                              ipf->ipf_protocol == proto) {
7166 7169                                  /*
7167 7170                                   * If we have received too many
7168 7171                                   * duplicate fragments for this packet
7169 7172                                   * free it.
7170 7173                                   */
7171 7174                                  if (ipf->ipf_num_dups > ip_max_frag_dups) {
7172 7175                                          ill_frag_free_pkts(ill, ipfb, ipf, 1);
7173 7176                                          freemsg(mp);
7174 7177                                          mutex_exit(&ipfb->ipfb_lock);
7175 7178                                          return (NULL);
7176 7179                                  }
7177 7180                                  /* Found it. */
7178 7181                                  break;
7179 7182                          }
7180 7183                          ipfp = &ipf->ipf_hash_next;
7181 7184                          continue;
7182 7185                  }
7183 7186  
7184 7187                  /*
7185 7188                   * If we pruned the list, do we want to store this new
7186 7189                   * fragment?. We apply an optimization here based on the
7187 7190                   * fact that most fragments will be received in order.
7188 7191                   * So if the offset of this incoming fragment is zero,
7189 7192                   * it is the first fragment of a new packet. We will
7190 7193                   * keep it.  Otherwise drop the fragment, as we have
7191 7194                   * probably pruned the packet already (since the
7192 7195                   * packet cannot be found).
7193 7196                   */
7194 7197                  if (pruned && offset != 0) {
7195 7198                          mutex_exit(&ipfb->ipfb_lock);
7196 7199                          freemsg(mp);
7197 7200                          return (NULL);
7198 7201                  }
7199 7202  
7200 7203                  if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst))  {
7201 7204                          /*
7202 7205                           * Too many fragmented packets in this hash
7203 7206                           * bucket. Free the oldest.
7204 7207                           */
7205 7208                          ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
7206 7209                  }
7207 7210  
7208 7211                  /* New guy.  Allocate a frag message. */
7209 7212                  mp1 = allocb(sizeof (*ipf), BPRI_MED);
7210 7213                  if (mp1 == NULL) {
7211 7214                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7212 7215                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7213 7216                          freemsg(mp);
7214 7217  reass_done:
7215 7218                          mutex_exit(&ipfb->ipfb_lock);
7216 7219                          return (NULL);
7217 7220                  }
7218 7221  
7219 7222                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds);
7220 7223                  mp1->b_cont = mp;
7221 7224  
7222 7225                  /* Initialize the fragment header. */
7223 7226                  ipf = (ipf_t *)mp1->b_rptr;
7224 7227                  ipf->ipf_mp = mp1;
7225 7228                  ipf->ipf_ptphn = ipfp;
7226 7229                  ipfp[0] = ipf;
7227 7230                  ipf->ipf_hash_next = NULL;
7228 7231                  ipf->ipf_ident = ident;
7229 7232                  ipf->ipf_protocol = proto;
7230 7233                  ipf->ipf_src = src;
7231 7234                  ipf->ipf_dst = dst;
7232 7235                  ipf->ipf_nf_hdr_len = 0;
7233 7236                  /* Record reassembly start time. */
7234 7237                  ipf->ipf_timestamp = gethrestime_sec();
7235 7238                  /* Record ipf generation and account for frag header */
7236 7239                  ipf->ipf_gen = ill->ill_ipf_gen++;
7237 7240                  ipf->ipf_count = MBLKSIZE(mp1);
7238 7241                  ipf->ipf_last_frag_seen = B_FALSE;
7239 7242                  ipf->ipf_ecn = ecn_info;
7240 7243                  ipf->ipf_num_dups = 0;
7241 7244                  ipfb->ipfb_frag_pkts++;
7242 7245                  ipf->ipf_checksum = 0;
7243 7246                  ipf->ipf_checksum_flags = 0;
7244 7247  
7245 7248                  /* Store checksum value in fragment header */
7246 7249                  if (sum_flags != 0) {
7247 7250                          sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7248 7251                          sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7249 7252                          ipf->ipf_checksum = sum_val;
7250 7253                          ipf->ipf_checksum_flags = sum_flags;
7251 7254                  }
7252 7255  
7253 7256                  /*
7254 7257                   * We handle reassembly two ways.  In the easy case,
7255 7258                   * where all the fragments show up in order, we do
7256 7259                   * minimal bookkeeping, and just clip new pieces on
7257 7260                   * the end.  If we ever see a hole, then we go off
7258 7261                   * to ip_reassemble which has to mark the pieces and
7259 7262                   * keep track of the number of holes, etc.  Obviously,
7260 7263                   * the point of having both mechanisms is so we can
7261 7264                   * handle the easy case as efficiently as possible.
7262 7265                   */
7263 7266                  if (offset == 0) {
7264 7267                          /* Easy case, in-order reassembly so far. */
7265 7268                          ipf->ipf_count += msg_len;
7266 7269                          ipf->ipf_tail_mp = tail_mp;
7267 7270                          /*
7268 7271                           * Keep track of next expected offset in
7269 7272                           * ipf_end.
7270 7273                           */
7271 7274                          ipf->ipf_end = end;
7272 7275                          ipf->ipf_nf_hdr_len = hdr_length;
7273 7276                  } else {
7274 7277                          /* Hard case, hole at the beginning. */
7275 7278                          ipf->ipf_tail_mp = NULL;
7276 7279                          /*
7277 7280                           * ipf_end == 0 means that we have given up
7278 7281                           * on easy reassembly.
7279 7282                           */
7280 7283                          ipf->ipf_end = 0;
7281 7284  
7282 7285                          /* Forget checksum offload from now on */
7283 7286                          ipf->ipf_checksum_flags = 0;
7284 7287  
7285 7288                          /*
7286 7289                           * ipf_hole_cnt is set by ip_reassemble.
7287 7290                           * ipf_count is updated by ip_reassemble.
7288 7291                           * No need to check for return value here
7289 7292                           * as we don't expect reassembly to complete
7290 7293                           * or fail for the first fragment itself.
7291 7294                           */
7292 7295                          (void) ip_reassemble(mp, ipf,
7293 7296                              (frag_offset_flags & IPH_OFFSET) << 3,
7294 7297                              (frag_offset_flags & IPH_MF), ill, msg_len);
7295 7298                  }
7296 7299                  /* Update per ipfb and ill byte counts */
7297 7300                  ipfb->ipfb_count += ipf->ipf_count;
7298 7301                  ASSERT(ipfb->ipfb_count > 0);   /* Wraparound */
7299 7302                  atomic_add_32(&ill->ill_frag_count, ipf->ipf_count);
7300 7303                  /* If the frag timer wasn't already going, start it. */
7301 7304                  mutex_enter(&ill->ill_lock);
7302 7305                  ill_frag_timer_start(ill);
7303 7306                  mutex_exit(&ill->ill_lock);
7304 7307                  goto reass_done;
7305 7308          }
7306 7309  
7307 7310          /*
7308 7311           * If the packet's flag has changed (it could be coming up
7309 7312           * from an interface different than the previous, therefore
7310 7313           * possibly different checksum capability), then forget about
7311 7314           * any stored checksum states.  Otherwise add the value to
7312 7315           * the existing one stored in the fragment header.
7313 7316           */
7314 7317          if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
7315 7318                  sum_val += ipf->ipf_checksum;
7316 7319                  sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7317 7320                  sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
7318 7321                  ipf->ipf_checksum = sum_val;
7319 7322          } else if (ipf->ipf_checksum_flags != 0) {
7320 7323                  /* Forget checksum offload from now on */
7321 7324                  ipf->ipf_checksum_flags = 0;
7322 7325          }
7323 7326  
7324 7327          /*
7325 7328           * We have a new piece of a datagram which is already being
7326 7329           * reassembled.  Update the ECN info if all IP fragments
7327 7330           * are ECN capable.  If there is one which is not, clear
7328 7331           * all the info.  If there is at least one which has CE
7329 7332           * code point, IP needs to report that up to transport.
7330 7333           */
7331 7334          if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
7332 7335                  if (ecn_info == IPH_ECN_CE)
7333 7336                          ipf->ipf_ecn = IPH_ECN_CE;
7334 7337          } else {
7335 7338                  ipf->ipf_ecn = IPH_ECN_NECT;
7336 7339          }
7337 7340          if (offset && ipf->ipf_end == offset) {
7338 7341                  /* The new fragment fits at the end */
7339 7342                  ipf->ipf_tail_mp->b_cont = mp;
7340 7343                  /* Update the byte count */
7341 7344                  ipf->ipf_count += msg_len;
7342 7345                  /* Update per ipfb and ill byte counts */
7343 7346                  ipfb->ipfb_count += msg_len;
7344 7347                  ASSERT(ipfb->ipfb_count > 0);   /* Wraparound */
7345 7348                  atomic_add_32(&ill->ill_frag_count, msg_len);
7346 7349                  if (frag_offset_flags & IPH_MF) {
7347 7350                          /* More to come. */
7348 7351                          ipf->ipf_end = end;
7349 7352                          ipf->ipf_tail_mp = tail_mp;
7350 7353                          goto reass_done;
7351 7354                  }
7352 7355          } else {
7353 7356                  /* Go do the hard cases. */
7354 7357                  int ret;
7355 7358  
7356 7359                  if (offset == 0)
7357 7360                          ipf->ipf_nf_hdr_len = hdr_length;
7358 7361  
7359 7362                  /* Save current byte count */
7360 7363                  count = ipf->ipf_count;
7361 7364                  ret = ip_reassemble(mp, ipf,
7362 7365                      (frag_offset_flags & IPH_OFFSET) << 3,
7363 7366                      (frag_offset_flags & IPH_MF), ill, msg_len);
7364 7367                  /* Count of bytes added and subtracted (freeb()ed) */
7365 7368                  count = ipf->ipf_count - count;
7366 7369                  if (count) {
7367 7370                          /* Update per ipfb and ill byte counts */
7368 7371                          ipfb->ipfb_count += count;
7369 7372                          ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
7370 7373                          atomic_add_32(&ill->ill_frag_count, count);
7371 7374                  }
7372 7375                  if (ret == IP_REASS_PARTIAL) {
7373 7376                          goto reass_done;
7374 7377                  } else if (ret == IP_REASS_FAILED) {
7375 7378                          /* Reassembly failed. Free up all resources */
7376 7379                          ill_frag_free_pkts(ill, ipfb, ipf, 1);
7377 7380                          for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
7378 7381                                  IP_REASS_SET_START(t_mp, 0);
7379 7382                                  IP_REASS_SET_END(t_mp, 0);
7380 7383                          }
7381 7384                          freemsg(mp);
7382 7385                          goto reass_done;
7383 7386                  }
7384 7387                  /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
7385 7388          }
7386 7389          /*
7387 7390           * We have completed reassembly.  Unhook the frag header from
7388 7391           * the reassembly list.
7389 7392           *
7390 7393           * Before we free the frag header, record the ECN info
7391 7394           * to report back to the transport.
7392 7395           */
7393 7396          ecn_info = ipf->ipf_ecn;
7394 7397          BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs);
7395 7398          ipfp = ipf->ipf_ptphn;
7396 7399  
7397 7400          /* We need to supply these to caller */
7398 7401          if ((sum_flags = ipf->ipf_checksum_flags) != 0)
7399 7402                  sum_val = ipf->ipf_checksum;
7400 7403          else
7401 7404                  sum_val = 0;
7402 7405  
7403 7406          mp1 = ipf->ipf_mp;
7404 7407          count = ipf->ipf_count;
7405 7408          ipf = ipf->ipf_hash_next;
7406 7409          if (ipf != NULL)
7407 7410                  ipf->ipf_ptphn = ipfp;
7408 7411          ipfp[0] = ipf;
7409 7412          atomic_add_32(&ill->ill_frag_count, -count);
7410 7413          ASSERT(ipfb->ipfb_count >= count);
7411 7414          ipfb->ipfb_count -= count;
7412 7415          ipfb->ipfb_frag_pkts--;
7413 7416          mutex_exit(&ipfb->ipfb_lock);
7414 7417          /* Ditch the frag header. */
7415 7418          mp = mp1->b_cont;
7416 7419  
7417 7420          freeb(mp1);
7418 7421  
7419 7422          /* Restore original IP length in header. */
7420 7423          packet_size = (uint32_t)msgdsize(mp);
7421 7424          if (packet_size > IP_MAXPACKET) {
7422 7425                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7423 7426                  ip_drop_input("Reassembled packet too large", mp, ill);
7424 7427                  freemsg(mp);
7425 7428                  return (NULL);
7426 7429          }
7427 7430  
7428 7431          if (DB_REF(mp) > 1) {
7429 7432                  mblk_t *mp2 = copymsg(mp);
7430 7433  
7431 7434                  if (mp2 == NULL) {
7432 7435                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7433 7436                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7434 7437                          freemsg(mp);
7435 7438                          return (NULL);
7436 7439                  }
7437 7440                  freemsg(mp);
7438 7441                  mp = mp2;
7439 7442          }
7440 7443          ipha = (ipha_t *)mp->b_rptr;
7441 7444  
7442 7445          ipha->ipha_length = htons((uint16_t)packet_size);
7443 7446          /* We're now complete, zip the frag state */
7444 7447          ipha->ipha_fragment_offset_and_flags = 0;
7445 7448          /* Record the ECN info. */
7446 7449          ipha->ipha_type_of_service &= 0xFC;
7447 7450          ipha->ipha_type_of_service |= ecn_info;
7448 7451  
7449 7452          /* Update the receive attributes */
7450 7453          ira->ira_pktlen = packet_size;
7451 7454          ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
7452 7455  
7453 7456          /* Reassembly is successful; set checksum information in packet */
7454 7457          DB_CKSUM16(mp) = (uint16_t)sum_val;
7455 7458          DB_CKSUMFLAGS(mp) = sum_flags;
7456 7459          DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length;
7457 7460  
7458 7461          return (mp);
7459 7462  }
7460 7463  
7461 7464  /*
7462 7465   * Pullup function that should be used for IP input in order to
7463 7466   * ensure we do not loose the L2 source address; we need the l2 source
7464 7467   * address for IP_RECVSLLA and for ndp_input.
7465 7468   *
7466 7469   * We return either NULL or b_rptr.
7467 7470   */
7468 7471  void *
7469 7472  ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira)
7470 7473  {
7471 7474          ill_t           *ill = ira->ira_ill;
7472 7475  
7473 7476          if (ip_rput_pullups++ == 0) {
7474 7477                  (void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE,
7475 7478                      "ip_pullup: %s forced us to "
7476 7479                      " pullup pkt, hdr len %ld, hdr addr %p",
7477 7480                      ill->ill_name, len, (void *)mp->b_rptr);
7478 7481          }
7479 7482          if (!(ira->ira_flags & IRAF_L2SRC_SET))
7480 7483                  ip_setl2src(mp, ira, ira->ira_rill);
7481 7484          ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7482 7485          if (!pullupmsg(mp, len))
7483 7486                  return (NULL);
7484 7487          else
7485 7488                  return (mp->b_rptr);
7486 7489  }
7487 7490  
7488 7491  /*
7489 7492   * Make sure ira_l2src has an address. If we don't have one fill with zeros.
7490 7493   * When called from the ULP ira_rill will be NULL hence the caller has to
7491 7494   * pass in the ill.
7492 7495   */
7493 7496  /* ARGSUSED */
7494 7497  void
7495 7498  ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill)
7496 7499  {
7497 7500          const uchar_t *addr;
7498 7501          int alen;
7499 7502  
7500 7503          if (ira->ira_flags & IRAF_L2SRC_SET)
7501 7504                  return;
7502 7505  
7503 7506          ASSERT(ill != NULL);
7504 7507          alen = ill->ill_phys_addr_length;
7505 7508          ASSERT(alen <= sizeof (ira->ira_l2src));
7506 7509          if (ira->ira_mhip != NULL &&
7507 7510              (addr = ira->ira_mhip->mhi_saddr) != NULL) {
7508 7511                  bcopy(addr, ira->ira_l2src, alen);
7509 7512          } else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) &&
7510 7513              (addr = ill->ill_phys_addr) != NULL) {
7511 7514                  bcopy(addr, ira->ira_l2src, alen);
7512 7515          } else {
7513 7516                  bzero(ira->ira_l2src, alen);
7514 7517          }
7515 7518          ira->ira_flags |= IRAF_L2SRC_SET;
7516 7519  }
7517 7520  
7518 7521  /*
7519 7522   * check ip header length and align it.
7520 7523   */
7521 7524  mblk_t *
7522 7525  ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira)
7523 7526  {
7524 7527          ill_t   *ill = ira->ira_ill;
7525 7528          ssize_t len;
7526 7529  
7527 7530          len = MBLKL(mp);
7528 7531  
7529 7532          if (!OK_32PTR(mp->b_rptr))
7530 7533                  IP_STAT(ill->ill_ipst, ip_notaligned);
7531 7534          else
7532 7535                  IP_STAT(ill->ill_ipst, ip_recv_pullup);
7533 7536  
7534 7537          /* Guard against bogus device drivers */
7535 7538          if (len < 0) {
7536 7539                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7537 7540                  ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7538 7541                  freemsg(mp);
7539 7542                  return (NULL);
7540 7543          }
7541 7544  
7542 7545          if (len == 0) {
7543 7546                  /* GLD sometimes sends up mblk with b_rptr == b_wptr! */
7544 7547                  mblk_t *mp1 = mp->b_cont;
7545 7548  
7546 7549                  if (!(ira->ira_flags & IRAF_L2SRC_SET))
7547 7550                          ip_setl2src(mp, ira, ira->ira_rill);
7548 7551                  ASSERT(ira->ira_flags & IRAF_L2SRC_SET);
7549 7552  
7550 7553                  freeb(mp);
7551 7554                  mp = mp1;
7552 7555                  if (mp == NULL)
7553 7556                          return (NULL);
7554 7557  
7555 7558                  if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size)
7556 7559                          return (mp);
7557 7560          }
7558 7561          if (ip_pullup(mp, min_size, ira) == NULL) {
7559 7562                  if (msgdsize(mp) < min_size) {
7560 7563                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7561 7564                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7562 7565                  } else {
7563 7566                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7564 7567                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7565 7568                  }
7566 7569                  freemsg(mp);
7567 7570                  return (NULL);
7568 7571          }
7569 7572          return (mp);
7570 7573  }
7571 7574  
7572 7575  /*
7573 7576   * Common code for IPv4 and IPv6 to check and pullup multi-mblks
7574 7577   */
7575 7578  mblk_t *
7576 7579  ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len, uint_t pkt_len,
7577 7580      uint_t min_size, ip_recv_attr_t *ira)
7578 7581  {
7579 7582          ill_t   *ill = ira->ira_ill;
7580 7583  
7581 7584          /*
7582 7585           * Make sure we have data length consistent
7583 7586           * with the IP header.
7584 7587           */
7585 7588          if (mp->b_cont == NULL) {
7586 7589                  /* pkt_len is based on ipha_len, not the mblk length */
7587 7590                  if (pkt_len < min_size) {
7588 7591                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7589 7592                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7590 7593                          freemsg(mp);
7591 7594                          return (NULL);
7592 7595                  }
7593 7596                  if (len < 0) {
7594 7597                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7595 7598                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7596 7599                          freemsg(mp);
7597 7600                          return (NULL);
7598 7601                  }
7599 7602                  /* Drop any pad */
7600 7603                  mp->b_wptr = rptr + pkt_len;
7601 7604          } else if ((len += msgdsize(mp->b_cont)) != 0) {
7602 7605                  ASSERT(pkt_len >= min_size);
7603 7606                  if (pkt_len < min_size) {
7604 7607                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7605 7608                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7606 7609                          freemsg(mp);
7607 7610                          return (NULL);
7608 7611                  }
7609 7612                  if (len < 0) {
7610 7613                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts);
7611 7614                          ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill);
7612 7615                          freemsg(mp);
7613 7616                          return (NULL);
7614 7617                  }
7615 7618                  /* Drop any pad */
7616 7619                  (void) adjmsg(mp, -len);
7617 7620                  /*
7618 7621                   * adjmsg may have freed an mblk from the chain, hence
7619 7622                   * invalidate any hw checksum here. This will force IP to
7620 7623                   * calculate the checksum in sw, but only for this packet.
7621 7624                   */
7622 7625                  DB_CKSUMFLAGS(mp) = 0;
7623 7626                  IP_STAT(ill->ill_ipst, ip_multimblk);
7624 7627          }
7625 7628          return (mp);
7626 7629  }
7627 7630  
7628 7631  /*
7629 7632   * Check that the IPv4 opt_len is consistent with the packet and pullup
7630 7633   * the options.
7631 7634   */
7632 7635  mblk_t *
7633 7636  ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len,
7634 7637      ip_recv_attr_t *ira)
7635 7638  {
7636 7639          ill_t   *ill = ira->ira_ill;
7637 7640          ssize_t len;
7638 7641  
7639 7642          /* Assume no IPv6 packets arrive over the IPv4 queue */
7640 7643          if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
7641 7644                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7642 7645                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion);
7643 7646                  ip_drop_input("IPvN packet on IPv4 ill", mp, ill);
7644 7647                  freemsg(mp);
7645 7648                  return (NULL);
7646 7649          }
7647 7650  
7648 7651          if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) {
7649 7652                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7650 7653                  ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7651 7654                  freemsg(mp);
7652 7655                  return (NULL);
7653 7656          }
7654 7657          /*
7655 7658           * Recompute complete header length and make sure we
7656 7659           * have access to all of it.
7657 7660           */
7658 7661          len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2;
7659 7662          if (len > (mp->b_wptr - mp->b_rptr)) {
7660 7663                  if (len > pkt_len) {
7661 7664                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors);
7662 7665                          ip_drop_input("ipIfStatsInHdrErrors", mp, ill);
7663 7666                          freemsg(mp);
7664 7667                          return (NULL);
7665 7668                  }
7666 7669                  if (ip_pullup(mp, len, ira) == NULL) {
7667 7670                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
7668 7671                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
7669 7672                          freemsg(mp);
7670 7673                          return (NULL);
7671 7674                  }
7672 7675          }
7673 7676          return (mp);
7674 7677  }
7675 7678  
7676 7679  /*
7677 7680   * Returns a new ire, or the same ire, or NULL.
7678 7681   * If a different IRE is returned, then it is held; the caller
7679 7682   * needs to release it.
7680 7683   * In no case is there any hold/release on the ire argument.
7681 7684   */
7682 7685  ire_t *
7683 7686  ip_check_multihome(void *addr, ire_t *ire, ill_t *ill)
7684 7687  {
7685 7688          ire_t           *new_ire;
7686 7689          ill_t           *ire_ill;
7687 7690          uint_t          ifindex;
7688 7691          ip_stack_t      *ipst = ill->ill_ipst;
7689 7692          boolean_t       strict_check = B_FALSE;
7690 7693  
7691 7694          /*
7692 7695           * IPMP common case: if IRE and ILL are in the same group, there's no
7693 7696           * issue (e.g. packet received on an underlying interface matched an
7694 7697           * IRE_LOCAL on its associated group interface).
7695 7698           */
7696 7699          ASSERT(ire->ire_ill != NULL);
7697 7700          if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill))
7698 7701                  return (ire);
7699 7702  
7700 7703          /*
7701 7704           * Do another ire lookup here, using the ingress ill, to see if the
7702 7705           * interface is in a usesrc group.
7703 7706           * As long as the ills belong to the same group, we don't consider
7704 7707           * them to be arriving on the wrong interface. Thus, if the switch
7705 7708           * is doing inbound load spreading, we won't drop packets when the
7706 7709           * ip*_strict_dst_multihoming switch is on.
7707 7710           * We also need to check for IPIF_UNNUMBERED point2point interfaces
7708 7711           * where the local address may not be unique. In this case we were
7709 7712           * at the mercy of the initial ire lookup and the IRE_LOCAL it
7710 7713           * actually returned. The new lookup, which is more specific, should
7711 7714           * only find the IRE_LOCAL associated with the ingress ill if one
7712 7715           * exists.
7713 7716           */
7714 7717          if (ire->ire_ipversion == IPV4_VERSION) {
7715 7718                  if (ipst->ips_ip_strict_dst_multihoming)
7716 7719                          strict_check = B_TRUE;
7717 7720                  new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0,
7718 7721                      IRE_LOCAL, ill, ALL_ZONES, NULL,
7719 7722                      (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7720 7723          } else {
7721 7724                  ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr));
7722 7725                  if (ipst->ips_ipv6_strict_dst_multihoming)
7723 7726                          strict_check = B_TRUE;
7724 7727                  new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL,
7725 7728                      IRE_LOCAL, ill, ALL_ZONES, NULL,
7726 7729                      (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL);
7727 7730          }
7728 7731          /*
7729 7732           * If the same ire that was returned in ip_input() is found then this
7730 7733           * is an indication that usesrc groups are in use. The packet
7731 7734           * arrived on a different ill in the group than the one associated with
7732 7735           * the destination address.  If a different ire was found then the same
7733 7736           * IP address must be hosted on multiple ills. This is possible with
7734 7737           * unnumbered point2point interfaces. We switch to use this new ire in
7735 7738           * order to have accurate interface statistics.
7736 7739           */
7737 7740          if (new_ire != NULL) {
7738 7741                  /* Note: held in one case but not the other? Caller handles */
7739 7742                  if (new_ire != ire)
7740 7743                          return (new_ire);
7741 7744                  /* Unchanged */
7742 7745                  ire_refrele(new_ire);
7743 7746                  return (ire);
7744 7747          }
7745 7748  
7746 7749          /*
7747 7750           * Chase pointers once and store locally.
7748 7751           */
7749 7752          ASSERT(ire->ire_ill != NULL);
7750 7753          ire_ill = ire->ire_ill;
7751 7754          ifindex = ill->ill_usesrc_ifindex;
7752 7755  
7753 7756          /*
7754 7757           * Check if it's a legal address on the 'usesrc' interface.
7755 7758           * For IPMP data addresses the IRE_LOCAL is the upper, hence we
7756 7759           * can just check phyint_ifindex.
7757 7760           */
7758 7761          if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) {
7759 7762                  return (ire);
7760 7763          }
7761 7764  
7762 7765          /*
7763 7766           * If the ip*_strict_dst_multihoming switch is on then we can
7764 7767           * only accept this packet if the interface is marked as routing.
7765 7768           */
7766 7769          if (!(strict_check))
7767 7770                  return (ire);
7768 7771  
7769 7772          if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) {
7770 7773                  return (ire);
7771 7774          }
7772 7775          return (NULL);
7773 7776  }
7774 7777  
7775 7778  /*
7776 7779   * This function is used to construct a mac_header_info_s from a
7777 7780   * DL_UNITDATA_IND message.
7778 7781   * The address fields in the mhi structure points into the message,
7779 7782   * thus the caller can't use those fields after freeing the message.
7780 7783   *
7781 7784   * We determine whether the packet received is a non-unicast packet
7782 7785   * and in doing so, determine whether or not it is broadcast vs multicast.
7783 7786   * For it to be a broadcast packet, we must have the appropriate mblk_t
7784 7787   * hanging off the ill_t.  If this is either not present or doesn't match
7785 7788   * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7786 7789   * to be multicast.  Thus NICs that have no broadcast address (or no
7787 7790   * capability for one, such as point to point links) cannot return as
7788 7791   * the packet being broadcast.
7789 7792   */
7790 7793  void
7791 7794  ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip)
7792 7795  {
7793 7796          dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr;
7794 7797          mblk_t *bmp;
7795 7798          uint_t extra_offset;
7796 7799  
7797 7800          bzero(mhip, sizeof (struct mac_header_info_s));
7798 7801  
7799 7802          mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7800 7803  
7801 7804          if (ill->ill_sap_length < 0)
7802 7805                  extra_offset = 0;
7803 7806          else
7804 7807                  extra_offset = ill->ill_sap_length;
7805 7808  
7806 7809          mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset +
7807 7810              extra_offset;
7808 7811          mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset +
7809 7812              extra_offset;
7810 7813  
7811 7814          if (!ind->dl_group_address)
7812 7815                  return;
7813 7816  
7814 7817          /* Multicast or broadcast */
7815 7818          mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7816 7819  
7817 7820          if (ind->dl_dest_addr_offset > sizeof (*ind) &&
7818 7821              ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) &&
7819 7822              (bmp = ill->ill_bcast_mp) != NULL) {
7820 7823                  dl_unitdata_req_t *dlur;
7821 7824                  uint8_t *bphys_addr;
7822 7825  
7823 7826                  dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7824 7827                  bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
7825 7828                      extra_offset;
7826 7829  
7827 7830                  if (bcmp(mhip->mhi_daddr, bphys_addr,
7828 7831                      ind->dl_dest_addr_length) == 0)
7829 7832                          mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7830 7833          }
7831 7834  }
7832 7835  
7833 7836  /*
7834 7837   * This function is used to construct a mac_header_info_s from a
7835 7838   * M_DATA fastpath message from a DLPI driver.
7836 7839   * The address fields in the mhi structure points into the message,
7837 7840   * thus the caller can't use those fields after freeing the message.
7838 7841   *
7839 7842   * We determine whether the packet received is a non-unicast packet
7840 7843   * and in doing so, determine whether or not it is broadcast vs multicast.
7841 7844   * For it to be a broadcast packet, we must have the appropriate mblk_t
7842 7845   * hanging off the ill_t.  If this is either not present or doesn't match
7843 7846   * the destination mac address in the DL_UNITDATA_IND, the packet is deemed
7844 7847   * to be multicast.  Thus NICs that have no broadcast address (or no
7845 7848   * capability for one, such as point to point links) cannot return as
7846 7849   * the packet being broadcast.
7847 7850   */
7848 7851  void
7849 7852  ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip)
7850 7853  {
7851 7854          mblk_t *bmp;
7852 7855          struct ether_header *pether;
7853 7856  
7854 7857          bzero(mhip, sizeof (struct mac_header_info_s));
7855 7858  
7856 7859          mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST;
7857 7860  
7858 7861          pether = (struct ether_header *)((char *)mp->b_rptr
7859 7862              - sizeof (struct ether_header));
7860 7863  
7861 7864          /*
7862 7865           * Make sure the interface is an ethernet type, since we don't
7863 7866           * know the header format for anything but Ethernet. Also make
7864 7867           * sure we are pointing correctly above db_base.
7865 7868           */
7866 7869          if (ill->ill_type != IFT_ETHER)
7867 7870                  return;
7868 7871  
7869 7872  retry:
7870 7873          if ((uchar_t *)pether < mp->b_datap->db_base)
7871 7874                  return;
7872 7875  
7873 7876          /* Is there a VLAN tag? */
7874 7877          if (ill->ill_isv6) {
7875 7878                  if (pether->ether_type != htons(ETHERTYPE_IPV6)) {
7876 7879                          pether = (struct ether_header *)((char *)pether - 4);
7877 7880                          goto retry;
7878 7881                  }
7879 7882          } else {
7880 7883                  if (pether->ether_type != htons(ETHERTYPE_IP)) {
7881 7884                          pether = (struct ether_header *)((char *)pether - 4);
7882 7885                          goto retry;
7883 7886                  }
7884 7887          }
7885 7888          mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost;
7886 7889          mhip->mhi_saddr = (uchar_t *)&pether->ether_shost;
7887 7890  
7888 7891          if (!(mhip->mhi_daddr[0] & 0x01))
7889 7892                  return;
7890 7893  
7891 7894          /* Multicast or broadcast */
7892 7895          mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST;
7893 7896  
7894 7897          if ((bmp = ill->ill_bcast_mp) != NULL) {
7895 7898                  dl_unitdata_req_t *dlur;
7896 7899                  uint8_t *bphys_addr;
7897 7900                  uint_t  addrlen;
7898 7901  
7899 7902                  dlur = (dl_unitdata_req_t *)bmp->b_rptr;
7900 7903                  addrlen = dlur->dl_dest_addr_length;
7901 7904                  if (ill->ill_sap_length < 0) {
7902 7905                          bphys_addr = (uchar_t *)dlur +
7903 7906                              dlur->dl_dest_addr_offset;
7904 7907                          addrlen += ill->ill_sap_length;
7905 7908                  } else {
7906 7909                          bphys_addr = (uchar_t *)dlur +
7907 7910                              dlur->dl_dest_addr_offset +
7908 7911                              ill->ill_sap_length;
7909 7912                          addrlen -= ill->ill_sap_length;
7910 7913                  }
7911 7914                  if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0)
7912 7915                          mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST;
7913 7916          }
7914 7917  }
7915 7918  
7916 7919  /*
7917 7920   * Handle anything but M_DATA messages
7918 7921   * We see the DL_UNITDATA_IND which are part
7919 7922   * of the data path, and also the other messages from the driver.
7920 7923   */
7921 7924  void
7922 7925  ip_rput_notdata(ill_t *ill, mblk_t *mp)
7923 7926  {
7924 7927          mblk_t          *first_mp;
7925 7928          struct iocblk   *iocp;
7926 7929          struct mac_header_info_s mhi;
7927 7930  
7928 7931          switch (DB_TYPE(mp)) {
7929 7932          case M_PROTO:
7930 7933          case M_PCPROTO: {
7931 7934                  if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive !=
7932 7935                      DL_UNITDATA_IND) {
7933 7936                          /* Go handle anything other than data elsewhere. */
7934 7937                          ip_rput_dlpi(ill, mp);
7935 7938                          return;
7936 7939                  }
7937 7940  
7938 7941                  first_mp = mp;
7939 7942                  mp = first_mp->b_cont;
7940 7943                  first_mp->b_cont = NULL;
7941 7944  
7942 7945                  if (mp == NULL) {
7943 7946                          freeb(first_mp);
7944 7947                          return;
7945 7948                  }
7946 7949                  ip_dlur_to_mhi(ill, first_mp, &mhi);
7947 7950                  if (ill->ill_isv6)
7948 7951                          ip_input_v6(ill, NULL, mp, &mhi);
7949 7952                  else
7950 7953                          ip_input(ill, NULL, mp, &mhi);
7951 7954  
7952 7955                  /* Ditch the DLPI header. */
7953 7956                  freeb(first_mp);
7954 7957                  return;
7955 7958          }
7956 7959          case M_IOCACK:
7957 7960                  iocp = (struct iocblk *)mp->b_rptr;
7958 7961                  switch (iocp->ioc_cmd) {
7959 7962                  case DL_IOC_HDR_INFO:
7960 7963                          ill_fastpath_ack(ill, mp);
7961 7964                          return;
7962 7965                  default:
7963 7966                          putnext(ill->ill_rq, mp);
7964 7967                          return;
7965 7968                  }
7966 7969                  /* FALLTHRU */
7967 7970          case M_ERROR:
7968 7971          case M_HANGUP:
7969 7972                  mutex_enter(&ill->ill_lock);
7970 7973                  if (ill->ill_state_flags & ILL_CONDEMNED) {
7971 7974                          mutex_exit(&ill->ill_lock);
7972 7975                          freemsg(mp);
7973 7976                          return;
7974 7977                  }
7975 7978                  ill_refhold_locked(ill);
7976 7979                  mutex_exit(&ill->ill_lock);
7977 7980                  qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP,
7978 7981                      B_FALSE);
7979 7982                  return;
7980 7983          case M_CTL:
7981 7984                  putnext(ill->ill_rq, mp);
7982 7985                  return;
7983 7986          case M_IOCNAK:
7984 7987                  ip1dbg(("got iocnak "));
7985 7988                  iocp = (struct iocblk *)mp->b_rptr;
7986 7989                  switch (iocp->ioc_cmd) {
7987 7990                  case DL_IOC_HDR_INFO:
7988 7991                          ip_rput_other(NULL, ill->ill_rq, mp, NULL);
7989 7992                          return;
7990 7993                  default:
7991 7994                          break;
7992 7995                  }
7993 7996                  /* FALLTHRU */
7994 7997          default:
7995 7998                  putnext(ill->ill_rq, mp);
7996 7999                  return;
7997 8000          }
7998 8001  }
7999 8002  
8000 8003  /* Read side put procedure.  Packets coming from the wire arrive here. */
8001 8004  void
8002 8005  ip_rput(queue_t *q, mblk_t *mp)
8003 8006  {
8004 8007          ill_t   *ill;
8005 8008          union DL_primitives *dl;
8006 8009  
8007 8010          ill = (ill_t *)q->q_ptr;
8008 8011  
8009 8012          if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) {
8010 8013                  /*
8011 8014                   * If things are opening or closing, only accept high-priority
8012 8015                   * DLPI messages.  (On open ill->ill_ipif has not yet been
8013 8016                   * created; on close, things hanging off the ill may have been
8014 8017                   * freed already.)
8015 8018                   */
8016 8019                  dl = (union DL_primitives *)mp->b_rptr;
8017 8020                  if (DB_TYPE(mp) != M_PCPROTO ||
8018 8021                      dl->dl_primitive == DL_UNITDATA_IND) {
8019 8022                          inet_freemsg(mp);
8020 8023                          return;
8021 8024                  }
8022 8025          }
8023 8026          if (DB_TYPE(mp) == M_DATA) {
8024 8027                  struct mac_header_info_s mhi;
8025 8028  
8026 8029                  ip_mdata_to_mhi(ill, mp, &mhi);
8027 8030                  ip_input(ill, NULL, mp, &mhi);
8028 8031          } else {
8029 8032                  ip_rput_notdata(ill, mp);
8030 8033          }
8031 8034  }
8032 8035  
8033 8036  /*
8034 8037   * Move the information to a copy.
8035 8038   */
8036 8039  mblk_t *
8037 8040  ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira)
8038 8041  {
8039 8042          mblk_t          *mp1;
8040 8043          ill_t           *ill = ira->ira_ill;
8041 8044          ip_stack_t      *ipst = ill->ill_ipst;
8042 8045  
8043 8046          IP_STAT(ipst, ip_db_ref);
8044 8047  
8045 8048          /* Make sure we have ira_l2src before we loose the original mblk */
8046 8049          if (!(ira->ira_flags & IRAF_L2SRC_SET))
8047 8050                  ip_setl2src(mp, ira, ira->ira_rill);
8048 8051  
8049 8052          mp1 = copymsg(mp);
8050 8053          if (mp1 == NULL) {
8051 8054                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
8052 8055                  ip_drop_input("ipIfStatsInDiscards", mp, ill);
8053 8056                  freemsg(mp);
8054 8057                  return (NULL);
8055 8058          }
8056 8059          /* preserve the hardware checksum flags and data, if present */
8057 8060          if (DB_CKSUMFLAGS(mp) != 0) {
8058 8061                  DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
8059 8062                  DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
8060 8063                  DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
8061 8064                  DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
8062 8065                  DB_CKSUM16(mp1) = DB_CKSUM16(mp);
8063 8066          }
8064 8067          freemsg(mp);
8065 8068          return (mp1);
8066 8069  }
8067 8070  
8068 8071  static void
8069 8072  ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err,
8070 8073      t_uscalar_t err)
8071 8074  {
8072 8075          if (dl_err == DL_SYSERR) {
8073 8076                  (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8074 8077                      "%s: %s failed: DL_SYSERR (errno %u)\n",
8075 8078                      ill->ill_name, dl_primstr(prim), err);
8076 8079                  return;
8077 8080          }
8078 8081  
8079 8082          (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
8080 8083              "%s: %s failed: %s\n", ill->ill_name, dl_primstr(prim),
8081 8084              dl_errstr(dl_err));
8082 8085  }
8083 8086  
8084 8087  /*
8085 8088   * ip_rput_dlpi is called by ip_rput to handle all DLPI messages other
8086 8089   * than DL_UNITDATA_IND messages. If we need to process this message
8087 8090   * exclusively, we call qwriter_ip, in which case we also need to call
8088 8091   * ill_refhold before that, since qwriter_ip does an ill_refrele.
8089 8092   */
8090 8093  void
8091 8094  ip_rput_dlpi(ill_t *ill, mblk_t *mp)
8092 8095  {
8093 8096          dl_ok_ack_t     *dloa = (dl_ok_ack_t *)mp->b_rptr;
8094 8097          dl_error_ack_t  *dlea = (dl_error_ack_t *)dloa;
8095 8098          queue_t         *q = ill->ill_rq;
8096 8099          t_uscalar_t     prim = dloa->dl_primitive;
8097 8100          t_uscalar_t     reqprim = DL_PRIM_INVAL;
8098 8101  
8099 8102          DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi",
8100 8103              char *, dl_primstr(prim), ill_t *, ill);
8101 8104          ip1dbg(("ip_rput_dlpi"));
8102 8105  
8103 8106          /*
8104 8107           * If we received an ACK but didn't send a request for it, then it
8105 8108           * can't be part of any pending operation; discard up-front.
8106 8109           */
8107 8110          switch (prim) {
8108 8111          case DL_ERROR_ACK:
8109 8112                  reqprim = dlea->dl_error_primitive;
8110 8113                  ip2dbg(("ip_rput_dlpi(%s): DL_ERROR_ACK for %s (0x%x): %s "
8111 8114                      "(0x%x), unix %u\n", ill->ill_name, dl_primstr(reqprim),
8112 8115                      reqprim, dl_errstr(dlea->dl_errno), dlea->dl_errno,
8113 8116                      dlea->dl_unix_errno));
8114 8117                  break;
8115 8118          case DL_OK_ACK:
8116 8119                  reqprim = dloa->dl_correct_primitive;
8117 8120                  break;
8118 8121          case DL_INFO_ACK:
8119 8122                  reqprim = DL_INFO_REQ;
8120 8123                  break;
8121 8124          case DL_BIND_ACK:
8122 8125                  reqprim = DL_BIND_REQ;
8123 8126                  break;
8124 8127          case DL_PHYS_ADDR_ACK:
8125 8128                  reqprim = DL_PHYS_ADDR_REQ;
8126 8129                  break;
8127 8130          case DL_NOTIFY_ACK:
8128 8131                  reqprim = DL_NOTIFY_REQ;
8129 8132                  break;
8130 8133          case DL_CAPABILITY_ACK:
8131 8134                  reqprim = DL_CAPABILITY_REQ;
8132 8135                  break;
8133 8136          }
8134 8137  
8135 8138          if (prim != DL_NOTIFY_IND) {
8136 8139                  if (reqprim == DL_PRIM_INVAL ||
8137 8140                      !ill_dlpi_pending(ill, reqprim)) {
8138 8141                          /* Not a DLPI message we support or expected */
8139 8142                          freemsg(mp);
8140 8143                          return;
8141 8144                  }
8142 8145                  ip1dbg(("ip_rput: received %s for %s\n", dl_primstr(prim),
8143 8146                      dl_primstr(reqprim)));
8144 8147          }
8145 8148  
8146 8149          switch (reqprim) {
8147 8150          case DL_UNBIND_REQ:
8148 8151                  /*
8149 8152                   * NOTE: we mark the unbind as complete even if we got a
8150 8153                   * DL_ERROR_ACK, since there's not much else we can do.
8151 8154                   */
8152 8155                  mutex_enter(&ill->ill_lock);
8153 8156                  ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
8154 8157                  cv_signal(&ill->ill_cv);
8155 8158                  mutex_exit(&ill->ill_lock);
8156 8159                  break;
8157 8160  
8158 8161          case DL_ENABMULTI_REQ:
8159 8162                  if (prim == DL_OK_ACK) {
8160 8163                          if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8161 8164                                  ill->ill_dlpi_multicast_state = IDS_OK;
8162 8165                  }
8163 8166                  break;
8164 8167          }
8165 8168  
8166 8169          /*
8167 8170           * The message is one we're waiting for (or DL_NOTIFY_IND), but we
8168 8171           * need to become writer to continue to process it.  Because an
8169 8172           * exclusive operation doesn't complete until replies to all queued
8170 8173           * DLPI messages have been received, we know we're in the middle of an
8171 8174           * exclusive operation and pass CUR_OP (except for DL_NOTIFY_IND).
8172 8175           *
8173 8176           * As required by qwriter_ip(), we refhold the ill; it will refrele.
8174 8177           * Since this is on the ill stream we unconditionally bump up the
8175 8178           * refcount without doing ILL_CAN_LOOKUP().
8176 8179           */
8177 8180          ill_refhold(ill);
8178 8181          if (prim == DL_NOTIFY_IND)
8179 8182                  qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, NEW_OP, B_FALSE);
8180 8183          else
8181 8184                  qwriter_ip(ill, q, mp, ip_rput_dlpi_writer, CUR_OP, B_FALSE);
8182 8185  }
8183 8186  
8184 8187  /*
8185 8188   * Handling of DLPI messages that require exclusive access to the ipsq.
8186 8189   *
8187 8190   * Need to do ipsq_pending_mp_get on ioctl completion, which could
8188 8191   * happen here. (along with mi_copy_done)
8189 8192   */
8190 8193  /* ARGSUSED */
8191 8194  static void
8192 8195  ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8193 8196  {
8194 8197          dl_ok_ack_t     *dloa = (dl_ok_ack_t *)mp->b_rptr;
8195 8198          dl_error_ack_t  *dlea = (dl_error_ack_t *)dloa;
8196 8199          int             err = 0;
8197 8200          ill_t           *ill = (ill_t *)q->q_ptr;
8198 8201          ipif_t          *ipif = NULL;
8199 8202          mblk_t          *mp1 = NULL;
8200 8203          conn_t          *connp = NULL;
8201 8204          t_uscalar_t     paddrreq;
8202 8205          mblk_t          *mp_hw;
8203 8206          boolean_t       success;
8204 8207          boolean_t       ioctl_aborted = B_FALSE;
8205 8208          boolean_t       log = B_TRUE;
8206 8209  
8207 8210          DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer",
8208 8211              char *, dl_primstr(dloa->dl_primitive), ill_t *, ill);
8209 8212  
8210 8213          ip1dbg(("ip_rput_dlpi_writer .."));
8211 8214          ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop);
8212 8215          ASSERT(IAM_WRITER_ILL(ill));
8213 8216  
8214 8217          ipif = ipsq->ipsq_xop->ipx_pending_ipif;
8215 8218          /*
8216 8219           * The current ioctl could have been aborted by the user and a new
8217 8220           * ioctl to bring up another ill could have started. We could still
8218 8221           * get a response from the driver later.
8219 8222           */
8220 8223          if (ipif != NULL && ipif->ipif_ill != ill)
8221 8224                  ioctl_aborted = B_TRUE;
8222 8225  
8223 8226          switch (dloa->dl_primitive) {
8224 8227          case DL_ERROR_ACK:
8225 8228                  ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n",
8226 8229                      dl_primstr(dlea->dl_error_primitive)));
8227 8230  
8228 8231                  DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error",
8229 8232                      char *, dl_primstr(dlea->dl_error_primitive),
8230 8233                      ill_t *, ill);
8231 8234  
8232 8235                  switch (dlea->dl_error_primitive) {
8233 8236                  case DL_DISABMULTI_REQ:
8234 8237                          ill_dlpi_done(ill, dlea->dl_error_primitive);
8235 8238                          break;
8236 8239                  case DL_PROMISCON_REQ:
8237 8240                  case DL_PROMISCOFF_REQ:
8238 8241                  case DL_UNBIND_REQ:
8239 8242                  case DL_ATTACH_REQ:
8240 8243                  case DL_INFO_REQ:
8241 8244                          ill_dlpi_done(ill, dlea->dl_error_primitive);
8242 8245                          break;
8243 8246                  case DL_NOTIFY_REQ:
8244 8247                          ill_dlpi_done(ill, DL_NOTIFY_REQ);
8245 8248                          log = B_FALSE;
8246 8249                          break;
8247 8250                  case DL_PHYS_ADDR_REQ:
8248 8251                          /*
8249 8252                           * For IPv6 only, there are two additional
8250 8253                           * phys_addr_req's sent to the driver to get the
8251 8254                           * IPv6 token and lla. This allows IP to acquire
8252 8255                           * the hardware address format for a given interface
8253 8256                           * without having built in knowledge of the hardware
8254 8257                           * address. ill_phys_addr_pend keeps track of the last
8255 8258                           * DL_PAR sent so we know which response we are
8256 8259                           * dealing with. ill_dlpi_done will update
8257 8260                           * ill_phys_addr_pend when it sends the next req.
8258 8261                           * We don't complete the IOCTL until all three DL_PARs
8259 8262                           * have been attempted, so set *_len to 0 and break.
8260 8263                           */
8261 8264                          paddrreq = ill->ill_phys_addr_pend;
8262 8265                          ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8263 8266                          if (paddrreq == DL_IPV6_TOKEN) {
8264 8267                                  ill->ill_token_length = 0;
8265 8268                                  log = B_FALSE;
8266 8269                                  break;
8267 8270                          } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8268 8271                                  ill->ill_nd_lla_len = 0;
8269 8272                                  log = B_FALSE;
8270 8273                                  break;
8271 8274                          }
8272 8275                          /*
8273 8276                           * Something went wrong with the DL_PHYS_ADDR_REQ.
8274 8277                           * We presumably have an IOCTL hanging out waiting
8275 8278                           * for completion. Find it and complete the IOCTL
8276 8279                           * with the error noted.
8277 8280                           * However, ill_dl_phys was called on an ill queue
8278 8281                           * (from SIOCSLIFNAME), thus conn_pending_ill is not
8279 8282                           * set. But the ioctl is known to be pending on ill_wq.
8280 8283                           */
8281 8284                          if (!ill->ill_ifname_pending)
8282 8285                                  break;
8283 8286                          ill->ill_ifname_pending = 0;
8284 8287                          if (!ioctl_aborted)
8285 8288                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8286 8289                          if (mp1 != NULL) {
8287 8290                                  /*
8288 8291                                   * This operation (SIOCSLIFNAME) must have
8289 8292                                   * happened on the ill. Assert there is no conn
8290 8293                                   */
8291 8294                                  ASSERT(connp == NULL);
8292 8295                                  q = ill->ill_wq;
8293 8296                          }
8294 8297                          break;
8295 8298                  case DL_BIND_REQ:
8296 8299                          ill_dlpi_done(ill, DL_BIND_REQ);
8297 8300                          if (ill->ill_ifname_pending)
8298 8301                                  break;
8299 8302                          mutex_enter(&ill->ill_lock);
8300 8303                          ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8301 8304                          mutex_exit(&ill->ill_lock);
8302 8305                          /*
8303 8306                           * Something went wrong with the bind.  We presumably
8304 8307                           * have an IOCTL hanging out waiting for completion.
8305 8308                           * Find it, take down the interface that was coming
8306 8309                           * up, and complete the IOCTL with the error noted.
8307 8310                           */
8308 8311                          if (!ioctl_aborted)
8309 8312                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8310 8313                          if (mp1 != NULL) {
8311 8314                                  /*
8312 8315                                   * This might be a result of a DL_NOTE_REPLUMB
8313 8316                                   * notification. In that case, connp is NULL.
8314 8317                                   */
8315 8318                                  if (connp != NULL)
8316 8319                                          q = CONNP_TO_WQ(connp);
8317 8320  
8318 8321                                  (void) ipif_down(ipif, NULL, NULL);
8319 8322                                  /* error is set below the switch */
8320 8323                          }
8321 8324                          break;
8322 8325                  case DL_ENABMULTI_REQ:
8323 8326                          ill_dlpi_done(ill, DL_ENABMULTI_REQ);
8324 8327  
8325 8328                          if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS)
8326 8329                                  ill->ill_dlpi_multicast_state = IDS_FAILED;
8327 8330                          if (ill->ill_dlpi_multicast_state == IDS_FAILED) {
8328 8331  
8329 8332                                  printf("ip: joining multicasts failed (%d)"
8330 8333                                      " on %s - will use link layer "
8331 8334                                      "broadcasts for multicast\n",
8332 8335                                      dlea->dl_errno, ill->ill_name);
8333 8336  
8334 8337                                  /*
8335 8338                                   * Set up for multi_bcast; We are the
8336 8339                                   * writer, so ok to access ill->ill_ipif
8337 8340                                   * without any lock.
8338 8341                                   */
8339 8342                                  mutex_enter(&ill->ill_phyint->phyint_lock);
8340 8343                                  ill->ill_phyint->phyint_flags |=
8341 8344                                      PHYI_MULTI_BCAST;
8342 8345                                  mutex_exit(&ill->ill_phyint->phyint_lock);
8343 8346  
8344 8347                          }
8345 8348                          freemsg(mp);    /* Don't want to pass this up */
8346 8349                          return;
8347 8350                  case DL_CAPABILITY_REQ:
8348 8351                          ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for "
8349 8352                              "DL_CAPABILITY REQ\n"));
8350 8353                          if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
8351 8354                                  ill->ill_dlpi_capab_state = IDCS_FAILED;
8352 8355                          ill_capability_done(ill);
8353 8356                          freemsg(mp);
8354 8357                          return;
8355 8358                  }
8356 8359                  /*
8357 8360                   * Note the error for IOCTL completion (mp1 is set when
8358 8361                   * ready to complete ioctl). If ill_ifname_pending_err is
8359 8362                   * set, an error occured during plumbing (ill_ifname_pending),
8360 8363                   * so we want to report that error.
8361 8364                   *
8362 8365                   * NOTE: there are two addtional DL_PHYS_ADDR_REQ's
8363 8366                   * (DL_IPV6_TOKEN and DL_IPV6_LINK_LAYER_ADDR) that are
8364 8367                   * expected to get errack'd if the driver doesn't support
8365 8368                   * these flags (e.g. ethernet). log will be set to B_FALSE
8366 8369                   * if these error conditions are encountered.
8367 8370                   */
8368 8371                  if (mp1 != NULL) {
8369 8372                          if (ill->ill_ifname_pending_err != 0)  {
8370 8373                                  err = ill->ill_ifname_pending_err;
8371 8374                                  ill->ill_ifname_pending_err = 0;
8372 8375                          } else {
8373 8376                                  err = dlea->dl_unix_errno ?
8374 8377                                      dlea->dl_unix_errno : ENXIO;
8375 8378                          }
8376 8379                  /*
8377 8380                   * If we're plumbing an interface and an error hasn't already
8378 8381                   * been saved, set ill_ifname_pending_err to the error passed
8379 8382                   * up. Ignore the error if log is B_FALSE (see comment above).
8380 8383                   */
8381 8384                  } else if (log && ill->ill_ifname_pending &&
8382 8385                      ill->ill_ifname_pending_err == 0) {
8383 8386                          ill->ill_ifname_pending_err = dlea->dl_unix_errno ?
8384 8387                              dlea->dl_unix_errno : ENXIO;
8385 8388                  }
8386 8389  
8387 8390                  if (log)
8388 8391                          ip_dlpi_error(ill, dlea->dl_error_primitive,
8389 8392                              dlea->dl_errno, dlea->dl_unix_errno);
8390 8393                  break;
8391 8394          case DL_CAPABILITY_ACK:
8392 8395                  ill_capability_ack(ill, mp);
8393 8396                  /*
8394 8397                   * The message has been handed off to ill_capability_ack
8395 8398                   * and must not be freed below
8396 8399                   */
8397 8400                  mp = NULL;
8398 8401                  break;
8399 8402  
8400 8403          case DL_INFO_ACK:
8401 8404                  /* Call a routine to handle this one. */
8402 8405                  ill_dlpi_done(ill, DL_INFO_REQ);
8403 8406                  ip_ll_subnet_defaults(ill, mp);
8404 8407                  ASSERT(!MUTEX_HELD(&ill->ill_phyint->phyint_ipsq->ipsq_lock));
8405 8408                  return;
8406 8409          case DL_BIND_ACK:
8407 8410                  /*
8408 8411                   * We should have an IOCTL waiting on this unless
8409 8412                   * sent by ill_dl_phys, in which case just return
8410 8413                   */
8411 8414                  ill_dlpi_done(ill, DL_BIND_REQ);
8412 8415  
8413 8416                  if (ill->ill_ifname_pending) {
8414 8417                          DTRACE_PROBE2(ip__rput__dlpi__ifname__pending,
8415 8418                              ill_t *, ill, mblk_t *, mp);
8416 8419                          break;
8417 8420                  }
8418 8421                  mutex_enter(&ill->ill_lock);
8419 8422                  ill->ill_dl_up = 1;
8420 8423                  ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
8421 8424                  mutex_exit(&ill->ill_lock);
8422 8425  
8423 8426                  if (!ioctl_aborted)
8424 8427                          mp1 = ipsq_pending_mp_get(ipsq, &connp);
8425 8428                  if (mp1 == NULL) {
8426 8429                          DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill);
8427 8430                          break;
8428 8431                  }
8429 8432                  /*
8430 8433                   * mp1 was added by ill_dl_up(). if that is a result of
8431 8434                   * a DL_NOTE_REPLUMB notification, connp could be NULL.
8432 8435                   */
8433 8436                  if (connp != NULL)
8434 8437                          q = CONNP_TO_WQ(connp);
8435 8438                  /*
8436 8439                   * We are exclusive. So nothing can change even after
8437 8440                   * we get the pending mp.
8438 8441                   */
8439 8442                  ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name));
8440 8443                  DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
8441 8444                  ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
8442 8445  
8443 8446                  /*
8444 8447                   * Now bring up the resolver; when that is complete, we'll
8445 8448                   * create IREs.  Note that we intentionally mirror what
8446 8449                   * ipif_up() would have done, because we got here by way of
8447 8450                   * ill_dl_up(), which stopped ipif_up()'s processing.
8448 8451                   */
8449 8452                  if (ill->ill_isv6) {
8450 8453                          /*
8451 8454                           * v6 interfaces.
8452 8455                           * Unlike ARP which has to do another bind
8453 8456                           * and attach, once we get here we are
8454 8457                           * done with NDP
8455 8458                           */
8456 8459                          (void) ipif_resolver_up(ipif, Res_act_initial);
8457 8460                          if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
8458 8461                                  err = ipif_up_done_v6(ipif);
8459 8462                  } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
8460 8463                          /*
8461 8464                           * ARP and other v4 external resolvers.
8462 8465                           * Leave the pending mblk intact so that
8463 8466                           * the ioctl completes in ip_rput().
8464 8467                           */
8465 8468                          if (connp != NULL)
8466 8469                                  mutex_enter(&connp->conn_lock);
8467 8470                          mutex_enter(&ill->ill_lock);
8468 8471                          success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
8469 8472                          mutex_exit(&ill->ill_lock);
8470 8473                          if (connp != NULL)
8471 8474                                  mutex_exit(&connp->conn_lock);
8472 8475                          if (success) {
8473 8476                                  err = ipif_resolver_up(ipif, Res_act_initial);
8474 8477                                  if (err == EINPROGRESS) {
8475 8478                                          freemsg(mp);
8476 8479                                          return;
8477 8480                                  }
8478 8481                                  mp1 = ipsq_pending_mp_get(ipsq, &connp);
8479 8482                          } else {
8480 8483                                  /* The conn has started closing */
8481 8484                                  err = EINTR;
8482 8485                          }
8483 8486                  } else {
8484 8487                          /*
8485 8488                           * This one is complete. Reply to pending ioctl.
8486 8489                           */
8487 8490                          (void) ipif_resolver_up(ipif, Res_act_initial);
8488 8491                          err = ipif_up_done(ipif);
8489 8492                  }
8490 8493  
8491 8494                  if ((err == 0) && (ill->ill_up_ipifs)) {
8492 8495                          err = ill_up_ipifs(ill, q, mp1);
8493 8496                          if (err == EINPROGRESS) {
8494 8497                                  freemsg(mp);
8495 8498                                  return;
8496 8499                          }
8497 8500                  }
8498 8501  
8499 8502                  /*
8500 8503                   * If we have a moved ipif to bring up, and everything has
8501 8504                   * succeeded to this point, bring it up on the IPMP ill.
8502 8505                   * Otherwise, leave it down -- the admin can try to bring it
8503 8506                   * up by hand if need be.
8504 8507                   */
8505 8508                  if (ill->ill_move_ipif != NULL) {
8506 8509                          if (err != 0) {
8507 8510                                  ill->ill_move_ipif = NULL;
8508 8511                          } else {
8509 8512                                  ipif = ill->ill_move_ipif;
8510 8513                                  ill->ill_move_ipif = NULL;
8511 8514                                  err = ipif_up(ipif, q, mp1);
8512 8515                                  if (err == EINPROGRESS) {
8513 8516                                          freemsg(mp);
8514 8517                                          return;
8515 8518                                  }
8516 8519                          }
8517 8520                  }
8518 8521                  break;
8519 8522  
8520 8523          case DL_NOTIFY_IND: {
8521 8524                  dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr;
8522 8525                  uint_t orig_mtu, orig_mc_mtu;
8523 8526  
8524 8527                  switch (notify->dl_notification) {
8525 8528                  case DL_NOTE_PHYS_ADDR:
8526 8529                          err = ill_set_phys_addr(ill, mp);
8527 8530                          break;
8528 8531  
8529 8532                  case DL_NOTE_REPLUMB:
8530 8533                          /*
8531 8534                           * Directly return after calling ill_replumb().
8532 8535                           * Note that we should not free mp as it is reused
8533 8536                           * in the ill_replumb() function.
8534 8537                           */
8535 8538                          err = ill_replumb(ill, mp);
8536 8539                          return;
8537 8540  
8538 8541                  case DL_NOTE_FASTPATH_FLUSH:
8539 8542                          nce_flush(ill, B_FALSE);
8540 8543                          break;
8541 8544  
8542 8545                  case DL_NOTE_SDU_SIZE:
8543 8546                  case DL_NOTE_SDU_SIZE2:
8544 8547                          /*
8545 8548                           * The dce and fragmentation code can cope with
8546 8549                           * this changing while packets are being sent.
8547 8550                           * When packets are sent ip_output will discover
8548 8551                           * a change.
8549 8552                           *
8550 8553                           * Change the MTU size of the interface.
8551 8554                           */
8552 8555                          mutex_enter(&ill->ill_lock);
8553 8556                          orig_mtu = ill->ill_mtu;
8554 8557                          orig_mc_mtu = ill->ill_mc_mtu;
8555 8558                          switch (notify->dl_notification) {
8556 8559                          case DL_NOTE_SDU_SIZE:
8557 8560                                  ill->ill_current_frag =
8558 8561                                      (uint_t)notify->dl_data;
8559 8562                                  ill->ill_mc_mtu = (uint_t)notify->dl_data;
8560 8563                                  break;
8561 8564                          case DL_NOTE_SDU_SIZE2:
8562 8565                                  ill->ill_current_frag =
8563 8566                                      (uint_t)notify->dl_data1;
8564 8567                                  ill->ill_mc_mtu = (uint_t)notify->dl_data2;
8565 8568                                  break;
8566 8569                          }
8567 8570                          if (ill->ill_current_frag > ill->ill_max_frag)
8568 8571                                  ill->ill_max_frag = ill->ill_current_frag;
8569 8572  
8570 8573                          if (!(ill->ill_flags & ILLF_FIXEDMTU)) {
8571 8574                                  ill->ill_mtu = ill->ill_current_frag;
8572 8575  
8573 8576                                  /*
8574 8577                                   * If ill_user_mtu was set (via
8575 8578                                   * SIOCSLIFLNKINFO), clamp ill_mtu at it.
8576 8579                                   */
8577 8580                                  if (ill->ill_user_mtu != 0 &&
8578 8581                                      ill->ill_user_mtu < ill->ill_mtu)
8579 8582                                          ill->ill_mtu = ill->ill_user_mtu;
8580 8583  
8581 8584                                  if (ill->ill_user_mtu != 0 &&
8582 8585                                      ill->ill_user_mtu < ill->ill_mc_mtu)
8583 8586                                          ill->ill_mc_mtu = ill->ill_user_mtu;
8584 8587  
8585 8588                                  if (ill->ill_isv6) {
8586 8589                                          if (ill->ill_mtu < IPV6_MIN_MTU)
8587 8590                                                  ill->ill_mtu = IPV6_MIN_MTU;
8588 8591                                          if (ill->ill_mc_mtu < IPV6_MIN_MTU)
8589 8592                                                  ill->ill_mc_mtu = IPV6_MIN_MTU;
8590 8593                                  } else {
8591 8594                                          if (ill->ill_mtu < IP_MIN_MTU)
8592 8595                                                  ill->ill_mtu = IP_MIN_MTU;
8593 8596                                          if (ill->ill_mc_mtu < IP_MIN_MTU)
8594 8597                                                  ill->ill_mc_mtu = IP_MIN_MTU;
8595 8598                                  }
8596 8599                          } else if (ill->ill_mc_mtu > ill->ill_mtu) {
8597 8600                                  ill->ill_mc_mtu = ill->ill_mtu;
8598 8601                          }
8599 8602  
8600 8603                          mutex_exit(&ill->ill_lock);
8601 8604                          /*
8602 8605                           * Make sure all dce_generation checks find out
8603 8606                           * that ill_mtu/ill_mc_mtu has changed.
8604 8607                           */
8605 8608                          if (orig_mtu != ill->ill_mtu ||
8606 8609                              orig_mc_mtu != ill->ill_mc_mtu) {
8607 8610                                  dce_increment_all_generations(ill->ill_isv6,
8608 8611                                      ill->ill_ipst);
8609 8612                          }
8610 8613  
8611 8614                          /*
8612 8615                           * Refresh IPMP meta-interface MTU if necessary.
8613 8616                           */
8614 8617                          if (IS_UNDER_IPMP(ill))
8615 8618                                  ipmp_illgrp_refresh_mtu(ill->ill_grp);
8616 8619                          break;
8617 8620  
8618 8621                  case DL_NOTE_LINK_UP:
8619 8622                  case DL_NOTE_LINK_DOWN: {
8620 8623                          /*
8621 8624                           * We are writer. ill / phyint / ipsq assocs stable.
8622 8625                           * The RUNNING flag reflects the state of the link.
8623 8626                           */
8624 8627                          phyint_t *phyint = ill->ill_phyint;
8625 8628                          uint64_t new_phyint_flags;
8626 8629                          boolean_t changed = B_FALSE;
8627 8630                          boolean_t went_up;
8628 8631  
8629 8632                          went_up = notify->dl_notification == DL_NOTE_LINK_UP;
8630 8633                          mutex_enter(&phyint->phyint_lock);
8631 8634  
8632 8635                          new_phyint_flags = went_up ?
8633 8636                              phyint->phyint_flags | PHYI_RUNNING :
8634 8637                              phyint->phyint_flags & ~PHYI_RUNNING;
8635 8638  
8636 8639                          if (IS_IPMP(ill)) {
8637 8640                                  new_phyint_flags = went_up ?
8638 8641                                      new_phyint_flags & ~PHYI_FAILED :
8639 8642                                      new_phyint_flags | PHYI_FAILED;
8640 8643                          }
8641 8644  
8642 8645                          if (new_phyint_flags != phyint->phyint_flags) {
8643 8646                                  phyint->phyint_flags = new_phyint_flags;
8644 8647                                  changed = B_TRUE;
8645 8648                          }
8646 8649                          mutex_exit(&phyint->phyint_lock);
8647 8650                          /*
8648 8651                           * ill_restart_dad handles the DAD restart and routing
8649 8652                           * socket notification logic.
8650 8653                           */
8651 8654                          if (changed) {
8652 8655                                  ill_restart_dad(phyint->phyint_illv4, went_up);
8653 8656                                  ill_restart_dad(phyint->phyint_illv6, went_up);
8654 8657                          }
8655 8658                          break;
8656 8659                  }
8657 8660                  case DL_NOTE_PROMISC_ON_PHYS: {
8658 8661                          phyint_t *phyint = ill->ill_phyint;
8659 8662  
8660 8663                          mutex_enter(&phyint->phyint_lock);
8661 8664                          phyint->phyint_flags |= PHYI_PROMISC;
8662 8665                          mutex_exit(&phyint->phyint_lock);
8663 8666                          break;
8664 8667                  }
8665 8668                  case DL_NOTE_PROMISC_OFF_PHYS: {
8666 8669                          phyint_t *phyint = ill->ill_phyint;
8667 8670  
8668 8671                          mutex_enter(&phyint->phyint_lock);
8669 8672                          phyint->phyint_flags &= ~PHYI_PROMISC;
8670 8673                          mutex_exit(&phyint->phyint_lock);
8671 8674                          break;
8672 8675                  }
8673 8676                  case DL_NOTE_CAPAB_RENEG:
8674 8677                          /*
8675 8678                           * Something changed on the driver side.
8676 8679                           * It wants us to renegotiate the capabilities
8677 8680                           * on this ill. One possible cause is the aggregation
8678 8681                           * interface under us where a port got added or
8679 8682                           * went away.
8680 8683                           *
8681 8684                           * If the capability negotiation is already done
8682 8685                           * or is in progress, reset the capabilities and
8683 8686                           * mark the ill's ill_capab_reneg to be B_TRUE,
8684 8687                           * so that when the ack comes back, we can start
8685 8688                           * the renegotiation process.
8686 8689                           *
8687 8690                           * Note that if ill_capab_reneg is already B_TRUE
8688 8691                           * (ill_dlpi_capab_state is IDS_UNKNOWN in this case),
8689 8692                           * the capability resetting request has been sent
8690 8693                           * and the renegotiation has not been started yet;
8691 8694                           * nothing needs to be done in this case.
8692 8695                           */
8693 8696                          ipsq_current_start(ipsq, ill->ill_ipif, 0);
8694 8697                          ill_capability_reset(ill, B_TRUE);
8695 8698                          ipsq_current_finish(ipsq);
8696 8699                          break;
8697 8700  
8698 8701                  case DL_NOTE_ALLOWED_IPS:
8699 8702                          ill_set_allowed_ips(ill, mp);
8700 8703                          break;
8701 8704                  default:
8702 8705                          ip0dbg(("ip_rput_dlpi_writer: unknown notification "
8703 8706                              "type 0x%x for DL_NOTIFY_IND\n",
8704 8707                              notify->dl_notification));
8705 8708                          break;
8706 8709                  }
8707 8710  
8708 8711                  /*
8709 8712                   * As this is an asynchronous operation, we
8710 8713                   * should not call ill_dlpi_done
8711 8714                   */
8712 8715                  break;
8713 8716          }
8714 8717          case DL_NOTIFY_ACK: {
8715 8718                  dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr;
8716 8719  
8717 8720                  if (noteack->dl_notifications & DL_NOTE_LINK_UP)
8718 8721                          ill->ill_note_link = 1;
8719 8722                  ill_dlpi_done(ill, DL_NOTIFY_REQ);
8720 8723                  break;
8721 8724          }
8722 8725          case DL_PHYS_ADDR_ACK: {
8723 8726                  /*
8724 8727                   * As part of plumbing the interface via SIOCSLIFNAME,
8725 8728                   * ill_dl_phys() will queue a series of DL_PHYS_ADDR_REQs,
8726 8729                   * whose answers we receive here.  As each answer is received,
8727 8730                   * we call ill_dlpi_done() to dispatch the next request as
8728 8731                   * we're processing the current one.  Once all answers have
8729 8732                   * been received, we use ipsq_pending_mp_get() to dequeue the
8730 8733                   * outstanding IOCTL and reply to it.  (Because ill_dl_phys()
8731 8734                   * is invoked from an ill queue, conn_oper_pending_ill is not
8732 8735                   * available, but we know the ioctl is pending on ill_wq.)
8733 8736                   */
8734 8737                  uint_t  paddrlen, paddroff;
8735 8738                  uint8_t *addr;
8736 8739  
8737 8740                  paddrreq = ill->ill_phys_addr_pend;
8738 8741                  paddrlen = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_length;
8739 8742                  paddroff = ((dl_phys_addr_ack_t *)mp->b_rptr)->dl_addr_offset;
8740 8743                  addr = mp->b_rptr + paddroff;
8741 8744  
8742 8745                  ill_dlpi_done(ill, DL_PHYS_ADDR_REQ);
8743 8746                  if (paddrreq == DL_IPV6_TOKEN) {
8744 8747                          /*
8745 8748                           * bcopy to low-order bits of ill_token
8746 8749                           *
8747 8750                           * XXX Temporary hack - currently, all known tokens
8748 8751                           * are 64 bits, so I'll cheat for the moment.
8749 8752                           */
8750 8753                          bcopy(addr, &ill->ill_token.s6_addr32[2], paddrlen);
8751 8754                          ill->ill_token_length = paddrlen;
8752 8755                          break;
8753 8756                  } else if (paddrreq == DL_IPV6_LINK_LAYER_ADDR) {
8754 8757                          ASSERT(ill->ill_nd_lla_mp == NULL);
8755 8758                          ill_set_ndmp(ill, mp, paddroff, paddrlen);
8756 8759                          mp = NULL;
8757 8760                          break;
8758 8761                  } else if (paddrreq == DL_CURR_DEST_ADDR) {
8759 8762                          ASSERT(ill->ill_dest_addr_mp == NULL);
8760 8763                          ill->ill_dest_addr_mp = mp;
8761 8764                          ill->ill_dest_addr = addr;
8762 8765                          mp = NULL;
8763 8766                          if (ill->ill_isv6) {
8764 8767                                  ill_setdesttoken(ill);
8765 8768                                  ipif_setdestlinklocal(ill->ill_ipif);
8766 8769                          }
8767 8770                          break;
8768 8771                  }
8769 8772  
8770 8773                  ASSERT(paddrreq == DL_CURR_PHYS_ADDR);
8771 8774                  ASSERT(ill->ill_phys_addr_mp == NULL);
8772 8775                  if (!ill->ill_ifname_pending)
8773 8776                          break;
8774 8777                  ill->ill_ifname_pending = 0;
8775 8778                  if (!ioctl_aborted)
8776 8779                          mp1 = ipsq_pending_mp_get(ipsq, &connp);
8777 8780                  if (mp1 != NULL) {
8778 8781                          ASSERT(connp == NULL);
8779 8782                          q = ill->ill_wq;
8780 8783                  }
8781 8784                  /*
8782 8785                   * If any error acks received during the plumbing sequence,
8783 8786                   * ill_ifname_pending_err will be set. Break out and send up
8784 8787                   * the error to the pending ioctl.
8785 8788                   */
8786 8789                  if (ill->ill_ifname_pending_err != 0) {
8787 8790                          err = ill->ill_ifname_pending_err;
8788 8791                          ill->ill_ifname_pending_err = 0;
8789 8792                          break;
8790 8793                  }
8791 8794  
8792 8795                  ill->ill_phys_addr_mp = mp;
8793 8796                  ill->ill_phys_addr = (paddrlen == 0 ? NULL : addr);
8794 8797                  mp = NULL;
8795 8798  
8796 8799                  /*
8797 8800                   * If paddrlen or ill_phys_addr_length is zero, the DLPI
8798 8801                   * provider doesn't support physical addresses.  We check both
8799 8802                   * paddrlen and ill_phys_addr_length because sppp (PPP) does
8800 8803                   * not have physical addresses, but historically adversises a
8801 8804                   * physical address length of 0 in its DL_INFO_ACK, but 6 in
8802 8805                   * its DL_PHYS_ADDR_ACK.
8803 8806                   */
8804 8807                  if (paddrlen == 0 || ill->ill_phys_addr_length == 0) {
8805 8808                          ill->ill_phys_addr = NULL;
8806 8809                  } else if (paddrlen != ill->ill_phys_addr_length) {
8807 8810                          ip0dbg(("DL_PHYS_ADDR_ACK: got addrlen %d, expected %d",
8808 8811                              paddrlen, ill->ill_phys_addr_length));
8809 8812                          err = EINVAL;
8810 8813                          break;
8811 8814                  }
8812 8815  
8813 8816                  if (ill->ill_nd_lla_mp == NULL) {
8814 8817                          if ((mp_hw = copyb(ill->ill_phys_addr_mp)) == NULL) {
8815 8818                                  err = ENOMEM;
8816 8819                                  break;
8817 8820                          }
8818 8821                          ill_set_ndmp(ill, mp_hw, paddroff, paddrlen);
8819 8822                  }
8820 8823  
8821 8824                  if (ill->ill_isv6) {
8822 8825                          ill_setdefaulttoken(ill);
8823 8826                          ipif_setlinklocal(ill->ill_ipif);
8824 8827                  }
8825 8828                  break;
8826 8829          }
8827 8830          case DL_OK_ACK:
8828 8831                  ip2dbg(("DL_OK_ACK %s (0x%x)\n",
8829 8832                      dl_primstr((int)dloa->dl_correct_primitive),
8830 8833                      dloa->dl_correct_primitive));
8831 8834                  DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok",
8832 8835                      char *, dl_primstr(dloa->dl_correct_primitive),
8833 8836                      ill_t *, ill);
8834 8837  
8835 8838                  switch (dloa->dl_correct_primitive) {
8836 8839                  case DL_ENABMULTI_REQ:
8837 8840                  case DL_DISABMULTI_REQ:
8838 8841                          ill_dlpi_done(ill, dloa->dl_correct_primitive);
8839 8842                          break;
8840 8843                  case DL_PROMISCON_REQ:
8841 8844                  case DL_PROMISCOFF_REQ:
8842 8845                  case DL_UNBIND_REQ:
8843 8846                  case DL_ATTACH_REQ:
8844 8847                          ill_dlpi_done(ill, dloa->dl_correct_primitive);
8845 8848                          break;
8846 8849                  }
8847 8850                  break;
8848 8851          default:
8849 8852                  break;
8850 8853          }
8851 8854  
8852 8855          freemsg(mp);
8853 8856          if (mp1 == NULL)
8854 8857                  return;
8855 8858  
8856 8859          /*
8857 8860           * The operation must complete without EINPROGRESS since
8858 8861           * ipsq_pending_mp_get() has removed the mblk (mp1).  Otherwise,
8859 8862           * the operation will be stuck forever inside the IPSQ.
8860 8863           */
8861 8864          ASSERT(err != EINPROGRESS);
8862 8865  
8863 8866          DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish",
8864 8867              int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill,
8865 8868              ipif_t *, NULL);
8866 8869  
8867 8870          switch (ipsq->ipsq_xop->ipx_current_ioctl) {
8868 8871          case 0:
8869 8872                  ipsq_current_finish(ipsq);
8870 8873                  break;
8871 8874  
8872 8875          case SIOCSLIFNAME:
8873 8876          case IF_UNITSEL: {
8874 8877                  ill_t *ill_other = ILL_OTHER(ill);
8875 8878  
8876 8879                  /*
8877 8880                   * If SIOCSLIFNAME or IF_UNITSEL is about to succeed, and the
8878 8881                   * ill has a peer which is in an IPMP group, then place ill
8879 8882                   * into the same group.  One catch: although ifconfig plumbs
8880 8883                   * the appropriate IPMP meta-interface prior to plumbing this
8881 8884                   * ill, it is possible for multiple ifconfig applications to
8882 8885                   * race (or for another application to adjust plumbing), in
8883 8886                   * which case the IPMP meta-interface we need will be missing.
8884 8887                   * If so, kick the phyint out of the group.
8885 8888                   */
8886 8889                  if (err == 0 && ill_other != NULL && IS_UNDER_IPMP(ill_other)) {
8887 8890                          ipmp_grp_t      *grp = ill->ill_phyint->phyint_grp;
8888 8891                          ipmp_illgrp_t   *illg;
8889 8892  
8890 8893                          illg = ill->ill_isv6 ? grp->gr_v6 : grp->gr_v4;
8891 8894                          if (illg == NULL)
8892 8895                                  ipmp_phyint_leave_grp(ill->ill_phyint);
8893 8896                          else
8894 8897                                  ipmp_ill_join_illgrp(ill, illg);
8895 8898                  }
8896 8899  
8897 8900                  if (ipsq->ipsq_xop->ipx_current_ioctl == IF_UNITSEL)
8898 8901                          ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8899 8902                  else
8900 8903                          ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8901 8904                  break;
8902 8905          }
8903 8906          case SIOCLIFADDIF:
8904 8907                  ip_ioctl_finish(q, mp1, err, COPYOUT, ipsq);
8905 8908                  break;
8906 8909  
8907 8910          default:
8908 8911                  ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
8909 8912                  break;
8910 8913          }
8911 8914  }
8912 8915  
8913 8916  /*
8914 8917   * ip_rput_other is called by ip_rput to handle messages modifying the global
8915 8918   * state in IP.  If 'ipsq' is non-NULL, caller is writer on it.
8916 8919   */
8917 8920  /* ARGSUSED */
8918 8921  void
8919 8922  ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8920 8923  {
8921 8924          ill_t           *ill = q->q_ptr;
8922 8925          struct iocblk   *iocp;
8923 8926  
8924 8927          ip1dbg(("ip_rput_other "));
8925 8928          if (ipsq != NULL) {
8926 8929                  ASSERT(IAM_WRITER_IPSQ(ipsq));
8927 8930                  ASSERT(ipsq->ipsq_xop ==
8928 8931                      ill->ill_phyint->phyint_ipsq->ipsq_xop);
8929 8932          }
8930 8933  
8931 8934          switch (mp->b_datap->db_type) {
8932 8935          case M_ERROR:
8933 8936          case M_HANGUP:
8934 8937                  /*
8935 8938                   * The device has a problem.  We force the ILL down.  It can
8936 8939                   * be brought up again manually using SIOCSIFFLAGS (via
8937 8940                   * ifconfig or equivalent).
8938 8941                   */
8939 8942                  ASSERT(ipsq != NULL);
8940 8943                  if (mp->b_rptr < mp->b_wptr)
8941 8944                          ill->ill_error = (int)(*mp->b_rptr & 0xFF);
8942 8945                  if (ill->ill_error == 0)
8943 8946                          ill->ill_error = ENXIO;
8944 8947                  if (!ill_down_start(q, mp))
8945 8948                          return;
8946 8949                  ipif_all_down_tail(ipsq, q, mp, NULL);
8947 8950                  break;
8948 8951          case M_IOCNAK: {
8949 8952                  iocp = (struct iocblk *)mp->b_rptr;
8950 8953  
8951 8954                  ASSERT(iocp->ioc_cmd == DL_IOC_HDR_INFO);
8952 8955                  /*
8953 8956                   * If this was the first attempt, turn off the fastpath
8954 8957                   * probing.
8955 8958                   */
8956 8959                  mutex_enter(&ill->ill_lock);
8957 8960                  if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) {
8958 8961                          ill->ill_dlpi_fastpath_state = IDS_FAILED;
8959 8962                          mutex_exit(&ill->ill_lock);
8960 8963                          /*
8961 8964                           * don't flush the nce_t entries: we use them
8962 8965                           * as an index to the ncec itself.
8963 8966                           */
8964 8967                          ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n",
8965 8968                              ill->ill_name));
8966 8969                  } else {
8967 8970                          mutex_exit(&ill->ill_lock);
8968 8971                  }
8969 8972                  freemsg(mp);
8970 8973                  break;
8971 8974          }
8972 8975          default:
8973 8976                  ASSERT(0);
8974 8977                  break;
8975 8978          }
8976 8979  }
8977 8980  
8978 8981  /*
8979 8982   * Update any source route, record route or timestamp options
8980 8983   * When it fails it has consumed the message and BUMPed the MIB.
8981 8984   */
8982 8985  boolean_t
8983 8986  ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill,
8984 8987      ip_recv_attr_t *ira)
8985 8988  {
8986 8989          ipoptp_t        opts;
8987 8990          uchar_t         *opt;
8988 8991          uint8_t         optval;
8989 8992          uint8_t         optlen;
8990 8993          ipaddr_t        dst;
8991 8994          ipaddr_t        ifaddr;
8992 8995          uint32_t        ts;
8993 8996          timestruc_t     now;
8994 8997          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
8995 8998  
8996 8999          ip2dbg(("ip_forward_options\n"));
8997 9000          dst = ipha->ipha_dst;
8998 9001          for (optval = ipoptp_first(&opts, ipha);
8999 9002              optval != IPOPT_EOL;
9000 9003              optval = ipoptp_next(&opts)) {
9001 9004                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
9002 9005                  opt = opts.ipoptp_cur;
9003 9006                  optlen = opts.ipoptp_len;
9004 9007                  ip2dbg(("ip_forward_options: opt %d, len %d\n",
9005 9008                      optval, opts.ipoptp_len));
9006 9009                  switch (optval) {
9007 9010                          uint32_t off;
9008 9011                  case IPOPT_SSRR:
9009 9012                  case IPOPT_LSRR:
9010 9013                          /* Check if adminstratively disabled */
9011 9014                          if (!ipst->ips_ip_forward_src_routed) {
9012 9015                                  BUMP_MIB(dst_ill->ill_ip_mib,
9013 9016                                      ipIfStatsForwProhibits);
9014 9017                                  ip_drop_input("ICMP_SOURCE_ROUTE_FAILED",
9015 9018                                      mp, dst_ill);
9016 9019                                  icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED,
9017 9020                                      ira);
9018 9021                                  return (B_FALSE);
9019 9022                          }
9020 9023                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9021 9024                                  /*
9022 9025                                   * Must be partial since ip_input_options
9023 9026                                   * checked for strict.
9024 9027                                   */
9025 9028                                  break;
9026 9029                          }
9027 9030                          off = opt[IPOPT_OFFSET];
9028 9031                          off--;
9029 9032                  redo_srr:
9030 9033                          if (optlen < IP_ADDR_LEN ||
9031 9034                              off > optlen - IP_ADDR_LEN) {
9032 9035                                  /* End of source route */
9033 9036                                  ip1dbg((
9034 9037                                      "ip_forward_options: end of SR\n"));
9035 9038                                  break;
9036 9039                          }
9037 9040                          /* Pick a reasonable address on the outbound if */
9038 9041                          ASSERT(dst_ill != NULL);
9039 9042                          if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9040 9043                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9041 9044                              NULL) != 0) {
9042 9045                                  /* No source! Shouldn't happen */
9043 9046                                  ifaddr = INADDR_ANY;
9044 9047                          }
9045 9048                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9046 9049                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9047 9050                          ip1dbg(("ip_forward_options: next hop 0x%x\n",
9048 9051                              ntohl(dst)));
9049 9052  
9050 9053                          /*
9051 9054                           * Check if our address is present more than
9052 9055                           * once as consecutive hops in source route.
9053 9056                           */
9054 9057                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9055 9058                                  off += IP_ADDR_LEN;
9056 9059                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9057 9060                                  goto redo_srr;
9058 9061                          }
9059 9062                          ipha->ipha_dst = dst;
9060 9063                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9061 9064                          break;
9062 9065                  case IPOPT_RR:
9063 9066                          off = opt[IPOPT_OFFSET];
9064 9067                          off--;
9065 9068                          if (optlen < IP_ADDR_LEN ||
9066 9069                              off > optlen - IP_ADDR_LEN) {
9067 9070                                  /* No more room - ignore */
9068 9071                                  ip1dbg((
9069 9072                                      "ip_forward_options: end of RR\n"));
9070 9073                                  break;
9071 9074                          }
9072 9075                          /* Pick a reasonable address on the outbound if */
9073 9076                          ASSERT(dst_ill != NULL);
9074 9077                          if (ip_select_source_v4(dst_ill, INADDR_ANY, dst,
9075 9078                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9076 9079                              NULL) != 0) {
9077 9080                                  /* No source! Shouldn't happen */
9078 9081                                  ifaddr = INADDR_ANY;
9079 9082                          }
9080 9083                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9081 9084                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9082 9085                          break;
9083 9086                  case IPOPT_TS:
9084 9087                          /* Insert timestamp if there is room */
9085 9088                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9086 9089                          case IPOPT_TS_TSONLY:
9087 9090                                  off = IPOPT_TS_TIMELEN;
9088 9091                                  break;
9089 9092                          case IPOPT_TS_PRESPEC:
9090 9093                          case IPOPT_TS_PRESPEC_RFC791:
9091 9094                                  /* Verify that the address matched */
9092 9095                                  off = opt[IPOPT_OFFSET] - 1;
9093 9096                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9094 9097                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9095 9098                                          /* Not for us */
9096 9099                                          break;
9097 9100                                  }
9098 9101                                  /* FALLTHRU */
9099 9102                          case IPOPT_TS_TSANDADDR:
9100 9103                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9101 9104                                  break;
9102 9105                          default:
9103 9106                                  /*
9104 9107                                   * ip_*put_options should have already
9105 9108                                   * dropped this packet.
9106 9109                                   */
9107 9110                                  cmn_err(CE_PANIC, "ip_forward_options: "
9108 9111                                      "unknown IT - bug in ip_input_options?\n");
9109 9112                                  return (B_TRUE);        /* Keep "lint" happy */
9110 9113                          }
9111 9114                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9112 9115                                  /* Increase overflow counter */
9113 9116                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9114 9117                                  opt[IPOPT_POS_OV_FLG] =
9115 9118                                      (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9116 9119                                      (off << 4));
9117 9120                                  break;
9118 9121                          }
9119 9122                          off = opt[IPOPT_OFFSET] - 1;
9120 9123                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9121 9124                          case IPOPT_TS_PRESPEC:
9122 9125                          case IPOPT_TS_PRESPEC_RFC791:
9123 9126                          case IPOPT_TS_TSANDADDR:
9124 9127                                  /* Pick a reasonable addr on the outbound if */
9125 9128                                  ASSERT(dst_ill != NULL);
9126 9129                                  if (ip_select_source_v4(dst_ill, INADDR_ANY,
9127 9130                                      dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr,
9128 9131                                      NULL, NULL) != 0) {
9129 9132                                          /* No source! Shouldn't happen */
9130 9133                                          ifaddr = INADDR_ANY;
9131 9134                                  }
9132 9135                                  bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9133 9136                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9134 9137                                  /* FALLTHRU */
9135 9138                          case IPOPT_TS_TSONLY:
9136 9139                                  off = opt[IPOPT_OFFSET] - 1;
9137 9140                                  /* Compute # of milliseconds since midnight */
9138 9141                                  gethrestime(&now);
9139 9142                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9140 9143                                      NSEC2MSEC(now.tv_nsec);
9141 9144                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9142 9145                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9143 9146                                  break;
9144 9147                          }
9145 9148                          break;
9146 9149                  }
9147 9150          }
9148 9151          return (B_TRUE);
9149 9152  }
9150 9153  
9151 9154  /*
9152 9155   * Call ill_frag_timeout to do garbage collection. ill_frag_timeout
9153 9156   * returns 'true' if there are still fragments left on the queue, in
9154 9157   * which case we restart the timer.
9155 9158   */
9156 9159  void
9157 9160  ill_frag_timer(void *arg)
9158 9161  {
9159 9162          ill_t   *ill = (ill_t *)arg;
9160 9163          boolean_t frag_pending;
9161 9164          ip_stack_t *ipst = ill->ill_ipst;
9162 9165          time_t  timeout;
9163 9166  
9164 9167          mutex_enter(&ill->ill_lock);
9165 9168          ASSERT(!ill->ill_fragtimer_executing);
9166 9169          if (ill->ill_state_flags & ILL_CONDEMNED) {
9167 9170                  ill->ill_frag_timer_id = 0;
9168 9171                  mutex_exit(&ill->ill_lock);
9169 9172                  return;
9170 9173          }
9171 9174          ill->ill_fragtimer_executing = 1;
9172 9175          mutex_exit(&ill->ill_lock);
9173 9176  
9174 9177          timeout = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9175 9178              ipst->ips_ip_reassembly_timeout);
9176 9179  
9177 9180          frag_pending = ill_frag_timeout(ill, timeout);
9178 9181  
9179 9182          /*
9180 9183           * Restart the timer, if we have fragments pending or if someone
9181 9184           * wanted us to be scheduled again.
9182 9185           */
9183 9186          mutex_enter(&ill->ill_lock);
9184 9187          ill->ill_fragtimer_executing = 0;
9185 9188          ill->ill_frag_timer_id = 0;
9186 9189          if (frag_pending || ill->ill_fragtimer_needrestart)
9187 9190                  ill_frag_timer_start(ill);
9188 9191          mutex_exit(&ill->ill_lock);
9189 9192  }
9190 9193  
9191 9194  void
9192 9195  ill_frag_timer_start(ill_t *ill)
9193 9196  {
9194 9197          ip_stack_t *ipst = ill->ill_ipst;
9195 9198          clock_t timeo_ms;
9196 9199  
9197 9200          ASSERT(MUTEX_HELD(&ill->ill_lock));
9198 9201  
9199 9202          /* If the ill is closing or opening don't proceed */
9200 9203          if (ill->ill_state_flags & ILL_CONDEMNED)
9201 9204                  return;
9202 9205  
9203 9206          if (ill->ill_fragtimer_executing) {
9204 9207                  /*
9205 9208                   * ill_frag_timer is currently executing. Just record the
9206 9209                   * the fact that we want the timer to be restarted.
9207 9210                   * ill_frag_timer will post a timeout before it returns,
9208 9211                   * ensuring it will be called again.
9209 9212                   */
9210 9213                  ill->ill_fragtimer_needrestart = 1;
9211 9214                  return;
9212 9215          }
9213 9216  
9214 9217          if (ill->ill_frag_timer_id == 0) {
9215 9218                  timeo_ms = (ill->ill_isv6 ? ipst->ips_ipv6_reassembly_timeout :
9216 9219                      ipst->ips_ip_reassembly_timeout) * SECONDS;
9217 9220  
9218 9221                  /*
9219 9222                   * The timer is neither running nor is the timeout handler
9220 9223                   * executing. Post a timeout so that ill_frag_timer will be
9221 9224                   * called
9222 9225                   */
9223 9226                  ill->ill_frag_timer_id = timeout(ill_frag_timer, ill,
9224 9227                      MSEC_TO_TICK(timeo_ms >> 1));
9225 9228                  ill->ill_fragtimer_needrestart = 0;
9226 9229          }
9227 9230  }
9228 9231  
9229 9232  /*
9230 9233   * Update any source route, record route or timestamp options.
9231 9234   * Check that we are at end of strict source route.
9232 9235   * The options have already been checked for sanity in ip_input_options().
9233 9236   */
9234 9237  boolean_t
9235 9238  ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira)
9236 9239  {
9237 9240          ipoptp_t        opts;
9238 9241          uchar_t         *opt;
9239 9242          uint8_t         optval;
9240 9243          uint8_t         optlen;
9241 9244          ipaddr_t        dst;
9242 9245          ipaddr_t        ifaddr;
9243 9246          uint32_t        ts;
9244 9247          timestruc_t     now;
9245 9248          ill_t           *ill = ira->ira_ill;
9246 9249          ip_stack_t      *ipst = ill->ill_ipst;
9247 9250  
9248 9251          ip2dbg(("ip_input_local_options\n"));
9249 9252  
9250 9253          for (optval = ipoptp_first(&opts, ipha);
9251 9254              optval != IPOPT_EOL;
9252 9255              optval = ipoptp_next(&opts)) {
9253 9256                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
9254 9257                  opt = opts.ipoptp_cur;
9255 9258                  optlen = opts.ipoptp_len;
9256 9259                  ip2dbg(("ip_input_local_options: opt %d, len %d\n",
9257 9260                      optval, optlen));
9258 9261                  switch (optval) {
9259 9262                          uint32_t off;
9260 9263                  case IPOPT_SSRR:
9261 9264                  case IPOPT_LSRR:
9262 9265                          off = opt[IPOPT_OFFSET];
9263 9266                          off--;
9264 9267                          if (optlen < IP_ADDR_LEN ||
9265 9268                              off > optlen - IP_ADDR_LEN) {
9266 9269                                  /* End of source route */
9267 9270                                  ip1dbg(("ip_input_local_options: end of SR\n"));
9268 9271                                  break;
9269 9272                          }
9270 9273                          /*
9271 9274                           * This will only happen if two consecutive entries
9272 9275                           * in the source route contains our address or if
9273 9276                           * it is a packet with a loose source route which
9274 9277                           * reaches us before consuming the whole source route
9275 9278                           */
9276 9279                          ip1dbg(("ip_input_local_options: not end of SR\n"));
9277 9280                          if (optval == IPOPT_SSRR) {
9278 9281                                  goto bad_src_route;
9279 9282                          }
9280 9283                          /*
9281 9284                           * Hack: instead of dropping the packet truncate the
9282 9285                           * source route to what has been used by filling the
9283 9286                           * rest with IPOPT_NOP.
9284 9287                           */
9285 9288                          opt[IPOPT_OLEN] = (uint8_t)off;
9286 9289                          while (off < optlen) {
9287 9290                                  opt[off++] = IPOPT_NOP;
9288 9291                          }
9289 9292                          break;
9290 9293                  case IPOPT_RR:
9291 9294                          off = opt[IPOPT_OFFSET];
9292 9295                          off--;
9293 9296                          if (optlen < IP_ADDR_LEN ||
9294 9297                              off > optlen - IP_ADDR_LEN) {
9295 9298                                  /* No more room - ignore */
9296 9299                                  ip1dbg((
9297 9300                                      "ip_input_local_options: end of RR\n"));
9298 9301                                  break;
9299 9302                          }
9300 9303                          /* Pick a reasonable address on the outbound if */
9301 9304                          if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst,
9302 9305                              INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL,
9303 9306                              NULL) != 0) {
9304 9307                                  /* No source! Shouldn't happen */
9305 9308                                  ifaddr = INADDR_ANY;
9306 9309                          }
9307 9310                          bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9308 9311                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9309 9312                          break;
9310 9313                  case IPOPT_TS:
9311 9314                          /* Insert timestamp if there is romm */
9312 9315                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9313 9316                          case IPOPT_TS_TSONLY:
9314 9317                                  off = IPOPT_TS_TIMELEN;
9315 9318                                  break;
9316 9319                          case IPOPT_TS_PRESPEC:
9317 9320                          case IPOPT_TS_PRESPEC_RFC791:
9318 9321                                  /* Verify that the address matched */
9319 9322                                  off = opt[IPOPT_OFFSET] - 1;
9320 9323                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9321 9324                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9322 9325                                          /* Not for us */
9323 9326                                          break;
9324 9327                                  }
9325 9328                                  /* FALLTHRU */
9326 9329                          case IPOPT_TS_TSANDADDR:
9327 9330                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9328 9331                                  break;
9329 9332                          default:
9330 9333                                  /*
9331 9334                                   * ip_*put_options should have already
9332 9335                                   * dropped this packet.
9333 9336                                   */
9334 9337                                  cmn_err(CE_PANIC, "ip_input_local_options: "
9335 9338                                      "unknown IT - bug in ip_input_options?\n");
9336 9339                                  return (B_TRUE);        /* Keep "lint" happy */
9337 9340                          }
9338 9341                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
9339 9342                                  /* Increase overflow counter */
9340 9343                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
9341 9344                                  opt[IPOPT_POS_OV_FLG] =
9342 9345                                      (uint8_t)((opt[IPOPT_POS_OV_FLG] & 0x0F) |
9343 9346                                      (off << 4));
9344 9347                                  break;
9345 9348                          }
9346 9349                          off = opt[IPOPT_OFFSET] - 1;
9347 9350                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9348 9351                          case IPOPT_TS_PRESPEC:
9349 9352                          case IPOPT_TS_PRESPEC_RFC791:
9350 9353                          case IPOPT_TS_TSANDADDR:
9351 9354                                  /* Pick a reasonable addr on the outbound if */
9352 9355                                  if (ip_select_source_v4(ill, INADDR_ANY,
9353 9356                                      ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst,
9354 9357                                      &ifaddr, NULL, NULL) != 0) {
9355 9358                                          /* No source! Shouldn't happen */
9356 9359                                          ifaddr = INADDR_ANY;
9357 9360                                  }
9358 9361                                  bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN);
9359 9362                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
9360 9363                                  /* FALLTHRU */
9361 9364                          case IPOPT_TS_TSONLY:
9362 9365                                  off = opt[IPOPT_OFFSET] - 1;
9363 9366                                  /* Compute # of milliseconds since midnight */
9364 9367                                  gethrestime(&now);
9365 9368                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
9366 9369                                      NSEC2MSEC(now.tv_nsec);
9367 9370                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
9368 9371                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
9369 9372                                  break;
9370 9373                          }
9371 9374                          break;
9372 9375                  }
9373 9376          }
9374 9377          return (B_TRUE);
9375 9378  
9376 9379  bad_src_route:
9377 9380          /* make sure we clear any indication of a hardware checksum */
9378 9381          DB_CKSUMFLAGS(mp) = 0;
9379 9382          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
9380 9383          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9381 9384          return (B_FALSE);
9382 9385  
9383 9386  }
9384 9387  
9385 9388  /*
9386 9389   * Process IP options in an inbound packet.  Always returns the nexthop.
9387 9390   * Normally this is the passed in nexthop, but if there is an option
9388 9391   * that effects the nexthop (such as a source route) that will be returned.
9389 9392   * Sets *errorp if there is an error, in which case an ICMP error has been sent
9390 9393   * and mp freed.
9391 9394   */
9392 9395  ipaddr_t
9393 9396  ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp,
9394 9397      ip_recv_attr_t *ira, int *errorp)
9395 9398  {
9396 9399          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
9397 9400          ipoptp_t        opts;
9398 9401          uchar_t         *opt;
9399 9402          uint8_t         optval;
9400 9403          uint8_t         optlen;
9401 9404          intptr_t        code = 0;
9402 9405          ire_t           *ire;
9403 9406  
9404 9407          ip2dbg(("ip_input_options\n"));
9405 9408          *errorp = 0;
9406 9409          for (optval = ipoptp_first(&opts, ipha);
9407 9410              optval != IPOPT_EOL;
9408 9411              optval = ipoptp_next(&opts)) {
9409 9412                  opt = opts.ipoptp_cur;
9410 9413                  optlen = opts.ipoptp_len;
9411 9414                  ip2dbg(("ip_input_options: opt %d, len %d\n",
9412 9415                      optval, optlen));
9413 9416                  /*
9414 9417                   * Note: we need to verify the checksum before we
9415 9418                   * modify anything thus this routine only extracts the next
9416 9419                   * hop dst from any source route.
9417 9420                   */
9418 9421                  switch (optval) {
9419 9422                          uint32_t off;
9420 9423                  case IPOPT_SSRR:
9421 9424                  case IPOPT_LSRR:
9422 9425                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
9423 9426                                  if (optval == IPOPT_SSRR) {
9424 9427                                          ip1dbg(("ip_input_options: not next"
9425 9428                                              " strict source route 0x%x\n",
9426 9429                                              ntohl(dst)));
9427 9430                                          code = (char *)&ipha->ipha_dst -
9428 9431                                              (char *)ipha;
9429 9432                                          goto param_prob; /* RouterReq's */
9430 9433                                  }
9431 9434                                  ip2dbg(("ip_input_options: "
9432 9435                                      "not next source route 0x%x\n",
9433 9436                                      ntohl(dst)));
9434 9437                                  break;
9435 9438                          }
9436 9439  
9437 9440                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9438 9441                                  ip1dbg((
9439 9442                                      "ip_input_options: bad option offset\n"));
9440 9443                                  code = (char *)&opt[IPOPT_OLEN] -
9441 9444                                      (char *)ipha;
9442 9445                                  goto param_prob;
9443 9446                          }
9444 9447                          off = opt[IPOPT_OFFSET];
9445 9448                          off--;
9446 9449                  redo_srr:
9447 9450                          if (optlen < IP_ADDR_LEN ||
9448 9451                              off > optlen - IP_ADDR_LEN) {
9449 9452                                  /* End of source route */
9450 9453                                  ip1dbg(("ip_input_options: end of SR\n"));
9451 9454                                  break;
9452 9455                          }
9453 9456                          bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
9454 9457                          ip1dbg(("ip_input_options: next hop 0x%x\n",
9455 9458                              ntohl(dst)));
9456 9459  
9457 9460                          /*
9458 9461                           * Check if our address is present more than
9459 9462                           * once as consecutive hops in source route.
9460 9463                           * XXX verify per-interface ip_forwarding
9461 9464                           * for source route?
9462 9465                           */
9463 9466                          if (ip_type_v4(dst, ipst) == IRE_LOCAL) {
9464 9467                                  off += IP_ADDR_LEN;
9465 9468                                  goto redo_srr;
9466 9469                          }
9467 9470  
9468 9471                          if (dst == htonl(INADDR_LOOPBACK)) {
9469 9472                                  ip1dbg(("ip_input_options: loopback addr in "
9470 9473                                      "source route!\n"));
9471 9474                                  goto bad_src_route;
9472 9475                          }
9473 9476                          /*
9474 9477                           * For strict: verify that dst is directly
9475 9478                           * reachable.
9476 9479                           */
9477 9480                          if (optval == IPOPT_SSRR) {
9478 9481                                  ire = ire_ftable_lookup_v4(dst, 0, 0,
9479 9482                                      IRE_INTERFACE, NULL, ALL_ZONES,
9480 9483                                      ira->ira_tsl,
9481 9484                                      MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
9482 9485                                      NULL);
9483 9486                                  if (ire == NULL) {
9484 9487                                          ip1dbg(("ip_input_options: SSRR not "
9485 9488                                              "directly reachable: 0x%x\n",
9486 9489                                              ntohl(dst)));
9487 9490                                          goto bad_src_route;
9488 9491                                  }
9489 9492                                  ire_refrele(ire);
9490 9493                          }
9491 9494                          /*
9492 9495                           * Defer update of the offset and the record route
9493 9496                           * until the packet is forwarded.
9494 9497                           */
9495 9498                          break;
9496 9499                  case IPOPT_RR:
9497 9500                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9498 9501                                  ip1dbg((
9499 9502                                      "ip_input_options: bad option offset\n"));
9500 9503                                  code = (char *)&opt[IPOPT_OLEN] -
9501 9504                                      (char *)ipha;
9502 9505                                  goto param_prob;
9503 9506                          }
9504 9507                          break;
9505 9508                  case IPOPT_TS:
9506 9509                          /*
9507 9510                           * Verify that length >= 5 and that there is either
9508 9511                           * room for another timestamp or that the overflow
9509 9512                           * counter is not maxed out.
9510 9513                           */
9511 9514                          code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
9512 9515                          if (optlen < IPOPT_MINLEN_IT) {
9513 9516                                  goto param_prob;
9514 9517                          }
9515 9518                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
9516 9519                                  ip1dbg((
9517 9520                                      "ip_input_options: bad option offset\n"));
9518 9521                                  code = (char *)&opt[IPOPT_OFFSET] -
9519 9522                                      (char *)ipha;
9520 9523                                  goto param_prob;
9521 9524                          }
9522 9525                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
9523 9526                          case IPOPT_TS_TSONLY:
9524 9527                                  off = IPOPT_TS_TIMELEN;
9525 9528                                  break;
9526 9529                          case IPOPT_TS_TSANDADDR:
9527 9530                          case IPOPT_TS_PRESPEC:
9528 9531                          case IPOPT_TS_PRESPEC_RFC791:
9529 9532                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
9530 9533                                  break;
9531 9534                          default:
9532 9535                                  code = (char *)&opt[IPOPT_POS_OV_FLG] -
9533 9536                                      (char *)ipha;
9534 9537                                  goto param_prob;
9535 9538                          }
9536 9539                          if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
9537 9540                              (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
9538 9541                                  /*
9539 9542                                   * No room and the overflow counter is 15
9540 9543                                   * already.
9541 9544                                   */
9542 9545                                  goto param_prob;
9543 9546                          }
9544 9547                          break;
9545 9548                  }
9546 9549          }
9547 9550  
9548 9551          if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) {
9549 9552                  return (dst);
9550 9553          }
9551 9554  
9552 9555          ip1dbg(("ip_input_options: error processing IP options."));
9553 9556          code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
9554 9557  
9555 9558  param_prob:
9556 9559          /* make sure we clear any indication of a hardware checksum */
9557 9560          DB_CKSUMFLAGS(mp) = 0;
9558 9561          ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill);
9559 9562          icmp_param_problem(mp, (uint8_t)code, ira);
9560 9563          *errorp = -1;
9561 9564          return (dst);
9562 9565  
9563 9566  bad_src_route:
9564 9567          /* make sure we clear any indication of a hardware checksum */
9565 9568          DB_CKSUMFLAGS(mp) = 0;
9566 9569          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill);
9567 9570          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira);
9568 9571          *errorp = -1;
9569 9572          return (dst);
9570 9573  }
9571 9574  
9572 9575  /*
9573 9576   * IP & ICMP info in >=14 msg's ...
9574 9577   *  - ip fixed part (mib2_ip_t)
9575 9578   *  - icmp fixed part (mib2_icmp_t)
9576 9579   *  - ipAddrEntryTable (ip 20)          all IPv4 ipifs
9577 9580   *  - ipRouteEntryTable (ip 21)         all IPv4 IREs
9578 9581   *  - ipNetToMediaEntryTable (ip 22)    all IPv4 Neighbor Cache entries
9579 9582   *  - ipRouteAttributeTable (ip 102)    labeled routes
9580 9583   *  - ip multicast membership (ip_member_t)
9581 9584   *  - ip multicast source filtering (ip_grpsrc_t)
9582 9585   *  - igmp fixed part (struct igmpstat)
9583 9586   *  - multicast routing stats (struct mrtstat)
9584 9587   *  - multicast routing vifs (array of struct vifctl)
9585 9588   *  - multicast routing routes (array of struct mfcctl)
9586 9589   *  - ip6 fixed part (mib2_ipv6IfStatsEntry_t)
9587 9590   *                                      One per ill plus one generic
9588 9591   *  - icmp6 fixed part (mib2_ipv6IfIcmpEntry_t)
9589 9592   *                                      One per ill plus one generic
9590 9593   *  - ipv6RouteEntry                    all IPv6 IREs
9591 9594   *  - ipv6RouteAttributeTable (ip6 102) labeled routes
9592 9595   *  - ipv6NetToMediaEntry               all IPv6 Neighbor Cache entries
9593 9596   *  - ipv6AddrEntry                     all IPv6 ipifs
9594 9597   *  - ipv6 multicast membership (ipv6_member_t)
9595 9598   *  - ipv6 multicast source filtering (ipv6_grpsrc_t)
9596 9599   *
9597 9600   * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is
9598 9601   * already filled in by the caller.
9599 9602   * If legacy_req is true then MIB structures needs to be truncated to their
9600 9603   * legacy sizes before being returned.
9601 9604   * Return value of 0 indicates that no messages were sent and caller
9602 9605   * should free mpctl.
9603 9606   */
9604 9607  int
9605 9608  ip_snmp_get(queue_t *q, mblk_t *mpctl, int level, boolean_t legacy_req)
9606 9609  {
9607 9610          ip_stack_t *ipst;
9608 9611          sctp_stack_t *sctps;
9609 9612  
9610 9613          if (q->q_next != NULL) {
9611 9614                  ipst = ILLQ_TO_IPST(q);
9612 9615          } else {
9613 9616                  ipst = CONNQ_TO_IPST(q);
9614 9617          }
9615 9618          ASSERT(ipst != NULL);
9616 9619          sctps = ipst->ips_netstack->netstack_sctp;
9617 9620  
9618 9621          if (mpctl == NULL || mpctl->b_cont == NULL) {
9619 9622                  return (0);
9620 9623          }
9621 9624  
9622 9625          /*
9623 9626           * For the purposes of the (broken) packet shell use
9624 9627           * of the level we make sure MIB2_TCP/MIB2_UDP can be used
9625 9628           * to make TCP and UDP appear first in the list of mib items.
9626 9629           * TBD: We could expand this and use it in netstat so that
9627 9630           * the kernel doesn't have to produce large tables (connections,
9628 9631           * routes, etc) when netstat only wants the statistics or a particular
9629 9632           * table.
9630 9633           */

↓ open down ↓

5051 lines elided

↑ open up ↑

9631 9634          if (!(level == MIB2_TCP || level == MIB2_UDP)) {
9632 9635                  if ((mpctl = icmp_snmp_get(q, mpctl)) == NULL) {
9633 9636                          return (1);
9634 9637                  }
9635 9638          }
9636 9639  
9637 9640          if (level != MIB2_TCP) {
9638 9641                  if ((mpctl = udp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9639 9642                          return (1);
9640 9643                  }
     9644 +                if (level == MIB2_UDP) {
     9645 +                        goto done;
     9646 +                }
9641 9647          }
9642 9648  
9643 9649          if (level != MIB2_UDP) {
9644 9650                  if ((mpctl = tcp_snmp_get(q, mpctl, legacy_req)) == NULL) {
9645 9651                          return (1);
9646 9652                  }
     9653 +                if (level == MIB2_TCP) {
     9654 +                        goto done;
     9655 +                }
9647 9656          }
9648 9657  
9649 9658          if ((mpctl = ip_snmp_get_mib2_ip_traffic_stats(q, mpctl,
9650 9659              ipst, legacy_req)) == NULL) {
9651 9660                  return (1);
9652 9661          }
9653 9662  
9654 9663          if ((mpctl = ip_snmp_get_mib2_ip6(q, mpctl, ipst,
9655 9664              legacy_req)) == NULL) {
9656 9665                  return (1);

9657 9666          }
9658 9667  
9659 9668          if ((mpctl = ip_snmp_get_mib2_icmp(q, mpctl, ipst)) == NULL) {
9660 9669                  return (1);
9661 9670          }
9662 9671  
9663 9672          if ((mpctl = ip_snmp_get_mib2_icmp6(q, mpctl, ipst)) == NULL) {
9664 9673                  return (1);
9665 9674          }
9666 9675  
9667 9676          if ((mpctl = ip_snmp_get_mib2_igmp(q, mpctl, ipst)) == NULL) {
9668 9677                  return (1);
9669 9678          }
9670 9679  
9671 9680          if ((mpctl = ip_snmp_get_mib2_multi(q, mpctl, ipst)) == NULL) {
9672 9681                  return (1);
9673 9682          }
9674 9683  
9675 9684          if ((mpctl = ip_snmp_get_mib2_ip_addr(q, mpctl, ipst,
9676 9685              legacy_req)) == NULL) {
9677 9686                  return (1);
9678 9687          }
9679 9688  
9680 9689          if ((mpctl = ip_snmp_get_mib2_ip6_addr(q, mpctl, ipst,
9681 9690              legacy_req)) == NULL) {
9682 9691                  return (1);
9683 9692          }
9684 9693  
9685 9694          if ((mpctl = ip_snmp_get_mib2_ip_group_mem(q, mpctl, ipst)) == NULL) {
9686 9695                  return (1);
9687 9696          }
9688 9697  
9689 9698          if ((mpctl = ip_snmp_get_mib2_ip6_group_mem(q, mpctl, ipst)) == NULL) {
9690 9699                  return (1);
9691 9700          }
9692 9701  
9693 9702          if ((mpctl = ip_snmp_get_mib2_ip_group_src(q, mpctl, ipst)) == NULL) {
9694 9703                  return (1);
9695 9704          }
9696 9705  
9697 9706          if ((mpctl = ip_snmp_get_mib2_ip6_group_src(q, mpctl, ipst)) == NULL) {
9698 9707                  return (1);
9699 9708          }
9700 9709  
9701 9710          if ((mpctl = ip_snmp_get_mib2_virt_multi(q, mpctl, ipst)) == NULL) {
9702 9711                  return (1);
9703 9712          }
9704 9713  
9705 9714          if ((mpctl = ip_snmp_get_mib2_multi_rtable(q, mpctl, ipst)) == NULL) {
9706 9715                  return (1);
9707 9716          }
9708 9717  
9709 9718          mpctl = ip_snmp_get_mib2_ip_route_media(q, mpctl, level, ipst);
9710 9719          if (mpctl == NULL)
9711 9720                  return (1);
9712 9721

↓ open down ↓

56 lines elided

↑ open up ↑

9713 9722          mpctl = ip_snmp_get_mib2_ip6_route_media(q, mpctl, level, ipst);
9714 9723          if (mpctl == NULL)
9715 9724                  return (1);
9716 9725  
9717 9726          if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) {
9718 9727                  return (1);
9719 9728          }
9720 9729          if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) {
9721 9730                  return (1);
9722 9731          }
     9732 +done:
9723 9733          freemsg(mpctl);
9724 9734          return (1);
9725 9735  }
9726 9736  
9727 9737  /* Get global (legacy) IPv4 statistics */
9728 9738  static mblk_t *
9729 9739  ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib,
9730 9740      ip_stack_t *ipst, boolean_t legacy_req)
9731 9741  {
9732 9742          mib2_ip_t               old_ip_mib;

9733 9743          struct opthdr           *optp;
9734 9744          mblk_t                  *mp2ctl;
9735 9745          mib2_ipAddrEntry_t      mae;
9736 9746  
9737 9747          /*
9738 9748           * make a copy of the original message
9739 9749           */
9740 9750          mp2ctl = copymsg(mpctl);
9741 9751  
9742 9752          /* fixed length IP structure... */
9743 9753          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9744 9754          optp->level = MIB2_IP;
9745 9755          optp->name = 0;
9746 9756          SET_MIB(old_ip_mib.ipForwarding,
9747 9757              (WE_ARE_FORWARDING(ipst) ? 1 : 2));
9748 9758          SET_MIB(old_ip_mib.ipDefaultTTL,
9749 9759              (uint32_t)ipst->ips_ip_def_ttl);
9750 9760          SET_MIB(old_ip_mib.ipReasmTimeout,
9751 9761              ipst->ips_ip_reassembly_timeout);
9752 9762          SET_MIB(old_ip_mib.ipAddrEntrySize,
9753 9763              (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
9754 9764              sizeof (mib2_ipAddrEntry_t));
9755 9765          SET_MIB(old_ip_mib.ipRouteEntrySize,
9756 9766              sizeof (mib2_ipRouteEntry_t));
9757 9767          SET_MIB(old_ip_mib.ipNetToMediaEntrySize,
9758 9768              sizeof (mib2_ipNetToMediaEntry_t));
9759 9769          SET_MIB(old_ip_mib.ipMemberEntrySize, sizeof (ip_member_t));
9760 9770          SET_MIB(old_ip_mib.ipGroupSourceEntrySize, sizeof (ip_grpsrc_t));
9761 9771          SET_MIB(old_ip_mib.ipRouteAttributeSize,
9762 9772              sizeof (mib2_ipAttributeEntry_t));
9763 9773          SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t));
9764 9774          SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t));
9765 9775  
9766 9776          /*
9767 9777           * Grab the statistics from the new IP MIB
9768 9778           */
9769 9779          SET_MIB(old_ip_mib.ipInReceives,
9770 9780              (uint32_t)ipmib->ipIfStatsHCInReceives);
9771 9781          SET_MIB(old_ip_mib.ipInHdrErrors, ipmib->ipIfStatsInHdrErrors);
9772 9782          SET_MIB(old_ip_mib.ipInAddrErrors, ipmib->ipIfStatsInAddrErrors);
9773 9783          SET_MIB(old_ip_mib.ipForwDatagrams,
9774 9784              (uint32_t)ipmib->ipIfStatsHCOutForwDatagrams);
9775 9785          SET_MIB(old_ip_mib.ipInUnknownProtos,
9776 9786              ipmib->ipIfStatsInUnknownProtos);
9777 9787          SET_MIB(old_ip_mib.ipInDiscards, ipmib->ipIfStatsInDiscards);
9778 9788          SET_MIB(old_ip_mib.ipInDelivers,
9779 9789              (uint32_t)ipmib->ipIfStatsHCInDelivers);
9780 9790          SET_MIB(old_ip_mib.ipOutRequests,
9781 9791              (uint32_t)ipmib->ipIfStatsHCOutRequests);
9782 9792          SET_MIB(old_ip_mib.ipOutDiscards, ipmib->ipIfStatsOutDiscards);
9783 9793          SET_MIB(old_ip_mib.ipOutNoRoutes, ipmib->ipIfStatsOutNoRoutes);
9784 9794          SET_MIB(old_ip_mib.ipReasmReqds, ipmib->ipIfStatsReasmReqds);
9785 9795          SET_MIB(old_ip_mib.ipReasmOKs, ipmib->ipIfStatsReasmOKs);
9786 9796          SET_MIB(old_ip_mib.ipReasmFails, ipmib->ipIfStatsReasmFails);
9787 9797          SET_MIB(old_ip_mib.ipFragOKs, ipmib->ipIfStatsOutFragOKs);
9788 9798          SET_MIB(old_ip_mib.ipFragFails, ipmib->ipIfStatsOutFragFails);
9789 9799          SET_MIB(old_ip_mib.ipFragCreates, ipmib->ipIfStatsOutFragCreates);
9790 9800  
9791 9801          /* ipRoutingDiscards is not being used */
9792 9802          SET_MIB(old_ip_mib.ipRoutingDiscards, 0);
9793 9803          SET_MIB(old_ip_mib.tcpInErrs, ipmib->tcpIfStatsInErrs);
9794 9804          SET_MIB(old_ip_mib.udpNoPorts, ipmib->udpIfStatsNoPorts);
9795 9805          SET_MIB(old_ip_mib.ipInCksumErrs, ipmib->ipIfStatsInCksumErrs);
9796 9806          SET_MIB(old_ip_mib.ipReasmDuplicates,
9797 9807              ipmib->ipIfStatsReasmDuplicates);
9798 9808          SET_MIB(old_ip_mib.ipReasmPartDups, ipmib->ipIfStatsReasmPartDups);
9799 9809          SET_MIB(old_ip_mib.ipForwProhibits, ipmib->ipIfStatsForwProhibits);
9800 9810          SET_MIB(old_ip_mib.udpInCksumErrs, ipmib->udpIfStatsInCksumErrs);
9801 9811          SET_MIB(old_ip_mib.udpInOverflows, ipmib->udpIfStatsInOverflows);
9802 9812          SET_MIB(old_ip_mib.rawipInOverflows,
9803 9813              ipmib->rawipIfStatsInOverflows);
9804 9814  
9805 9815          SET_MIB(old_ip_mib.ipsecInSucceeded, ipmib->ipsecIfStatsInSucceeded);
9806 9816          SET_MIB(old_ip_mib.ipsecInFailed, ipmib->ipsecIfStatsInFailed);
9807 9817          SET_MIB(old_ip_mib.ipInIPv6, ipmib->ipIfStatsInWrongIPVersion);
9808 9818          SET_MIB(old_ip_mib.ipOutIPv6, ipmib->ipIfStatsOutWrongIPVersion);
9809 9819          SET_MIB(old_ip_mib.ipOutSwitchIPv6,
9810 9820              ipmib->ipIfStatsOutSwitchIPVersion);
9811 9821  
9812 9822          if (!snmp_append_data(mpctl->b_cont, (char *)&old_ip_mib,
9813 9823              (int)sizeof (old_ip_mib))) {
9814 9824                  ip1dbg(("ip_snmp_get_mib2_ip: failed to allocate %u bytes\n",
9815 9825                      (uint_t)sizeof (old_ip_mib)));
9816 9826          }
9817 9827  
9818 9828          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9819 9829          ip3dbg(("ip_snmp_get_mib2_ip: level %d, name %d, len %d\n",
9820 9830              (int)optp->level, (int)optp->name, (int)optp->len));
9821 9831          qreply(q, mpctl);
9822 9832          return (mp2ctl);
9823 9833  }
9824 9834  
9825 9835  /* Per interface IPv4 statistics */
9826 9836  static mblk_t *
9827 9837  ip_snmp_get_mib2_ip_traffic_stats(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
9828 9838      boolean_t legacy_req)
9829 9839  {
9830 9840          struct opthdr           *optp;
9831 9841          mblk_t                  *mp2ctl;
9832 9842          ill_t                   *ill;
9833 9843          ill_walk_context_t      ctx;
9834 9844          mblk_t                  *mp_tail = NULL;
9835 9845          mib2_ipIfStatsEntry_t   global_ip_mib;
9836 9846          mib2_ipAddrEntry_t      mae;
9837 9847  
9838 9848          /*
9839 9849           * Make a copy of the original message
9840 9850           */
9841 9851          mp2ctl = copymsg(mpctl);
9842 9852  
9843 9853          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9844 9854          optp->level = MIB2_IP;
9845 9855          optp->name = MIB2_IP_TRAFFIC_STATS;
9846 9856          /* Include "unknown interface" ip_mib */
9847 9857          ipst->ips_ip_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
9848 9858          ipst->ips_ip_mib.ipIfStatsIfIndex =
9849 9859              MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
9850 9860          SET_MIB(ipst->ips_ip_mib.ipIfStatsForwarding,
9851 9861              (ipst->ips_ip_forwarding ? 1 : 2));
9852 9862          SET_MIB(ipst->ips_ip_mib.ipIfStatsDefaultTTL,
9853 9863              (uint32_t)ipst->ips_ip_def_ttl);
9854 9864          SET_MIB(ipst->ips_ip_mib.ipIfStatsEntrySize,
9855 9865              sizeof (mib2_ipIfStatsEntry_t));
9856 9866          SET_MIB(ipst->ips_ip_mib.ipIfStatsAddrEntrySize,
9857 9867              sizeof (mib2_ipAddrEntry_t));
9858 9868          SET_MIB(ipst->ips_ip_mib.ipIfStatsRouteEntrySize,
9859 9869              sizeof (mib2_ipRouteEntry_t));
9860 9870          SET_MIB(ipst->ips_ip_mib.ipIfStatsNetToMediaEntrySize,
9861 9871              sizeof (mib2_ipNetToMediaEntry_t));
9862 9872          SET_MIB(ipst->ips_ip_mib.ipIfStatsMemberEntrySize,
9863 9873              sizeof (ip_member_t));
9864 9874          SET_MIB(ipst->ips_ip_mib.ipIfStatsGroupSourceEntrySize,
9865 9875              sizeof (ip_grpsrc_t));
9866 9876  
9867 9877          bcopy(&ipst->ips_ip_mib, &global_ip_mib, sizeof (global_ip_mib));
9868 9878  
9869 9879          if (legacy_req) {
9870 9880                  SET_MIB(global_ip_mib.ipIfStatsAddrEntrySize,
9871 9881                      LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t));
9872 9882          }
9873 9883  
9874 9884          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9875 9885              (char *)&global_ip_mib, (int)sizeof (global_ip_mib))) {
9876 9886                  ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9877 9887                      "failed to allocate %u bytes\n",
9878 9888                      (uint_t)sizeof (global_ip_mib)));
9879 9889          }
9880 9890  
9881 9891          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
9882 9892          ill = ILL_START_WALK_V4(&ctx, ipst);
9883 9893          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
9884 9894                  ill->ill_ip_mib->ipIfStatsIfIndex =
9885 9895                      ill->ill_phyint->phyint_ifindex;
9886 9896                  SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
9887 9897                      (ipst->ips_ip_forwarding ? 1 : 2));
9888 9898                  SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultTTL,
9889 9899                      (uint32_t)ipst->ips_ip_def_ttl);
9890 9900  
9891 9901                  ip_mib2_add_ip_stats(&global_ip_mib, ill->ill_ip_mib);
9892 9902                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
9893 9903                      (char *)ill->ill_ip_mib,
9894 9904                      (int)sizeof (*ill->ill_ip_mib))) {
9895 9905                          ip1dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9896 9906                              "failed to allocate %u bytes\n",
9897 9907                              (uint_t)sizeof (*ill->ill_ip_mib)));
9898 9908                  }
9899 9909          }
9900 9910          rw_exit(&ipst->ips_ill_g_lock);
9901 9911  
9902 9912          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9903 9913          ip3dbg(("ip_snmp_get_mib2_ip_traffic_stats: "
9904 9914              "level %d, name %d, len %d\n",
9905 9915              (int)optp->level, (int)optp->name, (int)optp->len));
9906 9916          qreply(q, mpctl);
9907 9917  
9908 9918          if (mp2ctl == NULL)
9909 9919                  return (NULL);
9910 9920  
9911 9921          return (ip_snmp_get_mib2_ip(q, mp2ctl, &global_ip_mib, ipst,
9912 9922              legacy_req));
9913 9923  }
9914 9924  
9915 9925  /* Global IPv4 ICMP statistics */
9916 9926  static mblk_t *
9917 9927  ip_snmp_get_mib2_icmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9918 9928  {
9919 9929          struct opthdr           *optp;
9920 9930          mblk_t                  *mp2ctl;
9921 9931  
9922 9932          /*
9923 9933           * Make a copy of the original message
9924 9934           */
9925 9935          mp2ctl = copymsg(mpctl);
9926 9936  
9927 9937          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9928 9938          optp->level = MIB2_ICMP;
9929 9939          optp->name = 0;
9930 9940          if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_icmp_mib,
9931 9941              (int)sizeof (ipst->ips_icmp_mib))) {
9932 9942                  ip1dbg(("ip_snmp_get_mib2_icmp: failed to allocate %u bytes\n",
9933 9943                      (uint_t)sizeof (ipst->ips_icmp_mib)));
9934 9944          }
9935 9945          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9936 9946          ip3dbg(("ip_snmp_get_mib2_icmp: level %d, name %d, len %d\n",
9937 9947              (int)optp->level, (int)optp->name, (int)optp->len));
9938 9948          qreply(q, mpctl);
9939 9949          return (mp2ctl);
9940 9950  }
9941 9951  
9942 9952  /* Global IPv4 IGMP statistics */
9943 9953  static mblk_t *
9944 9954  ip_snmp_get_mib2_igmp(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9945 9955  {
9946 9956          struct opthdr           *optp;
9947 9957          mblk_t                  *mp2ctl;
9948 9958  
9949 9959          /*
9950 9960           * make a copy of the original message
9951 9961           */
9952 9962          mp2ctl = copymsg(mpctl);
9953 9963  
9954 9964          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9955 9965          optp->level = EXPER_IGMP;
9956 9966          optp->name = 0;
9957 9967          if (!snmp_append_data(mpctl->b_cont, (char *)&ipst->ips_igmpstat,
9958 9968              (int)sizeof (ipst->ips_igmpstat))) {
9959 9969                  ip1dbg(("ip_snmp_get_mib2_igmp: failed to allocate %u bytes\n",
9960 9970                      (uint_t)sizeof (ipst->ips_igmpstat)));
9961 9971          }
9962 9972          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9963 9973          ip3dbg(("ip_snmp_get_mib2_igmp: level %d, name %d, len %d\n",
9964 9974              (int)optp->level, (int)optp->name, (int)optp->len));
9965 9975          qreply(q, mpctl);
9966 9976          return (mp2ctl);
9967 9977  }
9968 9978  
9969 9979  /* Global IPv4 Multicast Routing statistics */
9970 9980  static mblk_t *
9971 9981  ip_snmp_get_mib2_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
9972 9982  {
9973 9983          struct opthdr           *optp;
9974 9984          mblk_t                  *mp2ctl;
9975 9985  
9976 9986          /*
9977 9987           * make a copy of the original message
9978 9988           */
9979 9989          mp2ctl = copymsg(mpctl);
9980 9990  
9981 9991          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
9982 9992          optp->level = EXPER_DVMRP;
9983 9993          optp->name = 0;
9984 9994          if (!ip_mroute_stats(mpctl->b_cont, ipst)) {
9985 9995                  ip0dbg(("ip_mroute_stats: failed\n"));
9986 9996          }
9987 9997          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
9988 9998          ip3dbg(("ip_snmp_get_mib2_multi: level %d, name %d, len %d\n",
9989 9999              (int)optp->level, (int)optp->name, (int)optp->len));
9990 10000          qreply(q, mpctl);
9991 10001          return (mp2ctl);
9992 10002  }
9993 10003  
9994 10004  /* IPv4 address information */
9995 10005  static mblk_t *
9996 10006  ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
9997 10007      boolean_t legacy_req)
9998 10008  {
9999 10009          struct opthdr           *optp;
10000 10010          mblk_t                  *mp2ctl;
10001 10011          mblk_t                  *mp_tail = NULL;
10002 10012          ill_t                   *ill;
10003 10013          ipif_t                  *ipif;
10004 10014          uint_t                  bitval;
10005 10015          mib2_ipAddrEntry_t      mae;
10006 10016          size_t                  mae_size;
10007 10017          zoneid_t                zoneid;
10008 10018          ill_walk_context_t      ctx;
10009 10019  
10010 10020          /*
10011 10021           * make a copy of the original message
10012 10022           */
10013 10023          mp2ctl = copymsg(mpctl);
10014 10024  
10015 10025          mae_size = (legacy_req) ? LEGACY_MIB_SIZE(&mae, mib2_ipAddrEntry_t) :
10016 10026              sizeof (mib2_ipAddrEntry_t);
10017 10027  
10018 10028          /* ipAddrEntryTable */
10019 10029  
10020 10030          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10021 10031          optp->level = MIB2_IP;
10022 10032          optp->name = MIB2_IP_ADDR;
10023 10033          zoneid = Q_TO_CONN(q)->conn_zoneid;
10024 10034  
10025 10035          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10026 10036          ill = ILL_START_WALK_V4(&ctx, ipst);
10027 10037          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10028 10038                  for (ipif = ill->ill_ipif; ipif != NULL;
10029 10039                      ipif = ipif->ipif_next) {
10030 10040                          if (ipif->ipif_zoneid != zoneid &&
10031 10041                              ipif->ipif_zoneid != ALL_ZONES)
10032 10042                                  continue;
10033 10043                          /* Sum of count from dead IRE_LO* and our current */
10034 10044                          mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10035 10045                          if (ipif->ipif_ire_local != NULL) {
10036 10046                                  mae.ipAdEntInfo.ae_ibcnt +=
10037 10047                                      ipif->ipif_ire_local->ire_ib_pkt_count;
10038 10048                          }
10039 10049                          mae.ipAdEntInfo.ae_obcnt = 0;
10040 10050                          mae.ipAdEntInfo.ae_focnt = 0;
10041 10051  
10042 10052                          ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes,
10043 10053                              OCTET_LENGTH);
10044 10054                          mae.ipAdEntIfIndex.o_length =
10045 10055                              mi_strlen(mae.ipAdEntIfIndex.o_bytes);
10046 10056                          mae.ipAdEntAddr = ipif->ipif_lcl_addr;
10047 10057                          mae.ipAdEntNetMask = ipif->ipif_net_mask;
10048 10058                          mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet;
10049 10059                          mae.ipAdEntInfo.ae_subnet_len =
10050 10060                              ip_mask_to_plen(ipif->ipif_net_mask);
10051 10061                          mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr;
10052 10062                          for (bitval = 1;
10053 10063                              bitval &&
10054 10064                              !(bitval & ipif->ipif_brd_addr);
10055 10065                              bitval <<= 1)
10056 10066                                  noop;
10057 10067                          mae.ipAdEntBcastAddr = bitval;
10058 10068                          mae.ipAdEntReasmMaxSize = IP_MAXPACKET;
10059 10069                          mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10060 10070                          mae.ipAdEntInfo.ae_metric  = ipif->ipif_ill->ill_metric;
10061 10071                          mae.ipAdEntInfo.ae_broadcast_addr =
10062 10072                              ipif->ipif_brd_addr;
10063 10073                          mae.ipAdEntInfo.ae_pp_dst_addr =
10064 10074                              ipif->ipif_pp_dst_addr;
10065 10075                          mae.ipAdEntInfo.ae_flags = ipif->ipif_flags |
10066 10076                              ill->ill_flags | ill->ill_phyint->phyint_flags;
10067 10077                          mae.ipAdEntRetransmitTime =
10068 10078                              ill->ill_reachable_retrans_time;
10069 10079  
10070 10080                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10071 10081                              (char *)&mae, (int)mae_size)) {
10072 10082                                  ip1dbg(("ip_snmp_get_mib2_ip_addr: failed to "
10073 10083                                      "allocate %u bytes\n", (uint_t)mae_size));
10074 10084                          }
10075 10085                  }
10076 10086          }
10077 10087          rw_exit(&ipst->ips_ill_g_lock);
10078 10088  
10079 10089          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10080 10090          ip3dbg(("ip_snmp_get_mib2_ip_addr: level %d, name %d, len %d\n",
10081 10091              (int)optp->level, (int)optp->name, (int)optp->len));
10082 10092          qreply(q, mpctl);
10083 10093          return (mp2ctl);
10084 10094  }
10085 10095  
10086 10096  /* IPv6 address information */
10087 10097  static mblk_t *
10088 10098  ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10089 10099      boolean_t legacy_req)
10090 10100  {
10091 10101          struct opthdr           *optp;
10092 10102          mblk_t                  *mp2ctl;
10093 10103          mblk_t                  *mp_tail = NULL;
10094 10104          ill_t                   *ill;
10095 10105          ipif_t                  *ipif;
10096 10106          mib2_ipv6AddrEntry_t    mae6;
10097 10107          size_t                  mae6_size;
10098 10108          zoneid_t                zoneid;
10099 10109          ill_walk_context_t      ctx;
10100 10110  
10101 10111          /*
10102 10112           * make a copy of the original message
10103 10113           */
10104 10114          mp2ctl = copymsg(mpctl);
10105 10115  
10106 10116          mae6_size = (legacy_req) ?
10107 10117              LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t) :
10108 10118              sizeof (mib2_ipv6AddrEntry_t);
10109 10119  
10110 10120          /* ipv6AddrEntryTable */
10111 10121  
10112 10122          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10113 10123          optp->level = MIB2_IP6;
10114 10124          optp->name = MIB2_IP6_ADDR;
10115 10125          zoneid = Q_TO_CONN(q)->conn_zoneid;
10116 10126  
10117 10127          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10118 10128          ill = ILL_START_WALK_V6(&ctx, ipst);
10119 10129          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10120 10130                  for (ipif = ill->ill_ipif; ipif != NULL;
10121 10131                      ipif = ipif->ipif_next) {
10122 10132                          if (ipif->ipif_zoneid != zoneid &&
10123 10133                              ipif->ipif_zoneid != ALL_ZONES)
10124 10134                                  continue;
10125 10135                          /* Sum of count from dead IRE_LO* and our current */
10126 10136                          mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count;
10127 10137                          if (ipif->ipif_ire_local != NULL) {
10128 10138                                  mae6.ipv6AddrInfo.ae_ibcnt +=
10129 10139                                      ipif->ipif_ire_local->ire_ib_pkt_count;
10130 10140                          }
10131 10141                          mae6.ipv6AddrInfo.ae_obcnt = 0;
10132 10142                          mae6.ipv6AddrInfo.ae_focnt = 0;
10133 10143  
10134 10144                          ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes,
10135 10145                              OCTET_LENGTH);
10136 10146                          mae6.ipv6AddrIfIndex.o_length =
10137 10147                              mi_strlen(mae6.ipv6AddrIfIndex.o_bytes);
10138 10148                          mae6.ipv6AddrAddress = ipif->ipif_v6lcl_addr;
10139 10149                          mae6.ipv6AddrPfxLength =
10140 10150                              ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10141 10151                          mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet;
10142 10152                          mae6.ipv6AddrInfo.ae_subnet_len =
10143 10153                              mae6.ipv6AddrPfxLength;
10144 10154                          mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr;
10145 10155  
10146 10156                          /* Type: stateless(1), stateful(2), unknown(3) */
10147 10157                          if (ipif->ipif_flags & IPIF_ADDRCONF)
10148 10158                                  mae6.ipv6AddrType = 1;
10149 10159                          else
10150 10160                                  mae6.ipv6AddrType = 2;
10151 10161                          /* Anycast: true(1), false(2) */
10152 10162                          if (ipif->ipif_flags & IPIF_ANYCAST)
10153 10163                                  mae6.ipv6AddrAnycastFlag = 1;
10154 10164                          else
10155 10165                                  mae6.ipv6AddrAnycastFlag = 2;
10156 10166  
10157 10167                          /*
10158 10168                           * Address status: preferred(1), deprecated(2),
10159 10169                           * invalid(3), inaccessible(4), unknown(5)
10160 10170                           */
10161 10171                          if (ipif->ipif_flags & IPIF_NOLOCAL)
10162 10172                                  mae6.ipv6AddrStatus = 3;
10163 10173                          else if (ipif->ipif_flags & IPIF_DEPRECATED)
10164 10174                                  mae6.ipv6AddrStatus = 2;
10165 10175                          else
10166 10176                                  mae6.ipv6AddrStatus = 1;
10167 10177                          mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu;
10168 10178                          mae6.ipv6AddrInfo.ae_metric  =
10169 10179                              ipif->ipif_ill->ill_metric;
10170 10180                          mae6.ipv6AddrInfo.ae_pp_dst_addr =
10171 10181                              ipif->ipif_v6pp_dst_addr;
10172 10182                          mae6.ipv6AddrInfo.ae_flags = ipif->ipif_flags |
10173 10183                              ill->ill_flags | ill->ill_phyint->phyint_flags;
10174 10184                          mae6.ipv6AddrReasmMaxSize = IP_MAXPACKET;
10175 10185                          mae6.ipv6AddrIdentifier = ill->ill_token;
10176 10186                          mae6.ipv6AddrIdentifierLen = ill->ill_token_length;
10177 10187                          mae6.ipv6AddrReachableTime = ill->ill_reachable_time;
10178 10188                          mae6.ipv6AddrRetransmitTime =
10179 10189                              ill->ill_reachable_retrans_time;
10180 10190                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10181 10191                              (char *)&mae6, (int)mae6_size)) {
10182 10192                                  ip1dbg(("ip_snmp_get_mib2_ip6_addr: failed to "
10183 10193                                      "allocate %u bytes\n",
10184 10194                                      (uint_t)mae6_size));
10185 10195                          }
10186 10196                  }
10187 10197          }
10188 10198          rw_exit(&ipst->ips_ill_g_lock);
10189 10199  
10190 10200          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10191 10201          ip3dbg(("ip_snmp_get_mib2_ip6_addr: level %d, name %d, len %d\n",
10192 10202              (int)optp->level, (int)optp->name, (int)optp->len));
10193 10203          qreply(q, mpctl);
10194 10204          return (mp2ctl);
10195 10205  }
10196 10206  
10197 10207  /* IPv4 multicast group membership. */
10198 10208  static mblk_t *
10199 10209  ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10200 10210  {
10201 10211          struct opthdr           *optp;
10202 10212          mblk_t                  *mp2ctl;
10203 10213          ill_t                   *ill;
10204 10214          ipif_t                  *ipif;
10205 10215          ilm_t                   *ilm;
10206 10216          ip_member_t             ipm;
10207 10217          mblk_t                  *mp_tail = NULL;
10208 10218          ill_walk_context_t      ctx;
10209 10219          zoneid_t                zoneid;
10210 10220  
10211 10221          /*
10212 10222           * make a copy of the original message
10213 10223           */
10214 10224          mp2ctl = copymsg(mpctl);
10215 10225          zoneid = Q_TO_CONN(q)->conn_zoneid;
10216 10226  
10217 10227          /* ipGroupMember table */
10218 10228          optp = (struct opthdr *)&mpctl->b_rptr[
10219 10229              sizeof (struct T_optmgmt_ack)];
10220 10230          optp->level = MIB2_IP;
10221 10231          optp->name = EXPER_IP_GROUP_MEMBERSHIP;
10222 10232  
10223 10233          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10224 10234          ill = ILL_START_WALK_V4(&ctx, ipst);
10225 10235          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10226 10236                  /* Make sure the ill isn't going away. */
10227 10237                  if (!ill_check_and_refhold(ill))
10228 10238                          continue;
10229 10239                  rw_exit(&ipst->ips_ill_g_lock);
10230 10240                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10231 10241                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10232 10242                          if (ilm->ilm_zoneid != zoneid &&
10233 10243                              ilm->ilm_zoneid != ALL_ZONES)
10234 10244                                  continue;
10235 10245  
10236 10246                          /* Is there an ipif for ilm_ifaddr? */
10237 10247                          for (ipif = ill->ill_ipif; ipif != NULL;
10238 10248                              ipif = ipif->ipif_next) {
10239 10249                                  if (!IPIF_IS_CONDEMNED(ipif) &&
10240 10250                                      ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10241 10251                                      ilm->ilm_ifaddr != INADDR_ANY)
10242 10252                                          break;
10243 10253                          }
10244 10254                          if (ipif != NULL) {
10245 10255                                  ipif_get_name(ipif,
10246 10256                                      ipm.ipGroupMemberIfIndex.o_bytes,
10247 10257                                      OCTET_LENGTH);
10248 10258                          } else {
10249 10259                                  ill_get_name(ill,
10250 10260                                      ipm.ipGroupMemberIfIndex.o_bytes,
10251 10261                                      OCTET_LENGTH);
10252 10262                          }
10253 10263                          ipm.ipGroupMemberIfIndex.o_length =
10254 10264                              mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes);
10255 10265  
10256 10266                          ipm.ipGroupMemberAddress = ilm->ilm_addr;
10257 10267                          ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt;
10258 10268                          ipm.ipGroupMemberFilterMode = ilm->ilm_fmode;
10259 10269                          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10260 10270                              (char *)&ipm, (int)sizeof (ipm))) {
10261 10271                                  ip1dbg(("ip_snmp_get_mib2_ip_group: "
10262 10272                                      "failed to allocate %u bytes\n",
10263 10273                                      (uint_t)sizeof (ipm)));
10264 10274                          }
10265 10275                  }
10266 10276                  rw_exit(&ill->ill_mcast_lock);
10267 10277                  ill_refrele(ill);
10268 10278                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10269 10279          }
10270 10280          rw_exit(&ipst->ips_ill_g_lock);
10271 10281          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10272 10282          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10273 10283              (int)optp->level, (int)optp->name, (int)optp->len));
10274 10284          qreply(q, mpctl);
10275 10285          return (mp2ctl);
10276 10286  }
10277 10287  
10278 10288  /* IPv6 multicast group membership. */
10279 10289  static mblk_t *
10280 10290  ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10281 10291  {
10282 10292          struct opthdr           *optp;
10283 10293          mblk_t                  *mp2ctl;
10284 10294          ill_t                   *ill;
10285 10295          ilm_t                   *ilm;
10286 10296          ipv6_member_t           ipm6;
10287 10297          mblk_t                  *mp_tail = NULL;
10288 10298          ill_walk_context_t      ctx;
10289 10299          zoneid_t                zoneid;
10290 10300  
10291 10301          /*
10292 10302           * make a copy of the original message
10293 10303           */
10294 10304          mp2ctl = copymsg(mpctl);
10295 10305          zoneid = Q_TO_CONN(q)->conn_zoneid;
10296 10306  
10297 10307          /* ip6GroupMember table */
10298 10308          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10299 10309          optp->level = MIB2_IP6;
10300 10310          optp->name = EXPER_IP6_GROUP_MEMBERSHIP;
10301 10311  
10302 10312          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10303 10313          ill = ILL_START_WALK_V6(&ctx, ipst);
10304 10314          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10305 10315                  /* Make sure the ill isn't going away. */
10306 10316                  if (!ill_check_and_refhold(ill))
10307 10317                          continue;
10308 10318                  rw_exit(&ipst->ips_ill_g_lock);
10309 10319                  /*
10310 10320                   * Normally we don't have any members on under IPMP interfaces.
10311 10321                   * We report them as a debugging aid.
10312 10322                   */
10313 10323                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10314 10324                  ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex;
10315 10325                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10316 10326                          if (ilm->ilm_zoneid != zoneid &&
10317 10327                              ilm->ilm_zoneid != ALL_ZONES)
10318 10328                                  continue;       /* not this zone */
10319 10329                          ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr;
10320 10330                          ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt;
10321 10331                          ipm6.ipv6GroupMemberFilterMode = ilm->ilm_fmode;
10322 10332                          if (!snmp_append_data2(mpctl->b_cont,
10323 10333                              &mp_tail,
10324 10334                              (char *)&ipm6, (int)sizeof (ipm6))) {
10325 10335                                  ip1dbg(("ip_snmp_get_mib2_ip6_group: "
10326 10336                                      "failed to allocate %u bytes\n",
10327 10337                                      (uint_t)sizeof (ipm6)));
10328 10338                          }
10329 10339                  }
10330 10340                  rw_exit(&ill->ill_mcast_lock);
10331 10341                  ill_refrele(ill);
10332 10342                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10333 10343          }
10334 10344          rw_exit(&ipst->ips_ill_g_lock);
10335 10345  
10336 10346          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10337 10347          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10338 10348              (int)optp->level, (int)optp->name, (int)optp->len));
10339 10349          qreply(q, mpctl);
10340 10350          return (mp2ctl);
10341 10351  }
10342 10352  
10343 10353  /* IP multicast filtered sources */
10344 10354  static mblk_t *
10345 10355  ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10346 10356  {
10347 10357          struct opthdr           *optp;
10348 10358          mblk_t                  *mp2ctl;
10349 10359          ill_t                   *ill;
10350 10360          ipif_t                  *ipif;
10351 10361          ilm_t                   *ilm;
10352 10362          ip_grpsrc_t             ips;
10353 10363          mblk_t                  *mp_tail = NULL;
10354 10364          ill_walk_context_t      ctx;
10355 10365          zoneid_t                zoneid;
10356 10366          int                     i;
10357 10367          slist_t                 *sl;
10358 10368  
10359 10369          /*
10360 10370           * make a copy of the original message
10361 10371           */
10362 10372          mp2ctl = copymsg(mpctl);
10363 10373          zoneid = Q_TO_CONN(q)->conn_zoneid;
10364 10374  
10365 10375          /* ipGroupSource table */
10366 10376          optp = (struct opthdr *)&mpctl->b_rptr[
10367 10377              sizeof (struct T_optmgmt_ack)];
10368 10378          optp->level = MIB2_IP;
10369 10379          optp->name = EXPER_IP_GROUP_SOURCES;
10370 10380  
10371 10381          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10372 10382          ill = ILL_START_WALK_V4(&ctx, ipst);
10373 10383          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10374 10384                  /* Make sure the ill isn't going away. */
10375 10385                  if (!ill_check_and_refhold(ill))
10376 10386                          continue;
10377 10387                  rw_exit(&ipst->ips_ill_g_lock);
10378 10388                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10379 10389                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10380 10390                          sl = ilm->ilm_filter;
10381 10391                          if (ilm->ilm_zoneid != zoneid &&
10382 10392                              ilm->ilm_zoneid != ALL_ZONES)
10383 10393                                  continue;
10384 10394                          if (SLIST_IS_EMPTY(sl))
10385 10395                                  continue;
10386 10396  
10387 10397                          /* Is there an ipif for ilm_ifaddr? */
10388 10398                          for (ipif = ill->ill_ipif; ipif != NULL;
10389 10399                              ipif = ipif->ipif_next) {
10390 10400                                  if (!IPIF_IS_CONDEMNED(ipif) &&
10391 10401                                      ipif->ipif_lcl_addr == ilm->ilm_ifaddr &&
10392 10402                                      ilm->ilm_ifaddr != INADDR_ANY)
10393 10403                                          break;
10394 10404                          }
10395 10405                          if (ipif != NULL) {
10396 10406                                  ipif_get_name(ipif,
10397 10407                                      ips.ipGroupSourceIfIndex.o_bytes,
10398 10408                                      OCTET_LENGTH);
10399 10409                          } else {
10400 10410                                  ill_get_name(ill,
10401 10411                                      ips.ipGroupSourceIfIndex.o_bytes,
10402 10412                                      OCTET_LENGTH);
10403 10413                          }
10404 10414                          ips.ipGroupSourceIfIndex.o_length =
10405 10415                              mi_strlen(ips.ipGroupSourceIfIndex.o_bytes);
10406 10416  
10407 10417                          ips.ipGroupSourceGroup = ilm->ilm_addr;
10408 10418                          for (i = 0; i < sl->sl_numsrc; i++) {
10409 10419                                  if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i]))
10410 10420                                          continue;
10411 10421                                  IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i],
10412 10422                                      ips.ipGroupSourceAddress);
10413 10423                                  if (snmp_append_data2(mpctl->b_cont, &mp_tail,
10414 10424                                      (char *)&ips, (int)sizeof (ips)) == 0) {
10415 10425                                          ip1dbg(("ip_snmp_get_mib2_ip_group_src:"
10416 10426                                              " failed to allocate %u bytes\n",
10417 10427                                              (uint_t)sizeof (ips)));
10418 10428                                  }
10419 10429                          }
10420 10430                  }
10421 10431                  rw_exit(&ill->ill_mcast_lock);
10422 10432                  ill_refrele(ill);
10423 10433                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10424 10434          }
10425 10435          rw_exit(&ipst->ips_ill_g_lock);
10426 10436          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10427 10437          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10428 10438              (int)optp->level, (int)optp->name, (int)optp->len));
10429 10439          qreply(q, mpctl);
10430 10440          return (mp2ctl);
10431 10441  }
10432 10442  
10433 10443  /* IPv6 multicast filtered sources. */
10434 10444  static mblk_t *
10435 10445  ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10436 10446  {
10437 10447          struct opthdr           *optp;
10438 10448          mblk_t                  *mp2ctl;
10439 10449          ill_t                   *ill;
10440 10450          ilm_t                   *ilm;
10441 10451          ipv6_grpsrc_t           ips6;
10442 10452          mblk_t                  *mp_tail = NULL;
10443 10453          ill_walk_context_t      ctx;
10444 10454          zoneid_t                zoneid;
10445 10455          int                     i;
10446 10456          slist_t                 *sl;
10447 10457  
10448 10458          /*
10449 10459           * make a copy of the original message
10450 10460           */
10451 10461          mp2ctl = copymsg(mpctl);
10452 10462          zoneid = Q_TO_CONN(q)->conn_zoneid;
10453 10463  
10454 10464          /* ip6GroupMember table */
10455 10465          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10456 10466          optp->level = MIB2_IP6;
10457 10467          optp->name = EXPER_IP6_GROUP_SOURCES;
10458 10468  
10459 10469          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10460 10470          ill = ILL_START_WALK_V6(&ctx, ipst);
10461 10471          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10462 10472                  /* Make sure the ill isn't going away. */
10463 10473                  if (!ill_check_and_refhold(ill))
10464 10474                          continue;
10465 10475                  rw_exit(&ipst->ips_ill_g_lock);
10466 10476                  /*
10467 10477                   * Normally we don't have any members on under IPMP interfaces.
10468 10478                   * We report them as a debugging aid.
10469 10479                   */
10470 10480                  rw_enter(&ill->ill_mcast_lock, RW_READER);
10471 10481                  ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex;
10472 10482                  for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
10473 10483                          sl = ilm->ilm_filter;
10474 10484                          if (ilm->ilm_zoneid != zoneid &&
10475 10485                              ilm->ilm_zoneid != ALL_ZONES)
10476 10486                                  continue;
10477 10487                          if (SLIST_IS_EMPTY(sl))
10478 10488                                  continue;
10479 10489                          ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr;
10480 10490                          for (i = 0; i < sl->sl_numsrc; i++) {
10481 10491                                  ips6.ipv6GroupSourceAddress = sl->sl_addr[i];
10482 10492                                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10483 10493                                      (char *)&ips6, (int)sizeof (ips6))) {
10484 10494                                          ip1dbg(("ip_snmp_get_mib2_ip6_"
10485 10495                                              "group_src: failed to allocate "
10486 10496                                              "%u bytes\n",
10487 10497                                              (uint_t)sizeof (ips6)));
10488 10498                                  }
10489 10499                          }
10490 10500                  }
10491 10501                  rw_exit(&ill->ill_mcast_lock);
10492 10502                  ill_refrele(ill);
10493 10503                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10494 10504          }
10495 10505          rw_exit(&ipst->ips_ill_g_lock);
10496 10506  
10497 10507          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10498 10508          ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
10499 10509              (int)optp->level, (int)optp->name, (int)optp->len));
10500 10510          qreply(q, mpctl);
10501 10511          return (mp2ctl);
10502 10512  }
10503 10513  
10504 10514  /* Multicast routing virtual interface table. */
10505 10515  static mblk_t *
10506 10516  ip_snmp_get_mib2_virt_multi(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10507 10517  {
10508 10518          struct opthdr           *optp;
10509 10519          mblk_t                  *mp2ctl;
10510 10520  
10511 10521          /*
10512 10522           * make a copy of the original message
10513 10523           */
10514 10524          mp2ctl = copymsg(mpctl);
10515 10525  
10516 10526          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10517 10527          optp->level = EXPER_DVMRP;
10518 10528          optp->name = EXPER_DVMRP_VIF;
10519 10529          if (!ip_mroute_vif(mpctl->b_cont, ipst)) {
10520 10530                  ip0dbg(("ip_mroute_vif: failed\n"));
10521 10531          }
10522 10532          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10523 10533          ip3dbg(("ip_snmp_get_mib2_virt_multi: level %d, name %d, len %d\n",
10524 10534              (int)optp->level, (int)optp->name, (int)optp->len));
10525 10535          qreply(q, mpctl);
10526 10536          return (mp2ctl);
10527 10537  }
10528 10538  
10529 10539  /* Multicast routing table. */
10530 10540  static mblk_t *
10531 10541  ip_snmp_get_mib2_multi_rtable(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10532 10542  {
10533 10543          struct opthdr           *optp;
10534 10544          mblk_t                  *mp2ctl;
10535 10545  
10536 10546          /*
10537 10547           * make a copy of the original message
10538 10548           */
10539 10549          mp2ctl = copymsg(mpctl);
10540 10550  
10541 10551          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10542 10552          optp->level = EXPER_DVMRP;
10543 10553          optp->name = EXPER_DVMRP_MRT;
10544 10554          if (!ip_mroute_mrt(mpctl->b_cont, ipst)) {
10545 10555                  ip0dbg(("ip_mroute_mrt: failed\n"));
10546 10556          }
10547 10557          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10548 10558          ip3dbg(("ip_snmp_get_mib2_multi_rtable: level %d, name %d, len %d\n",
10549 10559              (int)optp->level, (int)optp->name, (int)optp->len));
10550 10560          qreply(q, mpctl);
10551 10561          return (mp2ctl);
10552 10562  }
10553 10563  
10554 10564  /*
10555 10565   * Return ipRouteEntryTable, ipNetToMediaEntryTable, and ipRouteAttributeTable
10556 10566   * in one IRE walk.
10557 10567   */
10558 10568  static mblk_t *
10559 10569  ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level,
10560 10570      ip_stack_t *ipst)
10561 10571  {
10562 10572          struct opthdr   *optp;
10563 10573          mblk_t          *mp2ctl;        /* Returned */
10564 10574          mblk_t          *mp3ctl;        /* nettomedia */
10565 10575          mblk_t          *mp4ctl;        /* routeattrs */
10566 10576          iproutedata_t   ird;
10567 10577          zoneid_t        zoneid;
10568 10578  
10569 10579          /*
10570 10580           * make copies of the original message
10571 10581           *      - mp2ctl is returned unchanged to the caller for its use
10572 10582           *      - mpctl is sent upstream as ipRouteEntryTable
10573 10583           *      - mp3ctl is sent upstream as ipNetToMediaEntryTable
10574 10584           *      - mp4ctl is sent upstream as ipRouteAttributeTable
10575 10585           */
10576 10586          mp2ctl = copymsg(mpctl);
10577 10587          mp3ctl = copymsg(mpctl);
10578 10588          mp4ctl = copymsg(mpctl);
10579 10589          if (mp3ctl == NULL || mp4ctl == NULL) {
10580 10590                  freemsg(mp4ctl);
10581 10591                  freemsg(mp3ctl);
10582 10592                  freemsg(mp2ctl);
10583 10593                  freemsg(mpctl);
10584 10594                  return (NULL);
10585 10595          }
10586 10596  
10587 10597          bzero(&ird, sizeof (ird));
10588 10598  
10589 10599          ird.ird_route.lp_head = mpctl->b_cont;
10590 10600          ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10591 10601          ird.ird_attrs.lp_head = mp4ctl->b_cont;
10592 10602          /*
10593 10603           * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10594 10604           * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
10595 10605           * intended a temporary solution until a proper MIB API is provided
10596 10606           * that provides complete filtering/caller-opt-in.
10597 10607           */
10598 10608          if (level == EXPER_IP_AND_ALL_IRES)
10599 10609                  ird.ird_flags |= IRD_REPORT_ALL;
10600 10610  
10601 10611          zoneid = Q_TO_CONN(q)->conn_zoneid;
10602 10612          ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst);
10603 10613  
10604 10614          /* ipRouteEntryTable in mpctl */
10605 10615          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10606 10616          optp->level = MIB2_IP;
10607 10617          optp->name = MIB2_IP_ROUTE;
10608 10618          optp->len = msgdsize(ird.ird_route.lp_head);
10609 10619          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10610 10620              (int)optp->level, (int)optp->name, (int)optp->len));
10611 10621          qreply(q, mpctl);
10612 10622  
10613 10623          /* ipNetToMediaEntryTable in mp3ctl */
10614 10624          ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst);
10615 10625  
10616 10626          optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10617 10627          optp->level = MIB2_IP;
10618 10628          optp->name = MIB2_IP_MEDIA;
10619 10629          optp->len = msgdsize(ird.ird_netmedia.lp_head);
10620 10630          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10621 10631              (int)optp->level, (int)optp->name, (int)optp->len));
10622 10632          qreply(q, mp3ctl);
10623 10633  
10624 10634          /* ipRouteAttributeTable in mp4ctl */
10625 10635          optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10626 10636          optp->level = MIB2_IP;
10627 10637          optp->name = EXPER_IP_RTATTR;
10628 10638          optp->len = msgdsize(ird.ird_attrs.lp_head);
10629 10639          ip3dbg(("ip_snmp_get_mib2_ip_route_media: level %d, name %d, len %d\n",
10630 10640              (int)optp->level, (int)optp->name, (int)optp->len));
10631 10641          if (optp->len == 0)
10632 10642                  freemsg(mp4ctl);
10633 10643          else
10634 10644                  qreply(q, mp4ctl);
10635 10645  
10636 10646          return (mp2ctl);
10637 10647  }
10638 10648  
10639 10649  /*
10640 10650   * Return ipv6RouteEntryTable and ipv6RouteAttributeTable in one IRE walk, and
10641 10651   * ipv6NetToMediaEntryTable in an NDP walk.
10642 10652   */
10643 10653  static mblk_t *
10644 10654  ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level,
10645 10655      ip_stack_t *ipst)
10646 10656  {
10647 10657          struct opthdr   *optp;
10648 10658          mblk_t          *mp2ctl;        /* Returned */
10649 10659          mblk_t          *mp3ctl;        /* nettomedia */
10650 10660          mblk_t          *mp4ctl;        /* routeattrs */
10651 10661          iproutedata_t   ird;
10652 10662          zoneid_t        zoneid;
10653 10663  
10654 10664          /*
10655 10665           * make copies of the original message
10656 10666           *      - mp2ctl is returned unchanged to the caller for its use
10657 10667           *      - mpctl is sent upstream as ipv6RouteEntryTable
10658 10668           *      - mp3ctl is sent upstream as ipv6NetToMediaEntryTable
10659 10669           *      - mp4ctl is sent upstream as ipv6RouteAttributeTable
10660 10670           */
10661 10671          mp2ctl = copymsg(mpctl);
10662 10672          mp3ctl = copymsg(mpctl);
10663 10673          mp4ctl = copymsg(mpctl);
10664 10674          if (mp3ctl == NULL || mp4ctl == NULL) {
10665 10675                  freemsg(mp4ctl);
10666 10676                  freemsg(mp3ctl);
10667 10677                  freemsg(mp2ctl);
10668 10678                  freemsg(mpctl);
10669 10679                  return (NULL);
10670 10680          }
10671 10681  
10672 10682          bzero(&ird, sizeof (ird));
10673 10683  
10674 10684          ird.ird_route.lp_head = mpctl->b_cont;
10675 10685          ird.ird_netmedia.lp_head = mp3ctl->b_cont;
10676 10686          ird.ird_attrs.lp_head = mp4ctl->b_cont;
10677 10687          /*
10678 10688           * If the level has been set the special EXPER_IP_AND_ALL_IRES value,
10679 10689           * then also include ire_testhidden IREs and IRE_IF_CLONE.  This is
10680 10690           * intended a temporary solution until a proper MIB API is provided
10681 10691           * that provides complete filtering/caller-opt-in.
10682 10692           */
10683 10693          if (level == EXPER_IP_AND_ALL_IRES)
10684 10694                  ird.ird_flags |= IRD_REPORT_ALL;
10685 10695  
10686 10696          zoneid = Q_TO_CONN(q)->conn_zoneid;
10687 10697          ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst);
10688 10698  
10689 10699          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10690 10700          optp->level = MIB2_IP6;
10691 10701          optp->name = MIB2_IP6_ROUTE;
10692 10702          optp->len = msgdsize(ird.ird_route.lp_head);
10693 10703          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10694 10704              (int)optp->level, (int)optp->name, (int)optp->len));
10695 10705          qreply(q, mpctl);
10696 10706  
10697 10707          /* ipv6NetToMediaEntryTable in mp3ctl */
10698 10708          ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst);
10699 10709  
10700 10710          optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10701 10711          optp->level = MIB2_IP6;
10702 10712          optp->name = MIB2_IP6_MEDIA;
10703 10713          optp->len = msgdsize(ird.ird_netmedia.lp_head);
10704 10714          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10705 10715              (int)optp->level, (int)optp->name, (int)optp->len));
10706 10716          qreply(q, mp3ctl);
10707 10717  
10708 10718          /* ipv6RouteAttributeTable in mp4ctl */
10709 10719          optp = (struct opthdr *)&mp4ctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10710 10720          optp->level = MIB2_IP6;
10711 10721          optp->name = EXPER_IP_RTATTR;
10712 10722          optp->len = msgdsize(ird.ird_attrs.lp_head);
10713 10723          ip3dbg(("ip_snmp_get_mib2_ip6_route_media: level %d, name %d, len %d\n",
10714 10724              (int)optp->level, (int)optp->name, (int)optp->len));
10715 10725          if (optp->len == 0)
10716 10726                  freemsg(mp4ctl);
10717 10727          else
10718 10728                  qreply(q, mp4ctl);
10719 10729  
10720 10730          return (mp2ctl);
10721 10731  }
10722 10732  
10723 10733  /*
10724 10734   * IPv6 mib: One per ill
10725 10735   */
10726 10736  static mblk_t *
10727 10737  ip_snmp_get_mib2_ip6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst,
10728 10738      boolean_t legacy_req)
10729 10739  {
10730 10740          struct opthdr           *optp;
10731 10741          mblk_t                  *mp2ctl;
10732 10742          ill_t                   *ill;
10733 10743          ill_walk_context_t      ctx;
10734 10744          mblk_t                  *mp_tail = NULL;
10735 10745          mib2_ipv6AddrEntry_t    mae6;
10736 10746          mib2_ipIfStatsEntry_t   *ise;
10737 10747          size_t                  ise_size, iae_size;
10738 10748  
10739 10749          /*
10740 10750           * Make a copy of the original message
10741 10751           */
10742 10752          mp2ctl = copymsg(mpctl);
10743 10753  
10744 10754          /* fixed length IPv6 structure ... */
10745 10755  
10746 10756          if (legacy_req) {
10747 10757                  ise_size = LEGACY_MIB_SIZE(&ipst->ips_ip6_mib,
10748 10758                      mib2_ipIfStatsEntry_t);
10749 10759                  iae_size = LEGACY_MIB_SIZE(&mae6, mib2_ipv6AddrEntry_t);
10750 10760          } else {
10751 10761                  ise_size = sizeof (mib2_ipIfStatsEntry_t);
10752 10762                  iae_size = sizeof (mib2_ipv6AddrEntry_t);
10753 10763          }
10754 10764  
10755 10765          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10756 10766          optp->level = MIB2_IP6;
10757 10767          optp->name = 0;
10758 10768          /* Include "unknown interface" ip6_mib */
10759 10769          ipst->ips_ip6_mib.ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
10760 10770          ipst->ips_ip6_mib.ipIfStatsIfIndex =
10761 10771              MIB2_UNKNOWN_INTERFACE; /* Flag to netstat */
10762 10772          SET_MIB(ipst->ips_ip6_mib.ipIfStatsForwarding,
10763 10773              ipst->ips_ipv6_forwarding ? 1 : 2);
10764 10774          SET_MIB(ipst->ips_ip6_mib.ipIfStatsDefaultHopLimit,
10765 10775              ipst->ips_ipv6_def_hops);
10766 10776          SET_MIB(ipst->ips_ip6_mib.ipIfStatsEntrySize,
10767 10777              sizeof (mib2_ipIfStatsEntry_t));
10768 10778          SET_MIB(ipst->ips_ip6_mib.ipIfStatsAddrEntrySize,
10769 10779              sizeof (mib2_ipv6AddrEntry_t));
10770 10780          SET_MIB(ipst->ips_ip6_mib.ipIfStatsRouteEntrySize,
10771 10781              sizeof (mib2_ipv6RouteEntry_t));
10772 10782          SET_MIB(ipst->ips_ip6_mib.ipIfStatsNetToMediaEntrySize,
10773 10783              sizeof (mib2_ipv6NetToMediaEntry_t));
10774 10784          SET_MIB(ipst->ips_ip6_mib.ipIfStatsMemberEntrySize,
10775 10785              sizeof (ipv6_member_t));
10776 10786          SET_MIB(ipst->ips_ip6_mib.ipIfStatsGroupSourceEntrySize,
10777 10787              sizeof (ipv6_grpsrc_t));
10778 10788  
10779 10789          /*
10780 10790           * Synchronize 64- and 32-bit counters
10781 10791           */
10782 10792          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInReceives,
10783 10793              ipIfStatsHCInReceives);
10784 10794          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInDelivers,
10785 10795              ipIfStatsHCInDelivers);
10786 10796          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutRequests,
10787 10797              ipIfStatsHCOutRequests);
10788 10798          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutForwDatagrams,
10789 10799              ipIfStatsHCOutForwDatagrams);
10790 10800          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsOutMcastPkts,
10791 10801              ipIfStatsHCOutMcastPkts);
10792 10802          SYNC32_MIB(&ipst->ips_ip6_mib, ipIfStatsInMcastPkts,
10793 10803              ipIfStatsHCInMcastPkts);
10794 10804  
10795 10805          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10796 10806              (char *)&ipst->ips_ip6_mib, (int)ise_size)) {
10797 10807                  ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate %u bytes\n",
10798 10808                      (uint_t)ise_size));
10799 10809          } else if (legacy_req) {
10800 10810                  /* Adjust the EntrySize fields for legacy requests. */
10801 10811                  ise =
10802 10812                      (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr - (int)ise_size);
10803 10813                  SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10804 10814                  SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10805 10815          }
10806 10816  
10807 10817          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10808 10818          ill = ILL_START_WALK_V6(&ctx, ipst);
10809 10819          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10810 10820                  ill->ill_ip_mib->ipIfStatsIfIndex =
10811 10821                      ill->ill_phyint->phyint_ifindex;
10812 10822                  SET_MIB(ill->ill_ip_mib->ipIfStatsForwarding,
10813 10823                      ipst->ips_ipv6_forwarding ? 1 : 2);
10814 10824                  SET_MIB(ill->ill_ip_mib->ipIfStatsDefaultHopLimit,
10815 10825                      ill->ill_max_hops);
10816 10826  
10817 10827                  /*
10818 10828                   * Synchronize 64- and 32-bit counters
10819 10829                   */
10820 10830                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInReceives,
10821 10831                      ipIfStatsHCInReceives);
10822 10832                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInDelivers,
10823 10833                      ipIfStatsHCInDelivers);
10824 10834                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutRequests,
10825 10835                      ipIfStatsHCOutRequests);
10826 10836                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutForwDatagrams,
10827 10837                      ipIfStatsHCOutForwDatagrams);
10828 10838                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsOutMcastPkts,
10829 10839                      ipIfStatsHCOutMcastPkts);
10830 10840                  SYNC32_MIB(ill->ill_ip_mib, ipIfStatsInMcastPkts,
10831 10841                      ipIfStatsHCInMcastPkts);
10832 10842  
10833 10843                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10834 10844                      (char *)ill->ill_ip_mib, (int)ise_size)) {
10835 10845                          ip1dbg(("ip_snmp_get_mib2_ip6: failed to allocate "
10836 10846                          "%u bytes\n", (uint_t)ise_size));
10837 10847                  } else if (legacy_req) {
10838 10848                          /* Adjust the EntrySize fields for legacy requests. */
10839 10849                          ise = (mib2_ipIfStatsEntry_t *)(mp_tail->b_wptr -
10840 10850                              (int)ise_size);
10841 10851                          SET_MIB(ise->ipIfStatsEntrySize, ise_size);
10842 10852                          SET_MIB(ise->ipIfStatsAddrEntrySize, iae_size);
10843 10853                  }
10844 10854          }
10845 10855          rw_exit(&ipst->ips_ill_g_lock);
10846 10856  
10847 10857          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10848 10858          ip3dbg(("ip_snmp_get_mib2_ip6: level %d, name %d, len %d\n",
10849 10859              (int)optp->level, (int)optp->name, (int)optp->len));
10850 10860          qreply(q, mpctl);
10851 10861          return (mp2ctl);
10852 10862  }
10853 10863  
10854 10864  /*
10855 10865   * ICMPv6 mib: One per ill
10856 10866   */
10857 10867  static mblk_t *
10858 10868  ip_snmp_get_mib2_icmp6(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
10859 10869  {
10860 10870          struct opthdr           *optp;
10861 10871          mblk_t                  *mp2ctl;
10862 10872          ill_t                   *ill;
10863 10873          ill_walk_context_t      ctx;
10864 10874          mblk_t                  *mp_tail = NULL;
10865 10875          /*
10866 10876           * Make a copy of the original message
10867 10877           */
10868 10878          mp2ctl = copymsg(mpctl);
10869 10879  
10870 10880          /* fixed length ICMPv6 structure ... */
10871 10881  
10872 10882          optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)];
10873 10883          optp->level = MIB2_ICMP6;
10874 10884          optp->name = 0;
10875 10885          /* Include "unknown interface" icmp6_mib */
10876 10886          ipst->ips_icmp6_mib.ipv6IfIcmpIfIndex =
10877 10887              MIB2_UNKNOWN_INTERFACE; /* netstat flag */
10878 10888          ipst->ips_icmp6_mib.ipv6IfIcmpEntrySize =
10879 10889              sizeof (mib2_ipv6IfIcmpEntry_t);
10880 10890          if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10881 10891              (char *)&ipst->ips_icmp6_mib,
10882 10892              (int)sizeof (ipst->ips_icmp6_mib))) {
10883 10893                  ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate %u bytes\n",
10884 10894                      (uint_t)sizeof (ipst->ips_icmp6_mib)));
10885 10895          }
10886 10896  
10887 10897          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
10888 10898          ill = ILL_START_WALK_V6(&ctx, ipst);
10889 10899          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
10890 10900                  ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
10891 10901                      ill->ill_phyint->phyint_ifindex;
10892 10902                  if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
10893 10903                      (char *)ill->ill_icmp6_mib,
10894 10904                      (int)sizeof (*ill->ill_icmp6_mib))) {
10895 10905                          ip1dbg(("ip_snmp_get_mib2_icmp6: failed to allocate "
10896 10906                              "%u bytes\n",
10897 10907                              (uint_t)sizeof (*ill->ill_icmp6_mib)));
10898 10908                  }
10899 10909          }
10900 10910          rw_exit(&ipst->ips_ill_g_lock);
10901 10911  
10902 10912          optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
10903 10913          ip3dbg(("ip_snmp_get_mib2_icmp6: level %d, name %d, len %d\n",
10904 10914              (int)optp->level, (int)optp->name, (int)optp->len));
10905 10915          qreply(q, mpctl);
10906 10916          return (mp2ctl);
10907 10917  }
10908 10918  
10909 10919  /*
10910 10920   * ire_walk routine to create both ipRouteEntryTable and
10911 10921   * ipRouteAttributeTable in one IRE walk
10912 10922   */
10913 10923  static void
10914 10924  ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird)
10915 10925  {
10916 10926          ill_t                           *ill;
10917 10927          mib2_ipRouteEntry_t             *re;
10918 10928          mib2_ipAttributeEntry_t         iaes;
10919 10929          tsol_ire_gw_secattr_t           *attrp;
10920 10930          tsol_gc_t                       *gc = NULL;
10921 10931          tsol_gcgrp_t                    *gcgrp = NULL;
10922 10932          ip_stack_t                      *ipst = ire->ire_ipst;
10923 10933  
10924 10934          ASSERT(ire->ire_ipversion == IPV4_VERSION);
10925 10935  
10926 10936          if (!(ird->ird_flags & IRD_REPORT_ALL)) {
10927 10937                  if (ire->ire_testhidden)
10928 10938                          return;
10929 10939                  if (ire->ire_type & IRE_IF_CLONE)
10930 10940                          return;
10931 10941          }
10932 10942  
10933 10943          if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
10934 10944                  return;
10935 10945  
10936 10946          if ((attrp = ire->ire_gw_secattr) != NULL) {
10937 10947                  mutex_enter(&attrp->igsa_lock);
10938 10948                  if ((gc = attrp->igsa_gc) != NULL) {
10939 10949                          gcgrp = gc->gc_grp;
10940 10950                          ASSERT(gcgrp != NULL);
10941 10951                          rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
10942 10952                  }
10943 10953                  mutex_exit(&attrp->igsa_lock);
10944 10954          }
10945 10955          /*
10946 10956           * Return all IRE types for route table... let caller pick and choose
10947 10957           */
10948 10958          re->ipRouteDest = ire->ire_addr;
10949 10959          ill = ire->ire_ill;
10950 10960          re->ipRouteIfIndex.o_length = 0;
10951 10961          if (ill != NULL) {
10952 10962                  ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH);
10953 10963                  re->ipRouteIfIndex.o_length =
10954 10964                      mi_strlen(re->ipRouteIfIndex.o_bytes);
10955 10965          }
10956 10966          re->ipRouteMetric1 = -1;
10957 10967          re->ipRouteMetric2 = -1;
10958 10968          re->ipRouteMetric3 = -1;
10959 10969          re->ipRouteMetric4 = -1;
10960 10970  
10961 10971          re->ipRouteNextHop = ire->ire_gateway_addr;
10962 10972          /* indirect(4), direct(3), or invalid(2) */
10963 10973          if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
10964 10974                  re->ipRouteType = 2;
10965 10975          else if (ire->ire_type & IRE_ONLINK)
10966 10976                  re->ipRouteType = 3;
10967 10977          else
10968 10978                  re->ipRouteType = 4;
10969 10979  
10970 10980          re->ipRouteProto = -1;
10971 10981          re->ipRouteAge = gethrestime_sec() - ire->ire_create_time;
10972 10982          re->ipRouteMask = ire->ire_mask;
10973 10983          re->ipRouteMetric5 = -1;
10974 10984          re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
10975 10985          if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0)
10976 10986                  re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
10977 10987  
10978 10988          re->ipRouteInfo.re_frag_flag    = 0;
10979 10989          re->ipRouteInfo.re_rtt          = 0;
10980 10990          re->ipRouteInfo.re_src_addr     = 0;
10981 10991          re->ipRouteInfo.re_ref          = ire->ire_refcnt;
10982 10992          re->ipRouteInfo.re_obpkt        = ire->ire_ob_pkt_count;
10983 10993          re->ipRouteInfo.re_ibpkt        = ire->ire_ib_pkt_count;
10984 10994          re->ipRouteInfo.re_flags        = ire->ire_flags;
10985 10995  
10986 10996          /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
10987 10997          if (ire->ire_type & IRE_INTERFACE) {
10988 10998                  ire_t *child;
10989 10999  
10990 11000                  rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
10991 11001                  child = ire->ire_dep_children;
10992 11002                  while (child != NULL) {
10993 11003                          re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count;
10994 11004                          re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count;
10995 11005                          child = child->ire_dep_sib_next;
10996 11006                  }
10997 11007                  rw_exit(&ipst->ips_ire_dep_lock);
10998 11008          }
10999 11009  
11000 11010          if (ire->ire_flags & RTF_DYNAMIC) {
11001 11011                  re->ipRouteInfo.re_ire_type     = IRE_HOST_REDIRECT;
11002 11012          } else {
11003 11013                  re->ipRouteInfo.re_ire_type     = ire->ire_type;
11004 11014          }
11005 11015  
11006 11016          if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
11007 11017              (char *)re, (int)sizeof (*re))) {
11008 11018                  ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n",
11009 11019                      (uint_t)sizeof (*re)));
11010 11020          }
11011 11021  
11012 11022          if (gc != NULL) {
11013 11023                  iaes.iae_routeidx = ird->ird_idx;
11014 11024                  iaes.iae_doi = gc->gc_db->gcdb_doi;
11015 11025                  iaes.iae_slrange = gc->gc_db->gcdb_slrange;
11016 11026  
11017 11027                  if (!snmp_append_data2(ird->ird_attrs.lp_head,
11018 11028                      &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
11019 11029                          ip1dbg(("ip_snmp_get2_v4: failed to allocate %u "
11020 11030                              "bytes\n", (uint_t)sizeof (iaes)));
11021 11031                  }
11022 11032          }
11023 11033  
11024 11034          /* bump route index for next pass */
11025 11035          ird->ird_idx++;
11026 11036  
11027 11037          kmem_free(re, sizeof (*re));
11028 11038          if (gcgrp != NULL)
11029 11039                  rw_exit(&gcgrp->gcgrp_rwlock);
11030 11040  }
11031 11041  
11032 11042  /*
11033 11043   * ire_walk routine to create ipv6RouteEntryTable and ipRouteEntryTable.
11034 11044   */
11035 11045  static void
11036 11046  ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird)
11037 11047  {
11038 11048          ill_t                           *ill;
11039 11049          mib2_ipv6RouteEntry_t           *re;
11040 11050          mib2_ipAttributeEntry_t         iaes;
11041 11051          tsol_ire_gw_secattr_t           *attrp;
11042 11052          tsol_gc_t                       *gc = NULL;
11043 11053          tsol_gcgrp_t                    *gcgrp = NULL;
11044 11054          ip_stack_t                      *ipst = ire->ire_ipst;
11045 11055  
11046 11056          ASSERT(ire->ire_ipversion == IPV6_VERSION);
11047 11057  
11048 11058          if (!(ird->ird_flags & IRD_REPORT_ALL)) {
11049 11059                  if (ire->ire_testhidden)
11050 11060                          return;
11051 11061                  if (ire->ire_type & IRE_IF_CLONE)
11052 11062                          return;
11053 11063          }
11054 11064  
11055 11065          if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL)
11056 11066                  return;
11057 11067  
11058 11068          if ((attrp = ire->ire_gw_secattr) != NULL) {
11059 11069                  mutex_enter(&attrp->igsa_lock);
11060 11070                  if ((gc = attrp->igsa_gc) != NULL) {
11061 11071                          gcgrp = gc->gc_grp;
11062 11072                          ASSERT(gcgrp != NULL);
11063 11073                          rw_enter(&gcgrp->gcgrp_rwlock, RW_READER);
11064 11074                  }
11065 11075                  mutex_exit(&attrp->igsa_lock);
11066 11076          }
11067 11077          /*
11068 11078           * Return all IRE types for route table... let caller pick and choose
11069 11079           */
11070 11080          re->ipv6RouteDest = ire->ire_addr_v6;
11071 11081          re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6);
11072 11082          re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */
11073 11083          re->ipv6RouteIfIndex.o_length = 0;
11074 11084          ill = ire->ire_ill;
11075 11085          if (ill != NULL) {
11076 11086                  ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH);
11077 11087                  re->ipv6RouteIfIndex.o_length =
11078 11088                      mi_strlen(re->ipv6RouteIfIndex.o_bytes);
11079 11089          }
11080 11090  
11081 11091          ASSERT(!(ire->ire_type & IRE_BROADCAST));
11082 11092  
11083 11093          mutex_enter(&ire->ire_lock);
11084 11094          re->ipv6RouteNextHop = ire->ire_gateway_addr_v6;
11085 11095          mutex_exit(&ire->ire_lock);
11086 11096  
11087 11097          /* remote(4), local(3), or discard(2) */
11088 11098          if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE))
11089 11099                  re->ipv6RouteType = 2;
11090 11100          else if (ire->ire_type & IRE_ONLINK)
11091 11101                  re->ipv6RouteType = 3;
11092 11102          else
11093 11103                  re->ipv6RouteType = 4;
11094 11104  
11095 11105          re->ipv6RouteProtocol   = -1;
11096 11106          re->ipv6RoutePolicy     = 0;
11097 11107          re->ipv6RouteAge        = gethrestime_sec() - ire->ire_create_time;
11098 11108          re->ipv6RouteNextHopRDI = 0;
11099 11109          re->ipv6RouteWeight     = 0;
11100 11110          re->ipv6RouteMetric     = 0;
11101 11111          re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu;
11102 11112          if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0)
11103 11113                  re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu;
11104 11114  
11105 11115          re->ipv6RouteInfo.re_frag_flag  = 0;
11106 11116          re->ipv6RouteInfo.re_rtt        = 0;
11107 11117          re->ipv6RouteInfo.re_src_addr   = ipv6_all_zeros;
11108 11118          re->ipv6RouteInfo.re_obpkt      = ire->ire_ob_pkt_count;
11109 11119          re->ipv6RouteInfo.re_ibpkt      = ire->ire_ib_pkt_count;
11110 11120          re->ipv6RouteInfo.re_ref        = ire->ire_refcnt;
11111 11121          re->ipv6RouteInfo.re_flags      = ire->ire_flags;
11112 11122  
11113 11123          /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */
11114 11124          if (ire->ire_type & IRE_INTERFACE) {
11115 11125                  ire_t *child;
11116 11126  
11117 11127                  rw_enter(&ipst->ips_ire_dep_lock, RW_READER);
11118 11128                  child = ire->ire_dep_children;
11119 11129                  while (child != NULL) {
11120 11130                          re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count;
11121 11131                          re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count;
11122 11132                          child = child->ire_dep_sib_next;
11123 11133                  }
11124 11134                  rw_exit(&ipst->ips_ire_dep_lock);
11125 11135          }
11126 11136          if (ire->ire_flags & RTF_DYNAMIC) {
11127 11137                  re->ipv6RouteInfo.re_ire_type   = IRE_HOST_REDIRECT;
11128 11138          } else {
11129 11139                  re->ipv6RouteInfo.re_ire_type   = ire->ire_type;
11130 11140          }
11131 11141  
11132 11142          if (!snmp_append_data2(ird->ird_route.lp_head, &ird->ird_route.lp_tail,
11133 11143              (char *)re, (int)sizeof (*re))) {
11134 11144                  ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n",
11135 11145                      (uint_t)sizeof (*re)));
11136 11146          }
11137 11147  
11138 11148          if (gc != NULL) {
11139 11149                  iaes.iae_routeidx = ird->ird_idx;
11140 11150                  iaes.iae_doi = gc->gc_db->gcdb_doi;
11141 11151                  iaes.iae_slrange = gc->gc_db->gcdb_slrange;
11142 11152  
11143 11153                  if (!snmp_append_data2(ird->ird_attrs.lp_head,
11144 11154                      &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) {
11145 11155                          ip1dbg(("ip_snmp_get2_v6: failed to allocate %u "
11146 11156                              "bytes\n", (uint_t)sizeof (iaes)));
11147 11157                  }
11148 11158          }
11149 11159  
11150 11160          /* bump route index for next pass */
11151 11161          ird->ird_idx++;
11152 11162  
11153 11163          kmem_free(re, sizeof (*re));
11154 11164          if (gcgrp != NULL)
11155 11165                  rw_exit(&gcgrp->gcgrp_rwlock);
11156 11166  }
11157 11167  
11158 11168  /*
11159 11169   * ncec_walk routine to create ipv6NetToMediaEntryTable
11160 11170   */
11161 11171  static int
11162 11172  ip_snmp_get2_v6_media(ncec_t *ncec, iproutedata_t *ird)
11163 11173  {
11164 11174          ill_t                           *ill;
11165 11175          mib2_ipv6NetToMediaEntry_t      ntme;
11166 11176  
11167 11177          ill = ncec->ncec_ill;
11168 11178          /* skip arpce entries, and loopback ncec entries */
11169 11179          if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK)
11170 11180                  return (0);
11171 11181          /*
11172 11182           * Neighbor cache entry attached to IRE with on-link
11173 11183           * destination.
11174 11184           * We report all IPMP groups on ncec_ill which is normally the upper.
11175 11185           */
11176 11186          ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex;
11177 11187          ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr;
11178 11188          ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length;
11179 11189          if (ncec->ncec_lladdr != NULL) {
11180 11190                  bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes,
11181 11191                      ntme.ipv6NetToMediaPhysAddress.o_length);
11182 11192          }
11183 11193          /*
11184 11194           * Note: Returns ND_* states. Should be:
11185 11195           * reachable(1), stale(2), delay(3), probe(4),
11186 11196           * invalid(5), unknown(6)
11187 11197           */
11188 11198          ntme.ipv6NetToMediaState = ncec->ncec_state;
11189 11199          ntme.ipv6NetToMediaLastUpdated = 0;
11190 11200  
11191 11201          /* other(1), dynamic(2), static(3), local(4) */
11192 11202          if (NCE_MYADDR(ncec)) {
11193 11203                  ntme.ipv6NetToMediaType = 4;
11194 11204          } else if (ncec->ncec_flags & NCE_F_PUBLISH) {
11195 11205                  ntme.ipv6NetToMediaType = 1; /* proxy */
11196 11206          } else if (ncec->ncec_flags & NCE_F_STATIC) {
11197 11207                  ntme.ipv6NetToMediaType = 3;
11198 11208          } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) {
11199 11209                  ntme.ipv6NetToMediaType = 1;
11200 11210          } else {
11201 11211                  ntme.ipv6NetToMediaType = 2;
11202 11212          }
11203 11213  
11204 11214          if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11205 11215              &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11206 11216                  ip1dbg(("ip_snmp_get2_v6_media: failed to allocate %u bytes\n",
11207 11217                      (uint_t)sizeof (ntme)));
11208 11218          }
11209 11219          return (0);
11210 11220  }
11211 11221  
11212 11222  int
11213 11223  nce2ace(ncec_t *ncec)
11214 11224  {
11215 11225          int flags = 0;
11216 11226  
11217 11227          if (NCE_ISREACHABLE(ncec))
11218 11228                  flags |= ACE_F_RESOLVED;
11219 11229          if (ncec->ncec_flags & NCE_F_AUTHORITY)
11220 11230                  flags |= ACE_F_AUTHORITY;
11221 11231          if (ncec->ncec_flags & NCE_F_PUBLISH)
11222 11232                  flags |= ACE_F_PUBLISH;
11223 11233          if ((ncec->ncec_flags & NCE_F_NONUD) != 0)
11224 11234                  flags |= ACE_F_PERMANENT;
11225 11235          if (NCE_MYADDR(ncec))
11226 11236                  flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY);
11227 11237          if (ncec->ncec_flags & NCE_F_UNVERIFIED)
11228 11238                  flags |= ACE_F_UNVERIFIED;
11229 11239          if (ncec->ncec_flags & NCE_F_AUTHORITY)
11230 11240                  flags |= ACE_F_AUTHORITY;
11231 11241          if (ncec->ncec_flags & NCE_F_DELAYED)
11232 11242                  flags |= ACE_F_DELAYED;
11233 11243          return (flags);
11234 11244  }
11235 11245  
11236 11246  /*
11237 11247   * ncec_walk routine to create ipNetToMediaEntryTable
11238 11248   */
11239 11249  static int
11240 11250  ip_snmp_get2_v4_media(ncec_t *ncec, iproutedata_t *ird)
11241 11251  {
11242 11252          ill_t                           *ill;
11243 11253          mib2_ipNetToMediaEntry_t        ntme;
11244 11254          const char                      *name = "unknown";
11245 11255          ipaddr_t                        ncec_addr;
11246 11256  
11247 11257          ill = ncec->ncec_ill;
11248 11258          if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) ||
11249 11259              ill->ill_net_type == IRE_LOOPBACK)
11250 11260                  return (0);
11251 11261  
11252 11262          /* We report all IPMP groups on ncec_ill which is normally the upper. */
11253 11263          name = ill->ill_name;
11254 11264          /* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */
11255 11265          if (NCE_MYADDR(ncec)) {
11256 11266                  ntme.ipNetToMediaType = 4;
11257 11267          } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) {
11258 11268                  ntme.ipNetToMediaType = 1;
11259 11269          } else {
11260 11270                  ntme.ipNetToMediaType = 3;
11261 11271          }
11262 11272          ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name));
11263 11273          bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes,
11264 11274              ntme.ipNetToMediaIfIndex.o_length);
11265 11275  
11266 11276          IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr);
11267 11277          bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr));
11268 11278  
11269 11279          ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t);
11270 11280          ncec_addr = INADDR_BROADCAST;
11271 11281          bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes,
11272 11282              sizeof (ncec_addr));
11273 11283          /*
11274 11284           * map all the flags to the ACE counterpart.
11275 11285           */
11276 11286          ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec);
11277 11287  
11278 11288          ntme.ipNetToMediaPhysAddress.o_length =
11279 11289              MIN(OCTET_LENGTH, ill->ill_phys_addr_length);
11280 11290  
11281 11291          if (!NCE_ISREACHABLE(ncec))
11282 11292                  ntme.ipNetToMediaPhysAddress.o_length = 0;
11283 11293          else {
11284 11294                  if (ncec->ncec_lladdr != NULL) {
11285 11295                          bcopy(ncec->ncec_lladdr,
11286 11296                              ntme.ipNetToMediaPhysAddress.o_bytes,
11287 11297                              ntme.ipNetToMediaPhysAddress.o_length);
11288 11298                  }
11289 11299          }
11290 11300  
11291 11301          if (!snmp_append_data2(ird->ird_netmedia.lp_head,
11292 11302              &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) {
11293 11303                  ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n",
11294 11304                      (uint_t)sizeof (ntme)));
11295 11305          }
11296 11306          return (0);
11297 11307  }
11298 11308  
11299 11309  /*
11300 11310   * return (0) if invalid set request, 1 otherwise, including non-tcp requests
11301 11311   */
11302 11312  /* ARGSUSED */
11303 11313  int
11304 11314  ip_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
11305 11315  {
11306 11316          switch (level) {
11307 11317          case MIB2_IP:
11308 11318          case MIB2_ICMP:
11309 11319                  switch (name) {
11310 11320                  default:
11311 11321                          break;
11312 11322                  }
11313 11323                  return (1);
11314 11324          default:
11315 11325                  return (1);
11316 11326          }
11317 11327  }
11318 11328  
11319 11329  /*
11320 11330   * When there exists both a 64- and 32-bit counter of a particular type
11321 11331   * (i.e., InReceives), only the 64-bit counters are added.
11322 11332   */
11323 11333  void
11324 11334  ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *o1, mib2_ipIfStatsEntry_t *o2)
11325 11335  {
11326 11336          UPDATE_MIB(o1, ipIfStatsInHdrErrors, o2->ipIfStatsInHdrErrors);
11327 11337          UPDATE_MIB(o1, ipIfStatsInTooBigErrors, o2->ipIfStatsInTooBigErrors);
11328 11338          UPDATE_MIB(o1, ipIfStatsInNoRoutes, o2->ipIfStatsInNoRoutes);
11329 11339          UPDATE_MIB(o1, ipIfStatsInAddrErrors, o2->ipIfStatsInAddrErrors);
11330 11340          UPDATE_MIB(o1, ipIfStatsInUnknownProtos, o2->ipIfStatsInUnknownProtos);
11331 11341          UPDATE_MIB(o1, ipIfStatsInTruncatedPkts, o2->ipIfStatsInTruncatedPkts);
11332 11342          UPDATE_MIB(o1, ipIfStatsInDiscards, o2->ipIfStatsInDiscards);
11333 11343          UPDATE_MIB(o1, ipIfStatsOutDiscards, o2->ipIfStatsOutDiscards);
11334 11344          UPDATE_MIB(o1, ipIfStatsOutFragOKs, o2->ipIfStatsOutFragOKs);
11335 11345          UPDATE_MIB(o1, ipIfStatsOutFragFails, o2->ipIfStatsOutFragFails);
11336 11346          UPDATE_MIB(o1, ipIfStatsOutFragCreates, o2->ipIfStatsOutFragCreates);
11337 11347          UPDATE_MIB(o1, ipIfStatsReasmReqds, o2->ipIfStatsReasmReqds);
11338 11348          UPDATE_MIB(o1, ipIfStatsReasmOKs, o2->ipIfStatsReasmOKs);
11339 11349          UPDATE_MIB(o1, ipIfStatsReasmFails, o2->ipIfStatsReasmFails);
11340 11350          UPDATE_MIB(o1, ipIfStatsOutNoRoutes, o2->ipIfStatsOutNoRoutes);
11341 11351          UPDATE_MIB(o1, ipIfStatsReasmDuplicates, o2->ipIfStatsReasmDuplicates);
11342 11352          UPDATE_MIB(o1, ipIfStatsReasmPartDups, o2->ipIfStatsReasmPartDups);
11343 11353          UPDATE_MIB(o1, ipIfStatsForwProhibits, o2->ipIfStatsForwProhibits);
11344 11354          UPDATE_MIB(o1, udpInCksumErrs, o2->udpInCksumErrs);
11345 11355          UPDATE_MIB(o1, udpInOverflows, o2->udpInOverflows);
11346 11356          UPDATE_MIB(o1, rawipInOverflows, o2->rawipInOverflows);
11347 11357          UPDATE_MIB(o1, ipIfStatsInWrongIPVersion,
11348 11358              o2->ipIfStatsInWrongIPVersion);
11349 11359          UPDATE_MIB(o1, ipIfStatsOutWrongIPVersion,
11350 11360              o2->ipIfStatsInWrongIPVersion);
11351 11361          UPDATE_MIB(o1, ipIfStatsOutSwitchIPVersion,
11352 11362              o2->ipIfStatsOutSwitchIPVersion);
11353 11363          UPDATE_MIB(o1, ipIfStatsHCInReceives, o2->ipIfStatsHCInReceives);
11354 11364          UPDATE_MIB(o1, ipIfStatsHCInOctets, o2->ipIfStatsHCInOctets);
11355 11365          UPDATE_MIB(o1, ipIfStatsHCInForwDatagrams,
11356 11366              o2->ipIfStatsHCInForwDatagrams);
11357 11367          UPDATE_MIB(o1, ipIfStatsHCInDelivers, o2->ipIfStatsHCInDelivers);
11358 11368          UPDATE_MIB(o1, ipIfStatsHCOutRequests, o2->ipIfStatsHCOutRequests);
11359 11369          UPDATE_MIB(o1, ipIfStatsHCOutForwDatagrams,
11360 11370              o2->ipIfStatsHCOutForwDatagrams);
11361 11371          UPDATE_MIB(o1, ipIfStatsOutFragReqds, o2->ipIfStatsOutFragReqds);
11362 11372          UPDATE_MIB(o1, ipIfStatsHCOutTransmits, o2->ipIfStatsHCOutTransmits);
11363 11373          UPDATE_MIB(o1, ipIfStatsHCOutOctets, o2->ipIfStatsHCOutOctets);
11364 11374          UPDATE_MIB(o1, ipIfStatsHCInMcastPkts, o2->ipIfStatsHCInMcastPkts);
11365 11375          UPDATE_MIB(o1, ipIfStatsHCInMcastOctets, o2->ipIfStatsHCInMcastOctets);
11366 11376          UPDATE_MIB(o1, ipIfStatsHCOutMcastPkts, o2->ipIfStatsHCOutMcastPkts);
11367 11377          UPDATE_MIB(o1, ipIfStatsHCOutMcastOctets,
11368 11378              o2->ipIfStatsHCOutMcastOctets);
11369 11379          UPDATE_MIB(o1, ipIfStatsHCInBcastPkts, o2->ipIfStatsHCInBcastPkts);
11370 11380          UPDATE_MIB(o1, ipIfStatsHCOutBcastPkts, o2->ipIfStatsHCOutBcastPkts);
11371 11381          UPDATE_MIB(o1, ipsecInSucceeded, o2->ipsecInSucceeded);
11372 11382          UPDATE_MIB(o1, ipsecInFailed, o2->ipsecInFailed);
11373 11383          UPDATE_MIB(o1, ipInCksumErrs, o2->ipInCksumErrs);
11374 11384          UPDATE_MIB(o1, tcpInErrs, o2->tcpInErrs);
11375 11385          UPDATE_MIB(o1, udpNoPorts, o2->udpNoPorts);
11376 11386  }
11377 11387  
11378 11388  void
11379 11389  ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2)
11380 11390  {
11381 11391          UPDATE_MIB(o1, ipv6IfIcmpInMsgs, o2->ipv6IfIcmpInMsgs);
11382 11392          UPDATE_MIB(o1, ipv6IfIcmpInErrors, o2->ipv6IfIcmpInErrors);
11383 11393          UPDATE_MIB(o1, ipv6IfIcmpInDestUnreachs, o2->ipv6IfIcmpInDestUnreachs);
11384 11394          UPDATE_MIB(o1, ipv6IfIcmpInAdminProhibs, o2->ipv6IfIcmpInAdminProhibs);
11385 11395          UPDATE_MIB(o1, ipv6IfIcmpInTimeExcds, o2->ipv6IfIcmpInTimeExcds);
11386 11396          UPDATE_MIB(o1, ipv6IfIcmpInParmProblems, o2->ipv6IfIcmpInParmProblems);
11387 11397          UPDATE_MIB(o1, ipv6IfIcmpInPktTooBigs, o2->ipv6IfIcmpInPktTooBigs);
11388 11398          UPDATE_MIB(o1, ipv6IfIcmpInEchos, o2->ipv6IfIcmpInEchos);
11389 11399          UPDATE_MIB(o1, ipv6IfIcmpInEchoReplies, o2->ipv6IfIcmpInEchoReplies);
11390 11400          UPDATE_MIB(o1, ipv6IfIcmpInRouterSolicits,
11391 11401              o2->ipv6IfIcmpInRouterSolicits);
11392 11402          UPDATE_MIB(o1, ipv6IfIcmpInRouterAdvertisements,
11393 11403              o2->ipv6IfIcmpInRouterAdvertisements);
11394 11404          UPDATE_MIB(o1, ipv6IfIcmpInNeighborSolicits,
11395 11405              o2->ipv6IfIcmpInNeighborSolicits);
11396 11406          UPDATE_MIB(o1, ipv6IfIcmpInNeighborAdvertisements,
11397 11407              o2->ipv6IfIcmpInNeighborAdvertisements);
11398 11408          UPDATE_MIB(o1, ipv6IfIcmpInRedirects, o2->ipv6IfIcmpInRedirects);
11399 11409          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembQueries,
11400 11410              o2->ipv6IfIcmpInGroupMembQueries);
11401 11411          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembResponses,
11402 11412              o2->ipv6IfIcmpInGroupMembResponses);
11403 11413          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembReductions,
11404 11414              o2->ipv6IfIcmpInGroupMembReductions);
11405 11415          UPDATE_MIB(o1, ipv6IfIcmpOutMsgs, o2->ipv6IfIcmpOutMsgs);
11406 11416          UPDATE_MIB(o1, ipv6IfIcmpOutErrors, o2->ipv6IfIcmpOutErrors);
11407 11417          UPDATE_MIB(o1, ipv6IfIcmpOutDestUnreachs,
11408 11418              o2->ipv6IfIcmpOutDestUnreachs);
11409 11419          UPDATE_MIB(o1, ipv6IfIcmpOutAdminProhibs,
11410 11420              o2->ipv6IfIcmpOutAdminProhibs);
11411 11421          UPDATE_MIB(o1, ipv6IfIcmpOutTimeExcds, o2->ipv6IfIcmpOutTimeExcds);
11412 11422          UPDATE_MIB(o1, ipv6IfIcmpOutParmProblems,
11413 11423              o2->ipv6IfIcmpOutParmProblems);
11414 11424          UPDATE_MIB(o1, ipv6IfIcmpOutPktTooBigs, o2->ipv6IfIcmpOutPktTooBigs);
11415 11425          UPDATE_MIB(o1, ipv6IfIcmpOutEchos, o2->ipv6IfIcmpOutEchos);
11416 11426          UPDATE_MIB(o1, ipv6IfIcmpOutEchoReplies, o2->ipv6IfIcmpOutEchoReplies);
11417 11427          UPDATE_MIB(o1, ipv6IfIcmpOutRouterSolicits,
11418 11428              o2->ipv6IfIcmpOutRouterSolicits);
11419 11429          UPDATE_MIB(o1, ipv6IfIcmpOutRouterAdvertisements,
11420 11430              o2->ipv6IfIcmpOutRouterAdvertisements);
11421 11431          UPDATE_MIB(o1, ipv6IfIcmpOutNeighborSolicits,
11422 11432              o2->ipv6IfIcmpOutNeighborSolicits);
11423 11433          UPDATE_MIB(o1, ipv6IfIcmpOutNeighborAdvertisements,
11424 11434              o2->ipv6IfIcmpOutNeighborAdvertisements);
11425 11435          UPDATE_MIB(o1, ipv6IfIcmpOutRedirects, o2->ipv6IfIcmpOutRedirects);
11426 11436          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembQueries,
11427 11437              o2->ipv6IfIcmpOutGroupMembQueries);
11428 11438          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembResponses,
11429 11439              o2->ipv6IfIcmpOutGroupMembResponses);
11430 11440          UPDATE_MIB(o1, ipv6IfIcmpOutGroupMembReductions,
11431 11441              o2->ipv6IfIcmpOutGroupMembReductions);
11432 11442          UPDATE_MIB(o1, ipv6IfIcmpInOverflows, o2->ipv6IfIcmpInOverflows);
11433 11443          UPDATE_MIB(o1, ipv6IfIcmpBadHoplimit, o2->ipv6IfIcmpBadHoplimit);
11434 11444          UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborAdvertisements,
11435 11445              o2->ipv6IfIcmpInBadNeighborAdvertisements);
11436 11446          UPDATE_MIB(o1, ipv6IfIcmpInBadNeighborSolicitations,
11437 11447              o2->ipv6IfIcmpInBadNeighborSolicitations);
11438 11448          UPDATE_MIB(o1, ipv6IfIcmpInBadRedirects, o2->ipv6IfIcmpInBadRedirects);
11439 11449          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembTotal,
11440 11450              o2->ipv6IfIcmpInGroupMembTotal);
11441 11451          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadQueries,
11442 11452              o2->ipv6IfIcmpInGroupMembBadQueries);
11443 11453          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembBadReports,
11444 11454              o2->ipv6IfIcmpInGroupMembBadReports);
11445 11455          UPDATE_MIB(o1, ipv6IfIcmpInGroupMembOurReports,
11446 11456              o2->ipv6IfIcmpInGroupMembOurReports);
11447 11457  }
11448 11458  
11449 11459  /*
11450 11460   * Called before the options are updated to check if this packet will
11451 11461   * be source routed from here.
11452 11462   * This routine assumes that the options are well formed i.e. that they
11453 11463   * have already been checked.
11454 11464   */
11455 11465  boolean_t
11456 11466  ip_source_routed(ipha_t *ipha, ip_stack_t *ipst)
11457 11467  {
11458 11468          ipoptp_t        opts;
11459 11469          uchar_t         *opt;
11460 11470          uint8_t         optval;
11461 11471          uint8_t         optlen;
11462 11472          ipaddr_t        dst;
11463 11473  
11464 11474          if (IS_SIMPLE_IPH(ipha)) {
11465 11475                  ip2dbg(("not source routed\n"));
11466 11476                  return (B_FALSE);
11467 11477          }
11468 11478          dst = ipha->ipha_dst;
11469 11479          for (optval = ipoptp_first(&opts, ipha);
11470 11480              optval != IPOPT_EOL;
11471 11481              optval = ipoptp_next(&opts)) {
11472 11482                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11473 11483                  opt = opts.ipoptp_cur;
11474 11484                  optlen = opts.ipoptp_len;
11475 11485                  ip2dbg(("ip_source_routed: opt %d, len %d\n",
11476 11486                      optval, optlen));
11477 11487                  switch (optval) {
11478 11488                          uint32_t off;
11479 11489                  case IPOPT_SSRR:
11480 11490                  case IPOPT_LSRR:
11481 11491                          /*
11482 11492                           * If dst is one of our addresses and there are some
11483 11493                           * entries left in the source route return (true).
11484 11494                           */
11485 11495                          if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
11486 11496                                  ip2dbg(("ip_source_routed: not next"
11487 11497                                      " source route 0x%x\n",
11488 11498                                      ntohl(dst)));
11489 11499                                  return (B_FALSE);
11490 11500                          }
11491 11501                          off = opt[IPOPT_OFFSET];
11492 11502                          off--;
11493 11503                          if (optlen < IP_ADDR_LEN ||
11494 11504                              off > optlen - IP_ADDR_LEN) {
11495 11505                                  /* End of source route */
11496 11506                                  ip1dbg(("ip_source_routed: end of SR\n"));
11497 11507                                  return (B_FALSE);
11498 11508                          }
11499 11509                          return (B_TRUE);
11500 11510                  }
11501 11511          }
11502 11512          ip2dbg(("not source routed\n"));
11503 11513          return (B_FALSE);
11504 11514  }
11505 11515  
11506 11516  /*
11507 11517   * ip_unbind is called by the transports to remove a conn from
11508 11518   * the fanout table.
11509 11519   */
11510 11520  void
11511 11521  ip_unbind(conn_t *connp)
11512 11522  {
11513 11523  
11514 11524          ASSERT(!MUTEX_HELD(&connp->conn_lock));
11515 11525  
11516 11526          if (is_system_labeled() && connp->conn_anon_port) {
11517 11527                  (void) tsol_mlp_anon(crgetzone(connp->conn_cred),
11518 11528                      connp->conn_mlp_type, connp->conn_proto,
11519 11529                      ntohs(connp->conn_lport), B_FALSE);
11520 11530                  connp->conn_anon_port = 0;
11521 11531          }
11522 11532          connp->conn_mlp_type = mlptSingle;
11523 11533  
11524 11534          ipcl_hash_remove(connp);
11525 11535  }
11526 11536  
11527 11537  /*
11528 11538   * Used for deciding the MSS size for the upper layer. Thus
11529 11539   * we need to check the outbound policy values in the conn.
11530 11540   */
11531 11541  int
11532 11542  conn_ipsec_length(conn_t *connp)
11533 11543  {
11534 11544          ipsec_latch_t *ipl;
11535 11545  
11536 11546          ipl = connp->conn_latch;
11537 11547          if (ipl == NULL)
11538 11548                  return (0);
11539 11549  
11540 11550          if (connp->conn_ixa->ixa_ipsec_policy == NULL)
11541 11551                  return (0);
11542 11552  
11543 11553          return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd);
11544 11554  }
11545 11555  
11546 11556  /*
11547 11557   * Returns an estimate of the IPsec headers size. This is used if
11548 11558   * we don't want to call into IPsec to get the exact size.
11549 11559   */
11550 11560  int
11551 11561  ipsec_out_extra_length(ip_xmit_attr_t *ixa)
11552 11562  {
11553 11563          ipsec_action_t *a;
11554 11564  
11555 11565          if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE))
11556 11566                  return (0);
11557 11567  
11558 11568          a = ixa->ixa_ipsec_action;
11559 11569          if (a == NULL) {
11560 11570                  ASSERT(ixa->ixa_ipsec_policy != NULL);
11561 11571                  a = ixa->ixa_ipsec_policy->ipsp_act;
11562 11572          }
11563 11573          ASSERT(a != NULL);
11564 11574  
11565 11575          return (a->ipa_ovhd);
11566 11576  }
11567 11577  
11568 11578  /*
11569 11579   * If there are any source route options, return the true final
11570 11580   * destination. Otherwise, return the destination.
11571 11581   */
11572 11582  ipaddr_t
11573 11583  ip_get_dst(ipha_t *ipha)
11574 11584  {
11575 11585          ipoptp_t        opts;
11576 11586          uchar_t         *opt;
11577 11587          uint8_t         optval;
11578 11588          uint8_t         optlen;
11579 11589          ipaddr_t        dst;
11580 11590          uint32_t off;
11581 11591  
11582 11592          dst = ipha->ipha_dst;
11583 11593  
11584 11594          if (IS_SIMPLE_IPH(ipha))
11585 11595                  return (dst);
11586 11596  
11587 11597          for (optval = ipoptp_first(&opts, ipha);
11588 11598              optval != IPOPT_EOL;
11589 11599              optval = ipoptp_next(&opts)) {
11590 11600                  opt = opts.ipoptp_cur;
11591 11601                  optlen = opts.ipoptp_len;
11592 11602                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11593 11603                  switch (optval) {
11594 11604                  case IPOPT_SSRR:
11595 11605                  case IPOPT_LSRR:
11596 11606                          off = opt[IPOPT_OFFSET];
11597 11607                          /*
11598 11608                           * If one of the conditions is true, it means
11599 11609                           * end of options and dst already has the right
11600 11610                           * value.
11601 11611                           */
11602 11612                          if (!(optlen < IP_ADDR_LEN || off > optlen - 3)) {
11603 11613                                  off = optlen - IP_ADDR_LEN;
11604 11614                                  bcopy(&opt[off], &dst, IP_ADDR_LEN);
11605 11615                          }
11606 11616                          return (dst);
11607 11617                  default:
11608 11618                          break;
11609 11619                  }
11610 11620          }
11611 11621  
11612 11622          return (dst);
11613 11623  }
11614 11624  
11615 11625  /*
11616 11626   * Outbound IP fragmentation routine.
11617 11627   * Assumes the caller has checked whether or not fragmentation should
11618 11628   * be allowed. Here we copy the DF bit from the header to all the generated
11619 11629   * fragments.
11620 11630   */
11621 11631  int
11622 11632  ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags,
11623 11633      uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone,
11624 11634      zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie)
11625 11635  {
11626 11636          int             i1;
11627 11637          int             hdr_len;
11628 11638          mblk_t          *hdr_mp;
11629 11639          ipha_t          *ipha;
11630 11640          int             ip_data_end;
11631 11641          int             len;
11632 11642          mblk_t          *mp = mp_orig;
11633 11643          int             offset;
11634 11644          ill_t           *ill = nce->nce_ill;
11635 11645          ip_stack_t      *ipst = ill->ill_ipst;
11636 11646          mblk_t          *carve_mp;
11637 11647          uint32_t        frag_flag;
11638 11648          uint_t          priority = mp->b_band;
11639 11649          int             error = 0;
11640 11650  
11641 11651          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds);
11642 11652  
11643 11653          if (pkt_len != msgdsize(mp)) {
11644 11654                  ip0dbg(("Packet length mismatch: %d, %ld\n",
11645 11655                      pkt_len, msgdsize(mp)));
11646 11656                  freemsg(mp);
11647 11657                  return (EINVAL);
11648 11658          }
11649 11659  
11650 11660          if (max_frag == 0) {
11651 11661                  ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n"));
11652 11662                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11653 11663                  ip_drop_output("FragFails: zero max_frag", mp, ill);
11654 11664                  freemsg(mp);
11655 11665                  return (EINVAL);
11656 11666          }
11657 11667  
11658 11668          ASSERT(MBLKL(mp) >= sizeof (ipha_t));
11659 11669          ipha = (ipha_t *)mp->b_rptr;
11660 11670          ASSERT(ntohs(ipha->ipha_length) == pkt_len);
11661 11671          frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF;
11662 11672  
11663 11673          /*
11664 11674           * Establish the starting offset.  May not be zero if we are fragging
11665 11675           * a fragment that is being forwarded.
11666 11676           */
11667 11677          offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET;
11668 11678  
11669 11679          /* TODO why is this test needed? */
11670 11680          if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) {
11671 11681                  /* TODO: notify ulp somehow */
11672 11682                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11673 11683                  ip_drop_output("FragFails: bad starting offset", mp, ill);
11674 11684                  freemsg(mp);
11675 11685                  return (EINVAL);
11676 11686          }
11677 11687  
11678 11688          hdr_len = IPH_HDR_LENGTH(ipha);
11679 11689          ipha->ipha_hdr_checksum = 0;
11680 11690  
11681 11691          /*
11682 11692           * Establish the number of bytes maximum per frag, after putting
11683 11693           * in the header.
11684 11694           */
11685 11695          len = (max_frag - hdr_len) & ~7;
11686 11696  
11687 11697          /* Get a copy of the header for the trailing frags */
11688 11698          hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst,
11689 11699              mp);
11690 11700          if (hdr_mp == NULL) {
11691 11701                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11692 11702                  ip_drop_output("FragFails: no hdr_mp", mp, ill);
11693 11703                  freemsg(mp);
11694 11704                  return (ENOBUFS);
11695 11705          }
11696 11706  
11697 11707          /* Store the starting offset, with the MoreFrags flag. */
11698 11708          i1 = offset | IPH_MF | frag_flag;
11699 11709          ipha->ipha_fragment_offset_and_flags = htons((uint16_t)i1);
11700 11710  
11701 11711          /* Establish the ending byte offset, based on the starting offset. */
11702 11712          offset <<= 3;
11703 11713          ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
11704 11714  
11705 11715          /* Store the length of the first fragment in the IP header. */
11706 11716          i1 = len + hdr_len;
11707 11717          ASSERT(i1 <= IP_MAXPACKET);
11708 11718          ipha->ipha_length = htons((uint16_t)i1);
11709 11719  
11710 11720          /*
11711 11721           * Compute the IP header checksum for the first frag.  We have to
11712 11722           * watch out that we stop at the end of the header.
11713 11723           */
11714 11724          ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11715 11725  
11716 11726          /*
11717 11727           * Now carve off the first frag.  Note that this will include the
11718 11728           * original IP header.
11719 11729           */
11720 11730          if (!(mp = ip_carve_mp(&mp_orig, i1))) {
11721 11731                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11722 11732                  ip_drop_output("FragFails: could not carve mp", mp_orig, ill);
11723 11733                  freeb(hdr_mp);
11724 11734                  freemsg(mp_orig);
11725 11735                  return (ENOBUFS);
11726 11736          }
11727 11737  
11728 11738          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11729 11739  
11730 11740          error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid,
11731 11741              ixa_cookie);
11732 11742          if (error != 0 && error != EWOULDBLOCK) {
11733 11743                  /* No point in sending the other fragments */
11734 11744                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11735 11745                  ip_drop_output("FragFails: postfragfn failed", mp_orig, ill);
11736 11746                  freeb(hdr_mp);
11737 11747                  freemsg(mp_orig);
11738 11748                  return (error);
11739 11749          }
11740 11750  
11741 11751          /* No need to redo state machine in loop */
11742 11752          ixaflags &= ~IXAF_REACH_CONF;
11743 11753  
11744 11754          /* Advance the offset to the second frag starting point. */
11745 11755          offset += len;
11746 11756          /*
11747 11757           * Update hdr_len from the copied header - there might be less options
11748 11758           * in the later fragments.
11749 11759           */
11750 11760          hdr_len = IPH_HDR_LENGTH(hdr_mp->b_rptr);
11751 11761          /* Loop until done. */
11752 11762          for (;;) {
11753 11763                  uint16_t        offset_and_flags;
11754 11764                  uint16_t        ip_len;
11755 11765  
11756 11766                  if (ip_data_end - offset > len) {
11757 11767                          /*
11758 11768                           * Carve off the appropriate amount from the original
11759 11769                           * datagram.
11760 11770                           */
11761 11771                          if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11762 11772                                  mp = NULL;
11763 11773                                  break;
11764 11774                          }
11765 11775                          /*
11766 11776                           * More frags after this one.  Get another copy
11767 11777                           * of the header.
11768 11778                           */
11769 11779                          if (carve_mp->b_datap->db_ref == 1 &&
11770 11780                              hdr_mp->b_wptr - hdr_mp->b_rptr <
11771 11781                              carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11772 11782                                  /* Inline IP header */
11773 11783                                  carve_mp->b_rptr -= hdr_mp->b_wptr -
11774 11784                                      hdr_mp->b_rptr;
11775 11785                                  bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11776 11786                                      hdr_mp->b_wptr - hdr_mp->b_rptr);
11777 11787                                  mp = carve_mp;
11778 11788                          } else {
11779 11789                                  if (!(mp = copyb(hdr_mp))) {
11780 11790                                          freemsg(carve_mp);
11781 11791                                          break;
11782 11792                                  }
11783 11793                                  /* Get priority marking, if any. */
11784 11794                                  mp->b_band = priority;
11785 11795                                  mp->b_cont = carve_mp;
11786 11796                          }
11787 11797                          ipha = (ipha_t *)mp->b_rptr;
11788 11798                          offset_and_flags = IPH_MF;
11789 11799                  } else {
11790 11800                          /*
11791 11801                           * Last frag.  Consume the header. Set len to
11792 11802                           * the length of this last piece.
11793 11803                           */
11794 11804                          len = ip_data_end - offset;
11795 11805  
11796 11806                          /*
11797 11807                           * Carve off the appropriate amount from the original
11798 11808                           * datagram.
11799 11809                           */
11800 11810                          if (!(carve_mp = ip_carve_mp(&mp_orig, len))) {
11801 11811                                  mp = NULL;
11802 11812                                  break;
11803 11813                          }
11804 11814                          if (carve_mp->b_datap->db_ref == 1 &&
11805 11815                              hdr_mp->b_wptr - hdr_mp->b_rptr <
11806 11816                              carve_mp->b_rptr - carve_mp->b_datap->db_base) {
11807 11817                                  /* Inline IP header */
11808 11818                                  carve_mp->b_rptr -= hdr_mp->b_wptr -
11809 11819                                      hdr_mp->b_rptr;
11810 11820                                  bcopy(hdr_mp->b_rptr, carve_mp->b_rptr,
11811 11821                                      hdr_mp->b_wptr - hdr_mp->b_rptr);
11812 11822                                  mp = carve_mp;
11813 11823                                  freeb(hdr_mp);
11814 11824                                  hdr_mp = mp;
11815 11825                          } else {
11816 11826                                  mp = hdr_mp;
11817 11827                                  /* Get priority marking, if any. */
11818 11828                                  mp->b_band = priority;
11819 11829                                  mp->b_cont = carve_mp;
11820 11830                          }
11821 11831                          ipha = (ipha_t *)mp->b_rptr;
11822 11832                          /* A frag of a frag might have IPH_MF non-zero */
11823 11833                          offset_and_flags =
11824 11834                              ntohs(ipha->ipha_fragment_offset_and_flags) &
11825 11835                              IPH_MF;
11826 11836                  }
11827 11837                  offset_and_flags |= (uint16_t)(offset >> 3);
11828 11838                  offset_and_flags |= (uint16_t)frag_flag;
11829 11839                  /* Store the offset and flags in the IP header. */
11830 11840                  ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
11831 11841  
11832 11842                  /* Store the length in the IP header. */
11833 11843                  ip_len = (uint16_t)(len + hdr_len);
11834 11844                  ipha->ipha_length = htons(ip_len);
11835 11845  
11836 11846                  /*
11837 11847                   * Set the IP header checksum.  Note that mp is just
11838 11848                   * the header, so this is easy to pass to ip_csum.
11839 11849                   */
11840 11850                  ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
11841 11851  
11842 11852                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates);
11843 11853  
11844 11854                  error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone,
11845 11855                      nolzid, ixa_cookie);
11846 11856                  /* All done if we just consumed the hdr_mp. */
11847 11857                  if (mp == hdr_mp) {
11848 11858                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs);
11849 11859                          return (error);
11850 11860                  }
11851 11861                  if (error != 0 && error != EWOULDBLOCK) {
11852 11862                          DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill,
11853 11863                              mblk_t *, hdr_mp);
11854 11864                          /* No point in sending the other fragments */
11855 11865                          break;
11856 11866                  }
11857 11867  
11858 11868                  /* Otherwise, advance and loop. */
11859 11869                  offset += len;
11860 11870          }
11861 11871          /* Clean up following allocation failure. */
11862 11872          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails);
11863 11873          ip_drop_output("FragFails: loop ended", NULL, ill);
11864 11874          if (mp != hdr_mp)
11865 11875                  freeb(hdr_mp);
11866 11876          if (mp != mp_orig)
11867 11877                  freemsg(mp_orig);
11868 11878          return (error);
11869 11879  }
11870 11880  
11871 11881  /*
11872 11882   * Copy the header plus those options which have the copy bit set
11873 11883   */
11874 11884  static mblk_t *
11875 11885  ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst,
11876 11886      mblk_t *src)
11877 11887  {
11878 11888          mblk_t  *mp;
11879 11889          uchar_t *up;
11880 11890  
11881 11891          /*
11882 11892           * Quick check if we need to look for options without the copy bit
11883 11893           * set
11884 11894           */
11885 11895          mp = allocb_tmpl(ipst->ips_ip_wroff_extra + hdr_len, src);
11886 11896          if (!mp)
11887 11897                  return (mp);
11888 11898          mp->b_rptr += ipst->ips_ip_wroff_extra;
11889 11899          if (hdr_len == IP_SIMPLE_HDR_LENGTH || offset != 0) {
11890 11900                  bcopy(rptr, mp->b_rptr, hdr_len);
11891 11901                  mp->b_wptr += hdr_len + ipst->ips_ip_wroff_extra;
11892 11902                  return (mp);
11893 11903          }
11894 11904          up  = mp->b_rptr;
11895 11905          bcopy(rptr, up, IP_SIMPLE_HDR_LENGTH);
11896 11906          up += IP_SIMPLE_HDR_LENGTH;
11897 11907          rptr += IP_SIMPLE_HDR_LENGTH;
11898 11908          hdr_len -= IP_SIMPLE_HDR_LENGTH;
11899 11909          while (hdr_len > 0) {
11900 11910                  uint32_t optval;
11901 11911                  uint32_t optlen;
11902 11912  
11903 11913                  optval = *rptr;
11904 11914                  if (optval == IPOPT_EOL)
11905 11915                          break;
11906 11916                  if (optval == IPOPT_NOP)
11907 11917                          optlen = 1;
11908 11918                  else
11909 11919                          optlen = rptr[1];
11910 11920                  if (optval & IPOPT_COPY) {
11911 11921                          bcopy(rptr, up, optlen);
11912 11922                          up += optlen;
11913 11923                  }
11914 11924                  rptr += optlen;
11915 11925                  hdr_len -= optlen;
11916 11926          }
11917 11927          /*
11918 11928           * Make sure that we drop an even number of words by filling
11919 11929           * with EOL to the next word boundary.
11920 11930           */
11921 11931          for (hdr_len = up - (mp->b_rptr + IP_SIMPLE_HDR_LENGTH);
11922 11932              hdr_len & 0x3; hdr_len++)
11923 11933                  *up++ = IPOPT_EOL;
11924 11934          mp->b_wptr = up;
11925 11935          /* Update header length */
11926 11936          mp->b_rptr[0] = (uint8_t)((IP_VERSION << 4) | ((up - mp->b_rptr) >> 2));
11927 11937          return (mp);
11928 11938  }
11929 11939  
11930 11940  /*
11931 11941   * Update any source route, record route, or timestamp options when
11932 11942   * sending a packet back to ourselves.
11933 11943   * Check that we are at end of strict source route.
11934 11944   * The options have been sanity checked by ip_output_options().
11935 11945   */
11936 11946  void
11937 11947  ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst)
11938 11948  {
11939 11949          ipoptp_t        opts;
11940 11950          uchar_t         *opt;
11941 11951          uint8_t         optval;
11942 11952          uint8_t         optlen;
11943 11953          ipaddr_t        dst;
11944 11954          uint32_t        ts;
11945 11955          timestruc_t     now;
11946 11956  
11947 11957          for (optval = ipoptp_first(&opts, ipha);
11948 11958              optval != IPOPT_EOL;
11949 11959              optval = ipoptp_next(&opts)) {
11950 11960                  opt = opts.ipoptp_cur;
11951 11961                  optlen = opts.ipoptp_len;
11952 11962                  ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0);
11953 11963                  switch (optval) {
11954 11964                          uint32_t off;
11955 11965                  case IPOPT_SSRR:
11956 11966                  case IPOPT_LSRR:
11957 11967                          off = opt[IPOPT_OFFSET];
11958 11968                          off--;
11959 11969                          if (optlen < IP_ADDR_LEN ||
11960 11970                              off > optlen - IP_ADDR_LEN) {
11961 11971                                  /* End of source route */
11962 11972                                  break;
11963 11973                          }
11964 11974                          /*
11965 11975                           * This will only happen if two consecutive entries
11966 11976                           * in the source route contains our address or if
11967 11977                           * it is a packet with a loose source route which
11968 11978                           * reaches us before consuming the whole source route
11969 11979                           */
11970 11980  
11971 11981                          if (optval == IPOPT_SSRR) {
11972 11982                                  return;
11973 11983                          }
11974 11984                          /*
11975 11985                           * Hack: instead of dropping the packet truncate the
11976 11986                           * source route to what has been used by filling the
11977 11987                           * rest with IPOPT_NOP.
11978 11988                           */
11979 11989                          opt[IPOPT_OLEN] = (uint8_t)off;
11980 11990                          while (off < optlen) {
11981 11991                                  opt[off++] = IPOPT_NOP;
11982 11992                          }
11983 11993                          break;
11984 11994                  case IPOPT_RR:
11985 11995                          off = opt[IPOPT_OFFSET];
11986 11996                          off--;
11987 11997                          if (optlen < IP_ADDR_LEN ||
11988 11998                              off > optlen - IP_ADDR_LEN) {
11989 11999                                  /* No more room - ignore */
11990 12000                                  ip1dbg((
11991 12001                                      "ip_output_local_options: end of RR\n"));
11992 12002                                  break;
11993 12003                          }
11994 12004                          dst = htonl(INADDR_LOOPBACK);
11995 12005                          bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
11996 12006                          opt[IPOPT_OFFSET] += IP_ADDR_LEN;
11997 12007                          break;
11998 12008                  case IPOPT_TS:
11999 12009                          /* Insert timestamp if there is romm */
12000 12010                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
12001 12011                          case IPOPT_TS_TSONLY:
12002 12012                                  off = IPOPT_TS_TIMELEN;
12003 12013                                  break;
12004 12014                          case IPOPT_TS_PRESPEC:
12005 12015                          case IPOPT_TS_PRESPEC_RFC791:
12006 12016                                  /* Verify that the address matched */
12007 12017                                  off = opt[IPOPT_OFFSET] - 1;
12008 12018                                  bcopy((char *)opt + off, &dst, IP_ADDR_LEN);
12009 12019                                  if (ip_type_v4(dst, ipst) != IRE_LOCAL) {
12010 12020                                          /* Not for us */
12011 12021                                          break;
12012 12022                                  }
12013 12023                                  /* FALLTHRU */
12014 12024                          case IPOPT_TS_TSANDADDR:
12015 12025                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
12016 12026                                  break;
12017 12027                          default:
12018 12028                                  /*
12019 12029                                   * ip_*put_options should have already
12020 12030                                   * dropped this packet.
12021 12031                                   */
12022 12032                                  cmn_err(CE_PANIC, "ip_output_local_options: "
12023 12033                                      "unknown IT - bug in ip_output_options?\n");
12024 12034                                  return; /* Keep "lint" happy */
12025 12035                          }
12026 12036                          if (opt[IPOPT_OFFSET] - 1 + off > optlen) {
12027 12037                                  /* Increase overflow counter */
12028 12038                                  off = (opt[IPOPT_POS_OV_FLG] >> 4) + 1;
12029 12039                                  opt[IPOPT_POS_OV_FLG] = (uint8_t)
12030 12040                                      (opt[IPOPT_POS_OV_FLG] & 0x0F) |
12031 12041                                      (off << 4);
12032 12042                                  break;
12033 12043                          }
12034 12044                          off = opt[IPOPT_OFFSET] - 1;
12035 12045                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
12036 12046                          case IPOPT_TS_PRESPEC:
12037 12047                          case IPOPT_TS_PRESPEC_RFC791:
12038 12048                          case IPOPT_TS_TSANDADDR:
12039 12049                                  dst = htonl(INADDR_LOOPBACK);
12040 12050                                  bcopy(&dst, (char *)opt + off, IP_ADDR_LEN);
12041 12051                                  opt[IPOPT_OFFSET] += IP_ADDR_LEN;
12042 12052                                  /* FALLTHRU */
12043 12053                          case IPOPT_TS_TSONLY:
12044 12054                                  off = opt[IPOPT_OFFSET] - 1;
12045 12055                                  /* Compute # of milliseconds since midnight */
12046 12056                                  gethrestime(&now);
12047 12057                                  ts = (now.tv_sec % (24 * 60 * 60)) * 1000 +
12048 12058                                      NSEC2MSEC(now.tv_nsec);
12049 12059                                  bcopy(&ts, (char *)opt + off, IPOPT_TS_TIMELEN);
12050 12060                                  opt[IPOPT_OFFSET] += IPOPT_TS_TIMELEN;
12051 12061                                  break;
12052 12062                          }
12053 12063                          break;
12054 12064                  }
12055 12065          }
12056 12066  }
12057 12067  
12058 12068  /*
12059 12069   * Prepend an M_DATA fastpath header, and if none present prepend a
12060 12070   * DL_UNITDATA_REQ. Frees the mblk on failure.
12061 12071   *
12062 12072   * nce_dlur_mp and nce_fp_mp can not disappear once they have been set.
12063 12073   * If there is a change to them, the nce will be deleted (condemned) and
12064 12074   * a new nce_t will be created when packets are sent. Thus we need no locks
12065 12075   * to access those fields.
12066 12076   *
12067 12077   * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended
12068 12078   * we place b_band in dl_priority.dl_max.
12069 12079   */
12070 12080  static mblk_t *
12071 12081  ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce)
12072 12082  {
12073 12083          uint_t  hlen;
12074 12084          mblk_t *mp1;
12075 12085          uint_t  priority;
12076 12086          uchar_t *rptr;
12077 12087  
12078 12088          rptr = mp->b_rptr;
12079 12089  
12080 12090          ASSERT(DB_TYPE(mp) == M_DATA);
12081 12091          priority = mp->b_band;
12082 12092  
12083 12093          ASSERT(nce != NULL);
12084 12094          if ((mp1 = nce->nce_fp_mp) != NULL) {
12085 12095                  hlen = MBLKL(mp1);
12086 12096                  /*
12087 12097                   * Check if we have enough room to prepend fastpath
12088 12098                   * header
12089 12099                   */
12090 12100                  if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) {
12091 12101                          rptr -= hlen;
12092 12102                          bcopy(mp1->b_rptr, rptr, hlen);
12093 12103                          /*
12094 12104                           * Set the b_rptr to the start of the link layer
12095 12105                           * header
12096 12106                           */
12097 12107                          mp->b_rptr = rptr;
12098 12108                          return (mp);
12099 12109                  }
12100 12110                  mp1 = copyb(mp1);
12101 12111                  if (mp1 == NULL) {
12102 12112                          ill_t *ill = nce->nce_ill;
12103 12113  
12104 12114                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12105 12115                          ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12106 12116                          freemsg(mp);
12107 12117                          return (NULL);
12108 12118                  }
12109 12119                  mp1->b_band = priority;
12110 12120                  mp1->b_cont = mp;
12111 12121                  DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp);
12112 12122                  DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp);
12113 12123                  DB_CKSUMEND(mp1) = DB_CKSUMEND(mp);
12114 12124                  DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp);
12115 12125                  DB_LSOMSS(mp1) = DB_LSOMSS(mp);
12116 12126                  DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1);
12117 12127                  /*
12118 12128                   * XXX disable ICK_VALID and compute checksum
12119 12129                   * here; can happen if nce_fp_mp changes and
12120 12130                   * it can't be copied now due to insufficient
12121 12131                   * space. (unlikely, fp mp can change, but it
12122 12132                   * does not increase in length)
12123 12133                   */
12124 12134                  return (mp1);
12125 12135          }
12126 12136          mp1 = copyb(nce->nce_dlur_mp);
12127 12137  
12128 12138          if (mp1 == NULL) {
12129 12139                  ill_t *ill = nce->nce_ill;
12130 12140  
12131 12141                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12132 12142                  ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12133 12143                  freemsg(mp);
12134 12144                  return (NULL);
12135 12145          }
12136 12146          mp1->b_cont = mp;
12137 12147          if (priority != 0) {
12138 12148                  mp1->b_band = priority;
12139 12149                  ((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max =
12140 12150                      priority;
12141 12151          }
12142 12152          return (mp1);
12143 12153  }
12144 12154  
12145 12155  /*
12146 12156   * Finish the outbound IPsec processing. This function is called from
12147 12157   * ipsec_out_process() if the IPsec packet was processed
12148 12158   * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12149 12159   * asynchronously.
12150 12160   *
12151 12161   * This is common to IPv4 and IPv6.
12152 12162   */
12153 12163  int
12154 12164  ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa)
12155 12165  {
12156 12166          iaflags_t       ixaflags = ixa->ixa_flags;
12157 12167          uint_t          pktlen;
12158 12168  
12159 12169  
12160 12170          /* AH/ESP don't update ixa_pktlen when they modify the packet */
12161 12171          if (ixaflags & IXAF_IS_IPV4) {
12162 12172                  ipha_t          *ipha = (ipha_t *)mp->b_rptr;
12163 12173  
12164 12174                  ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12165 12175                  pktlen = ntohs(ipha->ipha_length);
12166 12176          } else {
12167 12177                  ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
12168 12178  
12169 12179                  ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12170 12180                  pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12171 12181          }
12172 12182  
12173 12183          /*
12174 12184           * We release any hard reference on the SAs here to make
12175 12185           * sure the SAs can be garbage collected. ipsr_sa has a soft reference
12176 12186           * on the SAs.
12177 12187           * If in the future we want the hard latching of the SAs in the
12178 12188           * ip_xmit_attr_t then we should remove this.
12179 12189           */
12180 12190          if (ixa->ixa_ipsec_esp_sa != NULL) {
12181 12191                  IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12182 12192                  ixa->ixa_ipsec_esp_sa = NULL;
12183 12193          }
12184 12194          if (ixa->ixa_ipsec_ah_sa != NULL) {
12185 12195                  IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12186 12196                  ixa->ixa_ipsec_ah_sa = NULL;
12187 12197          }
12188 12198  
12189 12199          /* Do we need to fragment? */
12190 12200          if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) ||
12191 12201              pktlen > ixa->ixa_fragsize) {
12192 12202                  if (ixaflags & IXAF_IS_IPV4) {
12193 12203                          ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR));
12194 12204                          /*
12195 12205                           * We check for the DF case in ipsec_out_process
12196 12206                           * hence this only handles the non-DF case.
12197 12207                           */
12198 12208                          return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags,
12199 12209                              pktlen, ixa->ixa_fragsize,
12200 12210                              ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12201 12211                              ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn,
12202 12212                              &ixa->ixa_cookie));
12203 12213                  } else {
12204 12214                          mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa);
12205 12215                          if (mp == NULL) {
12206 12216                                  /* MIB and ip_drop_output already done */
12207 12217                                  return (ENOMEM);
12208 12218                          }
12209 12219                          pktlen += sizeof (ip6_frag_t);
12210 12220                          if (pktlen > ixa->ixa_fragsize) {
12211 12221                                  return (ip_fragment_v6(mp, ixa->ixa_nce,
12212 12222                                      ixa->ixa_flags, pktlen,
12213 12223                                      ixa->ixa_fragsize, ixa->ixa_xmit_hint,
12214 12224                                      ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid,
12215 12225                                      ixa->ixa_postfragfn, &ixa->ixa_cookie));
12216 12226                          }
12217 12227                  }
12218 12228          }
12219 12229          return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags,
12220 12230              pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid,
12221 12231              ixa->ixa_no_loop_zoneid, NULL));
12222 12232  }
12223 12233  
12224 12234  /*
12225 12235   * Finish the inbound IPsec processing. This function is called from
12226 12236   * ipsec_out_process() if the IPsec packet was processed
12227 12237   * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed
12228 12238   * asynchronously.
12229 12239   *
12230 12240   * This is common to IPv4 and IPv6.
12231 12241   */
12232 12242  void
12233 12243  ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira)
12234 12244  {
12235 12245          iaflags_t       iraflags = ira->ira_flags;
12236 12246  
12237 12247          /* Length might have changed */
12238 12248          if (iraflags & IRAF_IS_IPV4) {
12239 12249                  ipha_t          *ipha = (ipha_t *)mp->b_rptr;
12240 12250  
12241 12251                  ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
12242 12252                  ira->ira_pktlen = ntohs(ipha->ipha_length);
12243 12253                  ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
12244 12254                  ira->ira_protocol = ipha->ipha_protocol;
12245 12255  
12246 12256                  ip_fanout_v4(mp, ipha, ira);
12247 12257          } else {
12248 12258                  ip6_t           *ip6h = (ip6_t *)mp->b_rptr;
12249 12259                  uint8_t         *nexthdrp;
12250 12260  
12251 12261                  ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION);
12252 12262                  ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
12253 12263                  if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length,
12254 12264                      &nexthdrp)) {
12255 12265                          /* Malformed packet */
12256 12266                          BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
12257 12267                          ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill);
12258 12268                          freemsg(mp);
12259 12269                          return;
12260 12270                  }
12261 12271                  ira->ira_protocol = *nexthdrp;
12262 12272                  ip_fanout_v6(mp, ip6h, ira);
12263 12273          }
12264 12274  }
12265 12275  
12266 12276  /*
12267 12277   * Select which AH & ESP SA's to use (if any) for the outbound packet.
12268 12278   *
12269 12279   * If this function returns B_TRUE, the requested SA's have been filled
12270 12280   * into the ixa_ipsec_*_sa pointers.
12271 12281   *
12272 12282   * If the function returns B_FALSE, the packet has been "consumed", most
12273 12283   * likely by an ACQUIRE sent up via PF_KEY to a key management daemon.
12274 12284   *
12275 12285   * The SA references created by the protocol-specific "select"
12276 12286   * function will be released in ip_output_post_ipsec.
12277 12287   */
12278 12288  static boolean_t
12279 12289  ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa)
12280 12290  {
12281 12291          boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE;
12282 12292          ipsec_policy_t *pp;
12283 12293          ipsec_action_t *ap;
12284 12294  
12285 12295          ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12286 12296          ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12287 12297              (ixa->ixa_ipsec_action != NULL));
12288 12298  
12289 12299          ap = ixa->ixa_ipsec_action;
12290 12300          if (ap == NULL) {
12291 12301                  pp = ixa->ixa_ipsec_policy;
12292 12302                  ASSERT(pp != NULL);
12293 12303                  ap = pp->ipsp_act;
12294 12304                  ASSERT(ap != NULL);
12295 12305          }
12296 12306  
12297 12307          /*
12298 12308           * We have an action.  now, let's select SA's.
12299 12309           * A side effect of setting ixa_ipsec_*_sa is that it will
12300 12310           * be cached in the conn_t.
12301 12311           */
12302 12312          if (ap->ipa_want_esp) {
12303 12313                  if (ixa->ixa_ipsec_esp_sa == NULL) {
12304 12314                          need_esp_acquire = !ipsec_outbound_sa(mp, ixa,
12305 12315                              IPPROTO_ESP);
12306 12316                  }
12307 12317                  ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL);
12308 12318          }
12309 12319  
12310 12320          if (ap->ipa_want_ah) {
12311 12321                  if (ixa->ixa_ipsec_ah_sa == NULL) {
12312 12322                          need_ah_acquire = !ipsec_outbound_sa(mp, ixa,
12313 12323                              IPPROTO_AH);
12314 12324                  }
12315 12325                  ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL);
12316 12326                  /*
12317 12327                   * The ESP and AH processing order needs to be preserved
12318 12328                   * when both protocols are required (ESP should be applied
12319 12329                   * before AH for an outbound packet). Force an ESP ACQUIRE
12320 12330                   * when both ESP and AH are required, and an AH ACQUIRE
12321 12331                   * is needed.
12322 12332                   */
12323 12333                  if (ap->ipa_want_esp && need_ah_acquire)
12324 12334                          need_esp_acquire = B_TRUE;
12325 12335          }
12326 12336  
12327 12337          /*
12328 12338           * Send an ACQUIRE (extended, regular, or both) if we need one.
12329 12339           * Release SAs that got referenced, but will not be used until we
12330 12340           * acquire _all_ of the SAs we need.
12331 12341           */
12332 12342          if (need_ah_acquire || need_esp_acquire) {
12333 12343                  if (ixa->ixa_ipsec_ah_sa != NULL) {
12334 12344                          IPSA_REFRELE(ixa->ixa_ipsec_ah_sa);
12335 12345                          ixa->ixa_ipsec_ah_sa = NULL;
12336 12346                  }
12337 12347                  if (ixa->ixa_ipsec_esp_sa != NULL) {
12338 12348                          IPSA_REFRELE(ixa->ixa_ipsec_esp_sa);
12339 12349                          ixa->ixa_ipsec_esp_sa = NULL;
12340 12350                  }
12341 12351  
12342 12352                  sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire);
12343 12353                  return (B_FALSE);
12344 12354          }
12345 12355  
12346 12356          return (B_TRUE);
12347 12357  }
12348 12358  
12349 12359  /*
12350 12360   * Handle IPsec output processing.
12351 12361   * This function is only entered once for a given packet.
12352 12362   * We try to do things synchronously, but if we need to have user-level
12353 12363   * set up SAs, or ESP or AH uses asynchronous kEF, then the operation
12354 12364   * will be completed
12355 12365   *  - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish
12356 12366   *  - when asynchronous ESP is done it will do AH
12357 12367   *
12358 12368   * In all cases we come back in ip_output_post_ipsec() to fragment and
12359 12369   * send out the packet.
12360 12370   */
12361 12371  int
12362 12372  ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa)
12363 12373  {
12364 12374          ill_t           *ill = ixa->ixa_nce->nce_ill;
12365 12375          ip_stack_t      *ipst = ixa->ixa_ipst;
12366 12376          ipsec_stack_t   *ipss;
12367 12377          ipsec_policy_t  *pp;
12368 12378          ipsec_action_t  *ap;
12369 12379  
12370 12380          ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE);
12371 12381  
12372 12382          ASSERT((ixa->ixa_ipsec_policy != NULL) ||
12373 12383              (ixa->ixa_ipsec_action != NULL));
12374 12384  
12375 12385          ipss = ipst->ips_netstack->netstack_ipsec;
12376 12386          if (!ipsec_loaded(ipss)) {
12377 12387                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12378 12388                  ip_drop_packet(mp, B_TRUE, ill,
12379 12389                      DROPPER(ipss, ipds_ip_ipsec_not_loaded),
12380 12390                      &ipss->ipsec_dropper);
12381 12391                  return (ENOTSUP);
12382 12392          }
12383 12393  
12384 12394          ap = ixa->ixa_ipsec_action;
12385 12395          if (ap == NULL) {
12386 12396                  pp = ixa->ixa_ipsec_policy;
12387 12397                  ASSERT(pp != NULL);
12388 12398                  ap = pp->ipsp_act;
12389 12399                  ASSERT(ap != NULL);
12390 12400          }
12391 12401  
12392 12402          /* Handle explicit drop action and bypass. */
12393 12403          switch (ap->ipa_act.ipa_type) {
12394 12404          case IPSEC_ACT_DISCARD:
12395 12405          case IPSEC_ACT_REJECT:
12396 12406                  ip_drop_packet(mp, B_FALSE, ill,
12397 12407                      DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper);
12398 12408                  return (EHOSTUNREACH);  /* IPsec policy failure */
12399 12409          case IPSEC_ACT_BYPASS:
12400 12410                  return (ip_output_post_ipsec(mp, ixa));
12401 12411          }
12402 12412  
12403 12413          /*
12404 12414           * The order of processing is first insert a IP header if needed.
12405 12415           * Then insert the ESP header and then the AH header.
12406 12416           */
12407 12417          if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) {
12408 12418                  /*
12409 12419                   * First get the outer IP header before sending
12410 12420                   * it to ESP.
12411 12421                   */
12412 12422                  ipha_t *oipha, *iipha;
12413 12423                  mblk_t *outer_mp, *inner_mp;
12414 12424  
12415 12425                  if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) {
12416 12426                          (void) mi_strlog(ill->ill_rq, 0,
12417 12427                              SL_ERROR|SL_TRACE|SL_CONSOLE,
12418 12428                              "ipsec_out_process: "
12419 12429                              "Self-Encapsulation failed: Out of memory\n");
12420 12430                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
12421 12431                          ip_drop_output("ipIfStatsOutDiscards", mp, ill);
12422 12432                          freemsg(mp);
12423 12433                          return (ENOBUFS);
12424 12434                  }
12425 12435                  inner_mp = mp;
12426 12436                  ASSERT(inner_mp->b_datap->db_type == M_DATA);
12427 12437                  oipha = (ipha_t *)outer_mp->b_rptr;
12428 12438                  iipha = (ipha_t *)inner_mp->b_rptr;
12429 12439                  *oipha = *iipha;
12430 12440                  outer_mp->b_wptr += sizeof (ipha_t);
12431 12441                  oipha->ipha_length = htons(ntohs(iipha->ipha_length) +
12432 12442                      sizeof (ipha_t));
12433 12443                  oipha->ipha_protocol = IPPROTO_ENCAP;
12434 12444                  oipha->ipha_version_and_hdr_length =
12435 12445                      IP_SIMPLE_HDR_VERSION;
12436 12446                  oipha->ipha_hdr_checksum = 0;
12437 12447                  oipha->ipha_hdr_checksum = ip_csum_hdr(oipha);
12438 12448                  outer_mp->b_cont = inner_mp;
12439 12449                  mp = outer_mp;
12440 12450  
12441 12451                  ixa->ixa_flags |= IXAF_IPSEC_TUNNEL;
12442 12452          }
12443 12453  
12444 12454          /* If we need to wait for a SA then we can't return any errno */
12445 12455          if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) ||
12446 12456              (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) &&
12447 12457              !ipsec_out_select_sa(mp, ixa))
12448 12458                  return (0);
12449 12459  
12450 12460          /*
12451 12461           * By now, we know what SA's to use.  Toss over to ESP & AH
12452 12462           * to do the heavy lifting.
12453 12463           */
12454 12464          if (ap->ipa_want_esp) {
12455 12465                  ASSERT(ixa->ixa_ipsec_esp_sa != NULL);
12456 12466  
12457 12467                  mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa);
12458 12468                  if (mp == NULL) {
12459 12469                          /*
12460 12470                           * Either it failed or is pending. In the former case
12461 12471                           * ipIfStatsInDiscards was increased.
12462 12472                           */
12463 12473                          return (0);
12464 12474                  }
12465 12475          }
12466 12476  
12467 12477          if (ap->ipa_want_ah) {
12468 12478                  ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
12469 12479  
12470 12480                  mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa);
12471 12481                  if (mp == NULL) {
12472 12482                          /*
12473 12483                           * Either it failed or is pending. In the former case
12474 12484                           * ipIfStatsInDiscards was increased.
12475 12485                           */
12476 12486                          return (0);
12477 12487                  }
12478 12488          }
12479 12489          /*
12480 12490           * We are done with IPsec processing. Send it over
12481 12491           * the wire.
12482 12492           */
12483 12493          return (ip_output_post_ipsec(mp, ixa));
12484 12494  }
12485 12495  
12486 12496  /*
12487 12497   * ioctls that go through a down/up sequence may need to wait for the down
12488 12498   * to complete. This involves waiting for the ire and ipif refcnts to go down
12489 12499   * to zero. Subsequently the ioctl is restarted from ipif_ill_refrele_tail.
12490 12500   */
12491 12501  /* ARGSUSED */
12492 12502  void
12493 12503  ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
12494 12504  {
12495 12505          struct iocblk *iocp;
12496 12506          mblk_t *mp1;
12497 12507          ip_ioctl_cmd_t *ipip;
12498 12508          int err;
12499 12509          sin_t   *sin;
12500 12510          struct lifreq *lifr;
12501 12511          struct ifreq *ifr;
12502 12512  
12503 12513          iocp = (struct iocblk *)mp->b_rptr;
12504 12514          ASSERT(ipsq != NULL);
12505 12515          /* Existence of mp1 verified in ip_wput_nondata */
12506 12516          mp1 = mp->b_cont->b_cont;
12507 12517          ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12508 12518          if (ipip->ipi_cmd == SIOCSLIFNAME || ipip->ipi_cmd == IF_UNITSEL) {
12509 12519                  /*
12510 12520                   * Special case where ipx_current_ipif is not set:
12511 12521                   * ill_phyint_reinit merged the v4 and v6 into a single ipsq.
12512 12522                   * We are here as were not able to complete the operation in
12513 12523                   * ipif_set_values because we could not become exclusive on
12514 12524                   * the new ipsq.
12515 12525                   */
12516 12526                  ill_t *ill = q->q_ptr;
12517 12527                  ipsq_current_start(ipsq, ill->ill_ipif, ipip->ipi_cmd);
12518 12528          }
12519 12529          ASSERT(ipsq->ipsq_xop->ipx_current_ipif != NULL);
12520 12530  
12521 12531          if (ipip->ipi_cmd_type == IF_CMD) {
12522 12532                  /* This a old style SIOC[GS]IF* command */
12523 12533                  ifr = (struct ifreq *)mp1->b_rptr;
12524 12534                  sin = (sin_t *)&ifr->ifr_addr;
12525 12535          } else if (ipip->ipi_cmd_type == LIF_CMD) {
12526 12536                  /* This a new style SIOC[GS]LIF* command */
12527 12537                  lifr = (struct lifreq *)mp1->b_rptr;
12528 12538                  sin = (sin_t *)&lifr->lifr_addr;
12529 12539          } else {
12530 12540                  sin = NULL;
12531 12541          }
12532 12542  
12533 12543          err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin,
12534 12544              q, mp, ipip, mp1->b_rptr);
12535 12545  
12536 12546          DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish",
12537 12547              int, ipip->ipi_cmd,
12538 12548              ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill,
12539 12549              ipif_t *, ipsq->ipsq_xop->ipx_current_ipif);
12540 12550  
12541 12551          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12542 12552  }
12543 12553  
12544 12554  /*
12545 12555   * ioctl processing
12546 12556   *
12547 12557   * ioctl processing starts with ip_sioctl_copyin_setup(), which looks up
12548 12558   * the ioctl command in the ioctl tables, determines the copyin data size
12549 12559   * from the ipi_copyin_size field, and does an mi_copyin() of that size.
12550 12560   *
12551 12561   * ioctl processing then continues when the M_IOCDATA makes its way down to
12552 12562   * ip_wput_nondata().  The ioctl is looked up again in the ioctl table, its
12553 12563   * associated 'conn' is refheld till the end of the ioctl and the general
12554 12564   * ioctl processing function ip_process_ioctl() is called to extract the
12555 12565   * arguments and process the ioctl.  To simplify extraction, ioctl commands
12556 12566   * are "typed" based on the arguments they take (e.g., LIF_CMD which takes a
12557 12567   * `struct lifreq'), and a common extract function (e.g., ip_extract_lifreq())
12558 12568   * is used to extract the ioctl's arguments.
12559 12569   *
12560 12570   * ip_process_ioctl determines if the ioctl needs to be serialized, and if
12561 12571   * so goes thru the serialization primitive ipsq_try_enter. Then the
12562 12572   * appropriate function to handle the ioctl is called based on the entry in
12563 12573   * the ioctl table. ioctl completion is encapsulated in ip_ioctl_finish
12564 12574   * which also refreleases the 'conn' that was refheld at the start of the
12565 12575   * ioctl. Finally ipsq_exit is called if needed to exit the ipsq.
12566 12576   *
12567 12577   * Many exclusive ioctls go thru an internal down up sequence as part of
12568 12578   * the operation. For example an attempt to change the IP address of an
12569 12579   * ipif entails ipif_down, set address, ipif_up. Bringing down the interface
12570 12580   * does all the cleanup such as deleting all ires that use this address.
12571 12581   * Then we need to wait till all references to the interface go away.
12572 12582   */
12573 12583  void
12574 12584  ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12575 12585  {
12576 12586          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
12577 12587          ip_ioctl_cmd_t *ipip = arg;
12578 12588          ip_extract_func_t *extract_funcp;
12579 12589          ill_t *ill;
12580 12590          cmd_info_t ci;
12581 12591          int err;
12582 12592          boolean_t entered_ipsq = B_FALSE;
12583 12593  
12584 12594          ip3dbg(("ip_process_ioctl: ioctl %X\n", iocp->ioc_cmd));
12585 12595  
12586 12596          if (ipip == NULL)
12587 12597                  ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12588 12598  
12589 12599          /*
12590 12600           * SIOCLIFADDIF needs to go thru a special path since the
12591 12601           * ill may not exist yet. This happens in the case of lo0
12592 12602           * which is created using this ioctl.
12593 12603           */
12594 12604          if (ipip->ipi_cmd == SIOCLIFADDIF) {
12595 12605                  err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL);
12596 12606                  DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish",
12597 12607                      int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12598 12608                  ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12599 12609                  return;
12600 12610          }
12601 12611  
12602 12612          ci.ci_ipif = NULL;
12603 12613          switch (ipip->ipi_cmd_type) {
12604 12614          case MISC_CMD:
12605 12615          case MSFILT_CMD:
12606 12616                  /*
12607 12617                   * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF.
12608 12618                   */
12609 12619                  if (ipip->ipi_cmd == IF_UNITSEL) {
12610 12620                          /* ioctl comes down the ill */
12611 12621                          ci.ci_ipif = ((ill_t *)q->q_ptr)->ill_ipif;
12612 12622                          ipif_refhold(ci.ci_ipif);
12613 12623                  }
12614 12624                  err = 0;
12615 12625                  ci.ci_sin = NULL;
12616 12626                  ci.ci_sin6 = NULL;
12617 12627                  ci.ci_lifr = NULL;
12618 12628                  extract_funcp = NULL;
12619 12629                  break;
12620 12630  
12621 12631          case IF_CMD:
12622 12632          case LIF_CMD:
12623 12633                  extract_funcp = ip_extract_lifreq;
12624 12634                  break;
12625 12635  
12626 12636          case ARP_CMD:
12627 12637          case XARP_CMD:
12628 12638                  extract_funcp = ip_extract_arpreq;
12629 12639                  break;
12630 12640  
12631 12641          default:
12632 12642                  ASSERT(0);
12633 12643          }
12634 12644  
12635 12645          if (extract_funcp != NULL) {
12636 12646                  err = (*extract_funcp)(q, mp, ipip, &ci);
12637 12647                  if (err != 0) {
12638 12648                          DTRACE_PROBE4(ipif__ioctl,
12639 12649                              char *, "ip_process_ioctl finish err",
12640 12650                              int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12641 12651                          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12642 12652                          return;
12643 12653                  }
12644 12654  
12645 12655                  /*
12646 12656                   * All of the extraction functions return a refheld ipif.
12647 12657                   */
12648 12658                  ASSERT(ci.ci_ipif != NULL);
12649 12659          }
12650 12660  
12651 12661          if (!(ipip->ipi_flags & IPI_WR)) {
12652 12662                  /*
12653 12663                   * A return value of EINPROGRESS means the ioctl is
12654 12664                   * either queued and waiting for some reason or has
12655 12665                   * already completed.
12656 12666                   */
12657 12667                  err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip,
12658 12668                      ci.ci_lifr);
12659 12669                  if (ci.ci_ipif != NULL) {
12660 12670                          DTRACE_PROBE4(ipif__ioctl,
12661 12671                              char *, "ip_process_ioctl finish RD",
12662 12672                              int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill,
12663 12673                              ipif_t *, ci.ci_ipif);
12664 12674                          ipif_refrele(ci.ci_ipif);
12665 12675                  } else {
12666 12676                          DTRACE_PROBE4(ipif__ioctl,
12667 12677                              char *, "ip_process_ioctl finish RD",
12668 12678                              int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL);
12669 12679                  }
12670 12680                  ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL);
12671 12681                  return;
12672 12682          }
12673 12683  
12674 12684          ASSERT(ci.ci_ipif != NULL);
12675 12685  
12676 12686          /*
12677 12687           * If ipsq is non-NULL, we are already being called exclusively
12678 12688           */
12679 12689          ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq));
12680 12690          if (ipsq == NULL) {
12681 12691                  ipsq = ipsq_try_enter(ci.ci_ipif, NULL, q, mp, ip_process_ioctl,
12682 12692                      NEW_OP, B_TRUE);
12683 12693                  if (ipsq == NULL) {
12684 12694                          ipif_refrele(ci.ci_ipif);
12685 12695                          return;
12686 12696                  }
12687 12697                  entered_ipsq = B_TRUE;
12688 12698          }
12689 12699          /*
12690 12700           * Release the ipif so that ipif_down and friends that wait for
12691 12701           * references to go away are not misled about the current ipif_refcnt
12692 12702           * values. We are writer so we can access the ipif even after releasing
12693 12703           * the ipif.
12694 12704           */
12695 12705          ipif_refrele(ci.ci_ipif);
12696 12706  
12697 12707          ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
12698 12708  
12699 12709          /*
12700 12710           * We need to cache the ill_t that we're going to use as the argument
12701 12711           * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
12702 12712           * blown away by calling ipi_func.
12703 12713           */
12704 12714          ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
12705 12715  
12706 12716          /*
12707 12717           * A return value of EINPROGRESS means the ioctl is
12708 12718           * either queued and waiting for some reason or has
12709 12719           * already completed.
12710 12720           */
12711 12721          err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
12712 12722  
12713 12723          DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
12714 12724              int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
12715 12725          ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
12716 12726  
12717 12727          if (entered_ipsq)
12718 12728                  ipsq_exit(ipsq);
12719 12729  }
12720 12730  
12721 12731  /*
12722 12732   * Complete the ioctl. Typically ioctls use the mi package and need to
12723 12733   * do mi_copyout/mi_copy_done.
12724 12734   */
12725 12735  void
12726 12736  ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
12727 12737  {
12728 12738          conn_t  *connp = NULL;
12729 12739  
12730 12740          if (err == EINPROGRESS)
12731 12741                  return;
12732 12742  
12733 12743          if (CONN_Q(q)) {
12734 12744                  connp = Q_TO_CONN(q);
12735 12745                  ASSERT(connp->conn_ref >= 2);
12736 12746          }
12737 12747  
12738 12748          switch (mode) {
12739 12749          case COPYOUT:
12740 12750                  if (err == 0)
12741 12751                          mi_copyout(q, mp);
12742 12752                  else
12743 12753                          mi_copy_done(q, mp, err);
12744 12754                  break;
12745 12755  
12746 12756          case NO_COPYOUT:
12747 12757                  mi_copy_done(q, mp, err);
12748 12758                  break;
12749 12759  
12750 12760          default:
12751 12761                  ASSERT(mode == CONN_CLOSE);     /* aborted through CONN_CLOSE */
12752 12762                  break;
12753 12763          }
12754 12764  
12755 12765          /*
12756 12766           * The conn refhold and ioctlref placed on the conn at the start of the
12757 12767           * ioctl are released here.
12758 12768           */
12759 12769          if (connp != NULL) {
12760 12770                  CONN_DEC_IOCTLREF(connp);
12761 12771                  CONN_OPER_PENDING_DONE(connp);
12762 12772          }
12763 12773  
12764 12774          if (ipsq != NULL)
12765 12775                  ipsq_current_finish(ipsq);
12766 12776  }
12767 12777  
12768 12778  /* Handles all non data messages */
12769 12779  void
12770 12780  ip_wput_nondata(queue_t *q, mblk_t *mp)
12771 12781  {
12772 12782          mblk_t          *mp1;
12773 12783          struct iocblk   *iocp;
12774 12784          ip_ioctl_cmd_t  *ipip;
12775 12785          conn_t          *connp;
12776 12786          cred_t          *cr;
12777 12787          char            *proto_str;
12778 12788  
12779 12789          if (CONN_Q(q))
12780 12790                  connp = Q_TO_CONN(q);
12781 12791          else
12782 12792                  connp = NULL;
12783 12793  
12784 12794          switch (DB_TYPE(mp)) {
12785 12795          case M_IOCTL:
12786 12796                  /*
12787 12797                   * IOCTL processing begins in ip_sioctl_copyin_setup which
12788 12798                   * will arrange to copy in associated control structures.
12789 12799                   */
12790 12800                  ip_sioctl_copyin_setup(q, mp);
12791 12801                  return;
12792 12802          case M_IOCDATA:
12793 12803                  /*
12794 12804                   * Ensure that this is associated with one of our trans-
12795 12805                   * parent ioctls.  If it's not ours, discard it if we're
12796 12806                   * running as a driver, or pass it on if we're a module.
12797 12807                   */
12798 12808                  iocp = (struct iocblk *)mp->b_rptr;
12799 12809                  ipip = ip_sioctl_lookup(iocp->ioc_cmd);
12800 12810                  if (ipip == NULL) {
12801 12811                          if (q->q_next == NULL) {
12802 12812                                  goto nak;
12803 12813                          } else {
12804 12814                                  putnext(q, mp);
12805 12815                          }
12806 12816                          return;
12807 12817                  }
12808 12818                  if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
12809 12819                          /*
12810 12820                           * The ioctl is one we recognise, but is not consumed
12811 12821                           * by IP as a module and we are a module, so we drop
12812 12822                           */
12813 12823                          goto nak;
12814 12824                  }
12815 12825  
12816 12826                  /* IOCTL continuation following copyin or copyout. */
12817 12827                  if (mi_copy_state(q, mp, NULL) == -1) {
12818 12828                          /*
12819 12829                           * The copy operation failed.  mi_copy_state already
12820 12830                           * cleaned up, so we're out of here.
12821 12831                           */
12822 12832                          return;
12823 12833                  }
12824 12834                  /*
12825 12835                   * If we just completed a copy in, we become writer and
12826 12836                   * continue processing in ip_sioctl_copyin_done.  If it
12827 12837                   * was a copy out, we call mi_copyout again.  If there is
12828 12838                   * nothing more to copy out, it will complete the IOCTL.
12829 12839                   */
12830 12840                  if (MI_COPY_DIRECTION(mp) == MI_COPY_IN) {
12831 12841                          if (!(mp1 = mp->b_cont) || !(mp1 = mp1->b_cont)) {
12832 12842                                  mi_copy_done(q, mp, EPROTO);
12833 12843                                  return;
12834 12844                          }
12835 12845                          /*
12836 12846                           * Check for cases that need more copying.  A return
12837 12847                           * value of 0 means a second copyin has been started,
12838 12848                           * so we return; a return value of 1 means no more
12839 12849                           * copying is needed, so we continue.
12840 12850                           */
12841 12851                          if (ipip->ipi_cmd_type == MSFILT_CMD &&
12842 12852                              MI_COPY_COUNT(mp) == 1) {
12843 12853                                  if (ip_copyin_msfilter(q, mp) == 0)
12844 12854                                          return;
12845 12855                          }
12846 12856                          /*
12847 12857                           * Refhold the conn, till the ioctl completes. This is
12848 12858                           * needed in case the ioctl ends up in the pending mp
12849 12859                           * list. Every mp in the ipx_pending_mp list must have
12850 12860                           * a refhold on the conn to resume processing. The
12851 12861                           * refhold is released when the ioctl completes
12852 12862                           * (whether normally or abnormally). An ioctlref is also
12853 12863                           * placed on the conn to prevent TCP from removing the
12854 12864                           * queue needed to send the ioctl reply back.
12855 12865                           * In all cases ip_ioctl_finish is called to finish
12856 12866                           * the ioctl and release the refholds.
12857 12867                           */
12858 12868                          if (connp != NULL) {
12859 12869                                  /* This is not a reentry */
12860 12870                                  CONN_INC_REF(connp);
12861 12871                                  CONN_INC_IOCTLREF(connp);
12862 12872                          } else {
12863 12873                                  if (!(ipip->ipi_flags & IPI_MODOK)) {
12864 12874                                          mi_copy_done(q, mp, EINVAL);
12865 12875                                          return;
12866 12876                                  }
12867 12877                          }
12868 12878  
12869 12879                          ip_process_ioctl(NULL, q, mp, ipip);
12870 12880  
12871 12881                  } else {
12872 12882                          mi_copyout(q, mp);
12873 12883                  }
12874 12884                  return;
12875 12885  
12876 12886          case M_IOCNAK:
12877 12887                  /*
12878 12888                   * The only way we could get here is if a resolver didn't like
12879 12889                   * an IOCTL we sent it.  This shouldn't happen.
12880 12890                   */
12881 12891                  (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
12882 12892                      "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x",
12883 12893                      ((struct iocblk *)mp->b_rptr)->ioc_cmd);
12884 12894                  freemsg(mp);
12885 12895                  return;
12886 12896          case M_IOCACK:
12887 12897                  /* /dev/ip shouldn't see this */
12888 12898                  goto nak;
12889 12899          case M_FLUSH:
12890 12900                  if (*mp->b_rptr & FLUSHW)
12891 12901                          flushq(q, FLUSHALL);
12892 12902                  if (q->q_next) {
12893 12903                          putnext(q, mp);
12894 12904                          return;
12895 12905                  }
12896 12906                  if (*mp->b_rptr & FLUSHR) {
12897 12907                          *mp->b_rptr &= ~FLUSHW;
12898 12908                          qreply(q, mp);
12899 12909                          return;
12900 12910                  }
12901 12911                  freemsg(mp);
12902 12912                  return;
12903 12913          case M_CTL:
12904 12914                  break;
12905 12915          case M_PROTO:
12906 12916          case M_PCPROTO:
12907 12917                  /*
12908 12918                   * The only PROTO messages we expect are SNMP-related.
12909 12919                   */
12910 12920                  switch (((union T_primitives *)mp->b_rptr)->type) {
12911 12921                  case T_SVR4_OPTMGMT_REQ:
12912 12922                          ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ "
12913 12923                              "flags %x\n",
12914 12924                              ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags));
12915 12925  
12916 12926                          if (connp == NULL) {
12917 12927                                  proto_str = "T_SVR4_OPTMGMT_REQ";
12918 12928                                  goto protonak;
12919 12929                          }
12920 12930  
12921 12931                          /*
12922 12932                           * All Solaris components should pass a db_credp
12923 12933                           * for this TPI message, hence we ASSERT.
12924 12934                           * But in case there is some other M_PROTO that looks
12925 12935                           * like a TPI message sent by some other kernel
12926 12936                           * component, we check and return an error.
12927 12937                           */
12928 12938                          cr = msg_getcred(mp, NULL);
12929 12939                          ASSERT(cr != NULL);
12930 12940                          if (cr == NULL) {
12931 12941                                  mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
12932 12942                                  if (mp != NULL)
12933 12943                                          qreply(q, mp);
12934 12944                                  return;
12935 12945                          }
12936 12946  
12937 12947                          if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) {
12938 12948                                  proto_str = "Bad SNMPCOM request?";
12939 12949                                  goto protonak;
12940 12950                          }
12941 12951                          return;
12942 12952                  default:
12943 12953                          ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n",
12944 12954                              (int)*(uint_t *)mp->b_rptr));
12945 12955                          freemsg(mp);
12946 12956                          return;
12947 12957                  }
12948 12958          default:
12949 12959                  break;
12950 12960          }
12951 12961          if (q->q_next) {
12952 12962                  putnext(q, mp);
12953 12963          } else
12954 12964                  freemsg(mp);
12955 12965          return;
12956 12966  
12957 12967  nak:
12958 12968          iocp->ioc_error = EINVAL;
12959 12969          mp->b_datap->db_type = M_IOCNAK;
12960 12970          iocp->ioc_count = 0;
12961 12971          qreply(q, mp);
12962 12972          return;
12963 12973  
12964 12974  protonak:
12965 12975          cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str);
12966 12976          if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL)
12967 12977                  qreply(q, mp);
12968 12978  }
12969 12979  
12970 12980  /*
12971 12981   * Process IP options in an outbound packet.  Verify that the nexthop in a
12972 12982   * strict source route is onlink.
12973 12983   * Returns non-zero if something fails in which case an ICMP error has been
12974 12984   * sent and mp freed.
12975 12985   *
12976 12986   * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst.
12977 12987   */
12978 12988  int
12979 12989  ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill)
12980 12990  {
12981 12991          ipoptp_t        opts;
12982 12992          uchar_t         *opt;
12983 12993          uint8_t         optval;
12984 12994          uint8_t         optlen;
12985 12995          ipaddr_t        dst;
12986 12996          intptr_t        code = 0;
12987 12997          ire_t           *ire;
12988 12998          ip_stack_t      *ipst = ixa->ixa_ipst;
12989 12999          ip_recv_attr_t  iras;
12990 13000  
12991 13001          ip2dbg(("ip_output_options\n"));
12992 13002  
12993 13003          dst = ipha->ipha_dst;
12994 13004          for (optval = ipoptp_first(&opts, ipha);
12995 13005              optval != IPOPT_EOL;
12996 13006              optval = ipoptp_next(&opts)) {
12997 13007                  opt = opts.ipoptp_cur;
12998 13008                  optlen = opts.ipoptp_len;
12999 13009                  ip2dbg(("ip_output_options: opt %d, len %d\n",
13000 13010                      optval, optlen));
13001 13011                  switch (optval) {
13002 13012                          uint32_t off;
13003 13013                  case IPOPT_SSRR:
13004 13014                  case IPOPT_LSRR:
13005 13015                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13006 13016                                  ip1dbg((
13007 13017                                      "ip_output_options: bad option offset\n"));
13008 13018                                  code = (char *)&opt[IPOPT_OLEN] -
13009 13019                                      (char *)ipha;
13010 13020                                  goto param_prob;
13011 13021                          }
13012 13022                          off = opt[IPOPT_OFFSET];
13013 13023                          ip1dbg(("ip_output_options: next hop 0x%x\n",
13014 13024                              ntohl(dst)));
13015 13025                          /*
13016 13026                           * For strict: verify that dst is directly
13017 13027                           * reachable.
13018 13028                           */
13019 13029                          if (optval == IPOPT_SSRR) {
13020 13030                                  ire = ire_ftable_lookup_v4(dst, 0, 0,
13021 13031                                      IRE_INTERFACE, NULL, ALL_ZONES,
13022 13032                                      ixa->ixa_tsl,
13023 13033                                      MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst,
13024 13034                                      NULL);
13025 13035                                  if (ire == NULL) {
13026 13036                                          ip1dbg(("ip_output_options: SSRR not"
13027 13037                                              " directly reachable: 0x%x\n",
13028 13038                                              ntohl(dst)));
13029 13039                                          goto bad_src_route;
13030 13040                                  }
13031 13041                                  ire_refrele(ire);
13032 13042                          }
13033 13043                          break;
13034 13044                  case IPOPT_RR:
13035 13045                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13036 13046                                  ip1dbg((
13037 13047                                      "ip_output_options: bad option offset\n"));
13038 13048                                  code = (char *)&opt[IPOPT_OLEN] -
13039 13049                                      (char *)ipha;
13040 13050                                  goto param_prob;
13041 13051                          }
13042 13052                          break;
13043 13053                  case IPOPT_TS:
13044 13054                          /*
13045 13055                           * Verify that length >=5 and that there is either
13046 13056                           * room for another timestamp or that the overflow
13047 13057                           * counter is not maxed out.
13048 13058                           */
13049 13059                          code = (char *)&opt[IPOPT_OLEN] - (char *)ipha;
13050 13060                          if (optlen < IPOPT_MINLEN_IT) {
13051 13061                                  goto param_prob;
13052 13062                          }
13053 13063                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
13054 13064                                  ip1dbg((
13055 13065                                      "ip_output_options: bad option offset\n"));
13056 13066                                  code = (char *)&opt[IPOPT_OFFSET] -
13057 13067                                      (char *)ipha;
13058 13068                                  goto param_prob;
13059 13069                          }
13060 13070                          switch (opt[IPOPT_POS_OV_FLG] & 0x0F) {
13061 13071                          case IPOPT_TS_TSONLY:
13062 13072                                  off = IPOPT_TS_TIMELEN;
13063 13073                                  break;
13064 13074                          case IPOPT_TS_TSANDADDR:
13065 13075                          case IPOPT_TS_PRESPEC:
13066 13076                          case IPOPT_TS_PRESPEC_RFC791:
13067 13077                                  off = IP_ADDR_LEN + IPOPT_TS_TIMELEN;
13068 13078                                  break;
13069 13079                          default:
13070 13080                                  code = (char *)&opt[IPOPT_POS_OV_FLG] -
13071 13081                                      (char *)ipha;
13072 13082                                  goto param_prob;
13073 13083                          }
13074 13084                          if (opt[IPOPT_OFFSET] - 1 + off > optlen &&
13075 13085                              (opt[IPOPT_POS_OV_FLG] & 0xF0) == 0xF0) {
13076 13086                                  /*
13077 13087                                   * No room and the overflow counter is 15
13078 13088                                   * already.
13079 13089                                   */
13080 13090                                  goto param_prob;
13081 13091                          }
13082 13092                          break;
13083 13093                  }
13084 13094          }
13085 13095  
13086 13096          if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0)
13087 13097                  return (0);
13088 13098  
13089 13099          ip1dbg(("ip_output_options: error processing IP options."));
13090 13100          code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha;
13091 13101  
13092 13102  param_prob:
13093 13103          bzero(&iras, sizeof (iras));
13094 13104          iras.ira_ill = iras.ira_rill = ill;
13095 13105          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13096 13106          iras.ira_rifindex = iras.ira_ruifindex;
13097 13107          iras.ira_flags = IRAF_IS_IPV4;
13098 13108  
13099 13109          ip_drop_output("ip_output_options", mp, ill);
13100 13110          icmp_param_problem(mp, (uint8_t)code, &iras);
13101 13111          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13102 13112          return (-1);
13103 13113  
13104 13114  bad_src_route:
13105 13115          bzero(&iras, sizeof (iras));
13106 13116          iras.ira_ill = iras.ira_rill = ill;
13107 13117          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
13108 13118          iras.ira_rifindex = iras.ira_ruifindex;
13109 13119          iras.ira_flags = IRAF_IS_IPV4;
13110 13120  
13111 13121          ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill);
13112 13122          icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras);
13113 13123          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
13114 13124          return (-1);
13115 13125  }
13116 13126  
13117 13127  /*
13118 13128   * The maximum value of conn_drain_list_cnt is CONN_MAXDRAINCNT.
13119 13129   * conn_drain_list_cnt can be changed by setting conn_drain_nthreads
13120 13130   * thru /etc/system.
13121 13131   */
13122 13132  #define CONN_MAXDRAINCNT        64
13123 13133  
13124 13134  static void
13125 13135  conn_drain_init(ip_stack_t *ipst)
13126 13136  {
13127 13137          int i, j;
13128 13138          idl_tx_list_t *itl_tx;
13129 13139  
13130 13140          ipst->ips_conn_drain_list_cnt = conn_drain_nthreads;
13131 13141  
13132 13142          if ((ipst->ips_conn_drain_list_cnt == 0) ||
13133 13143              (ipst->ips_conn_drain_list_cnt > CONN_MAXDRAINCNT)) {
13134 13144                  /*
13135 13145                   * Default value of the number of drainers is the
13136 13146                   * number of cpus, subject to maximum of 8 drainers.
13137 13147                   */
13138 13148                  if (boot_max_ncpus != -1)
13139 13149                          ipst->ips_conn_drain_list_cnt = MIN(boot_max_ncpus, 8);
13140 13150                  else
13141 13151                          ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8);
13142 13152          }
13143 13153  
13144 13154          ipst->ips_idl_tx_list =
13145 13155              kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP);
13146 13156          for (i = 0; i < TX_FANOUT_SIZE; i++) {
13147 13157                  itl_tx =  &ipst->ips_idl_tx_list[i];
13148 13158                  itl_tx->txl_drain_list =
13149 13159                      kmem_zalloc(ipst->ips_conn_drain_list_cnt *
13150 13160                      sizeof (idl_t), KM_SLEEP);
13151 13161                  mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL);
13152 13162                  for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) {
13153 13163                          mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL,
13154 13164                              MUTEX_DEFAULT, NULL);
13155 13165                          itl_tx->txl_drain_list[j].idl_itl = itl_tx;
13156 13166                  }
13157 13167          }
13158 13168  }
13159 13169  
13160 13170  static void
13161 13171  conn_drain_fini(ip_stack_t *ipst)
13162 13172  {
13163 13173          int i;
13164 13174          idl_tx_list_t *itl_tx;
13165 13175  
13166 13176          for (i = 0; i < TX_FANOUT_SIZE; i++) {
13167 13177                  itl_tx =  &ipst->ips_idl_tx_list[i];
13168 13178                  kmem_free(itl_tx->txl_drain_list,
13169 13179                      ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
13170 13180          }
13171 13181          kmem_free(ipst->ips_idl_tx_list,
13172 13182              TX_FANOUT_SIZE * sizeof (idl_tx_list_t));
13173 13183          ipst->ips_idl_tx_list = NULL;
13174 13184  }
13175 13185  
13176 13186  /*
13177 13187   * Flow control has blocked us from proceeding.  Insert the given conn in one
13178 13188   * of the conn drain lists.  When flow control is unblocked, either ip_wsrv()
13179 13189   * (STREAMS) or ill_flow_enable() (direct) will be called back, which in turn
13180 13190   * will call conn_walk_drain().  See the flow control notes at the top of this
13181 13191   * file for more details.
13182 13192   */
13183 13193  void
13184 13194  conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
13185 13195  {
13186 13196          idl_t   *idl = tx_list->txl_drain_list;
13187 13197          uint_t  index;
13188 13198          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
13189 13199  
13190 13200          mutex_enter(&connp->conn_lock);
13191 13201          if (connp->conn_state_flags & CONN_CLOSING) {
13192 13202                  /*
13193 13203                   * The conn is closing as a result of which CONN_CLOSING
13194 13204                   * is set. Return.
13195 13205                   */
13196 13206                  mutex_exit(&connp->conn_lock);
13197 13207                  return;
13198 13208          } else if (connp->conn_idl == NULL) {
13199 13209                  /*
13200 13210                   * Assign the next drain list round robin. We dont' use
13201 13211                   * a lock, and thus it may not be strictly round robin.
13202 13212                   * Atomicity of load/stores is enough to make sure that
13203 13213                   * conn_drain_list_index is always within bounds.
13204 13214                   */
13205 13215                  index = tx_list->txl_drain_index;
13206 13216                  ASSERT(index < ipst->ips_conn_drain_list_cnt);
13207 13217                  connp->conn_idl = &tx_list->txl_drain_list[index];
13208 13218                  index++;
13209 13219                  if (index == ipst->ips_conn_drain_list_cnt)
13210 13220                          index = 0;
13211 13221                  tx_list->txl_drain_index = index;
13212 13222          } else {
13213 13223                  ASSERT(connp->conn_idl->idl_itl == tx_list);
13214 13224          }
13215 13225          mutex_exit(&connp->conn_lock);
13216 13226  
13217 13227          idl = connp->conn_idl;
13218 13228          mutex_enter(&idl->idl_lock);
13219 13229          if ((connp->conn_drain_prev != NULL) ||
13220 13230              (connp->conn_state_flags & CONN_CLOSING)) {
13221 13231                  /*
13222 13232                   * The conn is either already in the drain list or closing.
13223 13233                   * (We needed to check for CONN_CLOSING again since close can
13224 13234                   * sneak in between dropping conn_lock and acquiring idl_lock.)
13225 13235                   */
13226 13236                  mutex_exit(&idl->idl_lock);
13227 13237                  return;
13228 13238          }
13229 13239  
13230 13240          /*
13231 13241           * The conn is not in the drain list. Insert it at the
13232 13242           * tail of the drain list. The drain list is circular
13233 13243           * and doubly linked. idl_conn points to the 1st element
13234 13244           * in the list.
13235 13245           */
13236 13246          if (idl->idl_conn == NULL) {
13237 13247                  idl->idl_conn = connp;
13238 13248                  connp->conn_drain_next = connp;
13239 13249                  connp->conn_drain_prev = connp;
13240 13250          } else {
13241 13251                  conn_t *head = idl->idl_conn;
13242 13252  
13243 13253                  connp->conn_drain_next = head;
13244 13254                  connp->conn_drain_prev = head->conn_drain_prev;
13245 13255                  head->conn_drain_prev->conn_drain_next = connp;
13246 13256                  head->conn_drain_prev = connp;
13247 13257          }
13248 13258          /*
13249 13259           * For non streams based sockets assert flow control.
13250 13260           */
13251 13261          conn_setqfull(connp, NULL);
13252 13262          mutex_exit(&idl->idl_lock);
13253 13263  }
13254 13264  
13255 13265  static void
13256 13266  conn_drain_remove(conn_t *connp)
13257 13267  {
13258 13268          idl_t *idl = connp->conn_idl;
13259 13269  
13260 13270          if (idl != NULL) {
13261 13271                  /*
13262 13272                   * Remove ourself from the drain list.
13263 13273                   */
13264 13274                  if (connp->conn_drain_next == connp) {
13265 13275                          /* Singleton in the list */
13266 13276                          ASSERT(connp->conn_drain_prev == connp);
13267 13277                          idl->idl_conn = NULL;
13268 13278                  } else {
13269 13279                          connp->conn_drain_prev->conn_drain_next =
13270 13280                              connp->conn_drain_next;
13271 13281                          connp->conn_drain_next->conn_drain_prev =
13272 13282                              connp->conn_drain_prev;
13273 13283                          if (idl->idl_conn == connp)
13274 13284                                  idl->idl_conn = connp->conn_drain_next;
13275 13285                  }
13276 13286  
13277 13287                  /*
13278 13288                   * NOTE: because conn_idl is associated with a specific drain
13279 13289                   * list which in turn is tied to the index the TX ring
13280 13290                   * (txl_cookie) hashes to, and because the TX ring can change
13281 13291                   * over the lifetime of the conn_t, we must clear conn_idl so
13282 13292                   * a subsequent conn_drain_insert() will set conn_idl again
13283 13293                   * based on the latest txl_cookie.
13284 13294                   */
13285 13295                  connp->conn_idl = NULL;
13286 13296          }
13287 13297          connp->conn_drain_next = NULL;
13288 13298          connp->conn_drain_prev = NULL;
13289 13299  
13290 13300          conn_clrqfull(connp, NULL);
13291 13301          /*
13292 13302           * For streams based sockets open up flow control.
13293 13303           */
13294 13304          if (!IPCL_IS_NONSTR(connp))
13295 13305                  enableok(connp->conn_wq);
13296 13306  }
13297 13307  
13298 13308  /*
13299 13309   * This conn is closing, and we are called from ip_close. OR
13300 13310   * this conn is draining because flow-control on the ill has been relieved.
13301 13311   *
13302 13312   * We must also need to remove conn's on this idl from the list, and also
13303 13313   * inform the sockfs upcalls about the change in flow-control.
13304 13314   */
13305 13315  static void
13306 13316  conn_drain(conn_t *connp, boolean_t closing)
13307 13317  {
13308 13318          idl_t *idl;
13309 13319          conn_t *next_connp;
13310 13320  
13311 13321          /*
13312 13322           * connp->conn_idl is stable at this point, and no lock is needed
13313 13323           * to check it. If we are called from ip_close, close has already
13314 13324           * set CONN_CLOSING, thus freezing the value of conn_idl, and
13315 13325           * called us only because conn_idl is non-null. If we are called thru
13316 13326           * service, conn_idl could be null, but it cannot change because
13317 13327           * service is single-threaded per queue, and there cannot be another
13318 13328           * instance of service trying to call conn_drain_insert on this conn
13319 13329           * now.
13320 13330           */
13321 13331          ASSERT(!closing || connp == NULL || connp->conn_idl != NULL);
13322 13332  
13323 13333          /*
13324 13334           * If the conn doesn't exist or is not on a drain list, bail.
13325 13335           */
13326 13336          if (connp == NULL || connp->conn_idl == NULL ||
13327 13337              connp->conn_drain_prev == NULL) {
13328 13338                  return;
13329 13339          }
13330 13340  
13331 13341          idl = connp->conn_idl;
13332 13342          ASSERT(MUTEX_HELD(&idl->idl_lock));
13333 13343  
13334 13344          if (!closing) {
13335 13345                  next_connp = connp->conn_drain_next;
13336 13346                  while (next_connp != connp) {
13337 13347                          conn_t *delconnp = next_connp;
13338 13348  
13339 13349                          next_connp = next_connp->conn_drain_next;
13340 13350                          conn_drain_remove(delconnp);
13341 13351                  }
13342 13352                  ASSERT(connp->conn_drain_next == idl->idl_conn);
13343 13353          }
13344 13354          conn_drain_remove(connp);
13345 13355  }
13346 13356  
13347 13357  /*
13348 13358   * Write service routine. Shared perimeter entry point.
13349 13359   * The device queue's messages has fallen below the low water mark and STREAMS
13350 13360   * has backenabled the ill_wq. Send sockfs notification about flow-control on
13351 13361   * each waiting conn.
13352 13362   */
13353 13363  void
13354 13364  ip_wsrv(queue_t *q)
13355 13365  {
13356 13366          ill_t   *ill;
13357 13367  
13358 13368          ill = (ill_t *)q->q_ptr;
13359 13369          if (ill->ill_state_flags == 0) {
13360 13370                  ip_stack_t *ipst = ill->ill_ipst;
13361 13371  
13362 13372                  /*
13363 13373                   * The device flow control has opened up.
13364 13374                   * Walk through conn drain lists and qenable the
13365 13375                   * first conn in each list. This makes sense only
13366 13376                   * if the stream is fully plumbed and setup.
13367 13377                   * Hence the ill_state_flags check above.
13368 13378                   */
13369 13379                  ip1dbg(("ip_wsrv: walking\n"));
13370 13380                  conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
13371 13381                  enableok(ill->ill_wq);
13372 13382          }
13373 13383  }
13374 13384  
13375 13385  /*
13376 13386   * Callback to disable flow control in IP.
13377 13387   *
13378 13388   * This is a mac client callback added when the DLD_CAPAB_DIRECT capability
13379 13389   * is enabled.
13380 13390   *
13381 13391   * When MAC_TX() is not able to send any more packets, dld sets its queue
13382 13392   * to QFULL and enable the STREAMS flow control. Later, when the underlying
13383 13393   * driver is able to continue to send packets, it calls mac_tx_(ring_)update()
13384 13394   * function and wakes up corresponding mac worker threads, which in turn
13385 13395   * calls this callback function, and disables flow control.
13386 13396   */
13387 13397  void
13388 13398  ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie)
13389 13399  {
13390 13400          ill_t *ill = (ill_t *)arg;
13391 13401          ip_stack_t *ipst = ill->ill_ipst;
13392 13402          idl_tx_list_t *idl_txl;
13393 13403  
13394 13404          idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
13395 13405          mutex_enter(&idl_txl->txl_lock);
13396 13406          /* add code to to set a flag to indicate idl_txl is enabled */
13397 13407          conn_walk_drain(ipst, idl_txl);
13398 13408          mutex_exit(&idl_txl->txl_lock);
13399 13409  }
13400 13410  
13401 13411  /*
13402 13412   * Flow control has been relieved and STREAMS has backenabled us; drain
13403 13413   * all the conn lists on `tx_list'.
13404 13414   */
13405 13415  static void
13406 13416  conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
13407 13417  {
13408 13418          int i;
13409 13419          idl_t *idl;
13410 13420  
13411 13421          IP_STAT(ipst, ip_conn_walk_drain);
13412 13422  
13413 13423          for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
13414 13424                  idl = &tx_list->txl_drain_list[i];
13415 13425                  mutex_enter(&idl->idl_lock);
13416 13426                  conn_drain(idl->idl_conn, B_FALSE);
13417 13427                  mutex_exit(&idl->idl_lock);
13418 13428          }
13419 13429  }
13420 13430  
13421 13431  /*
13422 13432   * Determine if the ill and multicast aspects of that packets
13423 13433   * "matches" the conn.
13424 13434   */
13425 13435  boolean_t
13426 13436  conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha)
13427 13437  {
13428 13438          ill_t           *ill = ira->ira_rill;
13429 13439          zoneid_t        zoneid = ira->ira_zoneid;
13430 13440          uint_t          in_ifindex;
13431 13441          ipaddr_t        dst, src;
13432 13442  
13433 13443          dst = ipha->ipha_dst;
13434 13444          src = ipha->ipha_src;
13435 13445  
13436 13446          /*
13437 13447           * conn_incoming_ifindex is set by IP_BOUND_IF which limits
13438 13448           * unicast, broadcast and multicast reception to
13439 13449           * conn_incoming_ifindex.
13440 13450           * conn_wantpacket is called for unicast, broadcast and
13441 13451           * multicast packets.
13442 13452           */
13443 13453          in_ifindex = connp->conn_incoming_ifindex;
13444 13454  
13445 13455          /* mpathd can bind to the under IPMP interface, which we allow */
13446 13456          if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) {
13447 13457                  if (!IS_UNDER_IPMP(ill))
13448 13458                          return (B_FALSE);
13449 13459  
13450 13460                  if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill))
13451 13461                          return (B_FALSE);
13452 13462          }
13453 13463  
13454 13464          if (!IPCL_ZONE_MATCH(connp, zoneid))
13455 13465                  return (B_FALSE);
13456 13466  
13457 13467          if (!(ira->ira_flags & IRAF_MULTICAST))
13458 13468                  return (B_TRUE);
13459 13469  
13460 13470          if (connp->conn_multi_router) {
13461 13471                  /* multicast packet and multicast router socket: send up */
13462 13472                  return (B_TRUE);
13463 13473          }
13464 13474  
13465 13475          if (ipha->ipha_protocol == IPPROTO_PIM ||
13466 13476              ipha->ipha_protocol == IPPROTO_RSVP)
13467 13477                  return (B_TRUE);
13468 13478  
13469 13479          return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill));
13470 13480  }
13471 13481  
13472 13482  void
13473 13483  conn_setqfull(conn_t *connp, boolean_t *flow_stopped)
13474 13484  {
13475 13485          if (IPCL_IS_NONSTR(connp)) {
13476 13486                  (*connp->conn_upcalls->su_txq_full)
13477 13487                      (connp->conn_upper_handle, B_TRUE);
13478 13488                  if (flow_stopped != NULL)
13479 13489                          *flow_stopped = B_TRUE;
13480 13490          } else {
13481 13491                  queue_t *q = connp->conn_wq;
13482 13492  
13483 13493                  ASSERT(q != NULL);
13484 13494                  if (!(q->q_flag & QFULL)) {
13485 13495                          mutex_enter(QLOCK(q));
13486 13496                          if (!(q->q_flag & QFULL)) {
13487 13497                                  /* still need to set QFULL */
13488 13498                                  q->q_flag |= QFULL;
13489 13499                                  /* set flow_stopped to true under QLOCK */
13490 13500                                  if (flow_stopped != NULL)
13491 13501                                          *flow_stopped = B_TRUE;
13492 13502                                  mutex_exit(QLOCK(q));
13493 13503                          } else {
13494 13504                                  /* flow_stopped is left unchanged */
13495 13505                                  mutex_exit(QLOCK(q));
13496 13506                          }
13497 13507                  }
13498 13508          }
13499 13509  }
13500 13510  
13501 13511  void
13502 13512  conn_clrqfull(conn_t *connp, boolean_t *flow_stopped)
13503 13513  {
13504 13514          if (IPCL_IS_NONSTR(connp)) {
13505 13515                  (*connp->conn_upcalls->su_txq_full)
13506 13516                      (connp->conn_upper_handle, B_FALSE);
13507 13517                  if (flow_stopped != NULL)
13508 13518                          *flow_stopped = B_FALSE;
13509 13519          } else {
13510 13520                  queue_t *q = connp->conn_wq;
13511 13521  
13512 13522                  ASSERT(q != NULL);
13513 13523                  if (q->q_flag & QFULL) {
13514 13524                          mutex_enter(QLOCK(q));
13515 13525                          if (q->q_flag & QFULL) {
13516 13526                                  q->q_flag &= ~QFULL;
13517 13527                                  /* set flow_stopped to false under QLOCK */
13518 13528                                  if (flow_stopped != NULL)
13519 13529                                          *flow_stopped = B_FALSE;
13520 13530                                  mutex_exit(QLOCK(q));
13521 13531                                  if (q->q_flag & QWANTW)
13522 13532                                          qbackenable(q, 0);
13523 13533                          } else {
13524 13534                                  /* flow_stopped is left unchanged */
13525 13535                                  mutex_exit(QLOCK(q));
13526 13536                          }
13527 13537                  }
13528 13538          }
13529 13539  
13530 13540          mutex_enter(&connp->conn_lock);
13531 13541          connp->conn_blocked = B_FALSE;
13532 13542          mutex_exit(&connp->conn_lock);
13533 13543  }
13534 13544  
13535 13545  /*
13536 13546   * Return the length in bytes of the IPv4 headers (base header, label, and
13537 13547   * other IP options) that will be needed based on the
13538 13548   * ip_pkt_t structure passed by the caller.
13539 13549   *
13540 13550   * The returned length does not include the length of the upper level
13541 13551   * protocol (ULP) header.
13542 13552   * The caller needs to check that the length doesn't exceed the max for IPv4.
13543 13553   */
13544 13554  int
13545 13555  ip_total_hdrs_len_v4(const ip_pkt_t *ipp)
13546 13556  {
13547 13557          int len;
13548 13558  
13549 13559          len = IP_SIMPLE_HDR_LENGTH;
13550 13560          if (ipp->ipp_fields & IPPF_LABEL_V4) {
13551 13561                  ASSERT(ipp->ipp_label_len_v4 != 0);
13552 13562                  /* We need to round up here */
13553 13563                  len += (ipp->ipp_label_len_v4 + 3) & ~3;
13554 13564          }
13555 13565  
13556 13566          if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13557 13567                  ASSERT(ipp->ipp_ipv4_options_len != 0);
13558 13568                  ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13559 13569                  len += ipp->ipp_ipv4_options_len;
13560 13570          }
13561 13571          return (len);
13562 13572  }
13563 13573  
13564 13574  /*
13565 13575   * All-purpose routine to build an IPv4 header with options based
13566 13576   * on the abstract ip_pkt_t.
13567 13577   *
13568 13578   * The caller has to set the source and destination address as well as
13569 13579   * ipha_length. The caller has to massage any source route and compensate
13570 13580   * for the ULP pseudo-header checksum due to the source route.
13571 13581   */
13572 13582  void
13573 13583  ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp,
13574 13584      uint8_t protocol)
13575 13585  {
13576 13586          ipha_t  *ipha = (ipha_t *)buf;
13577 13587          uint8_t *cp;
13578 13588  
13579 13589          /* Initialize IPv4 header */
13580 13590          ipha->ipha_type_of_service = ipp->ipp_type_of_service;
13581 13591          ipha->ipha_length = 0;  /* Caller will set later */
13582 13592          ipha->ipha_ident = 0;
13583 13593          ipha->ipha_fragment_offset_and_flags = 0;
13584 13594          ipha->ipha_ttl = ipp->ipp_unicast_hops;
13585 13595          ipha->ipha_protocol = protocol;
13586 13596          ipha->ipha_hdr_checksum = 0;
13587 13597  
13588 13598          if ((ipp->ipp_fields & IPPF_ADDR) &&
13589 13599              IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr))
13590 13600                  ipha->ipha_src = ipp->ipp_addr_v4;
13591 13601  
13592 13602          cp = (uint8_t *)&ipha[1];
13593 13603          if (ipp->ipp_fields & IPPF_LABEL_V4) {
13594 13604                  ASSERT(ipp->ipp_label_len_v4 != 0);
13595 13605                  bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4);
13596 13606                  cp += ipp->ipp_label_len_v4;
13597 13607                  /* We need to round up here */
13598 13608                  while ((uintptr_t)cp & 0x3) {
13599 13609                          *cp++ = IPOPT_NOP;
13600 13610                  }
13601 13611          }
13602 13612  
13603 13613          if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) {
13604 13614                  ASSERT(ipp->ipp_ipv4_options_len != 0);
13605 13615                  ASSERT((ipp->ipp_ipv4_options_len & 3) == 0);
13606 13616                  bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len);
13607 13617                  cp += ipp->ipp_ipv4_options_len;
13608 13618          }
13609 13619          ipha->ipha_version_and_hdr_length =
13610 13620              (uint8_t)((IP_VERSION << 4) + buf_len / 4);
13611 13621  
13612 13622          ASSERT((int)(cp - buf) == buf_len);
13613 13623  }
13614 13624  
13615 13625  /* Allocate the private structure */
13616 13626  static int
13617 13627  ip_priv_alloc(void **bufp)
13618 13628  {
13619 13629          void    *buf;
13620 13630  
13621 13631          if ((buf = kmem_alloc(sizeof (ip_priv_t), KM_NOSLEEP)) == NULL)
13622 13632                  return (ENOMEM);
13623 13633  
13624 13634          *bufp = buf;
13625 13635          return (0);
13626 13636  }
13627 13637  
13628 13638  /* Function to delete the private structure */
13629 13639  void
13630 13640  ip_priv_free(void *buf)
13631 13641  {
13632 13642          ASSERT(buf != NULL);
13633 13643          kmem_free(buf, sizeof (ip_priv_t));
13634 13644  }
13635 13645  
13636 13646  /*
13637 13647   * The entry point for IPPF processing.
13638 13648   * If the classifier (IPGPC_CLASSIFY) is not loaded and configured, the
13639 13649   * routine just returns.
13640 13650   *
13641 13651   * When called, ip_process generates an ipp_packet_t structure
13642 13652   * which holds the state information for this packet and invokes the
13643 13653   * the classifier (via ipp_packet_process). The classification, depending on
13644 13654   * configured filters, results in a list of actions for this packet. Invoking
13645 13655   * an action may cause the packet to be dropped, in which case we return NULL.
13646 13656   * proc indicates the callout position for
13647 13657   * this packet and ill is the interface this packet arrived on or will leave
13648 13658   * on (inbound and outbound resp.).
13649 13659   *
13650 13660   * We do the processing on the rill (mapped to the upper if ipmp), but MIB
13651 13661   * on the ill corrsponding to the destination IP address.
13652 13662   */
13653 13663  mblk_t *
13654 13664  ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill)
13655 13665  {
13656 13666          ip_priv_t       *priv;
13657 13667          ipp_action_id_t aid;
13658 13668          int             rc = 0;
13659 13669          ipp_packet_t    *pp;
13660 13670  
13661 13671          /* If the classifier is not loaded, return  */
13662 13672          if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) {
13663 13673                  return (mp);
13664 13674          }
13665 13675  
13666 13676          ASSERT(mp != NULL);
13667 13677  
13668 13678          /* Allocate the packet structure */
13669 13679          rc = ipp_packet_alloc(&pp, "ip", aid);
13670 13680          if (rc != 0)
13671 13681                  goto drop;
13672 13682  
13673 13683          /* Allocate the private structure */
13674 13684          rc = ip_priv_alloc((void **)&priv);
13675 13685          if (rc != 0) {
13676 13686                  ipp_packet_free(pp);
13677 13687                  goto drop;
13678 13688          }
13679 13689          priv->proc = proc;
13680 13690          priv->ill_index = ill_get_upper_ifindex(rill);
13681 13691  
13682 13692          ipp_packet_set_private(pp, priv, ip_priv_free);
13683 13693          ipp_packet_set_data(pp, mp);
13684 13694  
13685 13695          /* Invoke the classifier */
13686 13696          rc = ipp_packet_process(&pp);
13687 13697          if (pp != NULL) {
13688 13698                  mp = ipp_packet_get_data(pp);
13689 13699                  ipp_packet_free(pp);
13690 13700                  if (rc != 0)
13691 13701                          goto drop;
13692 13702                  return (mp);
13693 13703          } else {
13694 13704                  /* No mp to trace in ip_drop_input/ip_drop_output  */
13695 13705                  mp = NULL;
13696 13706          }
13697 13707  drop:
13698 13708          if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) {
13699 13709                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
13700 13710                  ip_drop_input("ip_process", mp, ill);
13701 13711          } else {
13702 13712                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13703 13713                  ip_drop_output("ip_process", mp, ill);
13704 13714          }
13705 13715          freemsg(mp);
13706 13716          return (NULL);
13707 13717  }
13708 13718  
13709 13719  /*
13710 13720   * Propagate a multicast group membership operation (add/drop) on
13711 13721   * all the interfaces crossed by the related multirt routes.
13712 13722   * The call is considered successful if the operation succeeds
13713 13723   * on at least one interface.
13714 13724   *
13715 13725   * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the
13716 13726   * multicast addresses with the ire argument being the first one.
13717 13727   * We walk the bucket to find all the of those.
13718 13728   *
13719 13729   * Common to IPv4 and IPv6.
13720 13730   */
13721 13731  static int
13722 13732  ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t,
13723 13733      const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *),
13724 13734      ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group,
13725 13735      mcast_record_t fmode, const in6_addr_t *v6src)
13726 13736  {
13727 13737          ire_t           *ire_gw;
13728 13738          irb_t           *irb;
13729 13739          int             ifindex;
13730 13740          int             error = 0;
13731 13741          int             result;
13732 13742          ip_stack_t      *ipst = ire->ire_ipst;
13733 13743          ipaddr_t        group;
13734 13744          boolean_t       isv6;
13735 13745          int             match_flags;
13736 13746  
13737 13747          if (IN6_IS_ADDR_V4MAPPED(v6group)) {
13738 13748                  IN6_V4MAPPED_TO_IPADDR(v6group, group);
13739 13749                  isv6 = B_FALSE;
13740 13750          } else {
13741 13751                  isv6 = B_TRUE;
13742 13752          }
13743 13753  
13744 13754          irb = ire->ire_bucket;
13745 13755          ASSERT(irb != NULL);
13746 13756  
13747 13757          result = 0;
13748 13758          irb_refhold(irb);
13749 13759          for (; ire != NULL; ire = ire->ire_next) {
13750 13760                  if ((ire->ire_flags & RTF_MULTIRT) == 0)
13751 13761                          continue;
13752 13762  
13753 13763                  /* We handle -ifp routes by matching on the ill if set */
13754 13764                  match_flags = MATCH_IRE_TYPE;
13755 13765                  if (ire->ire_ill != NULL)
13756 13766                          match_flags |= MATCH_IRE_ILL;
13757 13767  
13758 13768                  if (isv6) {
13759 13769                          if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group))
13760 13770                                  continue;
13761 13771  
13762 13772                          ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6,
13763 13773                              0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13764 13774                              match_flags, 0, ipst, NULL);
13765 13775                  } else {
13766 13776                          if (ire->ire_addr != group)
13767 13777                                  continue;
13768 13778  
13769 13779                          ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr,
13770 13780                              0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL,
13771 13781                              match_flags, 0, ipst, NULL);
13772 13782                  }
13773 13783                  /* No interface route exists for the gateway; skip this ire. */
13774 13784                  if (ire_gw == NULL)
13775 13785                          continue;
13776 13786                  if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
13777 13787                          ire_refrele(ire_gw);
13778 13788                          continue;
13779 13789                  }
13780 13790                  ASSERT(ire_gw->ire_ill != NULL);        /* IRE_INTERFACE */
13781 13791                  ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex;
13782 13792  
13783 13793                  /*
13784 13794                   * The operation is considered a success if
13785 13795                   * it succeeds at least once on any one interface.
13786 13796                   */
13787 13797                  error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex,
13788 13798                      fmode, v6src);
13789 13799                  if (error == 0)
13790 13800                          result = CGTP_MCAST_SUCCESS;
13791 13801  
13792 13802                  ire_refrele(ire_gw);
13793 13803          }
13794 13804          irb_refrele(irb);
13795 13805          /*
13796 13806           * Consider the call as successful if we succeeded on at least
13797 13807           * one interface. Otherwise, return the last encountered error.
13798 13808           */
13799 13809          return (result == CGTP_MCAST_SUCCESS ? 0 : error);
13800 13810  }
13801 13811  
13802 13812  /*
13803 13813   * Return the expected CGTP hooks version number.
13804 13814   */
13805 13815  int
13806 13816  ip_cgtp_filter_supported(void)
13807 13817  {
13808 13818          return (ip_cgtp_filter_rev);
13809 13819  }
13810 13820  
13811 13821  /*
13812 13822   * CGTP hooks can be registered by invoking this function.
13813 13823   * Checks that the version number matches.
13814 13824   */
13815 13825  int
13816 13826  ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops)
13817 13827  {
13818 13828          netstack_t *ns;
13819 13829          ip_stack_t *ipst;
13820 13830  
13821 13831          if (ops->cfo_filter_rev != CGTP_FILTER_REV)
13822 13832                  return (ENOTSUP);
13823 13833  
13824 13834          ns = netstack_find_by_stackid(stackid);
13825 13835          if (ns == NULL)
13826 13836                  return (EINVAL);
13827 13837          ipst = ns->netstack_ip;
13828 13838          ASSERT(ipst != NULL);
13829 13839  
13830 13840          if (ipst->ips_ip_cgtp_filter_ops != NULL) {
13831 13841                  netstack_rele(ns);
13832 13842                  return (EALREADY);
13833 13843          }
13834 13844  
13835 13845          ipst->ips_ip_cgtp_filter_ops = ops;
13836 13846  
13837 13847          ill_set_inputfn_all(ipst);
13838 13848  
13839 13849          netstack_rele(ns);
13840 13850          return (0);
13841 13851  }
13842 13852  
13843 13853  /*
13844 13854   * CGTP hooks can be unregistered by invoking this function.
13845 13855   * Returns ENXIO if there was no registration.
13846 13856   * Returns EBUSY if the ndd variable has not been turned off.
13847 13857   */
13848 13858  int
13849 13859  ip_cgtp_filter_unregister(netstackid_t stackid)
13850 13860  {
13851 13861          netstack_t *ns;
13852 13862          ip_stack_t *ipst;
13853 13863  
13854 13864          ns = netstack_find_by_stackid(stackid);
13855 13865          if (ns == NULL)
13856 13866                  return (EINVAL);
13857 13867          ipst = ns->netstack_ip;
13858 13868          ASSERT(ipst != NULL);
13859 13869  
13860 13870          if (ipst->ips_ip_cgtp_filter) {
13861 13871                  netstack_rele(ns);
13862 13872                  return (EBUSY);
13863 13873          }
13864 13874  
13865 13875          if (ipst->ips_ip_cgtp_filter_ops == NULL) {
13866 13876                  netstack_rele(ns);
13867 13877                  return (ENXIO);
13868 13878          }
13869 13879          ipst->ips_ip_cgtp_filter_ops = NULL;
13870 13880  
13871 13881          ill_set_inputfn_all(ipst);
13872 13882  
13873 13883          netstack_rele(ns);
13874 13884          return (0);
13875 13885  }
13876 13886  
13877 13887  /*
13878 13888   * Check whether there is a CGTP filter registration.
13879 13889   * Returns non-zero if there is a registration, otherwise returns zero.
13880 13890   * Note: returns zero if bad stackid.
13881 13891   */
13882 13892  int
13883 13893  ip_cgtp_filter_is_registered(netstackid_t stackid)
13884 13894  {
13885 13895          netstack_t *ns;
13886 13896          ip_stack_t *ipst;
13887 13897          int ret;
13888 13898  
13889 13899          ns = netstack_find_by_stackid(stackid);
13890 13900          if (ns == NULL)
13891 13901                  return (0);
13892 13902          ipst = ns->netstack_ip;
13893 13903          ASSERT(ipst != NULL);
13894 13904  
13895 13905          if (ipst->ips_ip_cgtp_filter_ops != NULL)
13896 13906                  ret = 1;
13897 13907          else
13898 13908                  ret = 0;
13899 13909  
13900 13910          netstack_rele(ns);
13901 13911          return (ret);
13902 13912  }
13903 13913  
13904 13914  static int
13905 13915  ip_squeue_switch(int val)
13906 13916  {
13907 13917          int rval;
13908 13918  
13909 13919          switch (val) {
13910 13920          case IP_SQUEUE_ENTER_NODRAIN:
13911 13921                  rval = SQ_NODRAIN;
13912 13922                  break;
13913 13923          case IP_SQUEUE_ENTER:
13914 13924                  rval = SQ_PROCESS;
13915 13925                  break;
13916 13926          case IP_SQUEUE_FILL:
13917 13927          default:
13918 13928                  rval = SQ_FILL;
13919 13929                  break;
13920 13930          }
13921 13931          return (rval);
13922 13932  }
13923 13933  
13924 13934  static void *
13925 13935  ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp)
13926 13936  {
13927 13937          kstat_t *ksp;
13928 13938  
13929 13939          ip_stat_t template = {
13930 13940                  { "ip_udp_fannorm",             KSTAT_DATA_UINT64 },
13931 13941                  { "ip_udp_fanmb",               KSTAT_DATA_UINT64 },
13932 13942                  { "ip_recv_pullup",             KSTAT_DATA_UINT64 },
13933 13943                  { "ip_db_ref",                  KSTAT_DATA_UINT64 },
13934 13944                  { "ip_notaligned",              KSTAT_DATA_UINT64 },
13935 13945                  { "ip_multimblk",               KSTAT_DATA_UINT64 },
13936 13946                  { "ip_opt",                     KSTAT_DATA_UINT64 },
13937 13947                  { "ipsec_proto_ahesp",          KSTAT_DATA_UINT64 },
13938 13948                  { "ip_conn_flputbq",            KSTAT_DATA_UINT64 },
13939 13949                  { "ip_conn_walk_drain",         KSTAT_DATA_UINT64 },
13940 13950                  { "ip_out_sw_cksum",            KSTAT_DATA_UINT64 },
13941 13951                  { "ip_out_sw_cksum_bytes",      KSTAT_DATA_UINT64 },
13942 13952                  { "ip_in_sw_cksum",             KSTAT_DATA_UINT64 },
13943 13953                  { "ip_ire_reclaim_calls",       KSTAT_DATA_UINT64 },
13944 13954                  { "ip_ire_reclaim_deleted",     KSTAT_DATA_UINT64 },
13945 13955                  { "ip_nce_reclaim_calls",       KSTAT_DATA_UINT64 },
13946 13956                  { "ip_nce_reclaim_deleted",     KSTAT_DATA_UINT64 },
13947 13957                  { "ip_dce_reclaim_calls",       KSTAT_DATA_UINT64 },
13948 13958                  { "ip_dce_reclaim_deleted",     KSTAT_DATA_UINT64 },
13949 13959                  { "ip_tcp_in_full_hw_cksum_err",        KSTAT_DATA_UINT64 },
13950 13960                  { "ip_tcp_in_part_hw_cksum_err",        KSTAT_DATA_UINT64 },
13951 13961                  { "ip_tcp_in_sw_cksum_err",             KSTAT_DATA_UINT64 },
13952 13962                  { "ip_udp_in_full_hw_cksum_err",        KSTAT_DATA_UINT64 },
13953 13963                  { "ip_udp_in_part_hw_cksum_err",        KSTAT_DATA_UINT64 },
13954 13964                  { "ip_udp_in_sw_cksum_err",     KSTAT_DATA_UINT64 },
13955 13965                  { "conn_in_recvdstaddr",        KSTAT_DATA_UINT64 },
13956 13966                  { "conn_in_recvopts",           KSTAT_DATA_UINT64 },
13957 13967                  { "conn_in_recvif",             KSTAT_DATA_UINT64 },
13958 13968                  { "conn_in_recvslla",           KSTAT_DATA_UINT64 },
13959 13969                  { "conn_in_recvucred",          KSTAT_DATA_UINT64 },
13960 13970                  { "conn_in_recvttl",            KSTAT_DATA_UINT64 },
13961 13971                  { "conn_in_recvhopopts",        KSTAT_DATA_UINT64 },
13962 13972                  { "conn_in_recvhoplimit",       KSTAT_DATA_UINT64 },
13963 13973                  { "conn_in_recvdstopts",        KSTAT_DATA_UINT64 },
13964 13974                  { "conn_in_recvrthdrdstopts",   KSTAT_DATA_UINT64 },
13965 13975                  { "conn_in_recvrthdr",          KSTAT_DATA_UINT64 },
13966 13976                  { "conn_in_recvpktinfo",        KSTAT_DATA_UINT64 },
13967 13977                  { "conn_in_recvtclass",         KSTAT_DATA_UINT64 },
13968 13978                  { "conn_in_timestamp",          KSTAT_DATA_UINT64 },
13969 13979          };
13970 13980  
13971 13981          ksp = kstat_create_netstack("ip", 0, "ipstat", "net",
13972 13982              KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t),
13973 13983              KSTAT_FLAG_VIRTUAL, stackid);
13974 13984  
13975 13985          if (ksp == NULL)
13976 13986                  return (NULL);
13977 13987  
13978 13988          bcopy(&template, ip_statisticsp, sizeof (template));
13979 13989          ksp->ks_data = (void *)ip_statisticsp;
13980 13990          ksp->ks_private = (void *)(uintptr_t)stackid;
13981 13991  
13982 13992          kstat_install(ksp);
13983 13993          return (ksp);
13984 13994  }
13985 13995  
13986 13996  static void
13987 13997  ip_kstat2_fini(netstackid_t stackid, kstat_t *ksp)
13988 13998  {
13989 13999          if (ksp != NULL) {
13990 14000                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
13991 14001                  kstat_delete_netstack(ksp, stackid);
13992 14002          }
13993 14003  }
13994 14004  
13995 14005  static void *
13996 14006  ip_kstat_init(netstackid_t stackid, ip_stack_t *ipst)
13997 14007  {
13998 14008          kstat_t *ksp;
13999 14009  
14000 14010          ip_named_kstat_t template = {
14001 14011                  { "forwarding",         KSTAT_DATA_UINT32, 0 },
14002 14012                  { "defaultTTL",         KSTAT_DATA_UINT32, 0 },
14003 14013                  { "inReceives",         KSTAT_DATA_UINT64, 0 },
14004 14014                  { "inHdrErrors",        KSTAT_DATA_UINT32, 0 },
14005 14015                  { "inAddrErrors",       KSTAT_DATA_UINT32, 0 },
14006 14016                  { "forwDatagrams",      KSTAT_DATA_UINT64, 0 },
14007 14017                  { "inUnknownProtos",    KSTAT_DATA_UINT32, 0 },
14008 14018                  { "inDiscards",         KSTAT_DATA_UINT32, 0 },
14009 14019                  { "inDelivers",         KSTAT_DATA_UINT64, 0 },
14010 14020                  { "outRequests",        KSTAT_DATA_UINT64, 0 },
14011 14021                  { "outDiscards",        KSTAT_DATA_UINT32, 0 },
14012 14022                  { "outNoRoutes",        KSTAT_DATA_UINT32, 0 },
14013 14023                  { "reasmTimeout",       KSTAT_DATA_UINT32, 0 },
14014 14024                  { "reasmReqds",         KSTAT_DATA_UINT32, 0 },
14015 14025                  { "reasmOKs",           KSTAT_DATA_UINT32, 0 },
14016 14026                  { "reasmFails",         KSTAT_DATA_UINT32, 0 },
14017 14027                  { "fragOKs",            KSTAT_DATA_UINT32, 0 },
14018 14028                  { "fragFails",          KSTAT_DATA_UINT32, 0 },
14019 14029                  { "fragCreates",        KSTAT_DATA_UINT32, 0 },
14020 14030                  { "addrEntrySize",      KSTAT_DATA_INT32, 0 },
14021 14031                  { "routeEntrySize",     KSTAT_DATA_INT32, 0 },
14022 14032                  { "netToMediaEntrySize",        KSTAT_DATA_INT32, 0 },
14023 14033                  { "routingDiscards",    KSTAT_DATA_UINT32, 0 },
14024 14034                  { "inErrs",             KSTAT_DATA_UINT32, 0 },
14025 14035                  { "noPorts",            KSTAT_DATA_UINT32, 0 },
14026 14036                  { "inCksumErrs",        KSTAT_DATA_UINT32, 0 },
14027 14037                  { "reasmDuplicates",    KSTAT_DATA_UINT32, 0 },
14028 14038                  { "reasmPartDups",      KSTAT_DATA_UINT32, 0 },
14029 14039                  { "forwProhibits",      KSTAT_DATA_UINT32, 0 },
14030 14040                  { "udpInCksumErrs",     KSTAT_DATA_UINT32, 0 },
14031 14041                  { "udpInOverflows",     KSTAT_DATA_UINT32, 0 },
14032 14042                  { "rawipInOverflows",   KSTAT_DATA_UINT32, 0 },
14033 14043                  { "ipsecInSucceeded",   KSTAT_DATA_UINT32, 0 },
14034 14044                  { "ipsecInFailed",      KSTAT_DATA_INT32, 0 },
14035 14045                  { "memberEntrySize",    KSTAT_DATA_INT32, 0 },
14036 14046                  { "inIPv6",             KSTAT_DATA_UINT32, 0 },
14037 14047                  { "outIPv6",            KSTAT_DATA_UINT32, 0 },
14038 14048                  { "outSwitchIPv6",      KSTAT_DATA_UINT32, 0 },
14039 14049          };
14040 14050  
14041 14051          ksp = kstat_create_netstack("ip", 0, "ip", "mib2", KSTAT_TYPE_NAMED,
14042 14052              NUM_OF_FIELDS(ip_named_kstat_t), 0, stackid);
14043 14053          if (ksp == NULL || ksp->ks_data == NULL)
14044 14054                  return (NULL);
14045 14055  
14046 14056          template.forwarding.value.ui32 = WE_ARE_FORWARDING(ipst) ? 1:2;
14047 14057          template.defaultTTL.value.ui32 = (uint32_t)ipst->ips_ip_def_ttl;
14048 14058          template.reasmTimeout.value.ui32 = ipst->ips_ip_reassembly_timeout;
14049 14059          template.addrEntrySize.value.i32 = sizeof (mib2_ipAddrEntry_t);
14050 14060          template.routeEntrySize.value.i32 = sizeof (mib2_ipRouteEntry_t);
14051 14061  
14052 14062          template.netToMediaEntrySize.value.i32 =
14053 14063              sizeof (mib2_ipNetToMediaEntry_t);
14054 14064  
14055 14065          template.memberEntrySize.value.i32 = sizeof (ipv6_member_t);
14056 14066  
14057 14067          bcopy(&template, ksp->ks_data, sizeof (template));
14058 14068          ksp->ks_update = ip_kstat_update;
14059 14069          ksp->ks_private = (void *)(uintptr_t)stackid;
14060 14070  
14061 14071          kstat_install(ksp);
14062 14072          return (ksp);
14063 14073  }
14064 14074  
14065 14075  static void
14066 14076  ip_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14067 14077  {
14068 14078          if (ksp != NULL) {
14069 14079                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14070 14080                  kstat_delete_netstack(ksp, stackid);
14071 14081          }
14072 14082  }
14073 14083  
14074 14084  static int
14075 14085  ip_kstat_update(kstat_t *kp, int rw)
14076 14086  {
14077 14087          ip_named_kstat_t *ipkp;
14078 14088          mib2_ipIfStatsEntry_t ipmib;
14079 14089          ill_walk_context_t ctx;
14080 14090          ill_t *ill;
14081 14091          netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14082 14092          netstack_t      *ns;
14083 14093          ip_stack_t      *ipst;
14084 14094  
14085 14095          if (kp == NULL || kp->ks_data == NULL)
14086 14096                  return (EIO);
14087 14097  
14088 14098          if (rw == KSTAT_WRITE)
14089 14099                  return (EACCES);
14090 14100  
14091 14101          ns = netstack_find_by_stackid(stackid);
14092 14102          if (ns == NULL)
14093 14103                  return (-1);
14094 14104          ipst = ns->netstack_ip;
14095 14105          if (ipst == NULL) {
14096 14106                  netstack_rele(ns);
14097 14107                  return (-1);
14098 14108          }
14099 14109          ipkp = (ip_named_kstat_t *)kp->ks_data;
14100 14110  
14101 14111          bcopy(&ipst->ips_ip_mib, &ipmib, sizeof (ipmib));
14102 14112          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
14103 14113          ill = ILL_START_WALK_V4(&ctx, ipst);
14104 14114          for (; ill != NULL; ill = ill_next(&ctx, ill))
14105 14115                  ip_mib2_add_ip_stats(&ipmib, ill->ill_ip_mib);
14106 14116          rw_exit(&ipst->ips_ill_g_lock);
14107 14117  
14108 14118          ipkp->forwarding.value.ui32 =           ipmib.ipIfStatsForwarding;
14109 14119          ipkp->defaultTTL.value.ui32 =           ipmib.ipIfStatsDefaultTTL;
14110 14120          ipkp->inReceives.value.ui64 =           ipmib.ipIfStatsHCInReceives;
14111 14121          ipkp->inHdrErrors.value.ui32 =          ipmib.ipIfStatsInHdrErrors;
14112 14122          ipkp->inAddrErrors.value.ui32 =         ipmib.ipIfStatsInAddrErrors;
14113 14123          ipkp->forwDatagrams.value.ui64 = ipmib.ipIfStatsHCOutForwDatagrams;
14114 14124          ipkp->inUnknownProtos.value.ui32 =      ipmib.ipIfStatsInUnknownProtos;
14115 14125          ipkp->inDiscards.value.ui32 =           ipmib.ipIfStatsInDiscards;
14116 14126          ipkp->inDelivers.value.ui64 =           ipmib.ipIfStatsHCInDelivers;
14117 14127          ipkp->outRequests.value.ui64 =          ipmib.ipIfStatsHCOutRequests;
14118 14128          ipkp->outDiscards.value.ui32 =          ipmib.ipIfStatsOutDiscards;
14119 14129          ipkp->outNoRoutes.value.ui32 =          ipmib.ipIfStatsOutNoRoutes;
14120 14130          ipkp->reasmTimeout.value.ui32 =         ipst->ips_ip_reassembly_timeout;
14121 14131          ipkp->reasmReqds.value.ui32 =           ipmib.ipIfStatsReasmReqds;
14122 14132          ipkp->reasmOKs.value.ui32 =             ipmib.ipIfStatsReasmOKs;
14123 14133          ipkp->reasmFails.value.ui32 =           ipmib.ipIfStatsReasmFails;
14124 14134          ipkp->fragOKs.value.ui32 =              ipmib.ipIfStatsOutFragOKs;
14125 14135          ipkp->fragFails.value.ui32 =            ipmib.ipIfStatsOutFragFails;
14126 14136          ipkp->fragCreates.value.ui32 =          ipmib.ipIfStatsOutFragCreates;
14127 14137  
14128 14138          ipkp->routingDiscards.value.ui32 =      0;
14129 14139          ipkp->inErrs.value.ui32 =               ipmib.tcpIfStatsInErrs;
14130 14140          ipkp->noPorts.value.ui32 =              ipmib.udpIfStatsNoPorts;
14131 14141          ipkp->inCksumErrs.value.ui32 =          ipmib.ipIfStatsInCksumErrs;
14132 14142          ipkp->reasmDuplicates.value.ui32 =      ipmib.ipIfStatsReasmDuplicates;
14133 14143          ipkp->reasmPartDups.value.ui32 =        ipmib.ipIfStatsReasmPartDups;
14134 14144          ipkp->forwProhibits.value.ui32 =        ipmib.ipIfStatsForwProhibits;
14135 14145          ipkp->udpInCksumErrs.value.ui32 =       ipmib.udpIfStatsInCksumErrs;
14136 14146          ipkp->udpInOverflows.value.ui32 =       ipmib.udpIfStatsInOverflows;
14137 14147          ipkp->rawipInOverflows.value.ui32 =     ipmib.rawipIfStatsInOverflows;
14138 14148          ipkp->ipsecInSucceeded.value.ui32 =     ipmib.ipsecIfStatsInSucceeded;
14139 14149          ipkp->ipsecInFailed.value.i32 =         ipmib.ipsecIfStatsInFailed;
14140 14150  
14141 14151          ipkp->inIPv6.value.ui32 =       ipmib.ipIfStatsInWrongIPVersion;
14142 14152          ipkp->outIPv6.value.ui32 =      ipmib.ipIfStatsOutWrongIPVersion;
14143 14153          ipkp->outSwitchIPv6.value.ui32 = ipmib.ipIfStatsOutSwitchIPVersion;
14144 14154  
14145 14155          netstack_rele(ns);
14146 14156  
14147 14157          return (0);
14148 14158  }
14149 14159  
14150 14160  static void *
14151 14161  icmp_kstat_init(netstackid_t stackid)
14152 14162  {
14153 14163          kstat_t *ksp;
14154 14164  
14155 14165          icmp_named_kstat_t template = {
14156 14166                  { "inMsgs",             KSTAT_DATA_UINT32 },
14157 14167                  { "inErrors",           KSTAT_DATA_UINT32 },
14158 14168                  { "inDestUnreachs",     KSTAT_DATA_UINT32 },
14159 14169                  { "inTimeExcds",        KSTAT_DATA_UINT32 },
14160 14170                  { "inParmProbs",        KSTAT_DATA_UINT32 },
14161 14171                  { "inSrcQuenchs",       KSTAT_DATA_UINT32 },
14162 14172                  { "inRedirects",        KSTAT_DATA_UINT32 },
14163 14173                  { "inEchos",            KSTAT_DATA_UINT32 },
14164 14174                  { "inEchoReps",         KSTAT_DATA_UINT32 },
14165 14175                  { "inTimestamps",       KSTAT_DATA_UINT32 },
14166 14176                  { "inTimestampReps",    KSTAT_DATA_UINT32 },
14167 14177                  { "inAddrMasks",        KSTAT_DATA_UINT32 },
14168 14178                  { "inAddrMaskReps",     KSTAT_DATA_UINT32 },
14169 14179                  { "outMsgs",            KSTAT_DATA_UINT32 },
14170 14180                  { "outErrors",          KSTAT_DATA_UINT32 },
14171 14181                  { "outDestUnreachs",    KSTAT_DATA_UINT32 },
14172 14182                  { "outTimeExcds",       KSTAT_DATA_UINT32 },
14173 14183                  { "outParmProbs",       KSTAT_DATA_UINT32 },
14174 14184                  { "outSrcQuenchs",      KSTAT_DATA_UINT32 },
14175 14185                  { "outRedirects",       KSTAT_DATA_UINT32 },
14176 14186                  { "outEchos",           KSTAT_DATA_UINT32 },
14177 14187                  { "outEchoReps",        KSTAT_DATA_UINT32 },
14178 14188                  { "outTimestamps",      KSTAT_DATA_UINT32 },
14179 14189                  { "outTimestampReps",   KSTAT_DATA_UINT32 },
14180 14190                  { "outAddrMasks",       KSTAT_DATA_UINT32 },
14181 14191                  { "outAddrMaskReps",    KSTAT_DATA_UINT32 },
14182 14192                  { "inChksumErrs",       KSTAT_DATA_UINT32 },
14183 14193                  { "inUnknowns",         KSTAT_DATA_UINT32 },
14184 14194                  { "inFragNeeded",       KSTAT_DATA_UINT32 },
14185 14195                  { "outFragNeeded",      KSTAT_DATA_UINT32 },
14186 14196                  { "outDrops",           KSTAT_DATA_UINT32 },
14187 14197                  { "inOverFlows",        KSTAT_DATA_UINT32 },
14188 14198                  { "inBadRedirects",     KSTAT_DATA_UINT32 },
14189 14199          };
14190 14200  
14191 14201          ksp = kstat_create_netstack("ip", 0, "icmp", "mib2", KSTAT_TYPE_NAMED,
14192 14202              NUM_OF_FIELDS(icmp_named_kstat_t), 0, stackid);
14193 14203          if (ksp == NULL || ksp->ks_data == NULL)
14194 14204                  return (NULL);
14195 14205  
14196 14206          bcopy(&template, ksp->ks_data, sizeof (template));
14197 14207  
14198 14208          ksp->ks_update = icmp_kstat_update;
14199 14209          ksp->ks_private = (void *)(uintptr_t)stackid;
14200 14210  
14201 14211          kstat_install(ksp);
14202 14212          return (ksp);
14203 14213  }
14204 14214  
14205 14215  static void
14206 14216  icmp_kstat_fini(netstackid_t stackid, kstat_t *ksp)
14207 14217  {
14208 14218          if (ksp != NULL) {
14209 14219                  ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private);
14210 14220                  kstat_delete_netstack(ksp, stackid);
14211 14221          }
14212 14222  }
14213 14223  
14214 14224  static int
14215 14225  icmp_kstat_update(kstat_t *kp, int rw)
14216 14226  {
14217 14227          icmp_named_kstat_t *icmpkp;
14218 14228          netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
14219 14229          netstack_t      *ns;
14220 14230          ip_stack_t      *ipst;
14221 14231  
14222 14232          if ((kp == NULL) || (kp->ks_data == NULL))
14223 14233                  return (EIO);
14224 14234  
14225 14235          if (rw == KSTAT_WRITE)
14226 14236                  return (EACCES);
14227 14237  
14228 14238          ns = netstack_find_by_stackid(stackid);
14229 14239          if (ns == NULL)
14230 14240                  return (-1);
14231 14241          ipst = ns->netstack_ip;
14232 14242          if (ipst == NULL) {
14233 14243                  netstack_rele(ns);
14234 14244                  return (-1);
14235 14245          }
14236 14246          icmpkp = (icmp_named_kstat_t *)kp->ks_data;
14237 14247  
14238 14248          icmpkp->inMsgs.value.ui32 =         ipst->ips_icmp_mib.icmpInMsgs;
14239 14249          icmpkp->inErrors.value.ui32 =       ipst->ips_icmp_mib.icmpInErrors;
14240 14250          icmpkp->inDestUnreachs.value.ui32 =
14241 14251              ipst->ips_icmp_mib.icmpInDestUnreachs;
14242 14252          icmpkp->inTimeExcds.value.ui32 =    ipst->ips_icmp_mib.icmpInTimeExcds;
14243 14253          icmpkp->inParmProbs.value.ui32 =    ipst->ips_icmp_mib.icmpInParmProbs;
14244 14254          icmpkp->inSrcQuenchs.value.ui32 =   ipst->ips_icmp_mib.icmpInSrcQuenchs;
14245 14255          icmpkp->inRedirects.value.ui32 =    ipst->ips_icmp_mib.icmpInRedirects;
14246 14256          icmpkp->inEchos.value.ui32 =        ipst->ips_icmp_mib.icmpInEchos;
14247 14257          icmpkp->inEchoReps.value.ui32 =     ipst->ips_icmp_mib.icmpInEchoReps;
14248 14258          icmpkp->inTimestamps.value.ui32 =   ipst->ips_icmp_mib.icmpInTimestamps;
14249 14259          icmpkp->inTimestampReps.value.ui32 =
14250 14260              ipst->ips_icmp_mib.icmpInTimestampReps;
14251 14261          icmpkp->inAddrMasks.value.ui32 =    ipst->ips_icmp_mib.icmpInAddrMasks;
14252 14262          icmpkp->inAddrMaskReps.value.ui32 =
14253 14263              ipst->ips_icmp_mib.icmpInAddrMaskReps;
14254 14264          icmpkp->outMsgs.value.ui32 =        ipst->ips_icmp_mib.icmpOutMsgs;
14255 14265          icmpkp->outErrors.value.ui32 =      ipst->ips_icmp_mib.icmpOutErrors;
14256 14266          icmpkp->outDestUnreachs.value.ui32 =
14257 14267              ipst->ips_icmp_mib.icmpOutDestUnreachs;
14258 14268          icmpkp->outTimeExcds.value.ui32 =   ipst->ips_icmp_mib.icmpOutTimeExcds;
14259 14269          icmpkp->outParmProbs.value.ui32 =   ipst->ips_icmp_mib.icmpOutParmProbs;
14260 14270          icmpkp->outSrcQuenchs.value.ui32 =
14261 14271              ipst->ips_icmp_mib.icmpOutSrcQuenchs;
14262 14272          icmpkp->outRedirects.value.ui32 =   ipst->ips_icmp_mib.icmpOutRedirects;
14263 14273          icmpkp->outEchos.value.ui32 =       ipst->ips_icmp_mib.icmpOutEchos;
14264 14274          icmpkp->outEchoReps.value.ui32 =    ipst->ips_icmp_mib.icmpOutEchoReps;
14265 14275          icmpkp->outTimestamps.value.ui32 =
14266 14276              ipst->ips_icmp_mib.icmpOutTimestamps;
14267 14277          icmpkp->outTimestampReps.value.ui32 =
14268 14278              ipst->ips_icmp_mib.icmpOutTimestampReps;
14269 14279          icmpkp->outAddrMasks.value.ui32 =
14270 14280              ipst->ips_icmp_mib.icmpOutAddrMasks;
14271 14281          icmpkp->outAddrMaskReps.value.ui32 =
14272 14282              ipst->ips_icmp_mib.icmpOutAddrMaskReps;
14273 14283          icmpkp->inCksumErrs.value.ui32 =    ipst->ips_icmp_mib.icmpInCksumErrs;
14274 14284          icmpkp->inUnknowns.value.ui32 =     ipst->ips_icmp_mib.icmpInUnknowns;
14275 14285          icmpkp->inFragNeeded.value.ui32 =   ipst->ips_icmp_mib.icmpInFragNeeded;
14276 14286          icmpkp->outFragNeeded.value.ui32 =
14277 14287              ipst->ips_icmp_mib.icmpOutFragNeeded;
14278 14288          icmpkp->outDrops.value.ui32 =       ipst->ips_icmp_mib.icmpOutDrops;
14279 14289          icmpkp->inOverflows.value.ui32 =    ipst->ips_icmp_mib.icmpInOverflows;
14280 14290          icmpkp->inBadRedirects.value.ui32 =
14281 14291              ipst->ips_icmp_mib.icmpInBadRedirects;
14282 14292  
14283 14293          netstack_rele(ns);
14284 14294          return (0);
14285 14295  }
14286 14296  
14287 14297  /*
14288 14298   * This is the fanout function for raw socket opened for SCTP.  Note
14289 14299   * that it is called after SCTP checks that there is no socket which
14290 14300   * wants a packet.  Then before SCTP handles this out of the blue packet,
14291 14301   * this function is called to see if there is any raw socket for SCTP.
14292 14302   * If there is and it is bound to the correct address, the packet will
14293 14303   * be sent to that socket.  Note that only one raw socket can be bound to
14294 14304   * a port.  This is assured in ipcl_sctp_hash_insert();
14295 14305   */
14296 14306  void
14297 14307  ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports,
14298 14308      ip_recv_attr_t *ira)
14299 14309  {
14300 14310          conn_t          *connp;
14301 14311          queue_t         *rq;
14302 14312          boolean_t       secure;
14303 14313          ill_t           *ill = ira->ira_ill;
14304 14314          ip_stack_t      *ipst = ill->ill_ipst;
14305 14315          ipsec_stack_t   *ipss = ipst->ips_netstack->netstack_ipsec;
14306 14316          sctp_stack_t    *sctps = ipst->ips_netstack->netstack_sctp;
14307 14317          iaflags_t       iraflags = ira->ira_flags;
14308 14318          ill_t           *rill = ira->ira_rill;
14309 14319  
14310 14320          secure = iraflags & IRAF_IPSEC_SECURE;
14311 14321  
14312 14322          connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h,
14313 14323              ira, ipst);
14314 14324          if (connp == NULL) {
14315 14325                  /*
14316 14326                   * Although raw sctp is not summed, OOB chunks must be.
14317 14327                   * Drop the packet here if the sctp checksum failed.
14318 14328                   */
14319 14329                  if (iraflags & IRAF_SCTP_CSUM_ERR) {
14320 14330                          SCTPS_BUMP_MIB(sctps, sctpChecksumError);
14321 14331                          freemsg(mp);
14322 14332                          return;
14323 14333                  }
14324 14334                  ira->ira_ill = ira->ira_rill = NULL;
14325 14335                  sctp_ootb_input(mp, ira, ipst);
14326 14336                  ira->ira_ill = ill;
14327 14337                  ira->ira_rill = rill;
14328 14338                  return;
14329 14339          }
14330 14340          rq = connp->conn_rq;
14331 14341          if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) {
14332 14342                  CONN_DEC_REF(connp);
14333 14343                  BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
14334 14344                  freemsg(mp);
14335 14345                  return;
14336 14346          }
14337 14347          if (((iraflags & IRAF_IS_IPV4) ?
14338 14348              CONN_INBOUND_POLICY_PRESENT(connp, ipss) :
14339 14349              CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) ||
14340 14350              secure) {
14341 14351                  mp = ipsec_check_inbound_policy(mp, connp, ipha,
14342 14352                      ip6h, ira);
14343 14353                  if (mp == NULL) {
14344 14354                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
14345 14355                          /* Note that mp is NULL */
14346 14356                          ip_drop_input("ipIfStatsInDiscards", mp, ill);
14347 14357                          CONN_DEC_REF(connp);
14348 14358                          return;
14349 14359                  }
14350 14360          }
14351 14361  
14352 14362          if (iraflags & IRAF_ICMP_ERROR) {
14353 14363                  (connp->conn_recvicmp)(connp, mp, NULL, ira);
14354 14364          } else {
14355 14365                  ill_t *rill = ira->ira_rill;
14356 14366  
14357 14367                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers);
14358 14368                  /* This is the SOCK_RAW, IPPROTO_SCTP case. */
14359 14369                  ira->ira_ill = ira->ira_rill = NULL;
14360 14370                  (connp->conn_recv)(connp, mp, NULL, ira);
14361 14371                  ira->ira_ill = ill;
14362 14372                  ira->ira_rill = rill;
14363 14373          }
14364 14374          CONN_DEC_REF(connp);
14365 14375  }
14366 14376  
14367 14377  /*
14368 14378   * Free a packet that has the link-layer dl_unitdata_req_t or fast-path
14369 14379   * header before the ip payload.
14370 14380   */
14371 14381  static void
14372 14382  ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len)
14373 14383  {
14374 14384          int len = (mp->b_wptr - mp->b_rptr);
14375 14385          mblk_t *ip_mp;
14376 14386  
14377 14387          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14378 14388          if (is_fp_mp || len != fp_mp_len) {
14379 14389                  if (len > fp_mp_len) {
14380 14390                          /*
14381 14391                           * fastpath header and ip header in the first mblk
14382 14392                           */
14383 14393                          mp->b_rptr += fp_mp_len;
14384 14394                  } else {
14385 14395                          /*
14386 14396                           * ip_xmit_attach_llhdr had to prepend an mblk to
14387 14397                           * attach the fastpath header before ip header.
14388 14398                           */
14389 14399                          ip_mp = mp->b_cont;
14390 14400                          freeb(mp);
14391 14401                          mp = ip_mp;
14392 14402                          mp->b_rptr += (fp_mp_len - len);
14393 14403                  }
14394 14404          } else {
14395 14405                  ip_mp = mp->b_cont;
14396 14406                  freeb(mp);
14397 14407                  mp = ip_mp;
14398 14408          }
14399 14409          ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill);
14400 14410          freemsg(mp);
14401 14411  }
14402 14412  
14403 14413  /*
14404 14414   * Normal post fragmentation function.
14405 14415   *
14406 14416   * Send a packet using the passed in nce. This handles both IPv4 and IPv6
14407 14417   * using the same state machine.
14408 14418   *
14409 14419   * We return an error on failure. In particular we return EWOULDBLOCK
14410 14420   * when the driver flow controls. In that case this ensures that ip_wsrv runs
14411 14421   * (currently by canputnext failure resulting in backenabling from GLD.)
14412 14422   * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an
14413 14423   * indication that they can flow control until ip_wsrv() tells then to restart.
14414 14424   *
14415 14425   * If the nce passed by caller is incomplete, this function
14416 14426   * queues the packet and if necessary, sends ARP request and bails.
14417 14427   * If the Neighbor Cache passed is fully resolved, we simply prepend
14418 14428   * the link-layer header to the packet, do ipsec hw acceleration
14419 14429   * work if necessary, and send the packet out on the wire.
14420 14430   */
14421 14431  /* ARGSUSED6 */
14422 14432  int
14423 14433  ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len,
14424 14434      uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie)
14425 14435  {
14426 14436          queue_t         *wq;
14427 14437          ill_t           *ill = nce->nce_ill;
14428 14438          ip_stack_t      *ipst = ill->ill_ipst;
14429 14439          uint64_t        delta;
14430 14440          boolean_t       isv6 = ill->ill_isv6;
14431 14441          boolean_t       fp_mp;
14432 14442          ncec_t          *ncec = nce->nce_common;
14433 14443          int64_t         now = LBOLT_FASTPATH64;
14434 14444          boolean_t       is_probe;
14435 14445  
14436 14446          DTRACE_PROBE1(ip__xmit, nce_t *, nce);
14437 14447  
14438 14448          ASSERT(mp != NULL);
14439 14449          ASSERT(mp->b_datap->db_type == M_DATA);
14440 14450          ASSERT(pkt_len == msgdsize(mp));
14441 14451  
14442 14452          /*
14443 14453           * If we have already been here and are coming back after ARP/ND.
14444 14454           * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs
14445 14455           * in that case since they have seen the packet when it came here
14446 14456           * the first time.
14447 14457           */
14448 14458          if (ixaflags & IXAF_NO_TRACE)
14449 14459                  goto sendit;
14450 14460  
14451 14461          if (ixaflags & IXAF_IS_IPV4) {
14452 14462                  ipha_t *ipha = (ipha_t *)mp->b_rptr;
14453 14463  
14454 14464                  ASSERT(!isv6);
14455 14465                  ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length));
14456 14466                  if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) &&
14457 14467                      !(ixaflags & IXAF_NO_PFHOOK)) {
14458 14468                          int     error;
14459 14469  
14460 14470                          FW_HOOKS(ipst->ips_ip4_physical_out_event,
14461 14471                              ipst->ips_ipv4firewall_physical_out,
14462 14472                              NULL, ill, ipha, mp, mp, 0, ipst, error);
14463 14473                          DTRACE_PROBE1(ip4__physical__out__end,
14464 14474                              mblk_t *, mp);
14465 14475                          if (mp == NULL)
14466 14476                                  return (error);
14467 14477  
14468 14478                          /* The length could have changed */
14469 14479                          pkt_len = msgdsize(mp);
14470 14480                  }
14471 14481                  if (ipst->ips_ip4_observe.he_interested) {
14472 14482                          /*
14473 14483                           * Note that for TX the zoneid is the sending
14474 14484                           * zone, whether or not MLP is in play.
14475 14485                           * Since the szone argument is the IP zoneid (i.e.,
14476 14486                           * zero for exclusive-IP zones) and ipobs wants
14477 14487                           * the system zoneid, we map it here.
14478 14488                           */
14479 14489                          szone = IP_REAL_ZONEID(szone, ipst);
14480 14490  
14481 14491                          /*
14482 14492                           * On the outbound path the destination zone will be
14483 14493                           * unknown as we're sending this packet out on the
14484 14494                           * wire.
14485 14495                           */
14486 14496                          ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14487 14497                              ill, ipst);
14488 14498                  }
14489 14499                  DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
14490 14500                      void_ip_t *, ipha,  __dtrace_ipsr_ill_t *, ill,
14491 14501                      ipha_t *, ipha, ip6_t *, NULL, int, 0);
14492 14502          } else {
14493 14503                  ip6_t *ip6h = (ip6_t *)mp->b_rptr;
14494 14504  
14495 14505                  ASSERT(isv6);
14496 14506                  ASSERT(pkt_len ==
14497 14507                      ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN);
14498 14508                  if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) &&
14499 14509                      !(ixaflags & IXAF_NO_PFHOOK)) {
14500 14510                          int     error;
14501 14511  
14502 14512                          FW_HOOKS6(ipst->ips_ip6_physical_out_event,
14503 14513                              ipst->ips_ipv6firewall_physical_out,
14504 14514                              NULL, ill, ip6h, mp, mp, 0, ipst, error);
14505 14515                          DTRACE_PROBE1(ip6__physical__out__end,
14506 14516                              mblk_t *, mp);
14507 14517                          if (mp == NULL)
14508 14518                                  return (error);
14509 14519  
14510 14520                          /* The length could have changed */
14511 14521                          pkt_len = msgdsize(mp);
14512 14522                  }
14513 14523                  if (ipst->ips_ip6_observe.he_interested) {
14514 14524                          /* See above */
14515 14525                          szone = IP_REAL_ZONEID(szone, ipst);
14516 14526  
14517 14527                          ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES,
14518 14528                              ill, ipst);
14519 14529                  }
14520 14530                  DTRACE_IP7(send, mblk_t *, mp,  conn_t *, NULL,
14521 14531                      void_ip_t *, ip6h,  __dtrace_ipsr_ill_t *, ill,
14522 14532                      ipha_t *, NULL, ip6_t *, ip6h, int, 0);
14523 14533          }
14524 14534  
14525 14535  sendit:
14526 14536          /*
14527 14537           * We check the state without a lock because the state can never
14528 14538           * move "backwards" to initial or incomplete.
14529 14539           */
14530 14540          switch (ncec->ncec_state) {
14531 14541          case ND_REACHABLE:
14532 14542          case ND_STALE:
14533 14543          case ND_DELAY:
14534 14544          case ND_PROBE:
14535 14545                  mp = ip_xmit_attach_llhdr(mp, nce);
14536 14546                  if (mp == NULL) {
14537 14547                          /*
14538 14548                           * ip_xmit_attach_llhdr has increased
14539 14549                           * ipIfStatsOutDiscards and called ip_drop_output()
14540 14550                           */
14541 14551                          return (ENOBUFS);
14542 14552                  }
14543 14553                  /*
14544 14554                   * check if nce_fastpath completed and we tagged on a
14545 14555                   * copy of nce_fp_mp in ip_xmit_attach_llhdr().
14546 14556                   */
14547 14557                  fp_mp = (mp->b_datap->db_type == M_DATA);
14548 14558  
14549 14559                  if (fp_mp &&
14550 14560                      (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) {
14551 14561                          ill_dld_direct_t *idd;
14552 14562  
14553 14563                          idd = &ill->ill_dld_capab->idc_direct;
14554 14564                          /*
14555 14565                           * Send the packet directly to DLD, where it
14556 14566                           * may be queued depending on the availability
14557 14567                           * of transmit resources at the media layer.
14558 14568                           * Return value should be taken into
14559 14569                           * account and flow control the TCP.
14560 14570                           */
14561 14571                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14562 14572                          UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14563 14573                              pkt_len);
14564 14574  
14565 14575                          if (ixaflags & IXAF_NO_DEV_FLOW_CTL) {
14566 14576                                  (void) idd->idd_tx_df(idd->idd_tx_dh, mp,
14567 14577                                      (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC);
14568 14578                          } else {
14569 14579                                  uintptr_t cookie;
14570 14580  
14571 14581                                  if ((cookie = idd->idd_tx_df(idd->idd_tx_dh,
14572 14582                                      mp, (uintptr_t)xmit_hint, 0)) != 0) {
14573 14583                                          if (ixacookie != NULL)
14574 14584                                                  *ixacookie = cookie;
14575 14585                                          return (EWOULDBLOCK);
14576 14586                                  }
14577 14587                          }
14578 14588                  } else {
14579 14589                          wq = ill->ill_wq;
14580 14590  
14581 14591                          if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) &&
14582 14592                              !canputnext(wq)) {
14583 14593                                  if (ixacookie != NULL)
14584 14594                                          *ixacookie = 0;
14585 14595                                  ip_xmit_flowctl_drop(ill, mp, fp_mp,
14586 14596                                      nce->nce_fp_mp != NULL ?
14587 14597                                      MBLKL(nce->nce_fp_mp) : 0);
14588 14598                                  return (EWOULDBLOCK);
14589 14599                          }
14590 14600                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
14591 14601                          UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
14592 14602                              pkt_len);
14593 14603                          putnext(wq, mp);
14594 14604                  }
14595 14605  
14596 14606                  /*
14597 14607                   * The rest of this function implements Neighbor Unreachability
14598 14608                   * detection. Determine if the ncec is eligible for NUD.
14599 14609                   */
14600 14610                  if (ncec->ncec_flags & NCE_F_NONUD)
14601 14611                          return (0);
14602 14612  
14603 14613                  ASSERT(ncec->ncec_state != ND_INCOMPLETE);
14604 14614  
14605 14615                  /*
14606 14616                   * Check for upper layer advice
14607 14617                   */
14608 14618                  if (ixaflags & IXAF_REACH_CONF) {
14609 14619                          timeout_id_t tid;
14610 14620  
14611 14621                          /*
14612 14622                           * It should be o.k. to check the state without
14613 14623                           * a lock here, at most we lose an advice.
14614 14624                           */
14615 14625                          ncec->ncec_last = TICK_TO_MSEC(now);
14616 14626                          if (ncec->ncec_state != ND_REACHABLE) {
14617 14627                                  mutex_enter(&ncec->ncec_lock);
14618 14628                                  ncec->ncec_state = ND_REACHABLE;
14619 14629                                  tid = ncec->ncec_timeout_id;
14620 14630                                  ncec->ncec_timeout_id = 0;
14621 14631                                  mutex_exit(&ncec->ncec_lock);
14622 14632                                  (void) untimeout(tid);
14623 14633                                  if (ip_debug > 2) {
14624 14634                                          /* ip1dbg */
14625 14635                                          pr_addr_dbg("ip_xmit: state"
14626 14636                                              " for %s changed to"
14627 14637                                              " REACHABLE\n", AF_INET6,
14628 14638                                              &ncec->ncec_addr);
14629 14639                                  }
14630 14640                          }
14631 14641                          return (0);
14632 14642                  }
14633 14643  
14634 14644                  delta =  TICK_TO_MSEC(now) - ncec->ncec_last;
14635 14645                  ip1dbg(("ip_xmit: delta = %" PRId64
14636 14646                      " ill_reachable_time = %d \n", delta,
14637 14647                      ill->ill_reachable_time));
14638 14648                  if (delta > (uint64_t)ill->ill_reachable_time) {
14639 14649                          mutex_enter(&ncec->ncec_lock);
14640 14650                          switch (ncec->ncec_state) {
14641 14651                          case ND_REACHABLE:
14642 14652                                  ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0);
14643 14653                                  /* FALLTHROUGH */
14644 14654                          case ND_STALE:
14645 14655                                  /*
14646 14656                                   * ND_REACHABLE is identical to
14647 14657                                   * ND_STALE in this specific case. If
14648 14658                                   * reachable time has expired for this
14649 14659                                   * neighbor (delta is greater than
14650 14660                                   * reachable time), conceptually, the
14651 14661                                   * neighbor cache is no longer in
14652 14662                                   * REACHABLE state, but already in
14653 14663                                   * STALE state.  So the correct
14654 14664                                   * transition here is to ND_DELAY.
14655 14665                                   */
14656 14666                                  ncec->ncec_state = ND_DELAY;
14657 14667                                  mutex_exit(&ncec->ncec_lock);
14658 14668                                  nce_restart_timer(ncec,
14659 14669                                      ipst->ips_delay_first_probe_time);
14660 14670                                  if (ip_debug > 3) {
14661 14671                                          /* ip2dbg */
14662 14672                                          pr_addr_dbg("ip_xmit: state"
14663 14673                                              " for %s changed to"
14664 14674                                              " DELAY\n", AF_INET6,
14665 14675                                              &ncec->ncec_addr);
14666 14676                                  }
14667 14677                                  break;
14668 14678                          case ND_DELAY:
14669 14679                          case ND_PROBE:
14670 14680                                  mutex_exit(&ncec->ncec_lock);
14671 14681                                  /* Timers have already started */
14672 14682                                  break;
14673 14683                          case ND_UNREACHABLE:
14674 14684                                  /*
14675 14685                                   * nce_timer has detected that this ncec
14676 14686                                   * is unreachable and initiated deleting
14677 14687                                   * this ncec.
14678 14688                                   * This is a harmless race where we found the
14679 14689                                   * ncec before it was deleted and have
14680 14690                                   * just sent out a packet using this
14681 14691                                   * unreachable ncec.
14682 14692                                   */
14683 14693                                  mutex_exit(&ncec->ncec_lock);
14684 14694                                  break;
14685 14695                          default:
14686 14696                                  ASSERT(0);
14687 14697                                  mutex_exit(&ncec->ncec_lock);
14688 14698                          }
14689 14699                  }
14690 14700                  return (0);
14691 14701  
14692 14702          case ND_INCOMPLETE:
14693 14703                  /*
14694 14704                   * the state could have changed since we didn't hold the lock.
14695 14705                   * Re-verify state under lock.
14696 14706                   */
14697 14707                  is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14698 14708                  mutex_enter(&ncec->ncec_lock);
14699 14709                  if (NCE_ISREACHABLE(ncec)) {
14700 14710                          mutex_exit(&ncec->ncec_lock);
14701 14711                          goto sendit;
14702 14712                  }
14703 14713                  /* queue the packet */
14704 14714                  nce_queue_mp(ncec, mp, is_probe);
14705 14715                  mutex_exit(&ncec->ncec_lock);
14706 14716                  DTRACE_PROBE2(ip__xmit__incomplete,
14707 14717                      (ncec_t *), ncec, (mblk_t *), mp);
14708 14718                  return (0);
14709 14719  
14710 14720          case ND_INITIAL:
14711 14721                  /*
14712 14722                   * State could have changed since we didn't hold the lock, so
14713 14723                   * re-verify state.
14714 14724                   */
14715 14725                  is_probe = ipmp_packet_is_probe(mp, nce->nce_ill);
14716 14726                  mutex_enter(&ncec->ncec_lock);
14717 14727                  if (NCE_ISREACHABLE(ncec))  {
14718 14728                          mutex_exit(&ncec->ncec_lock);
14719 14729                          goto sendit;
14720 14730                  }
14721 14731                  nce_queue_mp(ncec, mp, is_probe);
14722 14732                  if (ncec->ncec_state == ND_INITIAL) {
14723 14733                          ncec->ncec_state = ND_INCOMPLETE;
14724 14734                          mutex_exit(&ncec->ncec_lock);
14725 14735                          /*
14726 14736                           * figure out the source we want to use
14727 14737                           * and resolve it.
14728 14738                           */
14729 14739                          ip_ndp_resolve(ncec);
14730 14740                  } else  {
14731 14741                          mutex_exit(&ncec->ncec_lock);
14732 14742                  }
14733 14743                  return (0);
14734 14744  
14735 14745          case ND_UNREACHABLE:
14736 14746                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14737 14747                  ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE",
14738 14748                      mp, ill);
14739 14749                  freemsg(mp);
14740 14750                  return (0);
14741 14751  
14742 14752          default:
14743 14753                  ASSERT(0);
14744 14754                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
14745 14755                  ip_drop_output("ipIfStatsOutDiscards - ND_other",
14746 14756                      mp, ill);
14747 14757                  freemsg(mp);
14748 14758                  return (ENETUNREACH);
14749 14759          }
14750 14760  }
14751 14761  
14752 14762  /*
14753 14763   * Return B_TRUE if the buffers differ in length or content.
14754 14764   * This is used for comparing extension header buffers.
14755 14765   * Note that an extension header would be declared different
14756 14766   * even if all that changed was the next header value in that header i.e.
14757 14767   * what really changed is the next extension header.
14758 14768   */
14759 14769  boolean_t
14760 14770  ip_cmpbuf(const void *abuf, uint_t alen, boolean_t b_valid, const void *bbuf,
14761 14771      uint_t blen)
14762 14772  {
14763 14773          if (!b_valid)
14764 14774                  blen = 0;
14765 14775  
14766 14776          if (alen != blen)
14767 14777                  return (B_TRUE);
14768 14778          if (alen == 0)
14769 14779                  return (B_FALSE);       /* Both zero length */
14770 14780          return (bcmp(abuf, bbuf, alen));
14771 14781  }
14772 14782  
14773 14783  /*
14774 14784   * Preallocate memory for ip_savebuf(). Returns B_TRUE if ok.
14775 14785   * Return B_FALSE if memory allocation fails - don't change any state!
14776 14786   */
14777 14787  boolean_t
14778 14788  ip_allocbuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14779 14789      const void *src, uint_t srclen)
14780 14790  {
14781 14791          void *dst;
14782 14792  
14783 14793          if (!src_valid)
14784 14794                  srclen = 0;
14785 14795  
14786 14796          ASSERT(*dstlenp == 0);
14787 14797          if (src != NULL && srclen != 0) {
14788 14798                  dst = mi_alloc(srclen, BPRI_MED);
14789 14799                  if (dst == NULL)
14790 14800                          return (B_FALSE);
14791 14801          } else {
14792 14802                  dst = NULL;
14793 14803          }
14794 14804          if (*dstp != NULL)
14795 14805                  mi_free(*dstp);
14796 14806          *dstp = dst;
14797 14807          *dstlenp = dst == NULL ? 0 : srclen;
14798 14808          return (B_TRUE);
14799 14809  }
14800 14810  
14801 14811  /*
14802 14812   * Replace what is in *dst, *dstlen with the source.
14803 14813   * Assumes ip_allocbuf has already been called.
14804 14814   */
14805 14815  void
14806 14816  ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid,
14807 14817      const void *src, uint_t srclen)
14808 14818  {
14809 14819          if (!src_valid)
14810 14820                  srclen = 0;
14811 14821  
14812 14822          ASSERT(*dstlenp == srclen);
14813 14823          if (src != NULL && srclen != 0)
14814 14824                  bcopy(src, *dstp, srclen);
14815 14825  }
14816 14826  
14817 14827  /*
14818 14828   * Free the storage pointed to by the members of an ip_pkt_t.
14819 14829   */
14820 14830  void
14821 14831  ip_pkt_free(ip_pkt_t *ipp)
14822 14832  {
14823 14833          uint_t  fields = ipp->ipp_fields;
14824 14834  
14825 14835          if (fields & IPPF_HOPOPTS) {
14826 14836                  kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen);
14827 14837                  ipp->ipp_hopopts = NULL;
14828 14838                  ipp->ipp_hopoptslen = 0;
14829 14839          }
14830 14840          if (fields & IPPF_RTHDRDSTOPTS) {
14831 14841                  kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen);
14832 14842                  ipp->ipp_rthdrdstopts = NULL;
14833 14843                  ipp->ipp_rthdrdstoptslen = 0;
14834 14844          }
14835 14845          if (fields & IPPF_DSTOPTS) {
14836 14846                  kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen);
14837 14847                  ipp->ipp_dstopts = NULL;
14838 14848                  ipp->ipp_dstoptslen = 0;
14839 14849          }
14840 14850          if (fields & IPPF_RTHDR) {
14841 14851                  kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen);
14842 14852                  ipp->ipp_rthdr = NULL;
14843 14853                  ipp->ipp_rthdrlen = 0;
14844 14854          }
14845 14855          if (fields & IPPF_IPV4_OPTIONS) {
14846 14856                  kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len);
14847 14857                  ipp->ipp_ipv4_options = NULL;
14848 14858                  ipp->ipp_ipv4_options_len = 0;
14849 14859          }
14850 14860          if (fields & IPPF_LABEL_V4) {
14851 14861                  kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4);
14852 14862                  ipp->ipp_label_v4 = NULL;
14853 14863                  ipp->ipp_label_len_v4 = 0;
14854 14864          }
14855 14865          if (fields & IPPF_LABEL_V6) {
14856 14866                  kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6);
14857 14867                  ipp->ipp_label_v6 = NULL;
14858 14868                  ipp->ipp_label_len_v6 = 0;
14859 14869          }
14860 14870          ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14861 14871              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14862 14872  }
14863 14873  
14864 14874  /*
14865 14875   * Copy from src to dst and allocate as needed.
14866 14876   * Returns zero or ENOMEM.
14867 14877   *
14868 14878   * The caller must initialize dst to zero.
14869 14879   */
14870 14880  int
14871 14881  ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag)
14872 14882  {
14873 14883          uint_t  fields = src->ipp_fields;
14874 14884  
14875 14885          /* Start with fields that don't require memory allocation */
14876 14886          dst->ipp_fields = fields &
14877 14887              ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14878 14888              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6);
14879 14889  
14880 14890          dst->ipp_addr = src->ipp_addr;
14881 14891          dst->ipp_unicast_hops = src->ipp_unicast_hops;
14882 14892          dst->ipp_hoplimit = src->ipp_hoplimit;
14883 14893          dst->ipp_tclass = src->ipp_tclass;
14884 14894          dst->ipp_type_of_service = src->ipp_type_of_service;
14885 14895  
14886 14896          if (!(fields & (IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS |
14887 14897              IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6)))
14888 14898                  return (0);
14889 14899  
14890 14900          if (fields & IPPF_HOPOPTS) {
14891 14901                  dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag);
14892 14902                  if (dst->ipp_hopopts == NULL) {
14893 14903                          ip_pkt_free(dst);
14894 14904                          return (ENOMEM);
14895 14905                  }
14896 14906                  dst->ipp_fields |= IPPF_HOPOPTS;
14897 14907                  bcopy(src->ipp_hopopts, dst->ipp_hopopts,
14898 14908                      src->ipp_hopoptslen);
14899 14909                  dst->ipp_hopoptslen = src->ipp_hopoptslen;
14900 14910          }
14901 14911          if (fields & IPPF_RTHDRDSTOPTS) {
14902 14912                  dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen,
14903 14913                      kmflag);
14904 14914                  if (dst->ipp_rthdrdstopts == NULL) {
14905 14915                          ip_pkt_free(dst);
14906 14916                          return (ENOMEM);
14907 14917                  }
14908 14918                  dst->ipp_fields |= IPPF_RTHDRDSTOPTS;
14909 14919                  bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts,
14910 14920                      src->ipp_rthdrdstoptslen);
14911 14921                  dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen;
14912 14922          }
14913 14923          if (fields & IPPF_DSTOPTS) {
14914 14924                  dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag);
14915 14925                  if (dst->ipp_dstopts == NULL) {
14916 14926                          ip_pkt_free(dst);
14917 14927                          return (ENOMEM);
14918 14928                  }
14919 14929                  dst->ipp_fields |= IPPF_DSTOPTS;
14920 14930                  bcopy(src->ipp_dstopts, dst->ipp_dstopts,
14921 14931                      src->ipp_dstoptslen);
14922 14932                  dst->ipp_dstoptslen = src->ipp_dstoptslen;
14923 14933          }
14924 14934          if (fields & IPPF_RTHDR) {
14925 14935                  dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag);
14926 14936                  if (dst->ipp_rthdr == NULL) {
14927 14937                          ip_pkt_free(dst);
14928 14938                          return (ENOMEM);
14929 14939                  }
14930 14940                  dst->ipp_fields |= IPPF_RTHDR;
14931 14941                  bcopy(src->ipp_rthdr, dst->ipp_rthdr,
14932 14942                      src->ipp_rthdrlen);
14933 14943                  dst->ipp_rthdrlen = src->ipp_rthdrlen;
14934 14944          }
14935 14945          if (fields & IPPF_IPV4_OPTIONS) {
14936 14946                  dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len,
14937 14947                      kmflag);
14938 14948                  if (dst->ipp_ipv4_options == NULL) {
14939 14949                          ip_pkt_free(dst);
14940 14950                          return (ENOMEM);
14941 14951                  }
14942 14952                  dst->ipp_fields |= IPPF_IPV4_OPTIONS;
14943 14953                  bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options,
14944 14954                      src->ipp_ipv4_options_len);
14945 14955                  dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len;
14946 14956          }
14947 14957          if (fields & IPPF_LABEL_V4) {
14948 14958                  dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag);
14949 14959                  if (dst->ipp_label_v4 == NULL) {
14950 14960                          ip_pkt_free(dst);
14951 14961                          return (ENOMEM);
14952 14962                  }
14953 14963                  dst->ipp_fields |= IPPF_LABEL_V4;
14954 14964                  bcopy(src->ipp_label_v4, dst->ipp_label_v4,
14955 14965                      src->ipp_label_len_v4);
14956 14966                  dst->ipp_label_len_v4 = src->ipp_label_len_v4;
14957 14967          }
14958 14968          if (fields & IPPF_LABEL_V6) {
14959 14969                  dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag);
14960 14970                  if (dst->ipp_label_v6 == NULL) {
14961 14971                          ip_pkt_free(dst);
14962 14972                          return (ENOMEM);
14963 14973                  }
14964 14974                  dst->ipp_fields |= IPPF_LABEL_V6;
14965 14975                  bcopy(src->ipp_label_v6, dst->ipp_label_v6,
14966 14976                      src->ipp_label_len_v6);
14967 14977                  dst->ipp_label_len_v6 = src->ipp_label_len_v6;
14968 14978          }
14969 14979          if (fields & IPPF_FRAGHDR) {
14970 14980                  dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag);
14971 14981                  if (dst->ipp_fraghdr == NULL) {
14972 14982                          ip_pkt_free(dst);
14973 14983                          return (ENOMEM);
14974 14984                  }
14975 14985                  dst->ipp_fields |= IPPF_FRAGHDR;
14976 14986                  bcopy(src->ipp_fraghdr, dst->ipp_fraghdr,
14977 14987                      src->ipp_fraghdrlen);
14978 14988                  dst->ipp_fraghdrlen = src->ipp_fraghdrlen;
14979 14989          }
14980 14990          return (0);
14981 14991  }
14982 14992  
14983 14993  /*
14984 14994   * Returns INADDR_ANY if no source route
14985 14995   */
14986 14996  ipaddr_t
14987 14997  ip_pkt_source_route_v4(const ip_pkt_t *ipp)
14988 14998  {
14989 14999          ipaddr_t        nexthop = INADDR_ANY;
14990 15000          ipoptp_t        opts;
14991 15001          uchar_t         *opt;
14992 15002          uint8_t         optval;
14993 15003          uint8_t         optlen;
14994 15004          uint32_t        totallen;
14995 15005  
14996 15006          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
14997 15007                  return (INADDR_ANY);
14998 15008  
14999 15009          totallen = ipp->ipp_ipv4_options_len;
15000 15010          if (totallen & 0x3)
15001 15011                  return (INADDR_ANY);
15002 15012  
15003 15013          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
15004 15014              optval != IPOPT_EOL;
15005 15015              optval = ipoptp_next(&opts)) {
15006 15016                  opt = opts.ipoptp_cur;
15007 15017                  switch (optval) {
15008 15018                          uint8_t off;
15009 15019                  case IPOPT_SSRR:
15010 15020                  case IPOPT_LSRR:
15011 15021                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
15012 15022                                  break;
15013 15023                          }
15014 15024                          optlen = opts.ipoptp_len;
15015 15025                          off = opt[IPOPT_OFFSET];
15016 15026                          off--;
15017 15027                          if (optlen < IP_ADDR_LEN ||
15018 15028                              off > optlen - IP_ADDR_LEN) {
15019 15029                                  /* End of source route */
15020 15030                                  break;
15021 15031                          }
15022 15032                          bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN);
15023 15033                          if (nexthop == htonl(INADDR_LOOPBACK)) {
15024 15034                                  /* Ignore */
15025 15035                                  nexthop = INADDR_ANY;
15026 15036                                  break;
15027 15037                          }
15028 15038                          break;
15029 15039                  }
15030 15040          }
15031 15041          return (nexthop);
15032 15042  }
15033 15043  
15034 15044  /*
15035 15045   * Reverse a source route.
15036 15046   */
15037 15047  void
15038 15048  ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp)
15039 15049  {
15040 15050          ipaddr_t        tmp;
15041 15051          ipoptp_t        opts;
15042 15052          uchar_t         *opt;
15043 15053          uint8_t         optval;
15044 15054          uint32_t        totallen;
15045 15055  
15046 15056          if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS))
15047 15057                  return;
15048 15058  
15049 15059          totallen = ipp->ipp_ipv4_options_len;
15050 15060          if (totallen & 0x3)
15051 15061                  return;
15052 15062  
15053 15063          for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options);
15054 15064              optval != IPOPT_EOL;
15055 15065              optval = ipoptp_next(&opts)) {
15056 15066                  uint8_t off1, off2;
15057 15067  
15058 15068                  opt = opts.ipoptp_cur;
15059 15069                  switch (optval) {
15060 15070                  case IPOPT_SSRR:
15061 15071                  case IPOPT_LSRR:
15062 15072                          if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) {
15063 15073                                  break;
15064 15074                          }
15065 15075                          off1 = IPOPT_MINOFF_SR - 1;
15066 15076                          off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1;
15067 15077                          while (off2 > off1) {
15068 15078                                  bcopy(opt + off2, &tmp, IP_ADDR_LEN);
15069 15079                                  bcopy(opt + off1, opt + off2, IP_ADDR_LEN);
15070 15080                                  bcopy(&tmp, opt + off2, IP_ADDR_LEN);
15071 15081                                  off2 -= IP_ADDR_LEN;
15072 15082                                  off1 += IP_ADDR_LEN;
15073 15083                          }
15074 15084                          opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR;
15075 15085                          break;
15076 15086                  }
15077 15087          }
15078 15088  }
15079 15089  
15080 15090  /*
15081 15091   * Returns NULL if no routing header
15082 15092   */
15083 15093  in6_addr_t *
15084 15094  ip_pkt_source_route_v6(const ip_pkt_t *ipp)
15085 15095  {
15086 15096          in6_addr_t      *nexthop = NULL;
15087 15097          ip6_rthdr0_t    *rthdr;
15088 15098  
15089 15099          if (!(ipp->ipp_fields & IPPF_RTHDR))
15090 15100                  return (NULL);
15091 15101  
15092 15102          rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr;
15093 15103          if (rthdr->ip6r0_segleft == 0)
15094 15104                  return (NULL);
15095 15105  
15096 15106          nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr));
15097 15107          return (nexthop);
15098 15108  }
15099 15109  
15100 15110  zoneid_t
15101 15111  ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira,
15102 15112      zoneid_t lookup_zoneid)
15103 15113  {
15104 15114          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
15105 15115          ire_t           *ire;
15106 15116          int             ire_flags = MATCH_IRE_TYPE;
15107 15117          zoneid_t        zoneid = ALL_ZONES;
15108 15118  
15109 15119          if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15110 15120                  return (ALL_ZONES);
15111 15121  
15112 15122          if (lookup_zoneid != ALL_ZONES)
15113 15123                  ire_flags |= MATCH_IRE_ZONEONLY;
15114 15124          ire = ire_ftable_lookup_v4(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
15115 15125              NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15116 15126          if (ire != NULL) {
15117 15127                  zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15118 15128                  ire_refrele(ire);
15119 15129          }
15120 15130          return (zoneid);
15121 15131  }
15122 15132  
15123 15133  zoneid_t
15124 15134  ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill,
15125 15135      ip_recv_attr_t *ira, zoneid_t lookup_zoneid)
15126 15136  {
15127 15137          ip_stack_t      *ipst = ira->ira_ill->ill_ipst;
15128 15138          ire_t           *ire;
15129 15139          int             ire_flags = MATCH_IRE_TYPE;
15130 15140          zoneid_t        zoneid = ALL_ZONES;
15131 15141  
15132 15142          if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE))
15133 15143                  return (ALL_ZONES);
15134 15144  
15135 15145          if (IN6_IS_ADDR_LINKLOCAL(addr))
15136 15146                  ire_flags |= MATCH_IRE_ILL;
15137 15147  
15138 15148          if (lookup_zoneid != ALL_ZONES)
15139 15149                  ire_flags |= MATCH_IRE_ZONEONLY;
15140 15150          ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK,
15141 15151              ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL);
15142 15152          if (ire != NULL) {
15143 15153                  zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst);
15144 15154                  ire_refrele(ire);
15145 15155          }
15146 15156          return (zoneid);
15147 15157  }
15148 15158  
15149 15159  /*
15150 15160   * IP obserability hook support functions.
15151 15161   */
15152 15162  static void
15153 15163  ipobs_init(ip_stack_t *ipst)
15154 15164  {
15155 15165          netid_t id;
15156 15166  
15157 15167          id = net_getnetidbynetstackid(ipst->ips_netstack->netstack_stackid);
15158 15168  
15159 15169          ipst->ips_ip4_observe_pr = net_protocol_lookup(id, NHF_INET);
15160 15170          VERIFY(ipst->ips_ip4_observe_pr != NULL);
15161 15171  
15162 15172          ipst->ips_ip6_observe_pr = net_protocol_lookup(id, NHF_INET6);
15163 15173          VERIFY(ipst->ips_ip6_observe_pr != NULL);
15164 15174  }
15165 15175  
15166 15176  static void
15167 15177  ipobs_fini(ip_stack_t *ipst)
15168 15178  {
15169 15179  
15170 15180          VERIFY(net_protocol_release(ipst->ips_ip4_observe_pr) == 0);
15171 15181          VERIFY(net_protocol_release(ipst->ips_ip6_observe_pr) == 0);
15172 15182  }
15173 15183  
15174 15184  /*
15175 15185   * hook_pkt_observe_t is composed in network byte order so that the
15176 15186   * entire mblk_t chain handed into hook_run can be used as-is.
15177 15187   * The caveat is that use of the fields, such as the zone fields,
15178 15188   * requires conversion into host byte order first.
15179 15189   */
15180 15190  void
15181 15191  ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst,
15182 15192      const ill_t *ill, ip_stack_t *ipst)
15183 15193  {
15184 15194          hook_pkt_observe_t *hdr;
15185 15195          uint64_t grifindex;
15186 15196          mblk_t *imp;
15187 15197  
15188 15198          imp = allocb(sizeof (*hdr), BPRI_HI);
15189 15199          if (imp == NULL)
15190 15200                  return;
15191 15201  
15192 15202          hdr = (hook_pkt_observe_t *)imp->b_rptr;
15193 15203          /*
15194 15204           * b_wptr is set to make the apparent size of the data in the mblk_t
15195 15205           * to exclude the pointers at the end of hook_pkt_observer_t.
15196 15206           */
15197 15207          imp->b_wptr = imp->b_rptr + sizeof (dl_ipnetinfo_t);
15198 15208          imp->b_cont = mp;
15199 15209  
15200 15210          ASSERT(DB_TYPE(mp) == M_DATA);
15201 15211  
15202 15212          if (IS_UNDER_IPMP(ill))
15203 15213                  grifindex = ipmp_ill_get_ipmp_ifindex(ill);
15204 15214          else
15205 15215                  grifindex = 0;
15206 15216  
15207 15217          hdr->hpo_version = 1;
15208 15218          hdr->hpo_htype = htons(htype);
15209 15219          hdr->hpo_pktlen = htonl((ulong_t)msgdsize(mp));
15210 15220          hdr->hpo_ifindex = htonl(ill->ill_phyint->phyint_ifindex);
15211 15221          hdr->hpo_grifindex = htonl(grifindex);
15212 15222          hdr->hpo_zsrc = htonl(zsrc);
15213 15223          hdr->hpo_zdst = htonl(zdst);
15214 15224          hdr->hpo_pkt = imp;
15215 15225          hdr->hpo_ctx = ipst->ips_netstack;
15216 15226  
15217 15227          if (ill->ill_isv6) {
15218 15228                  hdr->hpo_family = AF_INET6;
15219 15229                  (void) hook_run(ipst->ips_ipv6_net_data->netd_hooks,
15220 15230                      ipst->ips_ipv6observing, (hook_data_t)hdr);
15221 15231          } else {
15222 15232                  hdr->hpo_family = AF_INET;
15223 15233                  (void) hook_run(ipst->ips_ipv4_net_data->netd_hooks,
15224 15234                      ipst->ips_ipv4observing, (hook_data_t)hdr);
15225 15235          }
15226 15236  
15227 15237          imp->b_cont = NULL;
15228 15238          freemsg(imp);
15229 15239  }
15230 15240  
15231 15241  /*
15232 15242   * Utility routine that checks if `v4srcp' is a valid address on underlying
15233 15243   * interface `ill'.  If `ipifp' is non-NULL, it's set to a held ipif
15234 15244   * associated with `v4srcp' on success.  NOTE: if this is not called from
15235 15245   * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the
15236 15246   * group during or after this lookup.
15237 15247   */
15238 15248  boolean_t
15239 15249  ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp)
15240 15250  {
15241 15251          ipif_t *ipif;
15242 15252  
15243 15253          ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst);
15244 15254          if (ipif != NULL) {
15245 15255                  if (ipifp != NULL)
15246 15256                          *ipifp = ipif;
15247 15257                  else
15248 15258                          ipif_refrele(ipif);
15249 15259                  return (B_TRUE);
15250 15260          }
15251 15261  
15252 15262          ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n",
15253 15263              *v4srcp));
15254 15264          return (B_FALSE);
15255 15265  }
15256 15266  
15257 15267  /*
15258 15268   * Transport protocol call back function for CPU state change.
15259 15269   */
15260 15270  /* ARGSUSED */
15261 15271  static int
15262 15272  ip_tp_cpu_update(cpu_setup_t what, int id, void *arg)
15263 15273  {
15264 15274          processorid_t cpu_seqid;
15265 15275          netstack_handle_t nh;
15266 15276          netstack_t *ns;
15267 15277  
15268 15278          ASSERT(MUTEX_HELD(&cpu_lock));
15269 15279  
15270 15280          switch (what) {
15271 15281          case CPU_CONFIG:
15272 15282          case CPU_ON:
15273 15283          case CPU_INIT:
15274 15284          case CPU_CPUPART_IN:
15275 15285                  cpu_seqid = cpu[id]->cpu_seqid;
15276 15286                  netstack_next_init(&nh);
15277 15287                  while ((ns = netstack_next(&nh)) != NULL) {
15278 15288                          tcp_stack_cpu_add(ns->netstack_tcp, cpu_seqid);
15279 15289                          sctp_stack_cpu_add(ns->netstack_sctp, cpu_seqid);
15280 15290                          udp_stack_cpu_add(ns->netstack_udp, cpu_seqid);
15281 15291                          netstack_rele(ns);
15282 15292                  }
15283 15293                  netstack_next_fini(&nh);
15284 15294                  break;
15285 15295          case CPU_UNCONFIG:
15286 15296          case CPU_OFF:
15287 15297          case CPU_CPUPART_OUT:
15288 15298                  /*
15289 15299                   * Nothing to do.  We don't remove the per CPU stats from
15290 15300                   * the IP stack even when the CPU goes offline.
15291 15301                   */
15292 15302                  break;
15293 15303          default:
15294 15304                  break;
15295 15305          }
15296 15306          return (0);
15297 15307  }

↓ open down ↓

5565 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX