OS-7184 Wdiff usr/src/uts/common/inet/ip/ip_if.c

Print this page

OS-7184 prototype

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/inet/ip/ip_if.c
          +++ new/usr/src/uts/common/inet/ip/ip_if.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 1990 Mentat Inc.
  24   24   * Copyright (c) 2013 by Delphix. All rights reserved.
  25   25   * Copyright 2019 Joyent, Inc.
  26   26   * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  27   27   */
  28   28  
  29   29  /*
  30   30   * This file contains the interface control functions for IP.
  31   31   */
  32   32  
  33   33  #include <sys/types.h>
  34   34  #include <sys/stream.h>
  35   35  #include <sys/dlpi.h>
  36   36  #include <sys/stropts.h>
  37   37  #include <sys/strsun.h>
  38   38  #include <sys/sysmacros.h>
  39   39  #include <sys/strsubr.h>
  40   40  #include <sys/strlog.h>
  41   41  #include <sys/ddi.h>
  42   42  #include <sys/sunddi.h>
  43   43  #include <sys/cmn_err.h>
  44   44  #include <sys/kstat.h>
  45   45  #include <sys/debug.h>
  46   46  #include <sys/zone.h>
  47   47  #include <sys/sunldi.h>
  48   48  #include <sys/file.h>
  49   49  #include <sys/bitmap.h>
  50   50  #include <sys/cpuvar.h>
  51   51  #include <sys/time.h>
  52   52  #include <sys/ctype.h>
  53   53  #include <sys/kmem.h>
  54   54  #include <sys/systm.h>
  55   55  #include <sys/param.h>
  56   56  #include <sys/socket.h>
  57   57  #include <sys/isa_defs.h>
  58   58  #include <net/if.h>
  59   59  #include <net/if_arp.h>
  60   60  #include <net/if_types.h>
  61   61  #include <net/if_dl.h>
  62   62  #include <net/route.h>
  63   63  #include <sys/sockio.h>
  64   64  #include <netinet/in.h>
  65   65  #include <netinet/ip6.h>
  66   66  #include <netinet/icmp6.h>
  67   67  #include <netinet/igmp_var.h>
  68   68  #include <sys/policy.h>
  69   69  #include <sys/ethernet.h>
  70   70  #include <sys/callb.h>
  71   71  #include <sys/md5.h>
  72   72  
  73   73  #include <inet/common.h>   /* for various inet/mi.h and inet/nd.h needs */
  74   74  #include <inet/mi.h>
  75   75  #include <inet/nd.h>
  76   76  #include <inet/tunables.h>
  77   77  #include <inet/arp.h>
  78   78  #include <inet/ip_arp.h>
  79   79  #include <inet/mib2.h>
  80   80  #include <inet/ip.h>
  81   81  #include <inet/ip6.h>
  82   82  #include <inet/ip6_asp.h>
  83   83  #include <inet/tcp.h>
  84   84  #include <inet/ip_multi.h>
  85   85  #include <inet/ip_ire.h>
  86   86  #include <inet/ip_ftable.h>
  87   87  #include <inet/ip_rts.h>
  88   88  #include <inet/ip_ndp.h>
  89   89  #include <inet/ip_if.h>
  90   90  #include <inet/ip_impl.h>
  91   91  #include <inet/sctp_ip.h>
  92   92  #include <inet/ip_netinfo.h>
  93   93  #include <inet/ilb_ip.h>
  94   94  
  95   95  #include <netinet/igmp.h>
  96   96  #include <inet/ip_listutils.h>
  97   97  #include <inet/ipclassifier.h>
  98   98  #include <sys/mac_client.h>
  99   99  #include <sys/dld.h>
 100  100  #include <sys/mac_flow.h>
 101  101  
 102  102  #include <sys/systeminfo.h>
 103  103  #include <sys/bootconf.h>
 104  104  
 105  105  #include <sys/tsol/tndb.h>
 106  106  #include <sys/tsol/tnet.h>
 107  107  
 108  108  #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
 109  109  #include <inet/udp_impl.h> /* needed for udp_stack_t */
 110  110  
 111  111  /* The character which tells where the ill_name ends */
 112  112  #define IPIF_SEPARATOR_CHAR     ':'
 113  113  
 114  114  /* IP ioctl function table entry */
 115  115  typedef struct ipft_s {
 116  116          int     ipft_cmd;
 117  117          pfi_t   ipft_pfi;
 118  118          int     ipft_min_size;
 119  119          int     ipft_flags;
 120  120  } ipft_t;
 121  121  #define IPFT_F_NO_REPLY         0x1     /* IP ioctl does not expect any reply */
 122  122  #define IPFT_F_SELF_REPLY       0x2     /* ioctl callee does the ioctl reply */
 123  123  
 124  124  static int      nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
 125  125  static int      nd_ill_forward_set(queue_t *q, mblk_t *mp,
 126  126                      char *value, caddr_t cp, cred_t *ioc_cr);
 127  127  
 128  128  static boolean_t ill_is_quiescent(ill_t *);
 129  129  static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
 130  130  static ip_m_t   *ip_m_lookup(t_uscalar_t mac_type);
 131  131  static int      ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 132  132      mblk_t *mp, boolean_t need_up);
 133  133  static int      ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 134  134      mblk_t *mp, boolean_t need_up);
 135  135  static int      ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
 136  136      queue_t *q, mblk_t *mp, boolean_t need_up);
 137  137  static int      ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
 138  138      mblk_t *mp);
 139  139  static int      ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
 140  140      mblk_t *mp);
 141  141  static int      ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
 142  142      queue_t *q, mblk_t *mp, boolean_t need_up);
 143  143  static int      ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
 144  144      int ioccmd, struct linkblk *li);
 145  145  static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
 146  146  static void     ip_wput_ioctl(queue_t *q, mblk_t *mp);
 147  147  static void     ipsq_flush(ill_t *ill);
 148  148  
 149  149  static  int     ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
 150  150      queue_t *q, mblk_t *mp, boolean_t need_up);
 151  151  static void     ipsq_delete(ipsq_t *);
 152  152  
 153  153  static ipif_t   *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
 154  154      boolean_t initialize, boolean_t insert, int *errorp);
 155  155  static ire_t    **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
 156  156  static void     ipif_delete_bcast_ires(ipif_t *ipif);
 157  157  static int      ipif_add_ires_v4(ipif_t *, boolean_t);
 158  158  static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
 159  159                      boolean_t isv6);
 160  160  static int      ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
 161  161  static void     ipif_free(ipif_t *ipif);
 162  162  static void     ipif_free_tail(ipif_t *ipif);
 163  163  static void     ipif_set_default(ipif_t *ipif);
 164  164  static int      ipif_set_values(queue_t *q, mblk_t *mp,
 165  165      char *interf_name, uint_t *ppa);
 166  166  static int      ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
 167  167      queue_t *q);
 168  168  static ipif_t   *ipif_lookup_on_name(char *name, size_t namelen,
 169  169      boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
 170  170      ip_stack_t *);
 171  171  static ipif_t   *ipif_lookup_on_name_async(char *name, size_t namelen,
 172  172      boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
 173  173      int *error, ip_stack_t *);
 174  174  
 175  175  static int      ill_alloc_ppa(ill_if_t *, ill_t *);
 176  176  static void     ill_delete_interface_type(ill_if_t *);
 177  177  static int      ill_dl_up(ill_t *ill, ipif_t *ipif);
 178  178  static void     ill_dl_down(ill_t *ill);
 179  179  static void     ill_down(ill_t *ill);
 180  180  static void     ill_down_ipifs(ill_t *, boolean_t);
 181  181  static void     ill_free_mib(ill_t *ill);
 182  182  static void     ill_glist_delete(ill_t *);
 183  183  static void     ill_phyint_reinit(ill_t *ill);
 184  184  static void     ill_set_nce_router_flags(ill_t *, boolean_t);
 185  185  static void     ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
 186  186  static void     ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
 187  187  
 188  188  static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
 189  189  static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
 190  190  static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
 191  191  static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
 192  192  static ip_v4mapinfo_func_t ip_ether_v4_mapping;
 193  193  static ip_v6mapinfo_func_t ip_ether_v6_mapping;
 194  194  static ip_v4mapinfo_func_t ip_ib_v4_mapping;
 195  195  static ip_v6mapinfo_func_t ip_ib_v6_mapping;
 196  196  static ip_v4mapinfo_func_t ip_mbcast_mapping;
 197  197  static void     ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
 198  198  static void     ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
 199  199  static void     phyint_free(phyint_t *);
 200  200  
 201  201  static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
 202  202  static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 203  203  static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 204  204  static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
 205  205  static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
 206  206  static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
 207  207      dl_capability_sub_t *);
 208  208  static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
 209  209  static void     ill_capability_dld_reset_fill(ill_t *, mblk_t *);
 210  210  static void     ill_capability_dld_ack(ill_t *, mblk_t *,
 211  211                      dl_capability_sub_t *);
 212  212  static void     ill_capability_dld_enable(ill_t *);
 213  213  static void     ill_capability_ack_thr(void *);
 214  214  static void     ill_capability_lso_enable(ill_t *);
 215  215  
 216  216  static ill_t    *ill_prev_usesrc(ill_t *);
 217  217  static int      ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
 218  218  static void     ill_disband_usesrc_group(ill_t *);
 219  219  static void     ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
 220  220  
 221  221  #ifdef DEBUG
 222  222  static  void    ill_trace_cleanup(const ill_t *);
 223  223  static  void    ipif_trace_cleanup(const ipif_t *);
 224  224  #endif
 225  225  
 226  226  static  void    ill_dlpi_clear_deferred(ill_t *ill);
 227  227  
 228  228  static  void    phyint_flags_init(phyint_t *, t_uscalar_t);
 229  229  
 230  230  /*
 231  231   * if we go over the memory footprint limit more than once in this msec
 232  232   * interval, we'll start pruning aggressively.
 233  233   */
 234  234  int ip_min_frag_prune_time = 0;
 235  235  
 236  236  static ipft_t   ip_ioctl_ftbl[] = {
 237  237          { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
 238  238          { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
 239  239                  IPFT_F_NO_REPLY },
 240  240          { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
 241  241          { 0 }
 242  242  };
 243  243  
 244  244  /* Simple ICMP IP Header Template */
 245  245  static ipha_t icmp_ipha = {
 246  246          IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
 247  247  };
 248  248  
 249  249  static uchar_t  ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
 250  250  
 251  251  static ip_m_t   ip_m_tbl[] = {
 252  252          { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 253  253              ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 254  254              ip_nodef_v6intfid },
 255  255          { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
 256  256              ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 257  257              ip_nodef_v6intfid },
 258  258          { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
 259  259              ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 260  260              ip_nodef_v6intfid },
 261  261          { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
 262  262              ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 263  263              ip_nodef_v6intfid },
 264  264          { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
 265  265              ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
 266  266              ip_nodef_v6intfid },
 267  267          { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
 268  268              ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
 269  269              ip_nodef_v6intfid },
 270  270          { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
 271  271              ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
 272  272              ip_ipv4_v6destintfid },
 273  273          { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
 274  274              ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
 275  275              ip_ipv6_v6destintfid },
 276  276          { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
 277  277              ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
 278  278              ip_nodef_v6intfid },
 279  279          { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 280  280              NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
 281  281          { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 282  282              NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
 283  283          { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
 284  284              ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
 285  285              ip_nodef_v6intfid }
 286  286  };
 287  287  
 288  288  char    ipif_loopback_name[] = "lo0";
 289  289  
 290  290  /* These are used by all IP network modules. */
 291  291  sin6_t  sin6_null;      /* Zero address for quick clears */
 292  292  sin_t   sin_null;       /* Zero address for quick clears */
 293  293  
 294  294  /* When set search for unused ipif_seqid */
 295  295  static ipif_t   ipif_zero;
 296  296  
 297  297  /*
 298  298   * ppa arena is created after these many
 299  299   * interfaces have been plumbed.
 300  300   */
 301  301  uint_t  ill_no_arena = 12;      /* Setable in /etc/system */
 302  302  
 303  303  /*
 304  304   * Allocate per-interface mibs.
 305  305   * Returns true if ok. False otherwise.
 306  306   *  ipsq  may not yet be allocated (loopback case ).
 307  307   */
 308  308  static boolean_t
 309  309  ill_allocate_mibs(ill_t *ill)
 310  310  {
 311  311          /* Already allocated? */
 312  312          if (ill->ill_ip_mib != NULL) {
 313  313                  if (ill->ill_isv6)
 314  314                          ASSERT(ill->ill_icmp6_mib != NULL);
 315  315                  return (B_TRUE);
 316  316          }
 317  317  
 318  318          ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
 319  319              KM_NOSLEEP);
 320  320          if (ill->ill_ip_mib == NULL) {
 321  321                  return (B_FALSE);
 322  322          }
 323  323  
 324  324          /* Setup static information */
 325  325          SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
 326  326              sizeof (mib2_ipIfStatsEntry_t));
 327  327          if (ill->ill_isv6) {
 328  328                  ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
 329  329                  SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 330  330                      sizeof (mib2_ipv6AddrEntry_t));
 331  331                  SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 332  332                      sizeof (mib2_ipv6RouteEntry_t));
 333  333                  SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 334  334                      sizeof (mib2_ipv6NetToMediaEntry_t));
 335  335                  SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 336  336                      sizeof (ipv6_member_t));
 337  337                  SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 338  338                      sizeof (ipv6_grpsrc_t));
 339  339          } else {
 340  340                  ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
 341  341                  SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
 342  342                      sizeof (mib2_ipAddrEntry_t));
 343  343                  SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
 344  344                      sizeof (mib2_ipRouteEntry_t));
 345  345                  SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
 346  346                      sizeof (mib2_ipNetToMediaEntry_t));
 347  347                  SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
 348  348                      sizeof (ip_member_t));
 349  349                  SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
 350  350                      sizeof (ip_grpsrc_t));
 351  351  
 352  352                  /*
 353  353                   * For a v4 ill, we are done at this point, because per ill
 354  354                   * icmp mibs are only used for v6.
 355  355                   */
 356  356                  return (B_TRUE);
 357  357          }
 358  358  
 359  359          ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
 360  360              KM_NOSLEEP);
 361  361          if (ill->ill_icmp6_mib == NULL) {
 362  362                  kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 363  363                  ill->ill_ip_mib = NULL;
 364  364                  return (B_FALSE);
 365  365          }
 366  366          /* static icmp info */
 367  367          ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
 368  368              sizeof (mib2_ipv6IfIcmpEntry_t);
 369  369          /*
 370  370           * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
 371  371           * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
 372  372           * -> ill_phyint_reinit
 373  373           */
 374  374          return (B_TRUE);
 375  375  }
 376  376  
 377  377  /*
 378  378   * Completely vaporize a lower level tap and all associated interfaces.
 379  379   * ill_delete is called only out of ip_close when the device control
 380  380   * stream is being closed.
 381  381   */
 382  382  void
 383  383  ill_delete(ill_t *ill)
 384  384  {
 385  385          ipif_t  *ipif;
 386  386          ill_t   *prev_ill;
 387  387          ip_stack_t      *ipst = ill->ill_ipst;
 388  388  
 389  389          /*
 390  390           * ill_delete may be forcibly entering the ipsq. The previous
 391  391           * ioctl may not have completed and may need to be aborted.
 392  392           * ipsq_flush takes care of it. If we don't need to enter the
 393  393           * the ipsq forcibly, the 2nd invocation of ipsq_flush in
 394  394           * ill_delete_tail is sufficient.
 395  395           */
 396  396          ipsq_flush(ill);
 397  397  
 398  398          /*
 399  399           * Nuke all interfaces.  ipif_free will take down the interface,
 400  400           * remove it from the list, and free the data structure.
 401  401           * Walk down the ipif list and remove the logical interfaces
 402  402           * first before removing the main ipif. We can't unplumb
 403  403           * zeroth interface first in the case of IPv6 as update_conn_ill
 404  404           * -> ip_ll_multireq de-references ill_ipif for checking
 405  405           * POINTOPOINT.
 406  406           *
 407  407           * If ill_ipif was not properly initialized (i.e low on memory),
 408  408           * then no interfaces to clean up. In this case just clean up the
 409  409           * ill.
 410  410           */
 411  411          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
 412  412                  ipif_free(ipif);
 413  413  
 414  414          /*
 415  415           * clean out all the nce_t entries that depend on this
 416  416           * ill for the ill_phys_addr.
 417  417           */
 418  418          nce_flush(ill, B_TRUE);
 419  419  
 420  420          /* Clean up msgs on pending upcalls for mrouted */
 421  421          reset_mrt_ill(ill);
 422  422  
 423  423          update_conn_ill(ill, ipst);
 424  424  
 425  425          /*
 426  426           * Remove multicast references added as a result of calls to
 427  427           * ip_join_allmulti().
 428  428           */
 429  429          ip_purge_allmulti(ill);
 430  430  
 431  431          /*
 432  432           * If the ill being deleted is under IPMP, boot it out of the illgrp.
 433  433           */
 434  434          if (IS_UNDER_IPMP(ill))
 435  435                  ipmp_ill_leave_illgrp(ill);
 436  436  
 437  437          /*
 438  438           * ill_down will arrange to blow off any IRE's dependent on this
 439  439           * ILL, and shut down fragmentation reassembly.
 440  440           */
 441  441          ill_down(ill);
 442  442  
 443  443          /* Let SCTP know, so that it can remove this from its list. */
 444  444          sctp_update_ill(ill, SCTP_ILL_REMOVE);
 445  445  
 446  446          /*
 447  447           * Walk all CONNs that can have a reference on an ire or nce for this
 448  448           * ill (we actually walk all that now have stale references).
 449  449           */
 450  450          ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
 451  451  
 452  452          /* With IPv6 we have dce_ifindex. Cleanup for neatness */
 453  453          if (ill->ill_isv6)
 454  454                  dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
 455  455  
 456  456          /*
 457  457           * If an address on this ILL is being used as a source address then
 458  458           * clear out the pointers in other ILLs that point to this ILL.
 459  459           */
 460  460          rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
 461  461          if (ill->ill_usesrc_grp_next != NULL) {
 462  462                  if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
 463  463                          ill_disband_usesrc_group(ill);
 464  464                  } else {        /* consumer of the usesrc ILL */
 465  465                          prev_ill = ill_prev_usesrc(ill);
 466  466                          prev_ill->ill_usesrc_grp_next =
 467  467                              ill->ill_usesrc_grp_next;
 468  468                  }
 469  469          }
 470  470          rw_exit(&ipst->ips_ill_g_usesrc_lock);
 471  471  }
 472  472  
 473  473  static void
 474  474  ipif_non_duplicate(ipif_t *ipif)
 475  475  {
 476  476          ill_t *ill = ipif->ipif_ill;
 477  477          mutex_enter(&ill->ill_lock);
 478  478          if (ipif->ipif_flags & IPIF_DUPLICATE) {
 479  479                  ipif->ipif_flags &= ~IPIF_DUPLICATE;
 480  480                  ASSERT(ill->ill_ipif_dup_count > 0);
 481  481                  ill->ill_ipif_dup_count--;
 482  482          }
 483  483          mutex_exit(&ill->ill_lock);
 484  484  }
 485  485  
 486  486  /*
 487  487   * ill_delete_tail is called from ip_modclose after all references
 488  488   * to the closing ill are gone. The wait is done in ip_modclose
 489  489   */
 490  490  void
 491  491  ill_delete_tail(ill_t *ill)
 492  492  {
 493  493          mblk_t  **mpp;
 494  494          ipif_t  *ipif;
 495  495          ip_stack_t *ipst = ill->ill_ipst;
 496  496  
 497  497          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
 498  498                  ipif_non_duplicate(ipif);
 499  499                  (void) ipif_down_tail(ipif);
 500  500          }
 501  501  
 502  502          ASSERT(ill->ill_ipif_dup_count == 0);
 503  503  
 504  504          /*
 505  505           * If polling capability is enabled (which signifies direct
 506  506           * upcall into IP and driver has ill saved as a handle),
 507  507           * we need to make sure that unbind has completed before we
 508  508           * let the ill disappear and driver no longer has any reference
 509  509           * to this ill.
 510  510           */
 511  511          mutex_enter(&ill->ill_lock);
 512  512          while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
 513  513                  cv_wait(&ill->ill_cv, &ill->ill_lock);
 514  514          mutex_exit(&ill->ill_lock);
 515  515          ASSERT(!(ill->ill_capabilities &
 516  516              (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
 517  517  
 518  518          if (ill->ill_net_type != IRE_LOOPBACK)
 519  519                  qprocsoff(ill->ill_rq);
 520  520  
 521  521          /*
 522  522           * We do an ipsq_flush once again now. New messages could have
 523  523           * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
 524  524           * could also have landed up if an ioctl thread had looked up
 525  525           * the ill before we set the ILL_CONDEMNED flag, but not yet
 526  526           * enqueued the ioctl when we did the ipsq_flush last time.
 527  527           */
 528  528          ipsq_flush(ill);
 529  529  
 530  530          /*
 531  531           * Free capabilities.
 532  532           */
 533  533          if (ill->ill_hcksum_capab != NULL) {
 534  534                  kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
 535  535                  ill->ill_hcksum_capab = NULL;
 536  536          }
 537  537  
 538  538          if (ill->ill_zerocopy_capab != NULL) {
 539  539                  kmem_free(ill->ill_zerocopy_capab,
 540  540                      sizeof (ill_zerocopy_capab_t));
 541  541                  ill->ill_zerocopy_capab = NULL;
 542  542          }
 543  543  
 544  544          if (ill->ill_lso_capab != NULL) {
 545  545                  kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
 546  546                  ill->ill_lso_capab = NULL;
 547  547          }
 548  548  
 549  549          if (ill->ill_dld_capab != NULL) {
 550  550                  kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
 551  551                  ill->ill_dld_capab = NULL;
 552  552          }
 553  553  
 554  554          /* Clean up ill_allowed_ips* related state */
 555  555          if (ill->ill_allowed_ips != NULL) {
 556  556                  ASSERT(ill->ill_allowed_ips_cnt > 0);
 557  557                  kmem_free(ill->ill_allowed_ips,
 558  558                      ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
 559  559                  ill->ill_allowed_ips = NULL;
 560  560                  ill->ill_allowed_ips_cnt = 0;
 561  561          }
 562  562  
 563  563          while (ill->ill_ipif != NULL)
 564  564                  ipif_free_tail(ill->ill_ipif);
 565  565  
 566  566          /*
 567  567           * We have removed all references to ilm from conn and the ones joined
 568  568           * within the kernel.
 569  569           *
 570  570           * We don't walk conns, mrts and ires because
 571  571           *
 572  572           * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
 573  573           * 2) ill_down ->ill_downi walks all the ires and cleans up
 574  574           *    ill references.
 575  575           */
 576  576  
 577  577          /*
 578  578           * If this ill is an IPMP meta-interface, blow away the illgrp.  This
 579  579           * is safe to do because the illgrp has already been unlinked from the
 580  580           * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
 581  581           */
 582  582          if (IS_IPMP(ill)) {
 583  583                  ipmp_illgrp_destroy(ill->ill_grp);
 584  584                  ill->ill_grp = NULL;
 585  585          }
 586  586  
 587  587          if (ill->ill_mphysaddr_list != NULL) {
 588  588                  multiphysaddr_t *mpa, *tmpa;
 589  589  
 590  590                  mpa = ill->ill_mphysaddr_list;
 591  591                  ill->ill_mphysaddr_list = NULL;
 592  592                  while (mpa) {
 593  593                          tmpa = mpa->mpa_next;
 594  594                          kmem_free(mpa, sizeof (*mpa));
 595  595                          mpa = tmpa;
 596  596                  }
 597  597          }
 598  598          /*
 599  599           * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
 600  600           * could free the phyint. No more reference to the phyint after this
 601  601           * point.
 602  602           */
 603  603          (void) ill_glist_delete(ill);
 604  604  
 605  605          if (ill->ill_frag_ptr != NULL) {
 606  606                  uint_t count;
 607  607  
 608  608                  for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
 609  609                          mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
 610  610                  }
 611  611                  mi_free(ill->ill_frag_ptr);
 612  612                  ill->ill_frag_ptr = NULL;
 613  613                  ill->ill_frag_hash_tbl = NULL;
 614  614          }
 615  615  
 616  616          freemsg(ill->ill_nd_lla_mp);
 617  617          /* Free all retained control messages. */
 618  618          mpp = &ill->ill_first_mp_to_free;
 619  619          do {
 620  620                  while (mpp[0]) {
 621  621                          mblk_t  *mp;
 622  622                          mblk_t  *mp1;
 623  623  
 624  624                          mp = mpp[0];
 625  625                          mpp[0] = mp->b_next;
 626  626                          for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
 627  627                                  mp1->b_next = NULL;
 628  628                                  mp1->b_prev = NULL;
 629  629                          }
 630  630                          freemsg(mp);
 631  631                  }
 632  632          } while (mpp++ != &ill->ill_last_mp_to_free);
 633  633  
 634  634          ill_free_mib(ill);
 635  635  
 636  636  #ifdef DEBUG
 637  637          ill_trace_cleanup(ill);
 638  638  #endif
 639  639  
 640  640          /* The default multicast interface might have changed */
 641  641          ire_increment_multicast_generation(ipst, ill->ill_isv6);
 642  642  
 643  643          /* Drop refcnt here */
 644  644          netstack_rele(ill->ill_ipst->ips_netstack);
 645  645          ill->ill_ipst = NULL;
 646  646  }
 647  647  
 648  648  static void
 649  649  ill_free_mib(ill_t *ill)
 650  650  {
 651  651          ip_stack_t *ipst = ill->ill_ipst;
 652  652  
 653  653          /*
 654  654           * MIB statistics must not be lost, so when an interface
 655  655           * goes away the counter values will be added to the global
 656  656           * MIBs.
 657  657           */
 658  658          if (ill->ill_ip_mib != NULL) {
 659  659                  if (ill->ill_isv6) {
 660  660                          ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
 661  661                              ill->ill_ip_mib);
 662  662                  } else {
 663  663                          ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
 664  664                              ill->ill_ip_mib);
 665  665                  }
 666  666  
 667  667                  kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
 668  668                  ill->ill_ip_mib = NULL;
 669  669          }
 670  670          if (ill->ill_icmp6_mib != NULL) {
 671  671                  ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
 672  672                      ill->ill_icmp6_mib);
 673  673                  kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
 674  674                  ill->ill_icmp6_mib = NULL;
 675  675          }
 676  676  }
 677  677  
 678  678  /*
 679  679   * Concatenate together a physical address and a sap.
 680  680   *
 681  681   * Sap_lengths are interpreted as follows:
 682  682   *   sap_length == 0    ==>     no sap
 683  683   *   sap_length > 0     ==>     sap is at the head of the dlpi address
 684  684   *   sap_length < 0     ==>     sap is at the tail of the dlpi address
 685  685   */
 686  686  static void
 687  687  ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
 688  688      t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
 689  689  {
 690  690          uint16_t sap_addr = (uint16_t)sap_src;
 691  691  
 692  692          if (sap_length == 0) {
 693  693                  if (phys_src == NULL)
 694  694                          bzero(dst, phys_length);
 695  695                  else
 696  696                          bcopy(phys_src, dst, phys_length);
 697  697          } else if (sap_length < 0) {
 698  698                  if (phys_src == NULL)
 699  699                          bzero(dst, phys_length);
 700  700                  else
 701  701                          bcopy(phys_src, dst, phys_length);
 702  702                  bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
 703  703          } else {
 704  704                  bcopy(&sap_addr, dst, sizeof (sap_addr));
 705  705                  if (phys_src == NULL)
 706  706                          bzero((char *)dst + sap_length, phys_length);
 707  707                  else
 708  708                          bcopy(phys_src, (char *)dst + sap_length, phys_length);
 709  709          }
 710  710  }
 711  711  
 712  712  /*
 713  713   * Generate a dl_unitdata_req mblk for the device and address given.
 714  714   * addr_length is the length of the physical portion of the address.
 715  715   * If addr is NULL include an all zero address of the specified length.
 716  716   * TRUE? In any case, addr_length is taken to be the entire length of the
 717  717   * dlpi address, including the absolute value of sap_length.
 718  718   */
 719  719  mblk_t *
 720  720  ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
 721  721      t_scalar_t sap_length)
 722  722  {
 723  723          dl_unitdata_req_t *dlur;
 724  724          mblk_t  *mp;
 725  725          t_scalar_t      abs_sap_length;         /* absolute value */
 726  726  
 727  727          abs_sap_length = ABS(sap_length);
 728  728          mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
 729  729              DL_UNITDATA_REQ);
 730  730          if (mp == NULL)
 731  731                  return (NULL);
 732  732          dlur = (dl_unitdata_req_t *)mp->b_rptr;
 733  733          /* HACK: accomodate incompatible DLPI drivers */
 734  734          if (addr_length == 8)
 735  735                  addr_length = 6;
 736  736          dlur->dl_dest_addr_length = addr_length + abs_sap_length;
 737  737          dlur->dl_dest_addr_offset = sizeof (*dlur);
 738  738          dlur->dl_priority.dl_min = 0;
 739  739          dlur->dl_priority.dl_max = 0;
 740  740          ill_dlur_copy_address(addr, addr_length, sap, sap_length,
 741  741              (uchar_t *)&dlur[1]);
 742  742          return (mp);
 743  743  }
 744  744  
 745  745  /*
 746  746   * Add the pending mp to the list. There can be only 1 pending mp
 747  747   * in the list. Any exclusive ioctl that needs to wait for a response
 748  748   * from another module or driver needs to use this function to set
 749  749   * the ipx_pending_mp to the ioctl mblk and wait for the response from
 750  750   * the other module/driver. This is also used while waiting for the
 751  751   * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
 752  752   */
 753  753  boolean_t
 754  754  ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
 755  755      int waitfor)
 756  756  {
 757  757          ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
 758  758  
 759  759          ASSERT(IAM_WRITER_IPIF(ipif));
 760  760          ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
 761  761          ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
 762  762          ASSERT(ipx->ipx_pending_mp == NULL);
 763  763          /*
 764  764           * The caller may be using a different ipif than the one passed into
 765  765           * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
 766  766           * ill needs to wait for the V6 ill to quiesce).  So we can't ASSERT
 767  767           * that `ipx_current_ipif == ipif'.
 768  768           */
 769  769          ASSERT(ipx->ipx_current_ipif != NULL);
 770  770  
 771  771          /*
 772  772           * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
 773  773           * driver.
 774  774           */
 775  775          ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
 776  776              (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
 777  777              (DB_TYPE(add_mp) == M_PCPROTO));
 778  778  
 779  779          if (connp != NULL) {
 780  780                  ASSERT(MUTEX_HELD(&connp->conn_lock));
 781  781                  /*
 782  782                   * Return error if the conn has started closing. The conn
 783  783                   * could have finished cleaning up the pending mp list,
 784  784                   * If so we should not add another mp to the list negating
 785  785                   * the cleanup.
 786  786                   */
 787  787                  if (connp->conn_state_flags & CONN_CLOSING)
 788  788                          return (B_FALSE);
 789  789          }
 790  790          mutex_enter(&ipx->ipx_lock);
 791  791          ipx->ipx_pending_ipif = ipif;
 792  792          /*
 793  793           * Note down the queue in b_queue. This will be returned by
 794  794           * ipsq_pending_mp_get. Caller will then use these values to restart
 795  795           * the processing
 796  796           */
 797  797          add_mp->b_next = NULL;
 798  798          add_mp->b_queue = q;
 799  799          ipx->ipx_pending_mp = add_mp;
 800  800          ipx->ipx_waitfor = waitfor;
 801  801          mutex_exit(&ipx->ipx_lock);
 802  802  
 803  803          if (connp != NULL)
 804  804                  connp->conn_oper_pending_ill = ipif->ipif_ill;
 805  805  
 806  806          return (B_TRUE);
 807  807  }
 808  808  
 809  809  /*
 810  810   * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
 811  811   * queued in the list.
 812  812   */
 813  813  mblk_t *
 814  814  ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
 815  815  {
 816  816          mblk_t  *curr = NULL;
 817  817          ipxop_t *ipx = ipsq->ipsq_xop;
 818  818  
 819  819          *connpp = NULL;
 820  820          mutex_enter(&ipx->ipx_lock);
 821  821          if (ipx->ipx_pending_mp == NULL) {
 822  822                  mutex_exit(&ipx->ipx_lock);
 823  823                  return (NULL);
 824  824          }
 825  825  
 826  826          /* There can be only 1 such excl message */
 827  827          curr = ipx->ipx_pending_mp;
 828  828          ASSERT(curr->b_next == NULL);
 829  829          ipx->ipx_pending_ipif = NULL;
 830  830          ipx->ipx_pending_mp = NULL;
 831  831          ipx->ipx_waitfor = 0;
 832  832          mutex_exit(&ipx->ipx_lock);
 833  833  
 834  834          if (CONN_Q(curr->b_queue)) {
 835  835                  /*
 836  836                   * This mp did a refhold on the conn, at the start of the ioctl.
 837  837                   * So we can safely return a pointer to the conn to the caller.
 838  838                   */
 839  839                  *connpp = Q_TO_CONN(curr->b_queue);
 840  840          } else {
 841  841                  *connpp = NULL;
 842  842          }
 843  843          curr->b_next = NULL;
 844  844          curr->b_prev = NULL;
 845  845          return (curr);
 846  846  }
 847  847  
 848  848  /*
 849  849   * Cleanup the ioctl mp queued in ipx_pending_mp
 850  850   * - Called in the ill_delete path
 851  851   * - Called in the M_ERROR or M_HANGUP path on the ill.
 852  852   * - Called in the conn close path.
 853  853   *
 854  854   * Returns success on finding the pending mblk associated with the ioctl or
 855  855   * exclusive operation in progress, failure otherwise.
 856  856   */
 857  857  boolean_t
 858  858  ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
 859  859  {
 860  860          mblk_t  *mp;
 861  861          ipxop_t *ipx;
 862  862          queue_t *q;
 863  863          ipif_t  *ipif;
 864  864          int     cmd;
 865  865  
 866  866          ASSERT(IAM_WRITER_ILL(ill));
 867  867          ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
 868  868  
 869  869          mutex_enter(&ipx->ipx_lock);
 870  870          mp = ipx->ipx_pending_mp;
 871  871          if (connp != NULL) {
 872  872                  if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
 873  873                          /*
 874  874                           * Nothing to clean since the conn that is closing
 875  875                           * does not have a matching pending mblk in
 876  876                           * ipx_pending_mp.
 877  877                           */
 878  878                          mutex_exit(&ipx->ipx_lock);
 879  879                          return (B_FALSE);
 880  880                  }
 881  881          } else {
 882  882                  /*
 883  883                   * A non-zero ill_error signifies we are called in the
 884  884                   * M_ERROR or M_HANGUP path and we need to unconditionally
 885  885                   * abort any current ioctl and do the corresponding cleanup.
 886  886                   * A zero ill_error means we are in the ill_delete path and
 887  887                   * we do the cleanup only if there is a pending mp.
 888  888                   */
 889  889                  if (mp == NULL && ill->ill_error == 0) {
 890  890                          mutex_exit(&ipx->ipx_lock);
 891  891                          return (B_FALSE);
 892  892                  }
 893  893          }
 894  894  
 895  895          /* Now remove from the ipx_pending_mp */
 896  896          ipx->ipx_pending_mp = NULL;
 897  897          ipif = ipx->ipx_pending_ipif;
 898  898          ipx->ipx_pending_ipif = NULL;
 899  899          ipx->ipx_waitfor = 0;
 900  900          ipx->ipx_current_ipif = NULL;
 901  901          cmd = ipx->ipx_current_ioctl;
 902  902          ipx->ipx_current_ioctl = 0;
 903  903          ipx->ipx_current_done = B_TRUE;
 904  904          mutex_exit(&ipx->ipx_lock);
 905  905  
 906  906          if (mp == NULL)
 907  907                  return (B_FALSE);
 908  908  
 909  909          q = mp->b_queue;
 910  910          mp->b_next = NULL;
 911  911          mp->b_prev = NULL;
 912  912          mp->b_queue = NULL;
 913  913  
 914  914          if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
 915  915                  DTRACE_PROBE4(ipif__ioctl,
 916  916                      char *, "ipsq_pending_mp_cleanup",
 917  917                      int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
 918  918                      ipif_t *, ipif);
 919  919                  if (connp == NULL) {
 920  920                          ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
 921  921                  } else {
 922  922                          ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
 923  923                          mutex_enter(&ipif->ipif_ill->ill_lock);
 924  924                          ipif->ipif_state_flags &= ~IPIF_CHANGING;
 925  925                          mutex_exit(&ipif->ipif_ill->ill_lock);
 926  926                  }
 927  927          } else {
 928  928                  inet_freemsg(mp);
 929  929          }
 930  930          return (B_TRUE);
 931  931  }
 932  932  
 933  933  /*
 934  934   * Called in the conn close path and ill delete path
 935  935   */
 936  936  static void
 937  937  ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
 938  938  {
 939  939          ipsq_t  *ipsq;
 940  940          mblk_t  *prev;
 941  941          mblk_t  *curr;
 942  942          mblk_t  *next;
 943  943          queue_t *wq, *rq = NULL;
 944  944          mblk_t  *tmp_list = NULL;
 945  945  
 946  946          ASSERT(IAM_WRITER_ILL(ill));
 947  947          if (connp != NULL)
 948  948                  wq = CONNP_TO_WQ(connp);
 949  949          else
 950  950                  wq = ill->ill_wq;
 951  951  
 952  952          /*
 953  953           * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
 954  954           * against this here.
 955  955           */
 956  956          if (wq != NULL)
 957  957                  rq = RD(wq);
 958  958  
 959  959          ipsq = ill->ill_phyint->phyint_ipsq;
 960  960          /*
 961  961           * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
 962  962           * In the case of ioctl from a conn, there can be only 1 mp
 963  963           * queued on the ipsq. If an ill is being unplumbed flush all
 964  964           * the messages.
 965  965           */
 966  966          mutex_enter(&ipsq->ipsq_lock);
 967  967          for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
 968  968              curr = next) {
 969  969                  next = curr->b_next;
 970  970                  if (connp == NULL ||
 971  971                      (curr->b_queue == wq || curr->b_queue == rq)) {
 972  972                          /* Unlink the mblk from the pending mp list */
 973  973                          if (prev != NULL) {
 974  974                                  prev->b_next = curr->b_next;
 975  975                          } else {
 976  976                                  ASSERT(ipsq->ipsq_xopq_mphead == curr);
 977  977                                  ipsq->ipsq_xopq_mphead = curr->b_next;
 978  978                          }
 979  979                          if (ipsq->ipsq_xopq_mptail == curr)
 980  980                                  ipsq->ipsq_xopq_mptail = prev;
 981  981                          /*
 982  982                           * Create a temporary list and release the ipsq lock
 983  983                           * New elements are added to the head of the tmp_list
 984  984                           */
 985  985                          curr->b_next = tmp_list;
 986  986                          tmp_list = curr;
 987  987                  } else {
 988  988                          prev = curr;
 989  989                  }
 990  990          }
 991  991          mutex_exit(&ipsq->ipsq_lock);
 992  992  
 993  993          while (tmp_list != NULL) {
 994  994                  curr = tmp_list;
 995  995                  tmp_list = curr->b_next;
 996  996                  curr->b_next = NULL;
 997  997                  curr->b_prev = NULL;
 998  998                  wq = curr->b_queue;
 999  999                  curr->b_queue = NULL;
1000 1000                  if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
1001 1001                          DTRACE_PROBE4(ipif__ioctl,
1002 1002                              char *, "ipsq_xopq_mp_cleanup",
1003 1003                              int, 0, ill_t *, NULL, ipif_t *, NULL);
1004 1004                          ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1005 1005                              CONN_CLOSE : NO_COPYOUT, NULL);
1006 1006                  } else {
1007 1007                          /*
1008 1008                           * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1009 1009                           * this can't be just inet_freemsg. we have to
1010 1010                           * restart it otherwise the thread will be stuck.
1011 1011                           */
1012 1012                          inet_freemsg(curr);
1013 1013                  }
1014 1014          }
1015 1015  }
1016 1016  
1017 1017  /*
1018 1018   * This conn has started closing. Cleanup any pending ioctl from this conn.
1019 1019   * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1020 1020   */
1021 1021  void
1022 1022  conn_ioctl_cleanup(conn_t *connp)
1023 1023  {
1024 1024          ipsq_t  *ipsq;
1025 1025          ill_t   *ill;
1026 1026          boolean_t refheld;
1027 1027  
1028 1028          /*
1029 1029           * Check for a queued ioctl. If the ioctl has not yet started, the mp
1030 1030           * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1031 1031           * started the mp could be present in ipx_pending_mp. Note that if
1032 1032           * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1033 1033           * not yet queued anywhere. In this case, the conn close code will wait
1034 1034           * until the conn_ref is dropped. If the stream was a tcp stream, then
1035 1035           * tcp_close will wait first until all ioctls have completed for this
1036 1036           * conn.
1037 1037           */
1038 1038          mutex_enter(&connp->conn_lock);
1039 1039          ill = connp->conn_oper_pending_ill;
1040 1040          if (ill == NULL) {
1041 1041                  mutex_exit(&connp->conn_lock);
1042 1042                  return;
1043 1043          }
1044 1044  
1045 1045          /*
1046 1046           * We may not be able to refhold the ill if the ill/ipif
1047 1047           * is changing. But we need to make sure that the ill will
1048 1048           * not vanish. So we just bump up the ill_waiter count.
1049 1049           */
1050 1050          refheld = ill_waiter_inc(ill);
1051 1051          mutex_exit(&connp->conn_lock);
1052 1052          if (refheld) {
1053 1053                  if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1054 1054                          ill_waiter_dcr(ill);
1055 1055                          /*
1056 1056                           * Check whether this ioctl has started and is
1057 1057                           * pending. If it is not found there then check
1058 1058                           * whether this ioctl has not even started and is in
1059 1059                           * the ipsq_xopq list.
1060 1060                           */
1061 1061                          if (!ipsq_pending_mp_cleanup(ill, connp))
1062 1062                                  ipsq_xopq_mp_cleanup(ill, connp);
1063 1063                          ipsq = ill->ill_phyint->phyint_ipsq;
1064 1064                          ipsq_exit(ipsq);
1065 1065                          return;
1066 1066                  }
1067 1067          }
1068 1068  
1069 1069          /*
1070 1070           * The ill is also closing and we could not bump up the
1071 1071           * ill_waiter_count or we could not enter the ipsq. Leave
1072 1072           * the cleanup to ill_delete
1073 1073           */
1074 1074          mutex_enter(&connp->conn_lock);
1075 1075          while (connp->conn_oper_pending_ill != NULL)
1076 1076                  cv_wait(&connp->conn_refcv, &connp->conn_lock);
1077 1077          mutex_exit(&connp->conn_lock);
1078 1078          if (refheld)
1079 1079                  ill_waiter_dcr(ill);
1080 1080  }
1081 1081  
1082 1082  /*
1083 1083   * ipcl_walk function for cleaning up conn_*_ill fields.
1084 1084   * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1085 1085   * conn_bound_if in place. We prefer dropping
1086 1086   * packets instead of sending them out the wrong interface, or accepting
1087 1087   * packets from the wrong ifindex.
1088 1088   */
1089 1089  static void
1090 1090  conn_cleanup_ill(conn_t *connp, caddr_t arg)
1091 1091  {
1092 1092          ill_t   *ill = (ill_t *)arg;
1093 1093  
1094 1094          mutex_enter(&connp->conn_lock);
1095 1095          if (connp->conn_dhcpinit_ill == ill) {
1096 1096                  connp->conn_dhcpinit_ill = NULL;
1097 1097                  ASSERT(ill->ill_dhcpinit != 0);
1098 1098                  atomic_dec_32(&ill->ill_dhcpinit);
1099 1099                  ill_set_inputfn(ill);
1100 1100          }
1101 1101          mutex_exit(&connp->conn_lock);
1102 1102  }
1103 1103  
1104 1104  static int
1105 1105  ill_down_ipifs_tail(ill_t *ill)
1106 1106  {
1107 1107          ipif_t  *ipif;
1108 1108          int err;
1109 1109  
1110 1110          ASSERT(IAM_WRITER_ILL(ill));
1111 1111          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1112 1112                  ipif_non_duplicate(ipif);
1113 1113                  /*
1114 1114                   * ipif_down_tail will call arp_ll_down on the last ipif
1115 1115                   * and typically return EINPROGRESS when the DL_UNBIND is sent.
1116 1116                   */
1117 1117                  if ((err = ipif_down_tail(ipif)) != 0)
1118 1118                          return (err);
1119 1119          }
1120 1120          return (0);
1121 1121  }
1122 1122  
1123 1123  /* ARGSUSED */
1124 1124  void
1125 1125  ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1126 1126  {
1127 1127          ASSERT(IAM_WRITER_IPSQ(ipsq));
1128 1128          (void) ill_down_ipifs_tail(q->q_ptr);
1129 1129          freemsg(mp);
1130 1130          ipsq_current_finish(ipsq);
1131 1131  }
1132 1132  
1133 1133  /*
1134 1134   * ill_down_start is called when we want to down this ill and bring it up again
1135 1135   * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1136 1136   * all interfaces, but don't tear down any plumbing.
1137 1137   */
1138 1138  boolean_t
1139 1139  ill_down_start(queue_t *q, mblk_t *mp)
1140 1140  {
1141 1141          ill_t   *ill = q->q_ptr;
1142 1142          ipif_t  *ipif;
1143 1143  
1144 1144          ASSERT(IAM_WRITER_ILL(ill));
1145 1145          /*
1146 1146           * It is possible that some ioctl is already in progress while we
1147 1147           * received the M_ERROR / M_HANGUP in which case, we need to abort
1148 1148           * the ioctl. ill_down_start() is being processed as CUR_OP rather
1149 1149           * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1150 1150           * the in progress ioctl from ever completing.
1151 1151           *
1152 1152           * The thread that started the ioctl (if any) must have returned,
1153 1153           * since we are now executing as writer. After the 2 calls below,
1154 1154           * the state of the ipsq and the ill would reflect no trace of any
1155 1155           * pending operation. Subsequently if there is any response to the
1156 1156           * original ioctl from the driver, it would be discarded as an
1157 1157           * unsolicited message from the driver.
1158 1158           */
1159 1159          (void) ipsq_pending_mp_cleanup(ill, NULL);
1160 1160          ill_dlpi_clear_deferred(ill);
1161 1161  
1162 1162          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1163 1163                  (void) ipif_down(ipif, NULL, NULL);
1164 1164  
1165 1165          ill_down(ill);
1166 1166  
1167 1167          /*
1168 1168           * Walk all CONNs that can have a reference on an ire or nce for this
1169 1169           * ill (we actually walk all that now have stale references).
1170 1170           */
1171 1171          ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1172 1172  
1173 1173          /* With IPv6 we have dce_ifindex. Cleanup for neatness */
1174 1174          if (ill->ill_isv6)
1175 1175                  dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1176 1176  
1177 1177          ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1178 1178  
1179 1179          /*
1180 1180           * Atomically test and add the pending mp if references are active.
1181 1181           */
1182 1182          mutex_enter(&ill->ill_lock);
1183 1183          if (!ill_is_quiescent(ill)) {
1184 1184                  /* call cannot fail since `conn_t *' argument is NULL */
1185 1185                  (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1186 1186                      mp, ILL_DOWN);
1187 1187                  mutex_exit(&ill->ill_lock);
1188 1188                  return (B_FALSE);
1189 1189          }
1190 1190          mutex_exit(&ill->ill_lock);
1191 1191          return (B_TRUE);
1192 1192  }
1193 1193  
1194 1194  static void
1195 1195  ill_down(ill_t *ill)
1196 1196  {
1197 1197          mblk_t  *mp;
1198 1198          ip_stack_t      *ipst = ill->ill_ipst;
1199 1199  
1200 1200          /*
1201 1201           * Blow off any IREs dependent on this ILL.
1202 1202           * The caller needs to handle conn_ixa_cleanup
1203 1203           */
1204 1204          ill_delete_ires(ill);
1205 1205  
1206 1206          ire_walk_ill(0, 0, ill_downi, ill, ill);
1207 1207  
1208 1208          /* Remove any conn_*_ill depending on this ill */
1209 1209          ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1210 1210  
1211 1211          /*
1212 1212           * Free state for additional IREs.
1213 1213           */
1214 1214          mutex_enter(&ill->ill_saved_ire_lock);
1215 1215          mp = ill->ill_saved_ire_mp;
1216 1216          ill->ill_saved_ire_mp = NULL;
1217 1217          ill->ill_saved_ire_cnt = 0;
1218 1218          mutex_exit(&ill->ill_saved_ire_lock);
1219 1219          freemsg(mp);
1220 1220  }
1221 1221  
1222 1222  /*
1223 1223   * ire_walk routine used to delete every IRE that depends on
1224 1224   * 'ill'.  (Always called as writer, and may only be called from ire_walk.)
1225 1225   *
1226 1226   * Note: since the routes added by the kernel are deleted separately,
1227 1227   * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1228 1228   *
1229 1229   * We also remove references on ire_nce_cache entries that refer to the ill.
1230 1230   */
1231 1231  void
1232 1232  ill_downi(ire_t *ire, char *ill_arg)
1233 1233  {
1234 1234          ill_t   *ill = (ill_t *)ill_arg;
1235 1235          nce_t   *nce;
1236 1236  
1237 1237          mutex_enter(&ire->ire_lock);
1238 1238          nce = ire->ire_nce_cache;
1239 1239          if (nce != NULL && nce->nce_ill == ill)
1240 1240                  ire->ire_nce_cache = NULL;
1241 1241          else
1242 1242                  nce = NULL;
1243 1243          mutex_exit(&ire->ire_lock);
1244 1244          if (nce != NULL)
1245 1245                  nce_refrele(nce);
1246 1246          if (ire->ire_ill == ill) {
1247 1247                  /*
1248 1248                   * The existing interface binding for ire must be
1249 1249                   * deleted before trying to bind the route to another
1250 1250                   * interface. However, since we are using the contents of the
1251 1251                   * ire after ire_delete, the caller has to ensure that
1252 1252                   * CONDEMNED (deleted) ire's are not removed from the list
1253 1253                   * when ire_delete() returns. Currently ill_downi() is
1254 1254                   * only called as part of ire_walk*() routines, so that
1255 1255                   * the irb_refhold() done by ire_walk*() will ensure that
1256 1256                   * ire_delete() does not lead to ire_inactive().
1257 1257                   */
1258 1258                  ASSERT(ire->ire_bucket->irb_refcnt > 0);
1259 1259                  ire_delete(ire);
1260 1260                  if (ire->ire_unbound)
1261 1261                          ire_rebind(ire);
1262 1262          }
1263 1263  }
1264 1264  
1265 1265  /* Remove IRE_IF_CLONE on this ill */
1266 1266  void
1267 1267  ill_downi_if_clone(ire_t *ire, char *ill_arg)
1268 1268  {
1269 1269          ill_t   *ill = (ill_t *)ill_arg;
1270 1270  
1271 1271          ASSERT(ire->ire_type & IRE_IF_CLONE);
1272 1272          if (ire->ire_ill == ill)
1273 1273                  ire_delete(ire);
1274 1274  }
1275 1275  
1276 1276  /* Consume an M_IOCACK of the fastpath probe. */
1277 1277  void
1278 1278  ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1279 1279  {
1280 1280          mblk_t  *mp1 = mp;
1281 1281  
1282 1282          /*
1283 1283           * If this was the first attempt turn on the fastpath probing.
1284 1284           */
1285 1285          mutex_enter(&ill->ill_lock);
1286 1286          if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1287 1287                  ill->ill_dlpi_fastpath_state = IDS_OK;
1288 1288          mutex_exit(&ill->ill_lock);
1289 1289  
1290 1290          /* Free the M_IOCACK mblk, hold on to the data */
1291 1291          mp = mp->b_cont;
1292 1292          freeb(mp1);
1293 1293          if (mp == NULL)
1294 1294                  return;
1295 1295          if (mp->b_cont != NULL)
1296 1296                  nce_fastpath_update(ill, mp);
1297 1297          else
1298 1298                  ip0dbg(("ill_fastpath_ack:  no b_cont\n"));
1299 1299          freemsg(mp);
1300 1300  }
1301 1301  
1302 1302  /*
1303 1303   * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1304 1304   * The data portion of the request is a dl_unitdata_req_t template for
1305 1305   * what we would send downstream in the absence of a fastpath confirmation.
1306 1306   */
1307 1307  int
1308 1308  ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1309 1309  {
1310 1310          struct iocblk   *ioc;
1311 1311          mblk_t  *mp;
1312 1312  
1313 1313          if (dlur_mp == NULL)
1314 1314                  return (EINVAL);
1315 1315  
1316 1316          mutex_enter(&ill->ill_lock);
1317 1317          switch (ill->ill_dlpi_fastpath_state) {
1318 1318          case IDS_FAILED:
1319 1319                  /*
1320 1320                   * Driver NAKed the first fastpath ioctl - assume it doesn't
1321 1321                   * support it.
1322 1322                   */
1323 1323                  mutex_exit(&ill->ill_lock);
1324 1324                  return (ENOTSUP);
1325 1325          case IDS_UNKNOWN:
1326 1326                  /* This is the first probe */
1327 1327                  ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1328 1328                  break;
1329 1329          default:
1330 1330                  break;
1331 1331          }
1332 1332          mutex_exit(&ill->ill_lock);
1333 1333  
1334 1334          if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1335 1335                  return (EAGAIN);
1336 1336  
1337 1337          mp->b_cont = copyb(dlur_mp);
1338 1338          if (mp->b_cont == NULL) {
1339 1339                  freeb(mp);
1340 1340                  return (EAGAIN);
1341 1341          }
1342 1342  
1343 1343          ioc = (struct iocblk *)mp->b_rptr;
1344 1344          ioc->ioc_count = msgdsize(mp->b_cont);
1345 1345  
1346 1346          DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1347 1347              char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1348 1348          putnext(ill->ill_wq, mp);
1349 1349          return (0);
1350 1350  }
1351 1351  
1352 1352  void
1353 1353  ill_capability_probe(ill_t *ill)
1354 1354  {
1355 1355          mblk_t  *mp;
1356 1356  
1357 1357          ASSERT(IAM_WRITER_ILL(ill));
1358 1358  
1359 1359          if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1360 1360              ill->ill_dlpi_capab_state != IDCS_FAILED)
1361 1361                  return;
1362 1362  
1363 1363          /*
1364 1364           * We are starting a new cycle of capability negotiation.
1365 1365           * Free up the capab reset messages of any previous incarnation.
1366 1366           * We will do a fresh allocation when we get the response to our probe
1367 1367           */
1368 1368          if (ill->ill_capab_reset_mp != NULL) {
1369 1369                  freemsg(ill->ill_capab_reset_mp);
1370 1370                  ill->ill_capab_reset_mp = NULL;
1371 1371          }
1372 1372  
1373 1373          ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1374 1374  
1375 1375          mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1376 1376          if (mp == NULL)
1377 1377                  return;
1378 1378  
1379 1379          ill_capability_send(ill, mp);
1380 1380          ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1381 1381  }
1382 1382  
1383 1383  static boolean_t
1384 1384  ill_capability_wait(ill_t *ill)
1385 1385  {
1386 1386          /*

↓ open down ↓

1386 lines elided

↑ open up ↑

1387 1387           * I'm in this ill's squeue, aka a writer.  The ILL_CONDEMNED flag can
1388 1388           * only be set by someone who is the writer.  Since we
1389 1389           * drop-and-reacquire the squeue in this loop, we need to check for
1390 1390           * ILL_CONDEMNED, which if set means nothing can signal our capability
1391 1391           * condition variable.
1392 1392           */
1393 1393          ASSERT(IAM_WRITER_ILL(ill));
1394 1394  
1395 1395          while (ill->ill_capab_pending_cnt != 0 &&
1396 1396              (ill->ill_state_flags & ILL_CONDEMNED) == 0) {
1397      -                mutex_enter(&ill->ill_dlpi_capab_lock);
     1397 +                /* This may enable blocked callers of ill_capability_done(). */
1398 1398                  ipsq_exit(ill->ill_phyint->phyint_ipsq);
1399      -                cv_wait(&ill->ill_dlpi_capab_cv, &ill->ill_dlpi_capab_lock);
1400      -                mutex_exit(&ill->ill_dlpi_capab_lock);
     1399 +                /* Pause a bit (1msec) before we re-enter the squeue. */
     1400 +                delay(drv_usectohz(1000000));
     1401 +
1401 1402                  /*
1402 1403                   * If ipsq_enter() fails, someone set ILL_CONDEMNED
1403 1404                   * while we dropped the squeue. Indicate such to the caller.
1404 1405                   */
1405 1406                  if (!ipsq_enter(ill, B_FALSE, CUR_OP))
1406 1407                          return (B_FALSE);
1407 1408          }
1408 1409  
1409 1410          return ((ill->ill_state_flags & ILL_CONDEMNED) == 0);
1410 1411  }

1411 1412  
1412 1413  void
1413 1414  ill_capability_reset(ill_t *ill, boolean_t reneg)
1414 1415  {
1415 1416          ASSERT(IAM_WRITER_ILL(ill));
1416 1417  
1417 1418          if (ill->ill_dlpi_capab_state != IDCS_OK)
1418 1419                  return;
1419 1420  
1420 1421          ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1421 1422  
1422 1423          ASSERT(ill->ill_capab_reset_mp != NULL);
1423 1424  
1424 1425          ill_capability_send(ill, ill->ill_capab_reset_mp);
1425 1426          ill->ill_capab_reset_mp = NULL;
1426 1427          /*
1427 1428           * We turn off all capabilities except those pertaining to
1428 1429           * direct function call capabilities viz. ILL_CAPAB_DLD*
1429 1430           * which will be turned off by the corresponding reset functions.
1430 1431           */
1431 1432          ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM  | ILL_CAPAB_ZEROCOPY);
1432 1433  }
1433 1434  
1434 1435  static void
1435 1436  ill_capability_reset_alloc(ill_t *ill)
1436 1437  {
1437 1438          mblk_t *mp;
1438 1439          size_t  size = 0;
1439 1440          int     err;
1440 1441          dl_capability_req_t     *capb;
1441 1442  
1442 1443          ASSERT(IAM_WRITER_ILL(ill));
1443 1444          ASSERT(ill->ill_capab_reset_mp == NULL);
1444 1445  
1445 1446          if (ILL_HCKSUM_CAPABLE(ill)) {
1446 1447                  size += sizeof (dl_capability_sub_t) +
1447 1448                      sizeof (dl_capab_hcksum_t);
1448 1449          }
1449 1450  
1450 1451          if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1451 1452                  size += sizeof (dl_capability_sub_t) +
1452 1453                      sizeof (dl_capab_zerocopy_t);
1453 1454          }
1454 1455  
1455 1456          if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1456 1457                  size += sizeof (dl_capability_sub_t) +
1457 1458                      sizeof (dl_capab_dld_t);
1458 1459          }
1459 1460  
1460 1461          mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1461 1462              STR_NOSIG, &err);
1462 1463  
1463 1464          mp->b_datap->db_type = M_PROTO;
1464 1465          bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1465 1466  
1466 1467          capb = (dl_capability_req_t *)mp->b_rptr;
1467 1468          capb->dl_primitive = DL_CAPABILITY_REQ;
1468 1469          capb->dl_sub_offset = sizeof (dl_capability_req_t);
1469 1470          capb->dl_sub_length = size;
1470 1471  
1471 1472          mp->b_wptr += sizeof (dl_capability_req_t);
1472 1473  
1473 1474          /*
1474 1475           * Each handler fills in the corresponding dl_capability_sub_t
1475 1476           * inside the mblk,
1476 1477           */
1477 1478          ill_capability_hcksum_reset_fill(ill, mp);
1478 1479          ill_capability_zerocopy_reset_fill(ill, mp);
1479 1480          ill_capability_dld_reset_fill(ill, mp);
1480 1481  
1481 1482          ill->ill_capab_reset_mp = mp;
1482 1483  }
1483 1484  
1484 1485  static void
1485 1486  ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1486 1487  {
1487 1488          dl_capab_id_t *id_ic;
1488 1489          uint_t sub_dl_cap = outers->dl_cap;
1489 1490          dl_capability_sub_t *inners;
1490 1491          uint8_t *capend;
1491 1492  
1492 1493          ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1493 1494  
1494 1495          /*
1495 1496           * Note: range checks here are not absolutely sufficient to
1496 1497           * make us robust against malformed messages sent by drivers;
1497 1498           * this is in keeping with the rest of IP's dlpi handling.
1498 1499           * (Remember, it's coming from something else in the kernel
1499 1500           * address space)
1500 1501           */
1501 1502  
1502 1503          capend = (uint8_t *)(outers + 1) + outers->dl_length;
1503 1504          if (capend > mp->b_wptr) {
1504 1505                  cmn_err(CE_WARN, "ill_capability_id_ack: "
1505 1506                      "malformed sub-capability too long for mblk");
1506 1507                  return;
1507 1508          }
1508 1509  
1509 1510          id_ic = (dl_capab_id_t *)(outers + 1);
1510 1511  
1511 1512          if (outers->dl_length < sizeof (*id_ic) ||
1512 1513              (inners = &id_ic->id_subcap,
1513 1514              inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1514 1515                  cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1515 1516                      "encapsulated capab type %d too long for mblk",
1516 1517                      inners->dl_cap);
1517 1518                  return;
1518 1519          }
1519 1520  
1520 1521          if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1521 1522                  ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1522 1523                      "isn't as expected; pass-thru module(s) detected, "
1523 1524                      "discarding capability\n", inners->dl_cap));
1524 1525                  return;
1525 1526          }
1526 1527  
1527 1528          /* Process the encapsulated sub-capability */
1528 1529          ill_capability_dispatch(ill, mp, inners);
1529 1530  }
1530 1531  
1531 1532  static void
1532 1533  ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1533 1534  {
1534 1535          dl_capability_sub_t *dl_subcap;
1535 1536  
1536 1537          if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1537 1538                  return;
1538 1539  
1539 1540          /*
1540 1541           * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1541 1542           * initialized below since it is not used by DLD.
1542 1543           */
1543 1544          dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1544 1545          dl_subcap->dl_cap = DL_CAPAB_DLD;
1545 1546          dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1546 1547  
1547 1548          mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1548 1549  }
1549 1550  
1550 1551  static void
1551 1552  ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1552 1553  {
1553 1554          /*
1554 1555           * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1555 1556           * is only to get the VRRP capability.
1556 1557           *
1557 1558           * Note that we cannot check ill_ipif_up_count here since
1558 1559           * ill_ipif_up_count is only incremented when the resolver is setup.
1559 1560           * That is done asynchronously, and can race with this function.
1560 1561           */
1561 1562          if (!ill->ill_dl_up) {
1562 1563                  if (subp->dl_cap == DL_CAPAB_VRRP)
1563 1564                          ill_capability_vrrp_ack(ill, mp, subp);
1564 1565                  return;
1565 1566          }
1566 1567  
1567 1568          switch (subp->dl_cap) {
1568 1569          case DL_CAPAB_HCKSUM:
1569 1570                  ill_capability_hcksum_ack(ill, mp, subp);
1570 1571                  break;
1571 1572          case DL_CAPAB_ZEROCOPY:
1572 1573                  ill_capability_zerocopy_ack(ill, mp, subp);
1573 1574                  break;
1574 1575          case DL_CAPAB_DLD:
1575 1576                  ill_capability_dld_ack(ill, mp, subp);
1576 1577                  break;
1577 1578          case DL_CAPAB_VRRP:
1578 1579                  break;
1579 1580          default:
1580 1581                  ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1581 1582                      subp->dl_cap));
1582 1583          }
1583 1584  }
1584 1585  
1585 1586  /*
1586 1587   * Process the vrrp capability received from a DLS Provider. isub must point
1587 1588   * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1588 1589   */
1589 1590  static void
1590 1591  ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1591 1592  {
1592 1593          dl_capab_vrrp_t *vrrp;
1593 1594          uint_t          sub_dl_cap = isub->dl_cap;
1594 1595          uint8_t         *capend;
1595 1596  
1596 1597          ASSERT(IAM_WRITER_ILL(ill));
1597 1598          ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1598 1599  
1599 1600          /*
1600 1601           * Note: range checks here are not absolutely sufficient to
1601 1602           * make us robust against malformed messages sent by drivers;
1602 1603           * this is in keeping with the rest of IP's dlpi handling.
1603 1604           * (Remember, it's coming from something else in the kernel
1604 1605           * address space)
1605 1606           */
1606 1607          capend = (uint8_t *)(isub + 1) + isub->dl_length;
1607 1608          if (capend > mp->b_wptr) {
1608 1609                  cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1609 1610                      "malformed sub-capability too long for mblk");
1610 1611                  return;
1611 1612          }
1612 1613          vrrp = (dl_capab_vrrp_t *)(isub + 1);
1613 1614  
1614 1615          /*
1615 1616           * Compare the IP address family and set ILLF_VRRP for the right ill.
1616 1617           */
1617 1618          if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1618 1619              (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1619 1620                  ill->ill_flags |= ILLF_VRRP;
1620 1621          }
1621 1622  }
1622 1623  
1623 1624  /*
1624 1625   * Process a hardware checksum offload capability negotiation ack received
1625 1626   * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1626 1627   * of a DL_CAPABILITY_ACK message.
1627 1628   */
1628 1629  static void
1629 1630  ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1630 1631  {
1631 1632          dl_capability_req_t     *ocap;
1632 1633          dl_capab_hcksum_t       *ihck, *ohck;
1633 1634          ill_hcksum_capab_t      **ill_hcksum;
1634 1635          mblk_t                  *nmp = NULL;
1635 1636          uint_t                  sub_dl_cap = isub->dl_cap;
1636 1637          uint8_t                 *capend;
1637 1638  
1638 1639          ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1639 1640  
1640 1641          ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1641 1642  
1642 1643          /*
1643 1644           * Note: range checks here are not absolutely sufficient to
1644 1645           * make us robust against malformed messages sent by drivers;
1645 1646           * this is in keeping with the rest of IP's dlpi handling.
1646 1647           * (Remember, it's coming from something else in the kernel
1647 1648           * address space)
1648 1649           */
1649 1650          capend = (uint8_t *)(isub + 1) + isub->dl_length;
1650 1651          if (capend > mp->b_wptr) {
1651 1652                  cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1652 1653                      "malformed sub-capability too long for mblk");
1653 1654                  return;
1654 1655          }
1655 1656  
1656 1657          /*
1657 1658           * There are two types of acks we process here:
1658 1659           * 1. acks in reply to a (first form) generic capability req
1659 1660           *    (no ENABLE flag set)
1660 1661           * 2. acks in reply to a ENABLE capability req.
1661 1662           *    (ENABLE flag set)
1662 1663           */
1663 1664          ihck = (dl_capab_hcksum_t *)(isub + 1);
1664 1665  
1665 1666          if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1666 1667                  cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1667 1668                      "unsupported hardware checksum "
1668 1669                      "sub-capability (version %d, expected %d)",
1669 1670                      ihck->hcksum_version, HCKSUM_VERSION_1);
1670 1671                  return;
1671 1672          }
1672 1673  
1673 1674          if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1674 1675                  ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1675 1676                      "checksum capability isn't as expected; pass-thru "
1676 1677                      "module(s) detected, discarding capability\n"));
1677 1678                  return;
1678 1679          }
1679 1680  
1680 1681  #define CURR_HCKSUM_CAPAB                               \
1681 1682          (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 |    \
1682 1683          HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1683 1684  
1684 1685          if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1685 1686              (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1686 1687                  /* do ENABLE processing */
1687 1688                  if (*ill_hcksum == NULL) {
1688 1689                          *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1689 1690                              KM_NOSLEEP);
1690 1691  
1691 1692                          if (*ill_hcksum == NULL) {
1692 1693                                  cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1693 1694                                      "could not enable hcksum version %d "
1694 1695                                      "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1695 1696                                      ill->ill_name);
1696 1697                                  return;
1697 1698                          }
1698 1699                  }
1699 1700  
1700 1701                  (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1701 1702                  (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1702 1703                  ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1703 1704                  ip1dbg(("ill_capability_hcksum_ack: interface %s "
1704 1705                      "has enabled hardware checksumming\n ",
1705 1706                      ill->ill_name));
1706 1707          } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1707 1708                  /*
1708 1709                   * Enabling hardware checksum offload
1709 1710                   * Currently IP supports {TCP,UDP}/IPv4
1710 1711                   * partial and full cksum offload and
1711 1712                   * IPv4 header checksum offload.
1712 1713                   * Allocate new mblk which will
1713 1714                   * contain a new capability request
1714 1715                   * to enable hardware checksum offload.
1715 1716                   */
1716 1717                  uint_t  size;
1717 1718                  uchar_t *rptr;
1718 1719  
1719 1720                  size = sizeof (dl_capability_req_t) +
1720 1721                      sizeof (dl_capability_sub_t) + isub->dl_length;
1721 1722  
1722 1723                  if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1723 1724                          cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1724 1725                              "could not enable hardware cksum for %s (ENOMEM)\n",
1725 1726                              ill->ill_name);
1726 1727                          return;
1727 1728                  }
1728 1729  
1729 1730                  rptr = nmp->b_rptr;
1730 1731                  /* initialize dl_capability_req_t */
1731 1732                  ocap = (dl_capability_req_t *)nmp->b_rptr;
1732 1733                  ocap->dl_sub_offset =
1733 1734                      sizeof (dl_capability_req_t);
1734 1735                  ocap->dl_sub_length =
1735 1736                      sizeof (dl_capability_sub_t) +
1736 1737                      isub->dl_length;
1737 1738                  nmp->b_rptr += sizeof (dl_capability_req_t);
1738 1739  
1739 1740                  /* initialize dl_capability_sub_t */
1740 1741                  bcopy(isub, nmp->b_rptr, sizeof (*isub));
1741 1742                  nmp->b_rptr += sizeof (*isub);
1742 1743  
1743 1744                  /* initialize dl_capab_hcksum_t */
1744 1745                  ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1745 1746                  bcopy(ihck, ohck, sizeof (*ihck));
1746 1747  
1747 1748                  nmp->b_rptr = rptr;
1748 1749                  ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1749 1750  
1750 1751                  /* Set ENABLE flag */
1751 1752                  ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1752 1753                  ohck->hcksum_txflags |= HCKSUM_ENABLE;
1753 1754  
1754 1755                  /*
1755 1756                   * nmp points to a DL_CAPABILITY_REQ message to enable
1756 1757                   * hardware checksum acceleration.
1757 1758                   */
1758 1759                  ill_capability_send(ill, nmp);
1759 1760          } else {
1760 1761                  ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1761 1762                      "advertised %x hardware checksum capability flags\n",
1762 1763                      ill->ill_name, ihck->hcksum_txflags));
1763 1764          }
1764 1765  }
1765 1766  
1766 1767  static void
1767 1768  ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1768 1769  {
1769 1770          dl_capab_hcksum_t *hck_subcap;
1770 1771          dl_capability_sub_t *dl_subcap;
1771 1772  
1772 1773          if (!ILL_HCKSUM_CAPABLE(ill))
1773 1774                  return;
1774 1775  
1775 1776          ASSERT(ill->ill_hcksum_capab != NULL);
1776 1777  
1777 1778          dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1778 1779          dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1779 1780          dl_subcap->dl_length = sizeof (*hck_subcap);
1780 1781  
1781 1782          hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1782 1783          hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1783 1784          hck_subcap->hcksum_txflags = 0;
1784 1785  
1785 1786          mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1786 1787  }
1787 1788  
1788 1789  static void
1789 1790  ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1790 1791  {
1791 1792          mblk_t *nmp = NULL;
1792 1793          dl_capability_req_t *oc;
1793 1794          dl_capab_zerocopy_t *zc_ic, *zc_oc;
1794 1795          ill_zerocopy_capab_t **ill_zerocopy_capab;
1795 1796          uint_t sub_dl_cap = isub->dl_cap;
1796 1797          uint8_t *capend;
1797 1798  
1798 1799          ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1799 1800  
1800 1801          ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1801 1802  
1802 1803          /*
1803 1804           * Note: range checks here are not absolutely sufficient to
1804 1805           * make us robust against malformed messages sent by drivers;
1805 1806           * this is in keeping with the rest of IP's dlpi handling.
1806 1807           * (Remember, it's coming from something else in the kernel
1807 1808           * address space)
1808 1809           */
1809 1810          capend = (uint8_t *)(isub + 1) + isub->dl_length;
1810 1811          if (capend > mp->b_wptr) {
1811 1812                  cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1812 1813                      "malformed sub-capability too long for mblk");
1813 1814                  return;
1814 1815          }
1815 1816  
1816 1817          zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1817 1818          if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1818 1819                  cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1819 1820                      "unsupported ZEROCOPY sub-capability (version %d, "
1820 1821                      "expected %d)", zc_ic->zerocopy_version,
1821 1822                      ZEROCOPY_VERSION_1);
1822 1823                  return;
1823 1824          }
1824 1825  
1825 1826          if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1826 1827                  ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1827 1828                      "capability isn't as expected; pass-thru module(s) "
1828 1829                      "detected, discarding capability\n"));
1829 1830                  return;
1830 1831          }
1831 1832  
1832 1833          if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1833 1834                  if (*ill_zerocopy_capab == NULL) {
1834 1835                          *ill_zerocopy_capab =
1835 1836                              kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1836 1837                              KM_NOSLEEP);
1837 1838  
1838 1839                          if (*ill_zerocopy_capab == NULL) {
1839 1840                                  cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1840 1841                                      "could not enable Zero-copy version %d "
1841 1842                                      "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1842 1843                                      ill->ill_name);
1843 1844                                  return;
1844 1845                          }
1845 1846                  }
1846 1847  
1847 1848                  ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1848 1849                      "supports Zero-copy version %d\n", ill->ill_name,
1849 1850                      ZEROCOPY_VERSION_1));
1850 1851  
1851 1852                  (*ill_zerocopy_capab)->ill_zerocopy_version =
1852 1853                      zc_ic->zerocopy_version;
1853 1854                  (*ill_zerocopy_capab)->ill_zerocopy_flags =
1854 1855                      zc_ic->zerocopy_flags;
1855 1856  
1856 1857                  ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1857 1858          } else {
1858 1859                  uint_t size;
1859 1860                  uchar_t *rptr;
1860 1861  
1861 1862                  size = sizeof (dl_capability_req_t) +
1862 1863                      sizeof (dl_capability_sub_t) +
1863 1864                      sizeof (dl_capab_zerocopy_t);
1864 1865  
1865 1866                  if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1866 1867                          cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1867 1868                              "could not enable zerocopy for %s (ENOMEM)\n",
1868 1869                              ill->ill_name);
1869 1870                          return;
1870 1871                  }
1871 1872  
1872 1873                  rptr = nmp->b_rptr;
1873 1874                  /* initialize dl_capability_req_t */
1874 1875                  oc = (dl_capability_req_t *)rptr;
1875 1876                  oc->dl_sub_offset = sizeof (dl_capability_req_t);
1876 1877                  oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1877 1878                      sizeof (dl_capab_zerocopy_t);
1878 1879                  rptr += sizeof (dl_capability_req_t);
1879 1880  
1880 1881                  /* initialize dl_capability_sub_t */
1881 1882                  bcopy(isub, rptr, sizeof (*isub));
1882 1883                  rptr += sizeof (*isub);
1883 1884  
1884 1885                  /* initialize dl_capab_zerocopy_t */
1885 1886                  zc_oc = (dl_capab_zerocopy_t *)rptr;
1886 1887                  *zc_oc = *zc_ic;
1887 1888  
1888 1889                  ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1889 1890                      "to enable zero-copy version %d\n", ill->ill_name,
1890 1891                      ZEROCOPY_VERSION_1));
1891 1892  
1892 1893                  /* set VMSAFE_MEM flag */
1893 1894                  zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1894 1895  
1895 1896                  /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1896 1897                  ill_capability_send(ill, nmp);
1897 1898          }
1898 1899  }
1899 1900  
1900 1901  static void
1901 1902  ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1902 1903  {
1903 1904          dl_capab_zerocopy_t *zerocopy_subcap;
1904 1905          dl_capability_sub_t *dl_subcap;
1905 1906  
1906 1907          if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1907 1908                  return;
1908 1909  
1909 1910          ASSERT(ill->ill_zerocopy_capab != NULL);
1910 1911  
1911 1912          dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1912 1913          dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1913 1914          dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1914 1915  
1915 1916          zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1916 1917          zerocopy_subcap->zerocopy_version =
1917 1918              ill->ill_zerocopy_capab->ill_zerocopy_version;
1918 1919          zerocopy_subcap->zerocopy_flags = 0;
1919 1920  
1920 1921          mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1921 1922  }
1922 1923  
1923 1924  /*
1924 1925   * DLD capability
1925 1926   * Refer to dld.h for more information regarding the purpose and usage
1926 1927   * of this capability.
1927 1928   */
1928 1929  static void
1929 1930  ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1930 1931  {
1931 1932          dl_capab_dld_t          *dld_ic, dld;
1932 1933          uint_t                  sub_dl_cap = isub->dl_cap;
1933 1934          uint8_t                 *capend;
1934 1935          ill_dld_capab_t         *idc;
1935 1936  
1936 1937          ASSERT(IAM_WRITER_ILL(ill));
1937 1938          ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1938 1939  
1939 1940          /*
1940 1941           * Note: range checks here are not absolutely sufficient to
1941 1942           * make us robust against malformed messages sent by drivers;
1942 1943           * this is in keeping with the rest of IP's dlpi handling.
1943 1944           * (Remember, it's coming from something else in the kernel
1944 1945           * address space)
1945 1946           */
1946 1947          capend = (uint8_t *)(isub + 1) + isub->dl_length;
1947 1948          if (capend > mp->b_wptr) {
1948 1949                  cmn_err(CE_WARN, "ill_capability_dld_ack: "
1949 1950                      "malformed sub-capability too long for mblk");
1950 1951                  return;
1951 1952          }
1952 1953          dld_ic = (dl_capab_dld_t *)(isub + 1);
1953 1954          if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1954 1955                  cmn_err(CE_CONT, "ill_capability_dld_ack: "
1955 1956                      "unsupported DLD sub-capability (version %d, "
1956 1957                      "expected %d)", dld_ic->dld_version,
1957 1958                      DLD_CURRENT_VERSION);
1958 1959                  return;
1959 1960          }
1960 1961          if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1961 1962                  ip1dbg(("ill_capability_dld_ack: mid token for dld "
1962 1963                      "capability isn't as expected; pass-thru module(s) "
1963 1964                      "detected, discarding capability\n"));
1964 1965                  return;
1965 1966          }
1966 1967  
1967 1968          /*
1968 1969           * Copy locally to ensure alignment.
1969 1970           */
1970 1971          bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1971 1972  
1972 1973          if ((idc = ill->ill_dld_capab) == NULL) {
1973 1974                  idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1974 1975                  if (idc == NULL) {
1975 1976                          cmn_err(CE_WARN, "ill_capability_dld_ack: "
1976 1977                              "could not enable DLD version %d "
1977 1978                              "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1978 1979                              ill->ill_name);
1979 1980                          return;
1980 1981                  }
1981 1982                  ill->ill_dld_capab = idc;
1982 1983          }
1983 1984          idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1984 1985          idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1985 1986          ip1dbg(("ill_capability_dld_ack: interface %s "
1986 1987              "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1987 1988  
1988 1989          ill_capability_dld_enable(ill);
1989 1990  }
1990 1991  
1991 1992  /*
1992 1993   * Typically capability negotiation between IP and the driver happens via
1993 1994   * DLPI message exchange. However GLD also offers a direct function call
1994 1995   * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1995 1996   * But arbitrary function calls into IP or GLD are not permitted, since both
1996 1997   * of them are protected by their own perimeter mechanism. The perimeter can
1997 1998   * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1998 1999   * these perimeters is IP -> MAC. Thus for example to enable the squeue
1999 2000   * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
2000 2001   * to enter the mac perimeter and then do the direct function calls into
2001 2002   * GLD to enable squeue polling. The ring related callbacks from the mac into
2002 2003   * the stack to add, bind, quiesce, restart or cleanup a ring are all
2003 2004   * protected by the mac perimeter.
2004 2005   */
2005 2006  static void
2006 2007  ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
2007 2008  {
2008 2009          ill_dld_capab_t         *idc = ill->ill_dld_capab;
2009 2010          int                     err;
2010 2011  
2011 2012          err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
2012 2013              DLD_ENABLE);
2013 2014          ASSERT(err == 0);
2014 2015  }
2015 2016  
2016 2017  static void
2017 2018  ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
2018 2019  {
2019 2020          ill_dld_capab_t         *idc = ill->ill_dld_capab;
2020 2021          int                     err;
2021 2022  
2022 2023          err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
2023 2024              DLD_DISABLE);
2024 2025          ASSERT(err == 0);
2025 2026  }
2026 2027  
2027 2028  boolean_t
2028 2029  ill_mac_perim_held(ill_t *ill)
2029 2030  {
2030 2031          ill_dld_capab_t         *idc = ill->ill_dld_capab;
2031 2032  
2032 2033          return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
2033 2034              DLD_QUERY));
2034 2035  }
2035 2036  
2036 2037  static void
2037 2038  ill_capability_direct_enable(ill_t *ill)
2038 2039  {
2039 2040          ill_dld_capab_t         *idc = ill->ill_dld_capab;
2040 2041          ill_dld_direct_t        *idd = &idc->idc_direct;
2041 2042          dld_capab_direct_t      direct;
2042 2043          int                     rc;
2043 2044  
2044 2045          ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2045 2046  
2046 2047          bzero(&direct, sizeof (direct));
2047 2048          direct.di_rx_cf = (uintptr_t)ip_input;
2048 2049          direct.di_rx_ch = ill;
2049 2050  
2050 2051          rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2051 2052              DLD_ENABLE);
2052 2053          if (rc == 0) {
2053 2054                  idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2054 2055                  idd->idd_tx_dh = direct.di_tx_dh;
2055 2056                  idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2056 2057                  idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2057 2058                  idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2058 2059                  idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2059 2060                  ASSERT(idd->idd_tx_cb_df != NULL);
2060 2061                  ASSERT(idd->idd_tx_fctl_df != NULL);
2061 2062                  ASSERT(idd->idd_tx_df != NULL);
2062 2063                  /*
2063 2064                   * One time registration of flow enable callback function
2064 2065                   */
2065 2066                  ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2066 2067                      ill_flow_enable, ill);
2067 2068                  ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2068 2069                  DTRACE_PROBE1(direct_on, (ill_t *), ill);
2069 2070          } else {
2070 2071                  cmn_err(CE_WARN, "warning: could not enable DIRECT "
2071 2072                      "capability, rc = %d\n", rc);
2072 2073                  DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2073 2074          }
2074 2075  }
2075 2076  
2076 2077  static void
2077 2078  ill_capability_poll_enable(ill_t *ill)
2078 2079  {
2079 2080          ill_dld_capab_t         *idc = ill->ill_dld_capab;
2080 2081          dld_capab_poll_t        poll;
2081 2082          int                     rc;
2082 2083  
2083 2084          ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2084 2085  
2085 2086          bzero(&poll, sizeof (poll));
2086 2087          poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2087 2088          poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2088 2089          poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2089 2090          poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2090 2091          poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2091 2092          poll.poll_ring_ch = ill;
2092 2093          rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2093 2094              DLD_ENABLE);
2094 2095          if (rc == 0) {
2095 2096                  ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2096 2097                  DTRACE_PROBE1(poll_on, (ill_t *), ill);
2097 2098          } else {
2098 2099                  ip1dbg(("warning: could not enable POLL "
2099 2100                      "capability, rc = %d\n", rc));
2100 2101                  DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2101 2102          }
2102 2103  }
2103 2104  
2104 2105  /*
2105 2106   * Enable the LSO capability.
2106 2107   */
2107 2108  static void
2108 2109  ill_capability_lso_enable(ill_t *ill)
2109 2110  {
2110 2111          ill_dld_capab_t *idc = ill->ill_dld_capab;
2111 2112          dld_capab_lso_t lso;
2112 2113          int rc;
2113 2114  
2114 2115          ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2115 2116  
2116 2117          if (ill->ill_lso_capab == NULL) {
2117 2118                  ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2118 2119                      KM_NOSLEEP);
2119 2120                  if (ill->ill_lso_capab == NULL) {
2120 2121                          cmn_err(CE_WARN, "ill_capability_lso_enable: "
2121 2122                              "could not enable LSO for %s (ENOMEM)\n",
2122 2123                              ill->ill_name);
2123 2124                          return;
2124 2125                  }
2125 2126          }
2126 2127  
2127 2128          bzero(&lso, sizeof (lso));
2128 2129          if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2129 2130              DLD_ENABLE)) == 0) {
2130 2131                  ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2131 2132                  ill->ill_lso_capab->ill_lso_max = lso.lso_max;
2132 2133                  ill->ill_capabilities |= ILL_CAPAB_LSO;
2133 2134                  ip1dbg(("ill_capability_lso_enable: interface %s "
2134 2135                      "has enabled LSO\n ", ill->ill_name));
2135 2136          } else {
2136 2137                  kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2137 2138                  ill->ill_lso_capab = NULL;
2138 2139                  DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2139 2140          }
2140 2141  }
2141 2142  
2142 2143  /*
2143 2144   * Check whether or not mac will prevent us from sending with a given IP
2144 2145   * address. This requires having the IPCHECK capability, which we should
2145 2146   * always be able to successfully negotiate, but if it's somehow missing
2146 2147   * then we just permit the caller to use the address, since mac does the
2147 2148   * actual enforcement and ip is just performing a courtesy check to help
2148 2149   * prevent users from unwittingly setting and attempting to use blocked
2149 2150   * addresses.
2150 2151   */
2151 2152  static boolean_t
2152 2153  ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr)
2153 2154  {
2154 2155          if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0)
2155 2156                  return (B_TRUE);
2156 2157  
2157 2158          ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck;
2158 2159          ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df;
2159 2160          return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr));
2160 2161  }
2161 2162  
2162 2163  static void
2163 2164  ill_capability_ipcheck_enable(ill_t *ill)
2164 2165  {
2165 2166          ill_dld_capab_t         *idc = ill->ill_dld_capab;
2166 2167          ill_dld_ipcheck_t       *idi = &idc->idc_ipcheck;
2167 2168          dld_capab_ipcheck_t     spoof;
2168 2169          int rc;
2169 2170  
2170 2171          ASSERT(IAM_WRITER_ILL(ill));
2171 2172  
2172 2173          bzero(&spoof, sizeof (spoof));
2173 2174          if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
2174 2175              &spoof, DLD_ENABLE)) == 0) {
2175 2176                  idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df;
2176 2177                  idi->idi_allowed_dh = spoof.ipc_allowed_dh;
2177 2178                  ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK;
2178 2179          } else {
2179 2180                  cmn_err(CE_WARN, "warning: could not enable IPCHECK "
2180 2181                      "capability, rc = %d\n", rc);
2181 2182                  DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc);
2182 2183          }
2183 2184  }
2184 2185  
2185 2186  static void
2186 2187  ill_capability_dld_enable(ill_t *ill)
2187 2188  {
2188 2189          mac_perim_handle_t mph;
2189 2190  
2190 2191          ASSERT(IAM_WRITER_ILL(ill));
2191 2192  
2192 2193          ill_mac_perim_enter(ill, &mph);
2193 2194          if (!ill->ill_isv6) {
2194 2195                  ill_capability_direct_enable(ill);
2195 2196                  ill_capability_poll_enable(ill);
2196 2197                  ill_capability_lso_enable(ill);
2197 2198          }
2198 2199  
2199 2200          ill_capability_ipcheck_enable(ill);
2200 2201  
2201 2202          ill->ill_capabilities |= ILL_CAPAB_DLD;
2202 2203          ill_mac_perim_exit(ill, mph);
2203 2204  }
2204 2205  
2205 2206  static void
2206 2207  ill_capability_dld_disable(ill_t *ill)
2207 2208  {
2208 2209          ill_dld_capab_t *idc;
2209 2210          ill_dld_direct_t *idd;
2210 2211          mac_perim_handle_t      mph;
2211 2212  
2212 2213          ASSERT(IAM_WRITER_ILL(ill));
2213 2214  
2214 2215          if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2215 2216                  return;
2216 2217  
2217 2218          ill_mac_perim_enter(ill, &mph);
2218 2219  
2219 2220          idc = ill->ill_dld_capab;
2220 2221          if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2221 2222                  /*
2222 2223                   * For performance we avoid locks in the transmit data path
2223 2224                   * and don't maintain a count of the number of threads using
2224 2225                   * direct calls. Thus some threads could be using direct
2225 2226                   * transmit calls to GLD, even after the capability mechanism
2226 2227                   * turns it off. This is still safe since the handles used in
2227 2228                   * the direct calls continue to be valid until the unplumb is
2228 2229                   * completed. Remove the callback that was added (1-time) at
2229 2230                   * capab enable time.
2230 2231                   */
2231 2232                  mutex_enter(&ill->ill_lock);
2232 2233                  ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2233 2234                  mutex_exit(&ill->ill_lock);
2234 2235                  if (ill->ill_flownotify_mh != NULL) {
2235 2236                          idd = &idc->idc_direct;
2236 2237                          idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2237 2238                              ill->ill_flownotify_mh);
2238 2239                          ill->ill_flownotify_mh = NULL;
2239 2240                  }
2240 2241                  (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2241 2242                      NULL, DLD_DISABLE);
2242 2243          }
2243 2244  
2244 2245          if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2245 2246                  ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2246 2247                  ip_squeue_clean_all(ill);
2247 2248                  (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2248 2249                      NULL, DLD_DISABLE);
2249 2250          }
2250 2251  
2251 2252          if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2252 2253                  ASSERT(ill->ill_lso_capab != NULL);
2253 2254                  /*
2254 2255                   * Clear the capability flag for LSO but retain the
2255 2256                   * ill_lso_capab structure since it's possible that another
2256 2257                   * thread is still referring to it.  The structure only gets
2257 2258                   * deallocated when we destroy the ill.
2258 2259                   */
2259 2260  
2260 2261                  ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2261 2262                  (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2262 2263                      NULL, DLD_DISABLE);
2263 2264          }
2264 2265  
2265 2266          if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) {
2266 2267                  ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL);
2267 2268                  ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL);
2268 2269  
2269 2270                  ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK;
2270 2271                  (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
2271 2272                      NULL, DLD_DISABLE);
2272 2273          }
2273 2274  
2274 2275          ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2275 2276          ill_mac_perim_exit(ill, mph);
2276 2277  }
2277 2278  
2278 2279  /*
2279 2280   * Capability Negotiation protocol
2280 2281   *
2281 2282   * We don't wait for DLPI capability operations to finish during interface
2282 2283   * bringup or teardown. Doing so would introduce more asynchrony and the
2283 2284   * interface up/down operations will need multiple return and restarts.
2284 2285   * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2285 2286   * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2286 2287   * exclusive operation won't start until the DLPI operations of the previous
2287 2288   * exclusive operation complete.
2288 2289   *
2289 2290   * The capability state machine is shown below.
2290 2291   *
2291 2292   * state                next state              event, action
2292 2293   *
2293 2294   * IDCS_UNKNOWN         IDCS_PROBE_SENT         ill_capability_probe
2294 2295   * IDCS_PROBE_SENT      IDCS_OK                 ill_capability_ack
2295 2296   * IDCS_PROBE_SENT      IDCS_FAILED             ip_rput_dlpi_writer (nack)
2296 2297   * IDCS_OK              IDCS_RENEG              Receipt of DL_NOTE_CAPAB_RENEG
2297 2298   * IDCS_OK              IDCS_RESET_SENT         ill_capability_reset
2298 2299   * IDCS_RESET_SENT      IDCS_UNKNOWN            ill_capability_ack_thr
2299 2300   * IDCS_RENEG           IDCS_PROBE_SENT         ill_capability_ack_thr ->
2300 2301   *                                                  ill_capability_probe.
2301 2302   */
2302 2303  
2303 2304  /*
2304 2305   * Dedicated thread started from ip_stack_init that handles capability
2305 2306   * disable. This thread ensures the taskq dispatch does not fail by waiting
2306 2307   * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2307 2308   * that direct calls to DLD are done in a cv_waitable context.
2308 2309   */
2309 2310  void
2310 2311  ill_taskq_dispatch(ip_stack_t *ipst)
2311 2312  {
2312 2313          callb_cpr_t cprinfo;
2313 2314          char    name[64];
2314 2315          mblk_t  *mp;
2315 2316  
2316 2317          (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2317 2318              ipst->ips_netstack->netstack_stackid);
2318 2319          CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2319 2320              name);
2320 2321          mutex_enter(&ipst->ips_capab_taskq_lock);
2321 2322  
2322 2323          for (;;) {
2323 2324                  mp = ipst->ips_capab_taskq_head;
2324 2325                  while (mp != NULL) {
2325 2326                          ipst->ips_capab_taskq_head = mp->b_next;
2326 2327                          if (ipst->ips_capab_taskq_head == NULL)
2327 2328                                  ipst->ips_capab_taskq_tail = NULL;
2328 2329                          mutex_exit(&ipst->ips_capab_taskq_lock);
2329 2330                          mp->b_next = NULL;
2330 2331  
2331 2332                          VERIFY(taskq_dispatch(system_taskq,
2332 2333                              ill_capability_ack_thr, mp, TQ_SLEEP) !=
2333 2334                              TASKQID_INVALID);
2334 2335                          mutex_enter(&ipst->ips_capab_taskq_lock);
2335 2336                          mp = ipst->ips_capab_taskq_head;
2336 2337                  }
2337 2338  
2338 2339                  if (ipst->ips_capab_taskq_quit)
2339 2340                          break;
2340 2341                  CALLB_CPR_SAFE_BEGIN(&cprinfo);
2341 2342                  cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2342 2343                  CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2343 2344          }
2344 2345          VERIFY(ipst->ips_capab_taskq_head == NULL);
2345 2346          VERIFY(ipst->ips_capab_taskq_tail == NULL);
2346 2347          CALLB_CPR_EXIT(&cprinfo);
2347 2348          thread_exit();
2348 2349  }
2349 2350  
2350 2351  /*
2351 2352   * Consume a new-style hardware capabilities negotiation ack.
2352 2353   * Called via taskq on receipt of DL_CAPABILITY_ACK.
2353 2354   */
2354 2355  static void
2355 2356  ill_capability_ack_thr(void *arg)
2356 2357  {
2357 2358          mblk_t  *mp = arg;
2358 2359          dl_capability_ack_t *capp;
2359 2360          dl_capability_sub_t *subp, *endp;
2360 2361          ill_t   *ill;
2361 2362          boolean_t reneg;
2362 2363  
2363 2364          ill = (ill_t *)mp->b_prev;
2364 2365          mp->b_prev = NULL;
2365 2366  
2366 2367          VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2367 2368  
2368 2369          if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2369 2370              ill->ill_dlpi_capab_state == IDCS_RENEG) {
2370 2371                  /*
2371 2372                   * We have received the ack for our DL_CAPAB reset request.
2372 2373                   * There isnt' anything in the message that needs processing.
2373 2374                   * All message based capabilities have been disabled, now
2374 2375                   * do the function call based capability disable.
2375 2376                   */
2376 2377                  reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2377 2378                  ill_capability_dld_disable(ill);
2378 2379                  ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2379 2380                  if (reneg)
2380 2381                          ill_capability_probe(ill);
2381 2382                  goto done;
2382 2383          }
2383 2384  
2384 2385          if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2385 2386                  ill->ill_dlpi_capab_state = IDCS_OK;
2386 2387  
2387 2388          capp = (dl_capability_ack_t *)mp->b_rptr;
2388 2389  
2389 2390          if (capp->dl_sub_length == 0) {
2390 2391                  /* no new-style capabilities */
2391 2392                  goto done;
2392 2393          }
2393 2394  
2394 2395          /* make sure the driver supplied correct dl_sub_length */
2395 2396          if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2396 2397                  ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2397 2398                      "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2398 2399                  goto done;
2399 2400          }
2400 2401  
2401 2402  #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2402 2403          /*
2403 2404           * There are sub-capabilities. Process the ones we know about.
2404 2405           * Loop until we don't have room for another sub-cap header..
2405 2406           */
2406 2407          for (subp = SC(capp, capp->dl_sub_offset),
2407 2408              endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2408 2409              subp <= endp;
2409 2410              subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2410 2411  
2411 2412                  switch (subp->dl_cap) {
2412 2413                  case DL_CAPAB_ID_WRAPPER:
2413 2414                          ill_capability_id_ack(ill, mp, subp);
2414 2415                          break;
2415 2416                  default:
2416 2417                          ill_capability_dispatch(ill, mp, subp);
2417 2418                          break;
2418 2419                  }
2419 2420          }
2420 2421  #undef SC
2421 2422  done:
2422 2423          inet_freemsg(mp);
2423 2424          ill_capability_done(ill);
2424 2425          ipsq_exit(ill->ill_phyint->phyint_ipsq);
2425 2426  }
2426 2427  
2427 2428  /*
2428 2429   * This needs to be started in a taskq thread to provide a cv_waitable
2429 2430   * context.
2430 2431   */
2431 2432  void
2432 2433  ill_capability_ack(ill_t *ill, mblk_t *mp)
2433 2434  {
2434 2435          ip_stack_t      *ipst = ill->ill_ipst;
2435 2436  
2436 2437          mp->b_prev = (mblk_t *)ill;
2437 2438          ASSERT(mp->b_next == NULL);
2438 2439  
2439 2440          if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2440 2441              TQ_NOSLEEP) != TASKQID_INVALID)
2441 2442                  return;
2442 2443  
2443 2444          /*
2444 2445           * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2445 2446           * which will do the dispatch using TQ_SLEEP to guarantee success.
2446 2447           */
2447 2448          mutex_enter(&ipst->ips_capab_taskq_lock);
2448 2449          if (ipst->ips_capab_taskq_head == NULL) {
2449 2450                  ASSERT(ipst->ips_capab_taskq_tail == NULL);
2450 2451                  ipst->ips_capab_taskq_head = mp;
2451 2452          } else {
2452 2453                  ipst->ips_capab_taskq_tail->b_next = mp;
2453 2454          }
2454 2455          ipst->ips_capab_taskq_tail = mp;
2455 2456  
2456 2457          cv_signal(&ipst->ips_capab_taskq_cv);
2457 2458          mutex_exit(&ipst->ips_capab_taskq_lock);
2458 2459  }
2459 2460  
2460 2461  /*
2461 2462   * This routine is called to scan the fragmentation reassembly table for
2462 2463   * the specified ILL for any packets that are starting to smell.
2463 2464   * dead_interval is the maximum time in seconds that will be tolerated.  It
2464 2465   * will either be the value specified in ip_g_frag_timeout, or zero if the
2465 2466   * ILL is shutting down and it is time to blow everything off.
2466 2467   *
2467 2468   * It returns the number of seconds (as a time_t) that the next frag timer
2468 2469   * should be scheduled for, 0 meaning that the timer doesn't need to be
2469 2470   * re-started.  Note that the method of calculating next_timeout isn't
2470 2471   * entirely accurate since time will flow between the time we grab
2471 2472   * current_time and the time we schedule the next timeout.  This isn't a
2472 2473   * big problem since this is the timer for sending an ICMP reassembly time
2473 2474   * exceeded messages, and it doesn't have to be exactly accurate.
2474 2475   *
2475 2476   * This function is
2476 2477   * sometimes called as writer, although this is not required.
2477 2478   */
2478 2479  time_t
2479 2480  ill_frag_timeout(ill_t *ill, time_t dead_interval)
2480 2481  {
2481 2482          ipfb_t  *ipfb;
2482 2483          ipfb_t  *endp;
2483 2484          ipf_t   *ipf;
2484 2485          ipf_t   *ipfnext;
2485 2486          mblk_t  *mp;
2486 2487          time_t  current_time = gethrestime_sec();
2487 2488          time_t  next_timeout = 0;
2488 2489          uint32_t        hdr_length;
2489 2490          mblk_t  *send_icmp_head;
2490 2491          mblk_t  *send_icmp_head_v6;
2491 2492          ip_stack_t *ipst = ill->ill_ipst;
2492 2493          ip_recv_attr_t iras;
2493 2494  
2494 2495          bzero(&iras, sizeof (iras));
2495 2496          iras.ira_flags = 0;
2496 2497          iras.ira_ill = iras.ira_rill = ill;
2497 2498          iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2498 2499          iras.ira_rifindex = iras.ira_ruifindex;
2499 2500  
2500 2501          ipfb = ill->ill_frag_hash_tbl;
2501 2502          if (ipfb == NULL)
2502 2503                  return (B_FALSE);
2503 2504          endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2504 2505          /* Walk the frag hash table. */
2505 2506          for (; ipfb < endp; ipfb++) {
2506 2507                  send_icmp_head = NULL;
2507 2508                  send_icmp_head_v6 = NULL;
2508 2509                  mutex_enter(&ipfb->ipfb_lock);
2509 2510                  while ((ipf = ipfb->ipfb_ipf) != 0) {
2510 2511                          time_t frag_time = current_time - ipf->ipf_timestamp;
2511 2512                          time_t frag_timeout;
2512 2513  
2513 2514                          if (frag_time < dead_interval) {
2514 2515                                  /*
2515 2516                                   * There are some outstanding fragments
2516 2517                                   * that will timeout later.  Make note of
2517 2518                                   * the time so that we can reschedule the
2518 2519                                   * next timeout appropriately.
2519 2520                                   */
2520 2521                                  frag_timeout = dead_interval - frag_time;
2521 2522                                  if (next_timeout == 0 ||
2522 2523                                      frag_timeout < next_timeout) {
2523 2524                                          next_timeout = frag_timeout;
2524 2525                                  }
2525 2526                                  break;
2526 2527                          }
2527 2528                          /* Time's up.  Get it out of here. */
2528 2529                          hdr_length = ipf->ipf_nf_hdr_len;
2529 2530                          ipfnext = ipf->ipf_hash_next;
2530 2531                          if (ipfnext)
2531 2532                                  ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2532 2533                          *ipf->ipf_ptphn = ipfnext;
2533 2534                          mp = ipf->ipf_mp->b_cont;
2534 2535                          for (; mp; mp = mp->b_cont) {
2535 2536                                  /* Extra points for neatness. */
2536 2537                                  IP_REASS_SET_START(mp, 0);
2537 2538                                  IP_REASS_SET_END(mp, 0);
2538 2539                          }
2539 2540                          mp = ipf->ipf_mp->b_cont;
2540 2541                          atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2541 2542                          ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2542 2543                          ipfb->ipfb_count -= ipf->ipf_count;
2543 2544                          ASSERT(ipfb->ipfb_frag_pkts > 0);
2544 2545                          ipfb->ipfb_frag_pkts--;
2545 2546                          /*
2546 2547                           * We do not send any icmp message from here because
2547 2548                           * we currently are holding the ipfb_lock for this
2548 2549                           * hash chain. If we try and send any icmp messages
2549 2550                           * from here we may end up via a put back into ip
2550 2551                           * trying to get the same lock, causing a recursive
2551 2552                           * mutex panic. Instead we build a list and send all
2552 2553                           * the icmp messages after we have dropped the lock.
2553 2554                           */
2554 2555                          if (ill->ill_isv6) {
2555 2556                                  if (hdr_length != 0) {
2556 2557                                          mp->b_next = send_icmp_head_v6;
2557 2558                                          send_icmp_head_v6 = mp;
2558 2559                                  } else {
2559 2560                                          freemsg(mp);
2560 2561                                  }
2561 2562                          } else {
2562 2563                                  if (hdr_length != 0) {
2563 2564                                          mp->b_next = send_icmp_head;
2564 2565                                          send_icmp_head = mp;
2565 2566                                  } else {
2566 2567                                          freemsg(mp);
2567 2568                                  }
2568 2569                          }
2569 2570                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2570 2571                          ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2571 2572                          freeb(ipf->ipf_mp);
2572 2573                  }
2573 2574                  mutex_exit(&ipfb->ipfb_lock);
2574 2575                  /*
2575 2576                   * Now need to send any icmp messages that we delayed from
2576 2577                   * above.
2577 2578                   */
2578 2579                  while (send_icmp_head_v6 != NULL) {
2579 2580                          ip6_t *ip6h;
2580 2581  
2581 2582                          mp = send_icmp_head_v6;
2582 2583                          send_icmp_head_v6 = send_icmp_head_v6->b_next;
2583 2584                          mp->b_next = NULL;
2584 2585                          ip6h = (ip6_t *)mp->b_rptr;
2585 2586                          iras.ira_flags = 0;
2586 2587                          /*
2587 2588                           * This will result in an incorrect ALL_ZONES zoneid
2588 2589                           * for multicast packets, but we
2589 2590                           * don't send ICMP errors for those in any case.
2590 2591                           */
2591 2592                          iras.ira_zoneid =
2592 2593                              ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2593 2594                              ill, ipst);
2594 2595                          ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2595 2596                          icmp_time_exceeded_v6(mp,
2596 2597                              ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2597 2598                              &iras);
2598 2599                          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2599 2600                  }
2600 2601                  while (send_icmp_head != NULL) {
2601 2602                          ipaddr_t dst;
2602 2603  
2603 2604                          mp = send_icmp_head;
2604 2605                          send_icmp_head = send_icmp_head->b_next;
2605 2606                          mp->b_next = NULL;
2606 2607  
2607 2608                          dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2608 2609  
2609 2610                          iras.ira_flags = IRAF_IS_IPV4;
2610 2611                          /*
2611 2612                           * This will result in an incorrect ALL_ZONES zoneid
2612 2613                           * for broadcast and multicast packets, but we
2613 2614                           * don't send ICMP errors for those in any case.
2614 2615                           */
2615 2616                          iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2616 2617                              ill, ipst);
2617 2618                          ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2618 2619                          icmp_time_exceeded(mp,
2619 2620                              ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2620 2621                          ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2621 2622                  }
2622 2623          }
2623 2624          /*
2624 2625           * A non-dying ILL will use the return value to decide whether to
2625 2626           * restart the frag timer, and for how long.
2626 2627           */
2627 2628          return (next_timeout);
2628 2629  }
2629 2630  
2630 2631  /*
2631 2632   * This routine is called when the approximate count of mblk memory used
2632 2633   * for the specified ILL has exceeded max_count.
2633 2634   */
2634 2635  void
2635 2636  ill_frag_prune(ill_t *ill, uint_t max_count)
2636 2637  {
2637 2638          ipfb_t  *ipfb;
2638 2639          ipf_t   *ipf;
2639 2640          size_t  count;
2640 2641          clock_t now;
2641 2642  
2642 2643          /*
2643 2644           * If we are here within ip_min_frag_prune_time msecs remove
2644 2645           * ill_frag_free_num_pkts oldest packets from each bucket and increment
2645 2646           * ill_frag_free_num_pkts.
2646 2647           */
2647 2648          mutex_enter(&ill->ill_lock);
2648 2649          now = ddi_get_lbolt();
2649 2650          if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2650 2651              (ip_min_frag_prune_time != 0 ?
2651 2652              ip_min_frag_prune_time : msec_per_tick)) {
2652 2653  
2653 2654                  ill->ill_frag_free_num_pkts++;
2654 2655  
2655 2656          } else {
2656 2657                  ill->ill_frag_free_num_pkts = 0;
2657 2658          }
2658 2659          ill->ill_last_frag_clean_time = now;
2659 2660          mutex_exit(&ill->ill_lock);
2660 2661  
2661 2662          /*
2662 2663           * free ill_frag_free_num_pkts oldest packets from each bucket.
2663 2664           */
2664 2665          if (ill->ill_frag_free_num_pkts != 0) {
2665 2666                  int ix;
2666 2667  
2667 2668                  for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2668 2669                          ipfb = &ill->ill_frag_hash_tbl[ix];
2669 2670                          mutex_enter(&ipfb->ipfb_lock);
2670 2671                          if (ipfb->ipfb_ipf != NULL) {
2671 2672                                  ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2672 2673                                      ill->ill_frag_free_num_pkts);
2673 2674                          }
2674 2675                          mutex_exit(&ipfb->ipfb_lock);
2675 2676                  }
2676 2677          }
2677 2678          /*
2678 2679           * While the reassembly list for this ILL is too big, prune a fragment
2679 2680           * queue by age, oldest first.
2680 2681           */
2681 2682          while (ill->ill_frag_count > max_count) {
2682 2683                  int     ix;
2683 2684                  ipfb_t  *oipfb = NULL;
2684 2685                  uint_t  oldest = UINT_MAX;
2685 2686  
2686 2687                  count = 0;
2687 2688                  for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2688 2689                          ipfb = &ill->ill_frag_hash_tbl[ix];
2689 2690                          mutex_enter(&ipfb->ipfb_lock);
2690 2691                          ipf = ipfb->ipfb_ipf;
2691 2692                          if (ipf != NULL && ipf->ipf_gen < oldest) {
2692 2693                                  oldest = ipf->ipf_gen;
2693 2694                                  oipfb = ipfb;
2694 2695                          }
2695 2696                          count += ipfb->ipfb_count;
2696 2697                          mutex_exit(&ipfb->ipfb_lock);
2697 2698                  }
2698 2699                  if (oipfb == NULL)
2699 2700                          break;
2700 2701  
2701 2702                  if (count <= max_count)
2702 2703                          return; /* Somebody beat us to it, nothing to do */
2703 2704                  mutex_enter(&oipfb->ipfb_lock);
2704 2705                  ipf = oipfb->ipfb_ipf;
2705 2706                  if (ipf != NULL) {
2706 2707                          ill_frag_free_pkts(ill, oipfb, ipf, 1);
2707 2708                  }
2708 2709                  mutex_exit(&oipfb->ipfb_lock);
2709 2710          }
2710 2711  }
2711 2712  
2712 2713  /*
2713 2714   * free 'free_cnt' fragmented packets starting at ipf.
2714 2715   */
2715 2716  void
2716 2717  ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2717 2718  {
2718 2719          size_t  count;
2719 2720          mblk_t  *mp;
2720 2721          mblk_t  *tmp;
2721 2722          ipf_t **ipfp = ipf->ipf_ptphn;
2722 2723  
2723 2724          ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2724 2725          ASSERT(ipfp != NULL);
2725 2726          ASSERT(ipf != NULL);
2726 2727  
2727 2728          while (ipf != NULL && free_cnt-- > 0) {
2728 2729                  count = ipf->ipf_count;
2729 2730                  mp = ipf->ipf_mp;
2730 2731                  ipf = ipf->ipf_hash_next;
2731 2732                  for (tmp = mp; tmp; tmp = tmp->b_cont) {
2732 2733                          IP_REASS_SET_START(tmp, 0);
2733 2734                          IP_REASS_SET_END(tmp, 0);
2734 2735                  }
2735 2736                  atomic_add_32(&ill->ill_frag_count, -count);
2736 2737                  ASSERT(ipfb->ipfb_count >= count);
2737 2738                  ipfb->ipfb_count -= count;
2738 2739                  ASSERT(ipfb->ipfb_frag_pkts > 0);
2739 2740                  ipfb->ipfb_frag_pkts--;
2740 2741                  BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2741 2742                  ip_drop_input("ipIfStatsReasmFails", mp, ill);
2742 2743                  freemsg(mp);
2743 2744          }
2744 2745  
2745 2746          if (ipf)
2746 2747                  ipf->ipf_ptphn = ipfp;
2747 2748          ipfp[0] = ipf;
2748 2749  }
2749 2750  
2750 2751  /*
2751 2752   * Helper function for ill_forward_set().
2752 2753   */
2753 2754  static void
2754 2755  ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2755 2756  {
2756 2757          ip_stack_t      *ipst = ill->ill_ipst;
2757 2758  
2758 2759          ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2759 2760  
2760 2761          ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2761 2762              (enable ? "Enabling" : "Disabling"),
2762 2763              (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2763 2764          mutex_enter(&ill->ill_lock);
2764 2765          if (enable)
2765 2766                  ill->ill_flags |= ILLF_ROUTER;
2766 2767          else
2767 2768                  ill->ill_flags &= ~ILLF_ROUTER;
2768 2769          mutex_exit(&ill->ill_lock);
2769 2770          if (ill->ill_isv6)
2770 2771                  ill_set_nce_router_flags(ill, enable);
2771 2772          /* Notify routing socket listeners of this change. */
2772 2773          if (ill->ill_ipif != NULL)
2773 2774                  ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2774 2775  }
2775 2776  
2776 2777  /*
2777 2778   * Set an ill's ILLF_ROUTER flag appropriately.  Send up RTS_IFINFO routing
2778 2779   * socket messages for each interface whose flags we change.
2779 2780   */
2780 2781  int
2781 2782  ill_forward_set(ill_t *ill, boolean_t enable)
2782 2783  {
2783 2784          ipmp_illgrp_t *illg;
2784 2785          ip_stack_t *ipst = ill->ill_ipst;
2785 2786  
2786 2787          ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2787 2788  
2788 2789          if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2789 2790              (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2790 2791                  return (0);
2791 2792  
2792 2793          if (IS_LOOPBACK(ill))
2793 2794                  return (EINVAL);
2794 2795  
2795 2796          if (enable && ill->ill_allowed_ips_cnt > 0)
2796 2797                  return (EPERM);
2797 2798  
2798 2799          if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2799 2800                  /*
2800 2801                   * Update all of the interfaces in the group.
2801 2802                   */
2802 2803                  illg = ill->ill_grp;
2803 2804                  ill = list_head(&illg->ig_if);
2804 2805                  for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2805 2806                          ill_forward_set_on_ill(ill, enable);
2806 2807  
2807 2808                  /*
2808 2809                   * Update the IPMP meta-interface.
2809 2810                   */
2810 2811                  ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2811 2812                  return (0);
2812 2813          }
2813 2814  
2814 2815          ill_forward_set_on_ill(ill, enable);
2815 2816          return (0);
2816 2817  }
2817 2818  
2818 2819  /*
2819 2820   * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2820 2821   * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2821 2822   * set or clear.
2822 2823   */
2823 2824  static void
2824 2825  ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2825 2826  {
2826 2827          ipif_t *ipif;
2827 2828          ncec_t *ncec;
2828 2829          nce_t *nce;
2829 2830  
2830 2831          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2831 2832                  /*
2832 2833                   * NOTE: we match across the illgrp because nce's for
2833 2834                   * addresses on IPMP interfaces have an nce_ill that points to
2834 2835                   * the bound underlying ill.
2835 2836                   */
2836 2837                  nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2837 2838                  if (nce != NULL) {
2838 2839                          ncec = nce->nce_common;
2839 2840                          mutex_enter(&ncec->ncec_lock);
2840 2841                          if (enable)
2841 2842                                  ncec->ncec_flags |= NCE_F_ISROUTER;
2842 2843                          else
2843 2844                                  ncec->ncec_flags &= ~NCE_F_ISROUTER;
2844 2845                          mutex_exit(&ncec->ncec_lock);
2845 2846                          nce_refrele(nce);
2846 2847                  }
2847 2848          }
2848 2849  }
2849 2850  
2850 2851  /*
2851 2852   * Intializes the context structure and returns the first ill in the list
2852 2853   * cuurently start_list and end_list can have values:
2853 2854   * MAX_G_HEADS          Traverse both IPV4 and IPV6 lists.
2854 2855   * IP_V4_G_HEAD         Traverse IPV4 list only.
2855 2856   * IP_V6_G_HEAD         Traverse IPV6 list only.
2856 2857   */
2857 2858  
2858 2859  /*
2859 2860   * We don't check for CONDEMNED ills here. Caller must do that if
2860 2861   * necessary under the ill lock.
2861 2862   */
2862 2863  ill_t *
2863 2864  ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2864 2865      ip_stack_t *ipst)
2865 2866  {
2866 2867          ill_if_t *ifp;
2867 2868          ill_t *ill;
2868 2869          avl_tree_t *avl_tree;
2869 2870  
2870 2871          ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2871 2872          ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2872 2873  
2873 2874          /*
2874 2875           * setup the lists to search
2875 2876           */
2876 2877          if (end_list != MAX_G_HEADS) {
2877 2878                  ctx->ctx_current_list = start_list;
2878 2879                  ctx->ctx_last_list = end_list;
2879 2880          } else {
2880 2881                  ctx->ctx_last_list = MAX_G_HEADS - 1;
2881 2882                  ctx->ctx_current_list = 0;
2882 2883          }
2883 2884  
2884 2885          while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2885 2886                  ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2886 2887                  if (ifp != (ill_if_t *)
2887 2888                      &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2888 2889                          avl_tree = &ifp->illif_avl_by_ppa;
2889 2890                          ill = avl_first(avl_tree);
2890 2891                          /*
2891 2892                           * ill is guaranteed to be non NULL or ifp should have
2892 2893                           * not existed.
2893 2894                           */
2894 2895                          ASSERT(ill != NULL);
2895 2896                          return (ill);
2896 2897                  }
2897 2898                  ctx->ctx_current_list++;
2898 2899          }
2899 2900  
2900 2901          return (NULL);
2901 2902  }
2902 2903  
2903 2904  /*
2904 2905   * returns the next ill in the list. ill_first() must have been called
2905 2906   * before calling ill_next() or bad things will happen.
2906 2907   */
2907 2908  
2908 2909  /*
2909 2910   * We don't check for CONDEMNED ills here. Caller must do that if
2910 2911   * necessary under the ill lock.
2911 2912   */
2912 2913  ill_t *
2913 2914  ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2914 2915  {
2915 2916          ill_if_t *ifp;
2916 2917          ill_t *ill;
2917 2918          ip_stack_t      *ipst = lastill->ill_ipst;
2918 2919  
2919 2920          ASSERT(lastill->ill_ifptr != (ill_if_t *)
2920 2921              &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2921 2922          if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2922 2923              AVL_AFTER)) != NULL) {
2923 2924                  return (ill);
2924 2925          }
2925 2926  
2926 2927          /* goto next ill_ifp in the list. */
2927 2928          ifp = lastill->ill_ifptr->illif_next;
2928 2929  
2929 2930          /* make sure not at end of circular list */
2930 2931          while (ifp ==
2931 2932              (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2932 2933                  if (++ctx->ctx_current_list > ctx->ctx_last_list)
2933 2934                          return (NULL);
2934 2935                  ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2935 2936          }
2936 2937  
2937 2938          return (avl_first(&ifp->illif_avl_by_ppa));
2938 2939  }
2939 2940  
2940 2941  /*
2941 2942   * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2942 2943   * The final number (PPA) must not have any leading zeros.  Upon success, a
2943 2944   * pointer to the start of the PPA is returned; otherwise NULL is returned.
2944 2945   */
2945 2946  static char *
2946 2947  ill_get_ppa_ptr(char *name)
2947 2948  {
2948 2949          int namelen = strlen(name);
2949 2950          int end_ndx = namelen - 1;
2950 2951          int ppa_ndx, i;
2951 2952  
2952 2953          /*
2953 2954           * Check that the first character is [a-zA-Z], and that the last
2954 2955           * character is [0-9].
2955 2956           */
2956 2957          if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2957 2958                  return (NULL);
2958 2959  
2959 2960          /*
2960 2961           * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2961 2962           */
2962 2963          for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2963 2964                  if (!isdigit(name[ppa_ndx - 1]))
2964 2965                          break;
2965 2966  
2966 2967          if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2967 2968                  return (NULL);
2968 2969  
2969 2970          /*
2970 2971           * Check that the intermediate characters are [a-z0-9.]
2971 2972           */
2972 2973          for (i = 1; i < ppa_ndx; i++) {
2973 2974                  if (!isalpha(name[i]) && !isdigit(name[i]) &&
2974 2975                      name[i] != '.' && name[i] != '_') {
2975 2976                          return (NULL);
2976 2977                  }
2977 2978          }
2978 2979  
2979 2980          return (name + ppa_ndx);
2980 2981  }
2981 2982  
2982 2983  /*
2983 2984   * use avl tree to locate the ill.
2984 2985   */
2985 2986  static ill_t *
2986 2987  ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2987 2988  {
2988 2989          char *ppa_ptr = NULL;
2989 2990          int len;
2990 2991          uint_t ppa;
2991 2992          ill_t *ill = NULL;
2992 2993          ill_if_t *ifp;
2993 2994          int list;
2994 2995  
2995 2996          /*
2996 2997           * get ppa ptr
2997 2998           */
2998 2999          if (isv6)
2999 3000                  list = IP_V6_G_HEAD;
3000 3001          else
3001 3002                  list = IP_V4_G_HEAD;
3002 3003  
3003 3004          if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
3004 3005                  return (NULL);
3005 3006          }
3006 3007  
3007 3008          len = ppa_ptr - name + 1;
3008 3009  
3009 3010          ppa = stoi(&ppa_ptr);
3010 3011  
3011 3012          ifp = IP_VX_ILL_G_LIST(list, ipst);
3012 3013  
3013 3014          while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
3014 3015                  /*
3015 3016                   * match is done on len - 1 as the name is not null
3016 3017                   * terminated it contains ppa in addition to the interface
3017 3018                   * name.
3018 3019                   */
3019 3020                  if ((ifp->illif_name_len == len) &&
3020 3021                      bcmp(ifp->illif_name, name, len - 1) == 0) {
3021 3022                          break;
3022 3023                  } else {
3023 3024                          ifp = ifp->illif_next;
3024 3025                  }
3025 3026          }
3026 3027  
3027 3028          if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
3028 3029                  /*
3029 3030                   * Even the interface type does not exist.
3030 3031                   */
3031 3032                  return (NULL);
3032 3033          }
3033 3034  
3034 3035          ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
3035 3036          if (ill != NULL) {
3036 3037                  mutex_enter(&ill->ill_lock);
3037 3038                  if (ILL_CAN_LOOKUP(ill)) {
3038 3039                          ill_refhold_locked(ill);
3039 3040                          mutex_exit(&ill->ill_lock);
3040 3041                          return (ill);
3041 3042                  }
3042 3043                  mutex_exit(&ill->ill_lock);
3043 3044          }
3044 3045          return (NULL);
3045 3046  }
3046 3047  
3047 3048  /*
3048 3049   * comparison function for use with avl.
3049 3050   */
3050 3051  static int
3051 3052  ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
3052 3053  {
3053 3054          uint_t ppa;
3054 3055          uint_t ill_ppa;
3055 3056  
3056 3057          ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
3057 3058  
3058 3059          ppa = *((uint_t *)ppa_ptr);
3059 3060          ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
3060 3061          /*
3061 3062           * We want the ill with the lowest ppa to be on the
3062 3063           * top.
3063 3064           */
3064 3065          if (ill_ppa < ppa)
3065 3066                  return (1);
3066 3067          if (ill_ppa > ppa)
3067 3068                  return (-1);
3068 3069          return (0);
3069 3070  }
3070 3071  
3071 3072  /*
3072 3073   * remove an interface type from the global list.
3073 3074   */
3074 3075  static void
3075 3076  ill_delete_interface_type(ill_if_t *interface)
3076 3077  {
3077 3078          ASSERT(interface != NULL);
3078 3079          ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
3079 3080  
3080 3081          avl_destroy(&interface->illif_avl_by_ppa);
3081 3082          if (interface->illif_ppa_arena != NULL)
3082 3083                  vmem_destroy(interface->illif_ppa_arena);
3083 3084  
3084 3085          remque(interface);
3085 3086  
3086 3087          mi_free(interface);
3087 3088  }
3088 3089  
3089 3090  /*
3090 3091   * remove ill from the global list.
3091 3092   */
3092 3093  static void
3093 3094  ill_glist_delete(ill_t *ill)
3094 3095  {
3095 3096          ip_stack_t      *ipst;
3096 3097          phyint_t        *phyi;
3097 3098  
3098 3099          if (ill == NULL)
3099 3100                  return;
3100 3101          ipst = ill->ill_ipst;
3101 3102          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3102 3103  
3103 3104          /*
3104 3105           * If the ill was never inserted into the AVL tree
3105 3106           * we skip the if branch.
3106 3107           */
3107 3108          if (ill->ill_ifptr != NULL) {
3108 3109                  /*
3109 3110                   * remove from AVL tree and free ppa number
3110 3111                   */
3111 3112                  avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3112 3113  
3113 3114                  if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3114 3115                          vmem_free(ill->ill_ifptr->illif_ppa_arena,
3115 3116                              (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3116 3117                  }
3117 3118                  if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3118 3119                          ill_delete_interface_type(ill->ill_ifptr);
3119 3120                  }
3120 3121  
3121 3122                  /*
3122 3123                   * Indicate ill is no longer in the list.
3123 3124                   */
3124 3125                  ill->ill_ifptr = NULL;
3125 3126                  ill->ill_name_length = 0;
3126 3127                  ill->ill_name[0] = '\0';
3127 3128                  ill->ill_ppa = UINT_MAX;
3128 3129          }
3129 3130  
3130 3131          /* Generate one last event for this ill. */
3131 3132          ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3132 3133              ill->ill_name_length);
3133 3134  
3134 3135          ASSERT(ill->ill_phyint != NULL);
3135 3136          phyi = ill->ill_phyint;
3136 3137          ill->ill_phyint = NULL;
3137 3138  
3138 3139          /*
3139 3140           * ill_init allocates a phyint always to store the copy
3140 3141           * of flags relevant to phyint. At that point in time, we could
3141 3142           * not assign the name and hence phyint_illv4/v6 could not be
3142 3143           * initialized. Later in ipif_set_values, we assign the name to
3143 3144           * the ill, at which point in time we assign phyint_illv4/v6.
3144 3145           * Thus we don't rely on phyint_illv6 to be initialized always.
3145 3146           */
3146 3147          if (ill->ill_flags & ILLF_IPV6)
3147 3148                  phyi->phyint_illv6 = NULL;
3148 3149          else
3149 3150                  phyi->phyint_illv4 = NULL;
3150 3151  
3151 3152          if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3152 3153                  rw_exit(&ipst->ips_ill_g_lock);
3153 3154                  return;
3154 3155          }
3155 3156  
3156 3157          /*
3157 3158           * There are no ills left on this phyint; pull it out of the phyint
3158 3159           * avl trees, and free it.
3159 3160           */
3160 3161          if (phyi->phyint_ifindex > 0) {
3161 3162                  avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3162 3163                      phyi);
3163 3164                  avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3164 3165                      phyi);
3165 3166          }
3166 3167          rw_exit(&ipst->ips_ill_g_lock);
3167 3168  
3168 3169          phyint_free(phyi);
3169 3170  }
3170 3171  
3171 3172  /*
3172 3173   * allocate a ppa, if the number of plumbed interfaces of this type are
3173 3174   * less than ill_no_arena do a linear search to find a unused ppa.
3174 3175   * When the number goes beyond ill_no_arena switch to using an arena.
3175 3176   * Note: ppa value of zero cannot be allocated from vmem_arena as it
3176 3177   * is the return value for an error condition, so allocation starts at one
3177 3178   * and is decremented by one.
3178 3179   */
3179 3180  static int
3180 3181  ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3181 3182  {
3182 3183          ill_t *tmp_ill;
3183 3184          uint_t start, end;
3184 3185          int ppa;
3185 3186  
3186 3187          if (ifp->illif_ppa_arena == NULL &&
3187 3188              (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3188 3189                  /*
3189 3190                   * Create an arena.
3190 3191                   */
3191 3192                  ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3192 3193                      (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3193 3194                      NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3194 3195                          /* allocate what has already been assigned */
3195 3196                  for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3196 3197                      tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3197 3198                      tmp_ill, AVL_AFTER)) {
3198 3199                          ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3199 3200                              1,          /* size */
3200 3201                              1,          /* align/quantum */
3201 3202                              0,          /* phase */
3202 3203                              0,          /* nocross */
3203 3204                              /* minaddr */
3204 3205                              (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3205 3206                              /* maxaddr */
3206 3207                              (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3207 3208                              VM_NOSLEEP|VM_FIRSTFIT);
3208 3209                          if (ppa == 0) {
3209 3210                                  ip1dbg(("ill_alloc_ppa: ppa allocation"
3210 3211                                      " failed while switching"));
3211 3212                                  vmem_destroy(ifp->illif_ppa_arena);
3212 3213                                  ifp->illif_ppa_arena = NULL;
3213 3214                                  break;
3214 3215                          }
3215 3216                  }
3216 3217          }
3217 3218  
3218 3219          if (ifp->illif_ppa_arena != NULL) {
3219 3220                  if (ill->ill_ppa == UINT_MAX) {
3220 3221                          ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3221 3222                              1, VM_NOSLEEP|VM_FIRSTFIT);
3222 3223                          if (ppa == 0)
3223 3224                                  return (EAGAIN);
3224 3225                          ill->ill_ppa = --ppa;
3225 3226                  } else {
3226 3227                          ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3227 3228                              1,          /* size */
3228 3229                              1,          /* align/quantum */
3229 3230                              0,          /* phase */
3230 3231                              0,          /* nocross */
3231 3232                              (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3232 3233                              (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3233 3234                              VM_NOSLEEP|VM_FIRSTFIT);
3234 3235                          /*
3235 3236                           * Most likely the allocation failed because
3236 3237                           * the requested ppa was in use.
3237 3238                           */
3238 3239                          if (ppa == 0)
3239 3240                                  return (EEXIST);
3240 3241                  }
3241 3242                  return (0);
3242 3243          }
3243 3244  
3244 3245          /*
3245 3246           * No arena is in use and not enough (>ill_no_arena) interfaces have
3246 3247           * been plumbed to create one. Do a linear search to get a unused ppa.
3247 3248           */
3248 3249          if (ill->ill_ppa == UINT_MAX) {
3249 3250                  end = UINT_MAX - 1;
3250 3251                  start = 0;
3251 3252          } else {
3252 3253                  end = start = ill->ill_ppa;
3253 3254          }
3254 3255  
3255 3256          tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3256 3257          while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3257 3258                  if (start++ >= end) {
3258 3259                          if (ill->ill_ppa == UINT_MAX)
3259 3260                                  return (EAGAIN);
3260 3261                          else
3261 3262                                  return (EEXIST);
3262 3263                  }
3263 3264                  tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3264 3265          }
3265 3266          ill->ill_ppa = start;
3266 3267          return (0);
3267 3268  }
3268 3269  
3269 3270  /*
3270 3271   * Insert ill into the list of configured ill's. Once this function completes,
3271 3272   * the ill is globally visible and is available through lookups. More precisely
3272 3273   * this happens after the caller drops the ill_g_lock.
3273 3274   */
3274 3275  static int
3275 3276  ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3276 3277  {
3277 3278          ill_if_t *ill_interface;
3278 3279          avl_index_t where = 0;
3279 3280          int error;
3280 3281          int name_length;
3281 3282          int index;
3282 3283          boolean_t check_length = B_FALSE;
3283 3284          ip_stack_t      *ipst = ill->ill_ipst;
3284 3285  
3285 3286          ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3286 3287  
3287 3288          name_length = mi_strlen(name) + 1;
3288 3289  
3289 3290          if (isv6)
3290 3291                  index = IP_V6_G_HEAD;
3291 3292          else
3292 3293                  index = IP_V4_G_HEAD;
3293 3294  
3294 3295          ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3295 3296          /*
3296 3297           * Search for interface type based on name
3297 3298           */
3298 3299          while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3299 3300                  if ((ill_interface->illif_name_len == name_length) &&
3300 3301                      (strcmp(ill_interface->illif_name, name) == 0)) {
3301 3302                          break;
3302 3303                  }
3303 3304                  ill_interface = ill_interface->illif_next;
3304 3305          }
3305 3306  
3306 3307          /*
3307 3308           * Interface type not found, create one.
3308 3309           */
3309 3310          if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3310 3311                  ill_g_head_t ghead;
3311 3312  
3312 3313                  /*
3313 3314                   * allocate ill_if_t structure
3314 3315                   */
3315 3316                  ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3316 3317                  if (ill_interface == NULL) {
3317 3318                          return (ENOMEM);
3318 3319                  }
3319 3320  
3320 3321                  (void) strcpy(ill_interface->illif_name, name);
3321 3322                  ill_interface->illif_name_len = name_length;
3322 3323  
3323 3324                  avl_create(&ill_interface->illif_avl_by_ppa,
3324 3325                      ill_compare_ppa, sizeof (ill_t),
3325 3326                      offsetof(struct ill_s, ill_avl_byppa));
3326 3327  
3327 3328                  /*
3328 3329                   * link the structure in the back to maintain order
3329 3330                   * of configuration for ifconfig output.
3330 3331                   */
3331 3332                  ghead = ipst->ips_ill_g_heads[index];
3332 3333                  insque(ill_interface, ghead.ill_g_list_tail);
3333 3334          }
3334 3335  
3335 3336          if (ill->ill_ppa == UINT_MAX)
3336 3337                  check_length = B_TRUE;
3337 3338  
3338 3339          error = ill_alloc_ppa(ill_interface, ill);
3339 3340          if (error != 0) {
3340 3341                  if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3341 3342                          ill_delete_interface_type(ill->ill_ifptr);
3342 3343                  return (error);
3343 3344          }
3344 3345  
3345 3346          /*
3346 3347           * When the ppa is choosen by the system, check that there is
3347 3348           * enough space to insert ppa. if a specific ppa was passed in this
3348 3349           * check is not required as the interface name passed in will have
3349 3350           * the right ppa in it.
3350 3351           */
3351 3352          if (check_length) {
3352 3353                  /*
3353 3354                   * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3354 3355                   */
3355 3356                  char buf[sizeof (uint_t) * 3];
3356 3357  
3357 3358                  /*
3358 3359                   * convert ppa to string to calculate the amount of space
3359 3360                   * required for it in the name.
3360 3361                   */
3361 3362                  numtos(ill->ill_ppa, buf);
3362 3363  
3363 3364                  /* Do we have enough space to insert ppa ? */
3364 3365  
3365 3366                  if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3366 3367                          /* Free ppa and interface type struct */
3367 3368                          if (ill_interface->illif_ppa_arena != NULL) {
3368 3369                                  vmem_free(ill_interface->illif_ppa_arena,
3369 3370                                      (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3370 3371                          }
3371 3372                          if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3372 3373                                  ill_delete_interface_type(ill->ill_ifptr);
3373 3374  
3374 3375                          return (EINVAL);
3375 3376                  }
3376 3377          }
3377 3378  
3378 3379          (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3379 3380          ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3380 3381  
3381 3382          (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3382 3383              &where);
3383 3384          ill->ill_ifptr = ill_interface;
3384 3385          avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
3385 3386  
3386 3387          ill_phyint_reinit(ill);
3387 3388          return (0);
3388 3389  }
3389 3390  
3390 3391  /* Initialize the per phyint ipsq used for serialization */
3391 3392  static boolean_t
3392 3393  ipsq_init(ill_t *ill, boolean_t enter)
3393 3394  {
3394 3395          ipsq_t  *ipsq;
3395 3396          ipxop_t *ipx;
3396 3397  
3397 3398          if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
3398 3399                  return (B_FALSE);
3399 3400  
3400 3401          ill->ill_phyint->phyint_ipsq = ipsq;
3401 3402          ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
3402 3403          ipx->ipx_ipsq = ipsq;
3403 3404          ipsq->ipsq_next = ipsq;
3404 3405          ipsq->ipsq_phyint = ill->ill_phyint;
3405 3406          mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
3406 3407          mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
3407 3408          ipsq->ipsq_ipst = ill->ill_ipst;        /* No netstack_hold */
3408 3409          if (enter) {
3409 3410                  ipx->ipx_writer = curthread;
3410 3411                  ipx->ipx_forced = B_FALSE;
3411 3412                  ipx->ipx_reentry_cnt = 1;
3412 3413  #ifdef DEBUG
3413 3414                  ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
3414 3415  #endif
3415 3416          }
3416 3417          return (B_TRUE);
3417 3418  }
3418 3419  
3419 3420  /*
3420 3421   * Here we perform initialisation of the ill_t common to both regular
3421 3422   * interface ILLs and the special loopback ILL created by ill_lookup_on_name.
3422 3423   */
3423 3424  static int
3424 3425  ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
3425 3426      boolean_t ipsq_enter)
3426 3427  {
3427 3428          int count;
3428 3429          uchar_t *frag_ptr;
3429 3430  
3430 3431          mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
3431 3432          mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
3432 3433          ill->ill_saved_ire_cnt = 0;
3433 3434  
3434 3435          if (is_loopback) {
3435 3436                  ill->ill_max_frag = isv6 ? ip_loopback_mtu_v6plus :
3436 3437                      ip_loopback_mtuplus;
3437 3438                  /*
3438 3439                   * No resolver here.
3439 3440                   */
3440 3441                  ill->ill_net_type = IRE_LOOPBACK;
3441 3442          } else {
3442 3443                  ill->ill_rq = q;
3443 3444                  ill->ill_wq = WR(q);
3444 3445                  ill->ill_ppa = UINT_MAX;
3445 3446          }
3446 3447  
3447 3448          ill->ill_isv6 = isv6;
3448 3449  
3449 3450          /*
3450 3451           * Allocate sufficient space to contain our fragment hash table and
3451 3452           * the device name.
3452 3453           */
3453 3454          frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
3454 3455          if (frag_ptr == NULL)
3455 3456                  return (ENOMEM);
3456 3457          ill->ill_frag_ptr = frag_ptr;
3457 3458          ill->ill_frag_free_num_pkts = 0;
3458 3459          ill->ill_last_frag_clean_time = 0;
3459 3460          ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
3460 3461          ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
3461 3462          for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
3462 3463                  mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
3463 3464                      NULL, MUTEX_DEFAULT, NULL);
3464 3465          }
3465 3466  
3466 3467          ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3467 3468          if (ill->ill_phyint == NULL) {
3468 3469                  mi_free(frag_ptr);
3469 3470                  return (ENOMEM);
3470 3471          }
3471 3472  
3472 3473          mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3473 3474          if (isv6) {
3474 3475                  ill->ill_phyint->phyint_illv6 = ill;
3475 3476          } else {
3476 3477                  ill->ill_phyint->phyint_illv4 = ill;
3477 3478          }
3478 3479          if (is_loopback) {
3479 3480                  phyint_flags_init(ill->ill_phyint, DL_LOOP);
3480 3481          }
3481 3482  
3482 3483          list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3483 3484  
3484 3485          ill_set_inputfn(ill);
3485 3486  
3486 3487          if (!ipsq_init(ill, ipsq_enter)) {
3487 3488                  mi_free(frag_ptr);
3488 3489                  mi_free(ill->ill_phyint);
3489 3490                  return (ENOMEM);
3490 3491          }
3491 3492  
3492 3493          /* Frag queue limit stuff */
3493 3494          ill->ill_frag_count = 0;
3494 3495          ill->ill_ipf_gen = 0;
3495 3496  
3496 3497          rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3497 3498          mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3498 3499          ill->ill_global_timer = INFINITY;
3499 3500          ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3500 3501          ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3501 3502          ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3502 3503          ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3503 3504  
3504 3505          /*
3505 3506           * Initialize IPv6 configuration variables.  The IP module is always

↓ open down ↓

2095 lines elided

↑ open up ↑

3506 3507           * opened as an IPv4 module.  Instead tracking down the cases where
3507 3508           * it switches to do ipv6, we'll just initialize the IPv6 configuration
3508 3509           * here for convenience, this has no effect until the ill is set to do
3509 3510           * IPv6.
3510 3511           */
3511 3512          ill->ill_reachable_time = ND_REACHABLE_TIME;
3512 3513          ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
3513 3514          ill->ill_max_buf = ND_MAX_Q;
3514 3515          ill->ill_refcnt = 0;
3515 3516  
3516      -        cv_init(&ill->ill_dlpi_capab_cv, NULL, CV_DEFAULT, NULL);
3517      -        mutex_init(&ill->ill_dlpi_capab_lock, NULL, MUTEX_DEFAULT, NULL);
3518      -
3519 3517          return (0);
3520 3518  }
3521 3519  
3522 3520  /*
3523 3521   * ill_init is called by ip_open when a device control stream is opened.
3524 3522   * It does a few initializations, and shoots a DL_INFO_REQ message down
3525 3523   * to the driver.  The response is later picked up in ip_rput_dlpi and
3526 3524   * used to set up default mechanisms for talking to the driver.  (Always
3527 3525   * called as writer.)
3528 3526   *

3529 3527   * If this function returns error, ip_open will call ip_close which in
3530 3528   * turn will call ill_delete to clean up any memory allocated here that
3531 3529   * is not yet freed.
3532 3530   *
3533 3531   * Note: ill_ipst and ill_zoneid must be set before calling ill_init.
3534 3532   */
3535 3533  int
3536 3534  ill_init(queue_t *q, ill_t *ill)
3537 3535  {
3538 3536          int ret;
3539 3537          dl_info_req_t   *dlir;
3540 3538          mblk_t  *info_mp;
3541 3539  
3542 3540          info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
3543 3541              BPRI_HI);
3544 3542          if (info_mp == NULL)
3545 3543                  return (ENOMEM);
3546 3544  
3547 3545          /*
3548 3546           * For now pretend this is a v4 ill. We need to set phyint_ill*
3549 3547           * at this point because of the following reason. If we can't
3550 3548           * enter the ipsq at some point and cv_wait, the writer that
3551 3549           * wakes us up tries to locate us using the list of all phyints
3552 3550           * in an ipsq and the ills from the phyint thru the phyint_ill*.
3553 3551           * If we don't set it now, we risk a missed wakeup.
3554 3552           */
3555 3553          if ((ret = ill_init_common(ill, q, B_FALSE, B_FALSE, B_TRUE)) != 0) {
3556 3554                  freemsg(info_mp);
3557 3555                  return (ret);
3558 3556          }
3559 3557  
3560 3558          ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
3561 3559  
3562 3560          /* Send down the Info Request to the driver. */
3563 3561          info_mp->b_datap->db_type = M_PCPROTO;
3564 3562          dlir = (dl_info_req_t *)info_mp->b_rptr;
3565 3563          info_mp->b_wptr = (uchar_t *)&dlir[1];
3566 3564          dlir->dl_primitive = DL_INFO_REQ;
3567 3565  
3568 3566          ill->ill_dlpi_pending = DL_PRIM_INVAL;
3569 3567  
3570 3568          qprocson(q);
3571 3569          ill_dlpi_send(ill, info_mp);
3572 3570  
3573 3571          return (0);
3574 3572  }
3575 3573  
3576 3574  /*
3577 3575   * ill_dls_info
3578 3576   * creates datalink socket info from the device.
3579 3577   */
3580 3578  int
3581 3579  ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
3582 3580  {
3583 3581          size_t  len;
3584 3582  
3585 3583          sdl->sdl_family = AF_LINK;
3586 3584          sdl->sdl_index = ill_get_upper_ifindex(ill);
3587 3585          sdl->sdl_type = ill->ill_type;
3588 3586          ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3589 3587          len = strlen(sdl->sdl_data);
3590 3588          ASSERT(len < 256);
3591 3589          sdl->sdl_nlen = (uchar_t)len;
3592 3590          sdl->sdl_alen = ill->ill_phys_addr_length;
3593 3591          sdl->sdl_slen = 0;
3594 3592          if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
3595 3593                  bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
3596 3594  
3597 3595          return (sizeof (struct sockaddr_dl));
3598 3596  }
3599 3597  
3600 3598  /*
3601 3599   * ill_xarp_info
3602 3600   * creates xarp info from the device.
3603 3601   */
3604 3602  static int
3605 3603  ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
3606 3604  {
3607 3605          sdl->sdl_family = AF_LINK;
3608 3606          sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
3609 3607          sdl->sdl_type = ill->ill_type;
3610 3608          ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3611 3609          sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
3612 3610          sdl->sdl_alen = ill->ill_phys_addr_length;
3613 3611          sdl->sdl_slen = 0;
3614 3612          return (sdl->sdl_nlen);
3615 3613  }
3616 3614  
3617 3615  static int
3618 3616  loopback_kstat_update(kstat_t *ksp, int rw)
3619 3617  {
3620 3618          kstat_named_t *kn;
3621 3619          netstackid_t    stackid;
3622 3620          netstack_t      *ns;
3623 3621          ip_stack_t      *ipst;
3624 3622  
3625 3623          if (ksp == NULL || ksp->ks_data == NULL)
3626 3624                  return (EIO);
3627 3625  
3628 3626          if (rw == KSTAT_WRITE)
3629 3627                  return (EACCES);
3630 3628  
3631 3629          kn = KSTAT_NAMED_PTR(ksp);
3632 3630          stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
3633 3631  
3634 3632          ns = netstack_find_by_stackid(stackid);
3635 3633          if (ns == NULL)
3636 3634                  return (-1);
3637 3635  
3638 3636          ipst = ns->netstack_ip;
3639 3637          if (ipst == NULL) {
3640 3638                  netstack_rele(ns);
3641 3639                  return (-1);
3642 3640          }
3643 3641          kn[0].value.ui32 = ipst->ips_loopback_packets;
3644 3642          kn[1].value.ui32 = ipst->ips_loopback_packets;
3645 3643          netstack_rele(ns);
3646 3644          return (0);
3647 3645  }
3648 3646  
3649 3647  /*
3650 3648   * Has ifindex been plumbed already?
3651 3649   */
3652 3650  static boolean_t
3653 3651  phyint_exists(uint_t index, ip_stack_t *ipst)
3654 3652  {
3655 3653          ASSERT(index != 0);
3656 3654          ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
3657 3655  
3658 3656          return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3659 3657              &index, NULL) != NULL);
3660 3658  }
3661 3659  
3662 3660  /*
3663 3661   * Pick a unique ifindex.
3664 3662   * When the index counter passes IF_INDEX_MAX for the first time, the wrap
3665 3663   * flag is set so that next time time ip_assign_ifindex() is called, it
3666 3664   * falls through and resets the index counter back to 1, the minimum value
3667 3665   * for the interface index. The logic below assumes that ips_ill_index
3668 3666   * can hold a value of IF_INDEX_MAX+1 without there being any loss
3669 3667   * (i.e. reset back to 0.)
3670 3668   */
3671 3669  boolean_t
3672 3670  ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
3673 3671  {
3674 3672          uint_t loops;
3675 3673  
3676 3674          if (!ipst->ips_ill_index_wrap) {
3677 3675                  *indexp = ipst->ips_ill_index++;
3678 3676                  if (ipst->ips_ill_index > IF_INDEX_MAX) {
3679 3677                          /*
3680 3678                           * Reached the maximum ifindex value, set the wrap
3681 3679                           * flag to indicate that it is no longer possible
3682 3680                           * to assume that a given index is unallocated.
3683 3681                           */
3684 3682                          ipst->ips_ill_index_wrap = B_TRUE;
3685 3683                  }
3686 3684                  return (B_TRUE);
3687 3685          }
3688 3686  
3689 3687          if (ipst->ips_ill_index > IF_INDEX_MAX)
3690 3688                  ipst->ips_ill_index = 1;
3691 3689  
3692 3690          /*
3693 3691           * Start reusing unused indexes. Note that we hold the ill_g_lock
3694 3692           * at this point and don't want to call any function that attempts
3695 3693           * to get the lock again.
3696 3694           */
3697 3695          for (loops = IF_INDEX_MAX; loops > 0; loops--) {
3698 3696                  if (!phyint_exists(ipst->ips_ill_index, ipst)) {
3699 3697                          /* found unused index - use it */
3700 3698                          *indexp = ipst->ips_ill_index;
3701 3699                          return (B_TRUE);
3702 3700                  }
3703 3701  
3704 3702                  ipst->ips_ill_index++;
3705 3703                  if (ipst->ips_ill_index > IF_INDEX_MAX)
3706 3704                          ipst->ips_ill_index = 1;
3707 3705          }
3708 3706  
3709 3707          /*
3710 3708           * all interface indicies are inuse.
3711 3709           */
3712 3710          return (B_FALSE);
3713 3711  }
3714 3712  
3715 3713  /*
3716 3714   * Assign a unique interface index for the phyint.
3717 3715   */
3718 3716  static boolean_t
3719 3717  phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
3720 3718  {
3721 3719          ASSERT(phyi->phyint_ifindex == 0);
3722 3720          return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
3723 3721  }
3724 3722  
3725 3723  /*
3726 3724   * Initialize the flags on `phyi' as per the provided mactype.
3727 3725   */
3728 3726  static void
3729 3727  phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
3730 3728  {
3731 3729          uint64_t flags = 0;
3732 3730  
3733 3731          /*
3734 3732           * Initialize PHYI_RUNNING and PHYI_FAILED.  For non-IPMP interfaces,
3735 3733           * we always presume the underlying hardware is working and set
3736 3734           * PHYI_RUNNING (if it's not, the driver will subsequently send a
3737 3735           * DL_NOTE_LINK_DOWN message).  For IPMP interfaces, at initialization
3738 3736           * there are no active interfaces in the group so we set PHYI_FAILED.
3739 3737           */
3740 3738          if (mactype == SUNW_DL_IPMP)
3741 3739                  flags |= PHYI_FAILED;
3742 3740          else
3743 3741                  flags |= PHYI_RUNNING;
3744 3742  
3745 3743          switch (mactype) {
3746 3744          case SUNW_DL_VNI:
3747 3745                  flags |= PHYI_VIRTUAL;
3748 3746                  break;
3749 3747          case SUNW_DL_IPMP:
3750 3748                  flags |= PHYI_IPMP;
3751 3749                  break;
3752 3750          case DL_LOOP:
3753 3751                  flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
3754 3752                  break;
3755 3753          }
3756 3754  
3757 3755          mutex_enter(&phyi->phyint_lock);
3758 3756          phyi->phyint_flags |= flags;
3759 3757          mutex_exit(&phyi->phyint_lock);
3760 3758  }
3761 3759  
3762 3760  /*
3763 3761   * Return a pointer to the ill which matches the supplied name.  Note that
3764 3762   * the ill name length includes the null termination character.  (May be
3765 3763   * called as writer.)
3766 3764   * If do_alloc and the interface is "lo0" it will be automatically created.
3767 3765   * Cannot bump up reference on condemned ills. So dup detect can't be done
3768 3766   * using this func.
3769 3767   */
3770 3768  ill_t *
3771 3769  ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
3772 3770      boolean_t *did_alloc, ip_stack_t *ipst)
3773 3771  {
3774 3772          ill_t   *ill;
3775 3773          ipif_t  *ipif;
3776 3774          ipsq_t  *ipsq;
3777 3775          kstat_named_t   *kn;
3778 3776          boolean_t isloopback;
3779 3777          in6_addr_t ov6addr;
3780 3778  
3781 3779          isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
3782 3780  
3783 3781          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3784 3782          ill = ill_find_by_name(name, isv6, ipst);
3785 3783          rw_exit(&ipst->ips_ill_g_lock);
3786 3784          if (ill != NULL)
3787 3785                  return (ill);
3788 3786  
3789 3787          /*
3790 3788           * Couldn't find it.  Does this happen to be a lookup for the
3791 3789           * loopback device and are we allowed to allocate it?
3792 3790           */
3793 3791          if (!isloopback || !do_alloc)
3794 3792                  return (NULL);
3795 3793  
3796 3794          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3797 3795          ill = ill_find_by_name(name, isv6, ipst);
3798 3796          if (ill != NULL) {
3799 3797                  rw_exit(&ipst->ips_ill_g_lock);
3800 3798                  return (ill);
3801 3799          }
3802 3800  
3803 3801          /* Create the loopback device on demand */
3804 3802          ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
3805 3803              sizeof (ipif_loopback_name), BPRI_MED));
3806 3804          if (ill == NULL)
3807 3805                  goto done;
3808 3806  
3809 3807          bzero(ill, sizeof (*ill));
3810 3808          ill->ill_ipst = ipst;
3811 3809          netstack_hold(ipst->ips_netstack);
3812 3810          /*
3813 3811           * For exclusive stacks we set the zoneid to zero
3814 3812           * to make IP operate as if in the global zone.
3815 3813           */
3816 3814          ill->ill_zoneid = GLOBAL_ZONEID;
3817 3815  
3818 3816          if (ill_init_common(ill, NULL, isv6, B_TRUE, B_FALSE) != 0)
3819 3817                  goto done;
3820 3818  
3821 3819          if (!ill_allocate_mibs(ill))
3822 3820                  goto done;
3823 3821  
3824 3822          ill->ill_current_frag = ill->ill_max_frag;
3825 3823          ill->ill_mtu = ill->ill_max_frag;       /* Initial value */
3826 3824          ill->ill_mc_mtu = ill->ill_mtu;
3827 3825          /*
3828 3826           * ipif_loopback_name can't be pointed at directly because its used
3829 3827           * by both the ipv4 and ipv6 interfaces.  When the ill is removed
3830 3828           * from the glist, ill_glist_delete() sets the first character of
3831 3829           * ill_name to '\0'.
3832 3830           */
3833 3831          ill->ill_name = (char *)ill + sizeof (*ill);
3834 3832          (void) strcpy(ill->ill_name, ipif_loopback_name);
3835 3833          ill->ill_name_length = sizeof (ipif_loopback_name);
3836 3834          /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
3837 3835          ill->ill_dlpi_pending = DL_PRIM_INVAL;
3838 3836  
3839 3837          ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
3840 3838          if (ipif == NULL)
3841 3839                  goto done;
3842 3840  
3843 3841          ill->ill_flags = ILLF_MULTICAST;
3844 3842  
3845 3843          ov6addr = ipif->ipif_v6lcl_addr;
3846 3844          /* Set up default loopback address and mask. */
3847 3845          if (!isv6) {
3848 3846                  ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
3849 3847  
3850 3848                  IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
3851 3849                  V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
3852 3850                  V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3853 3851                      ipif->ipif_v6subnet);
3854 3852                  ill->ill_flags |= ILLF_IPV4;
3855 3853          } else {
3856 3854                  ipif->ipif_v6lcl_addr = ipv6_loopback;
3857 3855                  ipif->ipif_v6net_mask = ipv6_all_ones;
3858 3856                  V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3859 3857                      ipif->ipif_v6subnet);
3860 3858                  ill->ill_flags |= ILLF_IPV6;
3861 3859          }
3862 3860  
3863 3861          /*
3864 3862           * Chain us in at the end of the ill list. hold the ill
3865 3863           * before we make it globally visible. 1 for the lookup.
3866 3864           */
3867 3865          ill_refhold(ill);
3868 3866  
3869 3867          ipsq = ill->ill_phyint->phyint_ipsq;
3870 3868  
3871 3869          if (ill_glist_insert(ill, "lo", isv6) != 0)
3872 3870                  cmn_err(CE_PANIC, "cannot insert loopback interface");
3873 3871  
3874 3872          /* Let SCTP know so that it can add this to its list */
3875 3873          sctp_update_ill(ill, SCTP_ILL_INSERT);
3876 3874  
3877 3875          /*
3878 3876           * We have already assigned ipif_v6lcl_addr above, but we need to
3879 3877           * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
3880 3878           * requires to be after ill_glist_insert() since we need the
3881 3879           * ill_index set. Pass on ipv6_loopback as the old address.
3882 3880           */
3883 3881          sctp_update_ipif_addr(ipif, ov6addr);
3884 3882  
3885 3883          ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
3886 3884  
3887 3885          /*
3888 3886           * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
3889 3887           * If so, free our original one.
3890 3888           */
3891 3889          if (ipsq != ill->ill_phyint->phyint_ipsq)
3892 3890                  ipsq_delete(ipsq);
3893 3891  
3894 3892          if (ipst->ips_loopback_ksp == NULL) {
3895 3893                  /* Export loopback interface statistics */
3896 3894                  ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
3897 3895                      ipif_loopback_name, "net",
3898 3896                      KSTAT_TYPE_NAMED, 2, 0,
3899 3897                      ipst->ips_netstack->netstack_stackid);
3900 3898                  if (ipst->ips_loopback_ksp != NULL) {
3901 3899                          ipst->ips_loopback_ksp->ks_update =
3902 3900                              loopback_kstat_update;
3903 3901                          kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
3904 3902                          kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
3905 3903                          kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
3906 3904                          ipst->ips_loopback_ksp->ks_private =
3907 3905                              (void *)(uintptr_t)ipst->ips_netstack->
3908 3906                              netstack_stackid;
3909 3907                          kstat_install(ipst->ips_loopback_ksp);
3910 3908                  }
3911 3909          }
3912 3910  
3913 3911          *did_alloc = B_TRUE;
3914 3912          rw_exit(&ipst->ips_ill_g_lock);
3915 3913          ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
3916 3914              NE_PLUMB, ill->ill_name, ill->ill_name_length);
3917 3915          return (ill);
3918 3916  done:
3919 3917          if (ill != NULL) {
3920 3918                  if (ill->ill_phyint != NULL) {
3921 3919                          ipsq = ill->ill_phyint->phyint_ipsq;
3922 3920                          if (ipsq != NULL) {
3923 3921                                  ipsq->ipsq_phyint = NULL;
3924 3922                                  ipsq_delete(ipsq);
3925 3923                          }
3926 3924                          mi_free(ill->ill_phyint);
3927 3925                  }
3928 3926                  ill_free_mib(ill);
3929 3927                  if (ill->ill_ipst != NULL)
3930 3928                          netstack_rele(ill->ill_ipst->ips_netstack);
3931 3929                  mi_free(ill);
3932 3930          }
3933 3931          rw_exit(&ipst->ips_ill_g_lock);
3934 3932          return (NULL);
3935 3933  }
3936 3934  
3937 3935  /*
3938 3936   * For IPP calls - use the ip_stack_t for global stack.
3939 3937   */
3940 3938  ill_t *
3941 3939  ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
3942 3940  {
3943 3941          ip_stack_t      *ipst;
3944 3942          ill_t           *ill;
3945 3943          netstack_t      *ns;
3946 3944  
3947 3945          ns = netstack_find_by_stackid(GLOBAL_NETSTACKID);
3948 3946  
3949 3947          if ((ipst = ns->netstack_ip) == NULL) {
3950 3948                  cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
3951 3949                  netstack_rele(ns);
3952 3950                  return (NULL);
3953 3951          }
3954 3952  
3955 3953          ill = ill_lookup_on_ifindex(index, isv6, ipst);
3956 3954          netstack_rele(ns);
3957 3955          return (ill);
3958 3956  }
3959 3957  
3960 3958  /*
3961 3959   * Return a pointer to the ill which matches the index and IP version type.
3962 3960   */
3963 3961  ill_t *
3964 3962  ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3965 3963  {
3966 3964          ill_t   *ill;
3967 3965          phyint_t *phyi;
3968 3966  
3969 3967          /*
3970 3968           * Indexes are stored in the phyint - a common structure
3971 3969           * to both IPv4 and IPv6.
3972 3970           */
3973 3971          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3974 3972          phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3975 3973              (void *) &index, NULL);
3976 3974          if (phyi != NULL) {
3977 3975                  ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
3978 3976                  if (ill != NULL) {
3979 3977                          mutex_enter(&ill->ill_lock);
3980 3978                          if (!ILL_IS_CONDEMNED(ill)) {
3981 3979                                  ill_refhold_locked(ill);
3982 3980                                  mutex_exit(&ill->ill_lock);
3983 3981                                  rw_exit(&ipst->ips_ill_g_lock);
3984 3982                                  return (ill);
3985 3983                          }
3986 3984                          mutex_exit(&ill->ill_lock);
3987 3985                  }
3988 3986          }
3989 3987          rw_exit(&ipst->ips_ill_g_lock);
3990 3988          return (NULL);
3991 3989  }
3992 3990  
3993 3991  /*
3994 3992   * Verify whether or not an interface index is valid for the specified zoneid
3995 3993   * to transmit packets.
3996 3994   * It can be zero (meaning "reset") or an interface index assigned
3997 3995   * to a non-VNI interface. (We don't use VNI interface to send packets.)
3998 3996   */
3999 3997  boolean_t
4000 3998  ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
4001 3999      ip_stack_t *ipst)
4002 4000  {
4003 4001          ill_t           *ill;
4004 4002  
4005 4003          if (ifindex == 0)
4006 4004                  return (B_TRUE);
4007 4005  
4008 4006          ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
4009 4007          if (ill == NULL)
4010 4008                  return (B_FALSE);
4011 4009          if (IS_VNI(ill)) {
4012 4010                  ill_refrele(ill);
4013 4011                  return (B_FALSE);
4014 4012          }
4015 4013          ill_refrele(ill);
4016 4014          return (B_TRUE);
4017 4015  }
4018 4016  
4019 4017  /*
4020 4018   * Return the ifindex next in sequence after the passed in ifindex.
4021 4019   * If there is no next ifindex for the given protocol, return 0.
4022 4020   */
4023 4021  uint_t
4024 4022  ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
4025 4023  {
4026 4024          phyint_t *phyi;
4027 4025          phyint_t *phyi_initial;
4028 4026          uint_t   ifindex;
4029 4027  
4030 4028          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4031 4029  
4032 4030          if (index == 0) {
4033 4031                  phyi = avl_first(
4034 4032                      &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
4035 4033          } else {
4036 4034                  phyi = phyi_initial = avl_find(
4037 4035                      &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
4038 4036                      (void *) &index, NULL);
4039 4037          }
4040 4038  
4041 4039          for (; phyi != NULL;
4042 4040              phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
4043 4041              phyi, AVL_AFTER)) {
4044 4042                  /*
4045 4043                   * If we're not returning the first interface in the tree
4046 4044                   * and we still haven't moved past the phyint_t that
4047 4045                   * corresponds to index, avl_walk needs to be called again
4048 4046                   */
4049 4047                  if (!((index != 0) && (phyi == phyi_initial))) {
4050 4048                          if (isv6) {
4051 4049                                  if ((phyi->phyint_illv6) &&
4052 4050                                      ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
4053 4051                                      (phyi->phyint_illv6->ill_isv6 == 1))
4054 4052                                          break;
4055 4053                          } else {
4056 4054                                  if ((phyi->phyint_illv4) &&
4057 4055                                      ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
4058 4056                                      (phyi->phyint_illv4->ill_isv6 == 0))
4059 4057                                          break;
4060 4058                          }
4061 4059                  }
4062 4060          }
4063 4061  
4064 4062          rw_exit(&ipst->ips_ill_g_lock);
4065 4063  
4066 4064          if (phyi != NULL)
4067 4065                  ifindex = phyi->phyint_ifindex;
4068 4066          else
4069 4067                  ifindex = 0;
4070 4068  
4071 4069          return (ifindex);
4072 4070  }
4073 4071  
4074 4072  /*
4075 4073   * Return the ifindex for the named interface.
4076 4074   * If there is no next ifindex for the interface, return 0.
4077 4075   */
4078 4076  uint_t
4079 4077  ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
4080 4078  {
4081 4079          phyint_t        *phyi;
4082 4080          avl_index_t     where = 0;
4083 4081          uint_t          ifindex;
4084 4082  
4085 4083          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4086 4084  
4087 4085          if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
4088 4086              name, &where)) == NULL) {
4089 4087                  rw_exit(&ipst->ips_ill_g_lock);
4090 4088                  return (0);
4091 4089          }
4092 4090  
4093 4091          ifindex = phyi->phyint_ifindex;
4094 4092  
4095 4093          rw_exit(&ipst->ips_ill_g_lock);
4096 4094  
4097 4095          return (ifindex);
4098 4096  }
4099 4097  
4100 4098  /*
4101 4099   * Return the ifindex to be used by upper layer protocols for instance
4102 4100   * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
4103 4101   */
4104 4102  uint_t
4105 4103  ill_get_upper_ifindex(const ill_t *ill)
4106 4104  {
4107 4105          if (IS_UNDER_IPMP(ill))
4108 4106                  return (ipmp_ill_get_ipmp_ifindex(ill));
4109 4107          else
4110 4108                  return (ill->ill_phyint->phyint_ifindex);
4111 4109  }
4112 4110  
4113 4111  
4114 4112  /*
4115 4113   * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4116 4114   * that gives a running thread a reference to the ill. This reference must be
4117 4115   * released by the thread when it is done accessing the ill and related
4118 4116   * objects. ill_refcnt can not be used to account for static references
4119 4117   * such as other structures pointing to an ill. Callers must generally
4120 4118   * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4121 4119   * or be sure that the ill is not being deleted or changing state before
4122 4120   * calling the refhold functions. A non-zero ill_refcnt ensures that the
4123 4121   * ill won't change any of its critical state such as address, netmask etc.
4124 4122   */
4125 4123  void
4126 4124  ill_refhold(ill_t *ill)
4127 4125  {
4128 4126          mutex_enter(&ill->ill_lock);
4129 4127          ill->ill_refcnt++;
4130 4128          ILL_TRACE_REF(ill);
4131 4129          mutex_exit(&ill->ill_lock);
4132 4130  }
4133 4131  
4134 4132  void
4135 4133  ill_refhold_locked(ill_t *ill)
4136 4134  {
4137 4135          ASSERT(MUTEX_HELD(&ill->ill_lock));
4138 4136          ill->ill_refcnt++;
4139 4137          ILL_TRACE_REF(ill);
4140 4138  }
4141 4139  
4142 4140  /* Returns true if we managed to get a refhold */
4143 4141  boolean_t
4144 4142  ill_check_and_refhold(ill_t *ill)
4145 4143  {
4146 4144          mutex_enter(&ill->ill_lock);
4147 4145          if (!ILL_IS_CONDEMNED(ill)) {
4148 4146                  ill_refhold_locked(ill);
4149 4147                  mutex_exit(&ill->ill_lock);
4150 4148                  return (B_TRUE);
4151 4149          }
4152 4150          mutex_exit(&ill->ill_lock);
4153 4151          return (B_FALSE);
4154 4152  }
4155 4153  
4156 4154  /*
4157 4155   * Must not be called while holding any locks. Otherwise if this is
4158 4156   * the last reference to be released, there is a chance of recursive mutex
4159 4157   * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4160 4158   * to restart an ioctl.
4161 4159   */
4162 4160  void
4163 4161  ill_refrele(ill_t *ill)
4164 4162  {
4165 4163          mutex_enter(&ill->ill_lock);
4166 4164          ASSERT(ill->ill_refcnt != 0);
4167 4165          ill->ill_refcnt--;
4168 4166          ILL_UNTRACE_REF(ill);
4169 4167          if (ill->ill_refcnt != 0) {
4170 4168                  /* Every ire pointing to the ill adds 1 to ill_refcnt */
4171 4169                  mutex_exit(&ill->ill_lock);
4172 4170                  return;
4173 4171          }
4174 4172  
4175 4173          /* Drops the ill_lock */
4176 4174          ipif_ill_refrele_tail(ill);
4177 4175  }
4178 4176  
4179 4177  /*
4180 4178   * Obtain a weak reference count on the ill. This reference ensures the
4181 4179   * ill won't be freed, but the ill may change any of its critical state
4182 4180   * such as netmask, address etc. Returns an error if the ill has started
4183 4181   * closing.
4184 4182   */
4185 4183  boolean_t
4186 4184  ill_waiter_inc(ill_t *ill)
4187 4185  {
4188 4186          mutex_enter(&ill->ill_lock);
4189 4187          if (ill->ill_state_flags & ILL_CONDEMNED) {
4190 4188                  mutex_exit(&ill->ill_lock);
4191 4189                  return (B_FALSE);
4192 4190          }
4193 4191          ill->ill_waiters++;
4194 4192          mutex_exit(&ill->ill_lock);
4195 4193          return (B_TRUE);
4196 4194  }
4197 4195  
4198 4196  void
4199 4197  ill_waiter_dcr(ill_t *ill)
4200 4198  {
4201 4199          mutex_enter(&ill->ill_lock);
4202 4200          ill->ill_waiters--;
4203 4201          if (ill->ill_waiters == 0)
4204 4202                  cv_broadcast(&ill->ill_cv);
4205 4203          mutex_exit(&ill->ill_lock);
4206 4204  }
4207 4205  
4208 4206  /*
4209 4207   * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
4210 4208   * driver.  We construct best guess defaults for lower level information that
4211 4209   * we need.  If an interface is brought up without injection of any overriding
4212 4210   * information from outside, we have to be ready to go with these defaults.
4213 4211   * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
4214 4212   * we primarely want the dl_provider_style.
4215 4213   * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
4216 4214   * at which point we assume the other part of the information is valid.
4217 4215   */
4218 4216  void
4219 4217  ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
4220 4218  {
4221 4219          uchar_t         *brdcst_addr;
4222 4220          uint_t          brdcst_addr_length, phys_addr_length;
4223 4221          t_scalar_t      sap_length;
4224 4222          dl_info_ack_t   *dlia;
4225 4223          ip_m_t          *ipm;
4226 4224          dl_qos_cl_sel1_t *sel1;
4227 4225          int             min_mtu;
4228 4226  
4229 4227          ASSERT(IAM_WRITER_ILL(ill));
4230 4228  
4231 4229          /*
4232 4230           * Till the ill is fully up  the ill is not globally visible.
4233 4231           * So no need for a lock.
4234 4232           */
4235 4233          dlia = (dl_info_ack_t *)mp->b_rptr;
4236 4234          ill->ill_mactype = dlia->dl_mac_type;
4237 4235  
4238 4236          ipm = ip_m_lookup(dlia->dl_mac_type);
4239 4237          if (ipm == NULL) {
4240 4238                  ipm = ip_m_lookup(DL_OTHER);
4241 4239                  ASSERT(ipm != NULL);
4242 4240          }
4243 4241          ill->ill_media = ipm;
4244 4242  
4245 4243          /*
4246 4244           * When the new DLPI stuff is ready we'll pull lengths
4247 4245           * from dlia.
4248 4246           */
4249 4247          if (dlia->dl_version == DL_VERSION_2) {
4250 4248                  brdcst_addr_length = dlia->dl_brdcst_addr_length;
4251 4249                  brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
4252 4250                      brdcst_addr_length);
4253 4251                  if (brdcst_addr == NULL) {
4254 4252                          brdcst_addr_length = 0;
4255 4253                  }
4256 4254                  sap_length = dlia->dl_sap_length;
4257 4255                  phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
4258 4256                  ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
4259 4257                      brdcst_addr_length, sap_length, phys_addr_length));
4260 4258          } else {
4261 4259                  brdcst_addr_length = 6;
4262 4260                  brdcst_addr = ip_six_byte_all_ones;
4263 4261                  sap_length = -2;
4264 4262                  phys_addr_length = brdcst_addr_length;
4265 4263          }
4266 4264  
4267 4265          ill->ill_bcast_addr_length = brdcst_addr_length;
4268 4266          ill->ill_phys_addr_length = phys_addr_length;
4269 4267          ill->ill_sap_length = sap_length;
4270 4268  
4271 4269          /*
4272 4270           * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
4273 4271           * but we must ensure a minimum IP MTU is used since other bits of
4274 4272           * IP will fly apart otherwise.
4275 4273           */
4276 4274          min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
4277 4275          ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
4278 4276          ill->ill_current_frag = ill->ill_max_frag;
4279 4277          ill->ill_mtu = ill->ill_max_frag;
4280 4278          ill->ill_mc_mtu = ill->ill_mtu; /* Overridden by DL_NOTE_SDU_SIZE2 */
4281 4279  
4282 4280          ill->ill_type = ipm->ip_m_type;
4283 4281  
4284 4282          if (!ill->ill_dlpi_style_set) {
4285 4283                  if (dlia->dl_provider_style == DL_STYLE2)
4286 4284                          ill->ill_needs_attach = 1;
4287 4285  
4288 4286                  phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
4289 4287  
4290 4288                  /*
4291 4289                   * Allocate the first ipif on this ill.  We don't delay it
4292 4290                   * further as ioctl handling assumes at least one ipif exists.
4293 4291                   *
4294 4292                   * At this point we don't know whether the ill is v4 or v6.
4295 4293                   * We will know this whan the SIOCSLIFNAME happens and
4296 4294                   * the correct value for ill_isv6 will be assigned in
4297 4295                   * ipif_set_values(). We need to hold the ill lock and
4298 4296                   * clear the ILL_LL_SUBNET_PENDING flag and atomically do
4299 4297                   * the wakeup.
4300 4298                   */
4301 4299                  (void) ipif_allocate(ill, 0, IRE_LOCAL,
4302 4300                      dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
4303 4301                  mutex_enter(&ill->ill_lock);
4304 4302                  ASSERT(ill->ill_dlpi_style_set == 0);
4305 4303                  ill->ill_dlpi_style_set = 1;
4306 4304                  ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
4307 4305                  cv_broadcast(&ill->ill_cv);
4308 4306                  mutex_exit(&ill->ill_lock);
4309 4307                  freemsg(mp);
4310 4308                  return;
4311 4309          }
4312 4310          ASSERT(ill->ill_ipif != NULL);
4313 4311          /*
4314 4312           * We know whether it is IPv4 or IPv6 now, as this is the
4315 4313           * second DL_INFO_ACK we are recieving in response to the
4316 4314           * DL_INFO_REQ sent in ipif_set_values.
4317 4315           */
4318 4316          ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
4319 4317          /*
4320 4318           * Clear all the flags that were set based on ill_bcast_addr_length
4321 4319           * and ill_phys_addr_length (in ipif_set_values) as these could have
4322 4320           * changed now and we need to re-evaluate.
4323 4321           */
4324 4322          ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
4325 4323          ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
4326 4324  
4327 4325          /*
4328 4326           * Free ill_bcast_mp as things could have changed now.
4329 4327           *
4330 4328           * NOTE: The IPMP meta-interface is special-cased because it starts
4331 4329           * with no underlying interfaces (and thus an unknown broadcast
4332 4330           * address length), but we enforce that an interface is broadcast-
4333 4331           * capable as part of allowing it to join a group.
4334 4332           */
4335 4333          if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
4336 4334                  if (ill->ill_bcast_mp != NULL)
4337 4335                          freemsg(ill->ill_bcast_mp);
4338 4336                  ill->ill_net_type = IRE_IF_NORESOLVER;
4339 4337  
4340 4338                  ill->ill_bcast_mp = ill_dlur_gen(NULL,
4341 4339                      ill->ill_phys_addr_length,
4342 4340                      ill->ill_sap,
4343 4341                      ill->ill_sap_length);
4344 4342  
4345 4343                  if (ill->ill_isv6)
4346 4344                          /*
4347 4345                           * Note: xresolv interfaces will eventually need NOARP
4348 4346                           * set here as well, but that will require those
4349 4347                           * external resolvers to have some knowledge of
4350 4348                           * that flag and act appropriately. Not to be changed
4351 4349                           * at present.
4352 4350                           */
4353 4351                          ill->ill_flags |= ILLF_NONUD;
4354 4352                  else
4355 4353                          ill->ill_flags |= ILLF_NOARP;
4356 4354  
4357 4355                  if (ill->ill_mactype == SUNW_DL_VNI) {
4358 4356                          ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
4359 4357                  } else if (ill->ill_phys_addr_length == 0 ||
4360 4358                      ill->ill_mactype == DL_IPV4 ||
4361 4359                      ill->ill_mactype == DL_IPV6) {
4362 4360                          /*
4363 4361                           * The underying link is point-to-point, so mark the
4364 4362                           * interface as such.  We can do IP multicast over
4365 4363                           * such a link since it transmits all network-layer
4366 4364                           * packets to the remote side the same way.
4367 4365                           */
4368 4366                          ill->ill_flags |= ILLF_MULTICAST;
4369 4367                          ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
4370 4368                  }
4371 4369          } else {
4372 4370                  ill->ill_net_type = IRE_IF_RESOLVER;
4373 4371                  if (ill->ill_bcast_mp != NULL)
4374 4372                          freemsg(ill->ill_bcast_mp);
4375 4373                  ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
4376 4374                      ill->ill_bcast_addr_length, ill->ill_sap,
4377 4375                      ill->ill_sap_length);
4378 4376                  /*
4379 4377                   * Later detect lack of DLPI driver multicast
4380 4378                   * capability by catching DL_ENABMULTI errors in
4381 4379                   * ip_rput_dlpi.
4382 4380                   */
4383 4381                  ill->ill_flags |= ILLF_MULTICAST;
4384 4382                  if (!ill->ill_isv6)
4385 4383                          ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
4386 4384          }
4387 4385  
4388 4386          /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
4389 4387          if (ill->ill_mactype == SUNW_DL_IPMP)
4390 4388                  ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
4391 4389  
4392 4390          /* By default an interface does not support any CoS marking */
4393 4391          ill->ill_flags &= ~ILLF_COS_ENABLED;
4394 4392  
4395 4393          /*
4396 4394           * If we get QoS information in DL_INFO_ACK, the device supports
4397 4395           * some form of CoS marking, set ILLF_COS_ENABLED.
4398 4396           */
4399 4397          sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
4400 4398              dlia->dl_qos_length);
4401 4399          if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
4402 4400                  ill->ill_flags |= ILLF_COS_ENABLED;
4403 4401          }
4404 4402  
4405 4403          /* Clear any previous error indication. */
4406 4404          ill->ill_error = 0;
4407 4405          freemsg(mp);
4408 4406  }
4409 4407  
4410 4408  /*
4411 4409   * Perform various checks to verify that an address would make sense as a
4412 4410   * local, remote, or subnet interface address.
4413 4411   */
4414 4412  static boolean_t
4415 4413  ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
4416 4414  {
4417 4415          ipaddr_t        net_mask;
4418 4416  
4419 4417          /*
4420 4418           * Don't allow all zeroes, or all ones, but allow
4421 4419           * all ones netmask.
4422 4420           */
4423 4421          if ((net_mask = ip_net_mask(addr)) == 0)
4424 4422                  return (B_FALSE);
4425 4423          /* A given netmask overrides the "guess" netmask */
4426 4424          if (subnet_mask != 0)
4427 4425                  net_mask = subnet_mask;
4428 4426          if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
4429 4427              (addr == (addr | ~net_mask)))) {
4430 4428                  return (B_FALSE);
4431 4429          }
4432 4430  
4433 4431          /*
4434 4432           * Even if the netmask is all ones, we do not allow address to be
4435 4433           * 255.255.255.255
4436 4434           */
4437 4435          if (addr == INADDR_BROADCAST)
4438 4436                  return (B_FALSE);
4439 4437  
4440 4438          if (CLASSD(addr))
4441 4439                  return (B_FALSE);
4442 4440  
4443 4441          return (B_TRUE);
4444 4442  }
4445 4443  
4446 4444  #define V6_IPIF_LINKLOCAL(p)    \
4447 4445          IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
4448 4446  
4449 4447  /*
4450 4448   * Compare two given ipifs and check if the second one is better than
4451 4449   * the first one using the order of preference (not taking deprecated
4452 4450   * into acount) specified in ipif_lookup_multicast().
4453 4451   */
4454 4452  static boolean_t
4455 4453  ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
4456 4454  {
4457 4455          /* Check the least preferred first. */
4458 4456          if (IS_LOOPBACK(old_ipif->ipif_ill)) {
4459 4457                  /* If both ipifs are the same, use the first one. */
4460 4458                  if (IS_LOOPBACK(new_ipif->ipif_ill))
4461 4459                          return (B_FALSE);
4462 4460                  else
4463 4461                          return (B_TRUE);
4464 4462          }
4465 4463  
4466 4464          /* For IPv6, check for link local address. */
4467 4465          if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
4468 4466                  if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4469 4467                      V6_IPIF_LINKLOCAL(new_ipif)) {
4470 4468                          /* The second one is equal or less preferred. */
4471 4469                          return (B_FALSE);
4472 4470                  } else {
4473 4471                          return (B_TRUE);
4474 4472                  }
4475 4473          }
4476 4474  
4477 4475          /* Then check for point to point interface. */
4478 4476          if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
4479 4477                  if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4480 4478                      (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
4481 4479                      (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
4482 4480                          return (B_FALSE);
4483 4481                  } else {
4484 4482                          return (B_TRUE);
4485 4483                  }
4486 4484          }
4487 4485  
4488 4486          /* old_ipif is a normal interface, so no need to use the new one. */
4489 4487          return (B_FALSE);
4490 4488  }
4491 4489  
4492 4490  /*
4493 4491   * Find a mulitcast-capable ipif given an IP instance and zoneid.
4494 4492   * The ipif must be up, and its ill must multicast-capable, not
4495 4493   * condemned, not an underlying interface in an IPMP group, and
4496 4494   * not a VNI interface.  Order of preference:
4497 4495   *
4498 4496   *      1a. normal
4499 4497   *      1b. normal, but deprecated
4500 4498   *      2a. point to point
4501 4499   *      2b. point to point, but deprecated
4502 4500   *      3a. link local
4503 4501   *      3b. link local, but deprecated
4504 4502   *      4. loopback.
4505 4503   */
4506 4504  static ipif_t *
4507 4505  ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4508 4506  {
4509 4507          ill_t                   *ill;
4510 4508          ill_walk_context_t      ctx;
4511 4509          ipif_t                  *ipif;
4512 4510          ipif_t                  *saved_ipif = NULL;
4513 4511          ipif_t                  *dep_ipif = NULL;
4514 4512  
4515 4513          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4516 4514          if (isv6)
4517 4515                  ill = ILL_START_WALK_V6(&ctx, ipst);
4518 4516          else
4519 4517                  ill = ILL_START_WALK_V4(&ctx, ipst);
4520 4518  
4521 4519          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4522 4520                  mutex_enter(&ill->ill_lock);
4523 4521                  if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
4524 4522                      ILL_IS_CONDEMNED(ill) ||
4525 4523                      !(ill->ill_flags & ILLF_MULTICAST)) {
4526 4524                          mutex_exit(&ill->ill_lock);
4527 4525                          continue;
4528 4526                  }
4529 4527                  for (ipif = ill->ill_ipif; ipif != NULL;
4530 4528                      ipif = ipif->ipif_next) {
4531 4529                          if (zoneid != ipif->ipif_zoneid &&
4532 4530                              zoneid != ALL_ZONES &&
4533 4531                              ipif->ipif_zoneid != ALL_ZONES) {
4534 4532                                  continue;
4535 4533                          }
4536 4534                          if (!(ipif->ipif_flags & IPIF_UP) ||
4537 4535                              IPIF_IS_CONDEMNED(ipif)) {
4538 4536                                  continue;
4539 4537                          }
4540 4538  
4541 4539                          /*
4542 4540                           * Found one candidate.  If it is deprecated,
4543 4541                           * remember it in dep_ipif.  If it is not deprecated,
4544 4542                           * remember it in saved_ipif.
4545 4543                           */
4546 4544                          if (ipif->ipif_flags & IPIF_DEPRECATED) {
4547 4545                                  if (dep_ipif == NULL) {
4548 4546                                          dep_ipif = ipif;
4549 4547                                  } else if (ipif_comp_multi(dep_ipif, ipif,
4550 4548                                      isv6)) {
4551 4549                                          /*
4552 4550                                           * If the previous dep_ipif does not
4553 4551                                           * belong to the same ill, we've done
4554 4552                                           * a ipif_refhold() on it.  So we need
4555 4553                                           * to release it.
4556 4554                                           */
4557 4555                                          if (dep_ipif->ipif_ill != ill)
4558 4556                                                  ipif_refrele(dep_ipif);
4559 4557                                          dep_ipif = ipif;
4560 4558                                  }
4561 4559                                  continue;
4562 4560                          }
4563 4561                          if (saved_ipif == NULL) {
4564 4562                                  saved_ipif = ipif;
4565 4563                          } else {
4566 4564                                  if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
4567 4565                                          if (saved_ipif->ipif_ill != ill)
4568 4566                                                  ipif_refrele(saved_ipif);
4569 4567                                          saved_ipif = ipif;
4570 4568                                  }
4571 4569                          }
4572 4570                  }
4573 4571                  /*
4574 4572                   * Before going to the next ill, do a ipif_refhold() on the
4575 4573                   * saved ones.
4576 4574                   */
4577 4575                  if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
4578 4576                          ipif_refhold_locked(saved_ipif);
4579 4577                  if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
4580 4578                          ipif_refhold_locked(dep_ipif);
4581 4579                  mutex_exit(&ill->ill_lock);
4582 4580          }
4583 4581          rw_exit(&ipst->ips_ill_g_lock);
4584 4582  
4585 4583          /*
4586 4584           * If we have only the saved_ipif, return it.  But if we have both
4587 4585           * saved_ipif and dep_ipif, check to see which one is better.
4588 4586           */
4589 4587          if (saved_ipif != NULL) {
4590 4588                  if (dep_ipif != NULL) {
4591 4589                          if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
4592 4590                                  ipif_refrele(saved_ipif);
4593 4591                                  return (dep_ipif);
4594 4592                          } else {
4595 4593                                  ipif_refrele(dep_ipif);
4596 4594                                  return (saved_ipif);
4597 4595                          }
4598 4596                  }
4599 4597                  return (saved_ipif);
4600 4598          } else {
4601 4599                  return (dep_ipif);
4602 4600          }
4603 4601  }
4604 4602  
4605 4603  ill_t *
4606 4604  ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4607 4605  {
4608 4606          ipif_t *ipif;
4609 4607          ill_t *ill;
4610 4608  
4611 4609          ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
4612 4610          if (ipif == NULL)
4613 4611                  return (NULL);
4614 4612  
4615 4613          ill = ipif->ipif_ill;
4616 4614          ill_refhold(ill);
4617 4615          ipif_refrele(ipif);
4618 4616          return (ill);
4619 4617  }
4620 4618  
4621 4619  /*
4622 4620   * This function is called when an application does not specify an interface
4623 4621   * to be used for multicast traffic (joining a group/sending data).  It
4624 4622   * calls ire_lookup_multi() to look for an interface route for the
4625 4623   * specified multicast group.  Doing this allows the administrator to add
4626 4624   * prefix routes for multicast to indicate which interface to be used for
4627 4625   * multicast traffic in the above scenario.  The route could be for all
4628 4626   * multicast (224.0/4), for a single multicast group (a /32 route) or
4629 4627   * anything in between.  If there is no such multicast route, we just find
4630 4628   * any multicast capable interface and return it.  The returned ipif
4631 4629   * is refhold'ed.
4632 4630   *
4633 4631   * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
4634 4632   * unicast table. This is used by CGTP.
4635 4633   */
4636 4634  ill_t *
4637 4635  ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
4638 4636      boolean_t *multirtp, ipaddr_t *setsrcp)
4639 4637  {
4640 4638          ill_t                   *ill;
4641 4639  
4642 4640          ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
4643 4641          if (ill != NULL)
4644 4642                  return (ill);
4645 4643  
4646 4644          return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
4647 4645  }
4648 4646  
4649 4647  /*
4650 4648   * Look for an ipif with the specified interface address and destination.
4651 4649   * The destination address is used only for matching point-to-point interfaces.
4652 4650   */
4653 4651  ipif_t *
4654 4652  ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
4655 4653  {
4656 4654          ipif_t  *ipif;
4657 4655          ill_t   *ill;
4658 4656          ill_walk_context_t ctx;
4659 4657  
4660 4658          /*
4661 4659           * First match all the point-to-point interfaces
4662 4660           * before looking at non-point-to-point interfaces.
4663 4661           * This is done to avoid returning non-point-to-point
4664 4662           * ipif instead of unnumbered point-to-point ipif.
4665 4663           */
4666 4664          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4667 4665          ill = ILL_START_WALK_V4(&ctx, ipst);
4668 4666          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4669 4667                  mutex_enter(&ill->ill_lock);
4670 4668                  for (ipif = ill->ill_ipif; ipif != NULL;
4671 4669                      ipif = ipif->ipif_next) {
4672 4670                          /* Allow the ipif to be down */
4673 4671                          if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
4674 4672                              (ipif->ipif_lcl_addr == if_addr) &&
4675 4673                              (ipif->ipif_pp_dst_addr == dst)) {
4676 4674                                  if (!IPIF_IS_CONDEMNED(ipif)) {
4677 4675                                          ipif_refhold_locked(ipif);
4678 4676                                          mutex_exit(&ill->ill_lock);
4679 4677                                          rw_exit(&ipst->ips_ill_g_lock);
4680 4678                                          return (ipif);
4681 4679                                  }
4682 4680                          }
4683 4681                  }
4684 4682                  mutex_exit(&ill->ill_lock);
4685 4683          }
4686 4684          rw_exit(&ipst->ips_ill_g_lock);
4687 4685  
4688 4686          /* lookup the ipif based on interface address */
4689 4687          ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
4690 4688          ASSERT(ipif == NULL || !ipif->ipif_isv6);
4691 4689          return (ipif);
4692 4690  }
4693 4691  
4694 4692  /*
4695 4693   * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
4696 4694   */
4697 4695  static ipif_t *
4698 4696  ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
4699 4697      zoneid_t zoneid, ip_stack_t *ipst)
4700 4698  {
4701 4699          ipif_t  *ipif;
4702 4700          ill_t   *ill;
4703 4701          boolean_t ptp = B_FALSE;
4704 4702          ill_walk_context_t      ctx;
4705 4703          boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
4706 4704          boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
4707 4705  
4708 4706          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4709 4707          /*
4710 4708           * Repeat twice, first based on local addresses and
4711 4709           * next time for pointopoint.
4712 4710           */
4713 4711  repeat:
4714 4712          ill = ILL_START_WALK_V4(&ctx, ipst);
4715 4713          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4716 4714                  if (match_ill != NULL && ill != match_ill &&
4717 4715                      (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
4718 4716                          continue;
4719 4717                  }
4720 4718                  mutex_enter(&ill->ill_lock);
4721 4719                  for (ipif = ill->ill_ipif; ipif != NULL;
4722 4720                      ipif = ipif->ipif_next) {
4723 4721                          if (zoneid != ALL_ZONES &&
4724 4722                              zoneid != ipif->ipif_zoneid &&
4725 4723                              ipif->ipif_zoneid != ALL_ZONES)
4726 4724                                  continue;
4727 4725  
4728 4726                          if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
4729 4727                                  continue;
4730 4728  
4731 4729                          /* Allow the ipif to be down */
4732 4730                          if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4733 4731                              ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4734 4732                              (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4735 4733                              (ipif->ipif_pp_dst_addr == addr))) {
4736 4734                                  if (!IPIF_IS_CONDEMNED(ipif)) {
4737 4735                                          ipif_refhold_locked(ipif);
4738 4736                                          mutex_exit(&ill->ill_lock);
4739 4737                                          rw_exit(&ipst->ips_ill_g_lock);
4740 4738                                          return (ipif);
4741 4739                                  }
4742 4740                          }
4743 4741                  }
4744 4742                  mutex_exit(&ill->ill_lock);
4745 4743          }
4746 4744  
4747 4745          /* If we already did the ptp case, then we are done */
4748 4746          if (ptp) {
4749 4747                  rw_exit(&ipst->ips_ill_g_lock);
4750 4748                  return (NULL);
4751 4749          }
4752 4750          ptp = B_TRUE;
4753 4751          goto repeat;
4754 4752  }
4755 4753  
4756 4754  /*
4757 4755   * Lookup an ipif with the specified address.  For point-to-point links we
4758 4756   * look for matches on either the destination address or the local address,
4759 4757   * but we skip the local address check if IPIF_UNNUMBERED is set.  If the
4760 4758   * `match_ill' argument is non-NULL, the lookup is restricted to that ill
4761 4759   * (or illgrp if `match_ill' is in an IPMP group).
4762 4760   */
4763 4761  ipif_t *
4764 4762  ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4765 4763      ip_stack_t *ipst)
4766 4764  {
4767 4765          return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
4768 4766              zoneid, ipst));
4769 4767  }
4770 4768  
4771 4769  /*
4772 4770   * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
4773 4771   * except that we will only return an address if it is not marked as
4774 4772   * IPIF_DUPLICATE
4775 4773   */
4776 4774  ipif_t *
4777 4775  ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4778 4776      ip_stack_t *ipst)
4779 4777  {
4780 4778          return (ipif_lookup_addr_common(addr, match_ill,
4781 4779              (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
4782 4780              zoneid, ipst));
4783 4781  }
4784 4782  
4785 4783  /*
4786 4784   * Special abbreviated version of ipif_lookup_addr() that doesn't match
4787 4785   * `match_ill' across the IPMP group.  This function is only needed in some
4788 4786   * corner-cases; almost everything should use ipif_lookup_addr().
4789 4787   */
4790 4788  ipif_t *
4791 4789  ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4792 4790  {
4793 4791          ASSERT(match_ill != NULL);
4794 4792          return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
4795 4793              ipst));
4796 4794  }
4797 4795  
4798 4796  /*
4799 4797   * Look for an ipif with the specified address. For point-point links
4800 4798   * we look for matches on either the destination address and the local
4801 4799   * address, but we ignore the check on the local address if IPIF_UNNUMBERED
4802 4800   * is set.
4803 4801   * If the `match_ill' argument is non-NULL, the lookup is restricted to that
4804 4802   * ill (or illgrp if `match_ill' is in an IPMP group).
4805 4803   * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
4806 4804   */
4807 4805  zoneid_t
4808 4806  ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4809 4807  {
4810 4808          zoneid_t zoneid;
4811 4809          ipif_t  *ipif;
4812 4810          ill_t   *ill;
4813 4811          boolean_t ptp = B_FALSE;
4814 4812          ill_walk_context_t      ctx;
4815 4813  
4816 4814          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4817 4815          /*
4818 4816           * Repeat twice, first based on local addresses and
4819 4817           * next time for pointopoint.
4820 4818           */
4821 4819  repeat:
4822 4820          ill = ILL_START_WALK_V4(&ctx, ipst);
4823 4821          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4824 4822                  if (match_ill != NULL && ill != match_ill &&
4825 4823                      !IS_IN_SAME_ILLGRP(ill, match_ill)) {
4826 4824                          continue;
4827 4825                  }
4828 4826                  mutex_enter(&ill->ill_lock);
4829 4827                  for (ipif = ill->ill_ipif; ipif != NULL;
4830 4828                      ipif = ipif->ipif_next) {
4831 4829                          /* Allow the ipif to be down */
4832 4830                          if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4833 4831                              ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4834 4832                              (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4835 4833                              (ipif->ipif_pp_dst_addr == addr)) &&
4836 4834                              !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
4837 4835                                  zoneid = ipif->ipif_zoneid;
4838 4836                                  mutex_exit(&ill->ill_lock);
4839 4837                                  rw_exit(&ipst->ips_ill_g_lock);
4840 4838                                  /*
4841 4839                                   * If ipif_zoneid was ALL_ZONES then we have
4842 4840                                   * a trusted extensions shared IP address.
4843 4841                                   * In that case GLOBAL_ZONEID works to send.
4844 4842                                   */
4845 4843                                  if (zoneid == ALL_ZONES)
4846 4844                                          zoneid = GLOBAL_ZONEID;
4847 4845                                  return (zoneid);
4848 4846                          }
4849 4847                  }
4850 4848                  mutex_exit(&ill->ill_lock);
4851 4849          }
4852 4850  
4853 4851          /* If we already did the ptp case, then we are done */
4854 4852          if (ptp) {
4855 4853                  rw_exit(&ipst->ips_ill_g_lock);
4856 4854                  return (ALL_ZONES);
4857 4855          }
4858 4856          ptp = B_TRUE;
4859 4857          goto repeat;
4860 4858  }
4861 4859  
4862 4860  /*
4863 4861   * Look for an ipif that matches the specified remote address i.e. the
4864 4862   * ipif that would receive the specified packet.
4865 4863   * First look for directly connected interfaces and then do a recursive
4866 4864   * IRE lookup and pick the first ipif corresponding to the source address in the
4867 4865   * ire.
4868 4866   * Returns: held ipif
4869 4867   *
4870 4868   * This is only used for ICMP_ADDRESS_MASK_REQUESTs
4871 4869   */
4872 4870  ipif_t *
4873 4871  ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
4874 4872  {
4875 4873          ipif_t  *ipif;
4876 4874  
4877 4875          ASSERT(!ill->ill_isv6);
4878 4876  
4879 4877          /*
4880 4878           * Someone could be changing this ipif currently or change it
4881 4879           * after we return this. Thus  a few packets could use the old
4882 4880           * old values. However structure updates/creates (ire, ilg, ilm etc)
4883 4881           * will atomically be updated or cleaned up with the new value
4884 4882           * Thus we don't need a lock to check the flags or other attrs below.
4885 4883           */
4886 4884          mutex_enter(&ill->ill_lock);
4887 4885          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4888 4886                  if (IPIF_IS_CONDEMNED(ipif))
4889 4887                          continue;
4890 4888                  if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
4891 4889                      ipif->ipif_zoneid != ALL_ZONES)
4892 4890                          continue;
4893 4891                  /* Allow the ipif to be down */
4894 4892                  if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4895 4893                          if ((ipif->ipif_pp_dst_addr == addr) ||
4896 4894                              (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
4897 4895                              ipif->ipif_lcl_addr == addr)) {
4898 4896                                  ipif_refhold_locked(ipif);
4899 4897                                  mutex_exit(&ill->ill_lock);
4900 4898                                  return (ipif);
4901 4899                          }
4902 4900                  } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
4903 4901                          ipif_refhold_locked(ipif);
4904 4902                          mutex_exit(&ill->ill_lock);
4905 4903                          return (ipif);
4906 4904                  }
4907 4905          }
4908 4906          mutex_exit(&ill->ill_lock);
4909 4907          /*
4910 4908           * For a remote destination it isn't possible to nail down a particular
4911 4909           * ipif.
4912 4910           */
4913 4911  
4914 4912          /* Pick the first interface */
4915 4913          ipif = ipif_get_next_ipif(NULL, ill);
4916 4914          return (ipif);
4917 4915  }
4918 4916  
4919 4917  /*
4920 4918   * This func does not prevent refcnt from increasing. But if
4921 4919   * the caller has taken steps to that effect, then this func
4922 4920   * can be used to determine whether the ill has become quiescent
4923 4921   */
4924 4922  static boolean_t
4925 4923  ill_is_quiescent(ill_t *ill)
4926 4924  {
4927 4925          ipif_t  *ipif;
4928 4926  
4929 4927          ASSERT(MUTEX_HELD(&ill->ill_lock));
4930 4928  
4931 4929          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4932 4930                  if (ipif->ipif_refcnt != 0)
4933 4931                          return (B_FALSE);
4934 4932          }
4935 4933          if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
4936 4934                  return (B_FALSE);
4937 4935          }
4938 4936          return (B_TRUE);
4939 4937  }
4940 4938  
4941 4939  boolean_t
4942 4940  ill_is_freeable(ill_t *ill)
4943 4941  {
4944 4942          ipif_t  *ipif;
4945 4943  
4946 4944          ASSERT(MUTEX_HELD(&ill->ill_lock));
4947 4945  
4948 4946          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4949 4947                  if (ipif->ipif_refcnt != 0) {
4950 4948                          return (B_FALSE);
4951 4949                  }
4952 4950          }
4953 4951          if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
4954 4952                  return (B_FALSE);
4955 4953          }
4956 4954          return (B_TRUE);
4957 4955  }
4958 4956  
4959 4957  /*
4960 4958   * This func does not prevent refcnt from increasing. But if
4961 4959   * the caller has taken steps to that effect, then this func
4962 4960   * can be used to determine whether the ipif has become quiescent
4963 4961   */
4964 4962  static boolean_t
4965 4963  ipif_is_quiescent(ipif_t *ipif)
4966 4964  {
4967 4965          ill_t *ill;
4968 4966  
4969 4967          ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4970 4968  
4971 4969          if (ipif->ipif_refcnt != 0)
4972 4970                  return (B_FALSE);
4973 4971  
4974 4972          ill = ipif->ipif_ill;
4975 4973          if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
4976 4974              ill->ill_logical_down) {
4977 4975                  return (B_TRUE);
4978 4976          }
4979 4977  
4980 4978          /* This is the last ipif going down or being deleted on this ill */
4981 4979          if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
4982 4980                  return (B_FALSE);
4983 4981          }
4984 4982  
4985 4983          return (B_TRUE);
4986 4984  }
4987 4985  
4988 4986  /*
4989 4987   * return true if the ipif can be destroyed: the ipif has to be quiescent
4990 4988   * with zero references from ire/ilm to it.
4991 4989   */
4992 4990  static boolean_t
4993 4991  ipif_is_freeable(ipif_t *ipif)
4994 4992  {
4995 4993          ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4996 4994          ASSERT(ipif->ipif_id != 0);
4997 4995          return (ipif->ipif_refcnt == 0);
4998 4996  }
4999 4997  
5000 4998  /*
5001 4999   * The ipif/ill/ire has been refreled. Do the tail processing.
5002 5000   * Determine if the ipif or ill in question has become quiescent and if so
5003 5001   * wakeup close and/or restart any queued pending ioctl that is waiting
5004 5002   * for the ipif_down (or ill_down)
5005 5003   */
5006 5004  void
5007 5005  ipif_ill_refrele_tail(ill_t *ill)
5008 5006  {
5009 5007          mblk_t  *mp;
5010 5008          conn_t  *connp;
5011 5009          ipsq_t  *ipsq;
5012 5010          ipxop_t *ipx;
5013 5011          ipif_t  *ipif;
5014 5012          dl_notify_ind_t *dlindp;
5015 5013  
5016 5014          ASSERT(MUTEX_HELD(&ill->ill_lock));
5017 5015  
5018 5016          if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
5019 5017                  /* ip_modclose() may be waiting */
5020 5018                  cv_broadcast(&ill->ill_cv);
5021 5019          }
5022 5020  
5023 5021          ipsq = ill->ill_phyint->phyint_ipsq;
5024 5022          mutex_enter(&ipsq->ipsq_lock);
5025 5023          ipx = ipsq->ipsq_xop;
5026 5024          mutex_enter(&ipx->ipx_lock);
5027 5025          if (ipx->ipx_waitfor == 0)      /* no one's waiting; bail */
5028 5026                  goto unlock;
5029 5027  
5030 5028          ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
5031 5029  
5032 5030          ipif = ipx->ipx_pending_ipif;
5033 5031          if (ipif->ipif_ill != ill)      /* wait is for another ill; bail */
5034 5032                  goto unlock;
5035 5033  
5036 5034          switch (ipx->ipx_waitfor) {
5037 5035          case IPIF_DOWN:
5038 5036                  if (!ipif_is_quiescent(ipif))
5039 5037                          goto unlock;
5040 5038                  break;
5041 5039          case IPIF_FREE:
5042 5040                  if (!ipif_is_freeable(ipif))
5043 5041                          goto unlock;
5044 5042                  break;
5045 5043          case ILL_DOWN:
5046 5044                  if (!ill_is_quiescent(ill))
5047 5045                          goto unlock;
5048 5046                  break;
5049 5047          case ILL_FREE:
5050 5048                  /*
5051 5049                   * ILL_FREE is only for loopback; normal ill teardown waits
5052 5050                   * synchronously in ip_modclose() without using ipx_waitfor,
5053 5051                   * handled by the cv_broadcast() at the top of this function.
5054 5052                   */
5055 5053                  if (!ill_is_freeable(ill))
5056 5054                          goto unlock;
5057 5055                  break;
5058 5056          default:
5059 5057                  cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
5060 5058                      (void *)ipsq, ipx->ipx_waitfor);
5061 5059          }
5062 5060  
5063 5061          ill_refhold_locked(ill);        /* for qwriter_ip() call below */
5064 5062          mutex_exit(&ipx->ipx_lock);
5065 5063          mp = ipsq_pending_mp_get(ipsq, &connp);
5066 5064          mutex_exit(&ipsq->ipsq_lock);
5067 5065          mutex_exit(&ill->ill_lock);
5068 5066  
5069 5067          ASSERT(mp != NULL);
5070 5068          /*
5071 5069           * NOTE: all of the qwriter_ip() calls below use CUR_OP since
5072 5070           * we can only get here when the current operation decides it
5073 5071           * it needs to quiesce via ipsq_pending_mp_add().
5074 5072           */
5075 5073          switch (mp->b_datap->db_type) {
5076 5074          case M_PCPROTO:
5077 5075          case M_PROTO:
5078 5076                  /*
5079 5077                   * For now, only DL_NOTIFY_IND messages can use this facility.
5080 5078                   */
5081 5079                  dlindp = (dl_notify_ind_t *)mp->b_rptr;
5082 5080                  ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
5083 5081  
5084 5082                  switch (dlindp->dl_notification) {
5085 5083                  case DL_NOTE_PHYS_ADDR:
5086 5084                          qwriter_ip(ill, ill->ill_rq, mp,
5087 5085                              ill_set_phys_addr_tail, CUR_OP, B_TRUE);
5088 5086                          return;
5089 5087                  case DL_NOTE_REPLUMB:
5090 5088                          qwriter_ip(ill, ill->ill_rq, mp,
5091 5089                              ill_replumb_tail, CUR_OP, B_TRUE);
5092 5090                          return;
5093 5091                  default:
5094 5092                          ASSERT(0);
5095 5093                          ill_refrele(ill);
5096 5094                  }
5097 5095                  break;
5098 5096  
5099 5097          case M_ERROR:
5100 5098          case M_HANGUP:
5101 5099                  qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
5102 5100                      B_TRUE);
5103 5101                  return;
5104 5102  
5105 5103          case M_IOCTL:
5106 5104          case M_IOCDATA:
5107 5105                  qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
5108 5106                      ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
5109 5107                  return;
5110 5108  
5111 5109          default:
5112 5110                  cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5113 5111                      "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5114 5112          }
5115 5113          return;
5116 5114  unlock:
5117 5115          mutex_exit(&ipsq->ipsq_lock);
5118 5116          mutex_exit(&ipx->ipx_lock);
5119 5117          mutex_exit(&ill->ill_lock);
5120 5118  }
5121 5119  
5122 5120  #ifdef DEBUG
5123 5121  /* Reuse trace buffer from beginning (if reached the end) and record trace */
5124 5122  static void
5125 5123  th_trace_rrecord(th_trace_t *th_trace)
5126 5124  {
5127 5125          tr_buf_t *tr_buf;
5128 5126          uint_t lastref;
5129 5127  
5130 5128          lastref = th_trace->th_trace_lastref;
5131 5129          lastref++;
5132 5130          if (lastref == TR_BUF_MAX)
5133 5131                  lastref = 0;
5134 5132          th_trace->th_trace_lastref = lastref;
5135 5133          tr_buf = &th_trace->th_trbuf[lastref];
5136 5134          tr_buf->tr_time = ddi_get_lbolt();
5137 5135          tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
5138 5136  }
5139 5137  
5140 5138  static void
5141 5139  th_trace_free(void *value)
5142 5140  {
5143 5141          th_trace_t *th_trace = value;
5144 5142  
5145 5143          ASSERT(th_trace->th_refcnt == 0);
5146 5144          kmem_free(th_trace, sizeof (*th_trace));
5147 5145  }
5148 5146  
5149 5147  /*
5150 5148   * Find or create the per-thread hash table used to track object references.
5151 5149   * The ipst argument is NULL if we shouldn't allocate.
5152 5150   *
5153 5151   * Accesses per-thread data, so there's no need to lock here.
5154 5152   */
5155 5153  static mod_hash_t *
5156 5154  th_trace_gethash(ip_stack_t *ipst)
5157 5155  {
5158 5156          th_hash_t *thh;
5159 5157  
5160 5158          if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
5161 5159                  mod_hash_t *mh;
5162 5160                  char name[256];
5163 5161                  size_t objsize, rshift;
5164 5162                  int retv;
5165 5163  
5166 5164                  if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
5167 5165                          return (NULL);
5168 5166                  (void) snprintf(name, sizeof (name), "th_trace_%p",
5169 5167                      (void *)curthread);
5170 5168  
5171 5169                  /*
5172 5170                   * We use mod_hash_create_extended here rather than the more
5173 5171                   * obvious mod_hash_create_ptrhash because the latter has a
5174 5172                   * hard-coded KM_SLEEP, and we'd prefer to fail rather than
5175 5173                   * block.
5176 5174                   */
5177 5175                  objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
5178 5176                      MAX(sizeof (ire_t), sizeof (ncec_t)));
5179 5177                  rshift = highbit(objsize);
5180 5178                  mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
5181 5179                      th_trace_free, mod_hash_byptr, (void *)rshift,
5182 5180                      mod_hash_ptrkey_cmp, KM_NOSLEEP);
5183 5181                  if (mh == NULL) {
5184 5182                          kmem_free(thh, sizeof (*thh));
5185 5183                          return (NULL);
5186 5184                  }
5187 5185                  thh->thh_hash = mh;
5188 5186                  thh->thh_ipst = ipst;
5189 5187                  /*
5190 5188                   * We trace ills, ipifs, ires, and nces.  All of these are
5191 5189                   * per-IP-stack, so the lock on the thread list is as well.
5192 5190                   */
5193 5191                  rw_enter(&ip_thread_rwlock, RW_WRITER);
5194 5192                  list_insert_tail(&ip_thread_list, thh);
5195 5193                  rw_exit(&ip_thread_rwlock);
5196 5194                  retv = tsd_set(ip_thread_data, thh);
5197 5195                  ASSERT(retv == 0);
5198 5196          }
5199 5197          return (thh != NULL ? thh->thh_hash : NULL);
5200 5198  }
5201 5199  
5202 5200  boolean_t
5203 5201  th_trace_ref(const void *obj, ip_stack_t *ipst)
5204 5202  {
5205 5203          th_trace_t *th_trace;
5206 5204          mod_hash_t *mh;
5207 5205          mod_hash_val_t val;
5208 5206  
5209 5207          if ((mh = th_trace_gethash(ipst)) == NULL)
5210 5208                  return (B_FALSE);
5211 5209  
5212 5210          /*
5213 5211           * Attempt to locate the trace buffer for this obj and thread.
5214 5212           * If it does not exist, then allocate a new trace buffer and
5215 5213           * insert into the hash.
5216 5214           */
5217 5215          if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
5218 5216                  th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
5219 5217                  if (th_trace == NULL)
5220 5218                          return (B_FALSE);
5221 5219  
5222 5220                  th_trace->th_id = curthread;
5223 5221                  if (mod_hash_insert(mh, (mod_hash_key_t)obj,
5224 5222                      (mod_hash_val_t)th_trace) != 0) {
5225 5223                          kmem_free(th_trace, sizeof (th_trace_t));
5226 5224                          return (B_FALSE);
5227 5225                  }
5228 5226          } else {
5229 5227                  th_trace = (th_trace_t *)val;
5230 5228          }
5231 5229  
5232 5230          ASSERT(th_trace->th_refcnt >= 0 &&
5233 5231              th_trace->th_refcnt < TR_BUF_MAX - 1);
5234 5232  
5235 5233          th_trace->th_refcnt++;
5236 5234          th_trace_rrecord(th_trace);
5237 5235          return (B_TRUE);
5238 5236  }
5239 5237  
5240 5238  /*
5241 5239   * For the purpose of tracing a reference release, we assume that global
5242 5240   * tracing is always on and that the same thread initiated the reference hold
5243 5241   * is releasing.
5244 5242   */
5245 5243  void
5246 5244  th_trace_unref(const void *obj)
5247 5245  {
5248 5246          int retv;
5249 5247          mod_hash_t *mh;
5250 5248          th_trace_t *th_trace;
5251 5249          mod_hash_val_t val;
5252 5250  
5253 5251          mh = th_trace_gethash(NULL);
5254 5252          retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
5255 5253          ASSERT(retv == 0);
5256 5254          th_trace = (th_trace_t *)val;
5257 5255  
5258 5256          ASSERT(th_trace->th_refcnt > 0);
5259 5257          th_trace->th_refcnt--;
5260 5258          th_trace_rrecord(th_trace);
5261 5259  }
5262 5260  
5263 5261  /*
5264 5262   * If tracing has been disabled, then we assume that the reference counts are
5265 5263   * now useless, and we clear them out before destroying the entries.
5266 5264   */
5267 5265  void
5268 5266  th_trace_cleanup(const void *obj, boolean_t trace_disable)
5269 5267  {
5270 5268          th_hash_t       *thh;
5271 5269          mod_hash_t      *mh;
5272 5270          mod_hash_val_t  val;
5273 5271          th_trace_t      *th_trace;
5274 5272          int             retv;
5275 5273  
5276 5274          rw_enter(&ip_thread_rwlock, RW_READER);
5277 5275          for (thh = list_head(&ip_thread_list); thh != NULL;
5278 5276              thh = list_next(&ip_thread_list, thh)) {
5279 5277                  if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
5280 5278                      &val) == 0) {
5281 5279                          th_trace = (th_trace_t *)val;
5282 5280                          if (trace_disable)
5283 5281                                  th_trace->th_refcnt = 0;
5284 5282                          retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
5285 5283                          ASSERT(retv == 0);
5286 5284                  }
5287 5285          }
5288 5286          rw_exit(&ip_thread_rwlock);
5289 5287  }
5290 5288  
5291 5289  void
5292 5290  ipif_trace_ref(ipif_t *ipif)
5293 5291  {
5294 5292          ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5295 5293  
5296 5294          if (ipif->ipif_trace_disable)
5297 5295                  return;
5298 5296  
5299 5297          if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
5300 5298                  ipif->ipif_trace_disable = B_TRUE;
5301 5299                  ipif_trace_cleanup(ipif);
5302 5300          }
5303 5301  }
5304 5302  
5305 5303  void
5306 5304  ipif_untrace_ref(ipif_t *ipif)
5307 5305  {
5308 5306          ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5309 5307  
5310 5308          if (!ipif->ipif_trace_disable)
5311 5309                  th_trace_unref(ipif);
5312 5310  }
5313 5311  
5314 5312  void
5315 5313  ill_trace_ref(ill_t *ill)
5316 5314  {
5317 5315          ASSERT(MUTEX_HELD(&ill->ill_lock));
5318 5316  
5319 5317          if (ill->ill_trace_disable)
5320 5318                  return;
5321 5319  
5322 5320          if (!th_trace_ref(ill, ill->ill_ipst)) {
5323 5321                  ill->ill_trace_disable = B_TRUE;
5324 5322                  ill_trace_cleanup(ill);
5325 5323          }
5326 5324  }
5327 5325  
5328 5326  void
5329 5327  ill_untrace_ref(ill_t *ill)
5330 5328  {
5331 5329          ASSERT(MUTEX_HELD(&ill->ill_lock));
5332 5330  
5333 5331          if (!ill->ill_trace_disable)
5334 5332                  th_trace_unref(ill);
5335 5333  }
5336 5334  
5337 5335  /*
5338 5336   * Called when ipif is unplumbed or when memory alloc fails.  Note that on
5339 5337   * failure, ipif_trace_disable is set.
5340 5338   */
5341 5339  static void
5342 5340  ipif_trace_cleanup(const ipif_t *ipif)
5343 5341  {
5344 5342          th_trace_cleanup(ipif, ipif->ipif_trace_disable);
5345 5343  }
5346 5344  
5347 5345  /*
5348 5346   * Called when ill is unplumbed or when memory alloc fails.  Note that on
5349 5347   * failure, ill_trace_disable is set.
5350 5348   */
5351 5349  static void
5352 5350  ill_trace_cleanup(const ill_t *ill)
5353 5351  {
5354 5352          th_trace_cleanup(ill, ill->ill_trace_disable);
5355 5353  }
5356 5354  #endif /* DEBUG */
5357 5355  
5358 5356  void
5359 5357  ipif_refhold_locked(ipif_t *ipif)
5360 5358  {
5361 5359          ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5362 5360          ipif->ipif_refcnt++;
5363 5361          IPIF_TRACE_REF(ipif);
5364 5362  }
5365 5363  
5366 5364  void
5367 5365  ipif_refhold(ipif_t *ipif)
5368 5366  {
5369 5367          ill_t   *ill;
5370 5368  
5371 5369          ill = ipif->ipif_ill;
5372 5370          mutex_enter(&ill->ill_lock);
5373 5371          ipif->ipif_refcnt++;
5374 5372          IPIF_TRACE_REF(ipif);
5375 5373          mutex_exit(&ill->ill_lock);
5376 5374  }
5377 5375  
5378 5376  /*
5379 5377   * Must not be called while holding any locks. Otherwise if this is
5380 5378   * the last reference to be released there is a chance of recursive mutex
5381 5379   * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5382 5380   * to restart an ioctl.
5383 5381   */
5384 5382  void
5385 5383  ipif_refrele(ipif_t *ipif)
5386 5384  {
5387 5385          ill_t   *ill;
5388 5386  
5389 5387          ill = ipif->ipif_ill;
5390 5388  
5391 5389          mutex_enter(&ill->ill_lock);
5392 5390          ASSERT(ipif->ipif_refcnt != 0);
5393 5391          ipif->ipif_refcnt--;
5394 5392          IPIF_UNTRACE_REF(ipif);
5395 5393          if (ipif->ipif_refcnt != 0) {
5396 5394                  mutex_exit(&ill->ill_lock);
5397 5395                  return;
5398 5396          }
5399 5397  
5400 5398          /* Drops the ill_lock */
5401 5399          ipif_ill_refrele_tail(ill);
5402 5400  }
5403 5401  
5404 5402  ipif_t *
5405 5403  ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
5406 5404  {
5407 5405          ipif_t  *ipif;
5408 5406  
5409 5407          mutex_enter(&ill->ill_lock);
5410 5408          for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
5411 5409              ipif != NULL; ipif = ipif->ipif_next) {
5412 5410                  if (IPIF_IS_CONDEMNED(ipif))
5413 5411                          continue;
5414 5412                  ipif_refhold_locked(ipif);
5415 5413                  mutex_exit(&ill->ill_lock);
5416 5414                  return (ipif);
5417 5415          }
5418 5416          mutex_exit(&ill->ill_lock);
5419 5417          return (NULL);
5420 5418  }
5421 5419  
5422 5420  /*
5423 5421   * TODO: make this table extendible at run time
5424 5422   * Return a pointer to the mac type info for 'mac_type'
5425 5423   */
5426 5424  static ip_m_t *
5427 5425  ip_m_lookup(t_uscalar_t mac_type)
5428 5426  {
5429 5427          ip_m_t  *ipm;
5430 5428  
5431 5429          for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
5432 5430                  if (ipm->ip_m_mac_type == mac_type)
5433 5431                          return (ipm);
5434 5432          return (NULL);
5435 5433  }
5436 5434  
5437 5435  /*
5438 5436   * Make a link layer address from the multicast IP address *addr.
5439 5437   * To form the link layer address, invoke the ip_m_v*mapping function
5440 5438   * associated with the link-layer type.
5441 5439   */
5442 5440  void
5443 5441  ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
5444 5442  {
5445 5443          ip_m_t *ipm;
5446 5444  
5447 5445          if (ill->ill_net_type == IRE_IF_NORESOLVER)
5448 5446                  return;
5449 5447  
5450 5448          ASSERT(addr != NULL);
5451 5449  
5452 5450          ipm = ip_m_lookup(ill->ill_mactype);
5453 5451          if (ipm == NULL ||
5454 5452              (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
5455 5453              (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
5456 5454                  ip0dbg(("no mapping for ill %s mactype 0x%x\n",
5457 5455                      ill->ill_name, ill->ill_mactype));
5458 5456                  return;
5459 5457          }
5460 5458          if (ill->ill_isv6)
5461 5459                  (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
5462 5460          else
5463 5461                  (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
5464 5462  }
5465 5463  
5466 5464  /*
5467 5465   * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous.
5468 5466   * Otherwise returns B_TRUE.
5469 5467   *
5470 5468   * The netmask can be verified to be contiguous with 32 shifts and or
5471 5469   * operations. Take the contiguous mask (in host byte order) and compute
5472 5470   *      mask | mask << 1 | mask << 2 | ... | mask << 31
5473 5471   * the result will be the same as the 'mask' for contiguous mask.
5474 5472   */
5475 5473  static boolean_t
5476 5474  ip_contiguous_mask(uint32_t mask)
5477 5475  {
5478 5476          uint32_t        m = mask;
5479 5477          int             i;
5480 5478  
5481 5479          for (i = 1; i < 32; i++)
5482 5480                  m |= (mask << i);
5483 5481  
5484 5482          return (m == mask);
5485 5483  }
5486 5484  
5487 5485  /*
5488 5486   * ip_rt_add is called to add an IPv4 route to the forwarding table.
5489 5487   * ill is passed in to associate it with the correct interface.
5490 5488   * If ire_arg is set, then we return the held IRE in that location.
5491 5489   */
5492 5490  int
5493 5491  ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5494 5492      ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
5495 5493      boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
5496 5494  {
5497 5495          ire_t   *ire, *nire;
5498 5496          ire_t   *gw_ire = NULL;
5499 5497          ipif_t  *ipif = NULL;
5500 5498          uint_t  type;
5501 5499          int     match_flags = MATCH_IRE_TYPE;
5502 5500          tsol_gc_t *gc = NULL;
5503 5501          tsol_gcgrp_t *gcgrp = NULL;
5504 5502          boolean_t gcgrp_xtraref = B_FALSE;
5505 5503          boolean_t cgtp_broadcast;
5506 5504          boolean_t unbound = B_FALSE;
5507 5505  
5508 5506          ip1dbg(("ip_rt_add:"));
5509 5507  
5510 5508          if (ire_arg != NULL)
5511 5509                  *ire_arg = NULL;
5512 5510  
5513 5511          /* disallow non-contiguous netmasks */
5514 5512          if (!ip_contiguous_mask(ntohl(mask)))
5515 5513                  return (ENOTSUP);
5516 5514  
5517 5515          /*
5518 5516           * If this is the case of RTF_HOST being set, then we set the netmask
5519 5517           * to all ones (regardless if one was supplied).
5520 5518           */
5521 5519          if (flags & RTF_HOST)
5522 5520                  mask = IP_HOST_MASK;
5523 5521  
5524 5522          /*
5525 5523           * Prevent routes with a zero gateway from being created (since
5526 5524           * interfaces can currently be plumbed and brought up no assigned
5527 5525           * address).
5528 5526           */
5529 5527          if (gw_addr == 0)
5530 5528                  return (ENETUNREACH);
5531 5529          /*
5532 5530           * Get the ipif, if any, corresponding to the gw_addr
5533 5531           * If -ifp was specified we restrict ourselves to the ill, otherwise
5534 5532           * we match on the gatway and destination to handle unnumbered pt-pt
5535 5533           * interfaces.
5536 5534           */
5537 5535          if (ill != NULL)
5538 5536                  ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
5539 5537          else
5540 5538                  ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5541 5539          if (ipif != NULL) {
5542 5540                  if (IS_VNI(ipif->ipif_ill)) {
5543 5541                          ipif_refrele(ipif);
5544 5542                          return (EINVAL);
5545 5543                  }
5546 5544          }
5547 5545  
5548 5546          /*
5549 5547           * GateD will attempt to create routes with a loopback interface
5550 5548           * address as the gateway and with RTF_GATEWAY set.  We allow
5551 5549           * these routes to be added, but create them as interface routes
5552 5550           * since the gateway is an interface address.
5553 5551           */
5554 5552          if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
5555 5553                  flags &= ~RTF_GATEWAY;
5556 5554                  if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
5557 5555                      mask == IP_HOST_MASK) {
5558 5556                          ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
5559 5557                              NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
5560 5558                              NULL);
5561 5559                          if (ire != NULL) {
5562 5560                                  ire_refrele(ire);
5563 5561                                  ipif_refrele(ipif);
5564 5562                                  return (EEXIST);
5565 5563                          }
5566 5564                          ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
5567 5565                              "for 0x%x\n", (void *)ipif,
5568 5566                              ipif->ipif_ire_type,
5569 5567                              ntohl(ipif->ipif_lcl_addr)));
5570 5568                          ire = ire_create(
5571 5569                              (uchar_t *)&dst_addr,       /* dest address */
5572 5570                              (uchar_t *)&mask,           /* mask */
5573 5571                              NULL,                       /* no gateway */
5574 5572                              ipif->ipif_ire_type,        /* LOOPBACK */
5575 5573                              ipif->ipif_ill,
5576 5574                              zoneid,
5577 5575                              (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
5578 5576                              NULL,
5579 5577                              ipst);
5580 5578  
5581 5579                          if (ire == NULL) {
5582 5580                                  ipif_refrele(ipif);
5583 5581                                  return (ENOMEM);
5584 5582                          }
5585 5583                          /* src address assigned by the caller? */
5586 5584                          if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5587 5585                                  ire->ire_setsrc_addr = src_addr;
5588 5586  
5589 5587                          nire = ire_add(ire);
5590 5588                          if (nire == NULL) {
5591 5589                                  /*
5592 5590                                   * In the result of failure, ire_add() will have
5593 5591                                   * already deleted the ire in question, so there
5594 5592                                   * is no need to do that here.
5595 5593                                   */
5596 5594                                  ipif_refrele(ipif);
5597 5595                                  return (ENOMEM);
5598 5596                          }
5599 5597                          /*
5600 5598                           * Check if it was a duplicate entry. This handles
5601 5599                           * the case of two racing route adds for the same route
5602 5600                           */
5603 5601                          if (nire != ire) {
5604 5602                                  ASSERT(nire->ire_identical_ref > 1);
5605 5603                                  ire_delete(nire);
5606 5604                                  ire_refrele(nire);
5607 5605                                  ipif_refrele(ipif);
5608 5606                                  return (EEXIST);
5609 5607                          }
5610 5608                          ire = nire;
5611 5609                          goto save_ire;
5612 5610                  }
5613 5611          }
5614 5612  
5615 5613          /*
5616 5614           * The routes for multicast with CGTP are quite special in that
5617 5615           * the gateway is the local interface address, yet RTF_GATEWAY
5618 5616           * is set. We turn off RTF_GATEWAY to provide compatibility with
5619 5617           * this undocumented and unusual use of multicast routes.
5620 5618           */
5621 5619          if ((flags & RTF_MULTIRT) && ipif != NULL)
5622 5620                  flags &= ~RTF_GATEWAY;
5623 5621  
5624 5622          /*
5625 5623           * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
5626 5624           * and the gateway address provided is one of the system's interface
5627 5625           * addresses.  By using the routing socket interface and supplying an
5628 5626           * RTA_IFP sockaddr with an interface index, an alternate method of
5629 5627           * specifying an interface route to be created is available which uses
5630 5628           * the interface index that specifies the outgoing interface rather than
5631 5629           * the address of an outgoing interface (which may not be able to
5632 5630           * uniquely identify an interface).  When coupled with the RTF_GATEWAY
5633 5631           * flag, routes can be specified which not only specify the next-hop to
5634 5632           * be used when routing to a certain prefix, but also which outgoing
5635 5633           * interface should be used.
5636 5634           *
5637 5635           * Previously, interfaces would have unique addresses assigned to them
5638 5636           * and so the address assigned to a particular interface could be used
5639 5637           * to identify a particular interface.  One exception to this was the
5640 5638           * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
5641 5639           *
5642 5640           * With the advent of IPv6 and its link-local addresses, this
5643 5641           * restriction was relaxed and interfaces could share addresses between
5644 5642           * themselves.  In fact, typically all of the link-local interfaces on
5645 5643           * an IPv6 node or router will have the same link-local address.  In
5646 5644           * order to differentiate between these interfaces, the use of an
5647 5645           * interface index is necessary and this index can be carried inside a
5648 5646           * RTA_IFP sockaddr (which is actually a sockaddr_dl).  One restriction
5649 5647           * of using the interface index, however, is that all of the ipif's that
5650 5648           * are part of an ill have the same index and so the RTA_IFP sockaddr
5651 5649           * cannot be used to differentiate between ipif's (or logical
5652 5650           * interfaces) that belong to the same ill (physical interface).
5653 5651           *
5654 5652           * For example, in the following case involving IPv4 interfaces and
5655 5653           * logical interfaces
5656 5654           *
5657 5655           *      192.0.2.32      255.255.255.224 192.0.2.33      U       if0
5658 5656           *      192.0.2.32      255.255.255.224 192.0.2.34      U       if0
5659 5657           *      192.0.2.32      255.255.255.224 192.0.2.35      U       if0
5660 5658           *
5661 5659           * the ipif's corresponding to each of these interface routes can be
5662 5660           * uniquely identified by the "gateway" (actually interface address).
5663 5661           *
5664 5662           * In this case involving multiple IPv6 default routes to a particular
5665 5663           * link-local gateway, the use of RTA_IFP is necessary to specify which
5666 5664           * default route is of interest:
5667 5665           *
5668 5666           *      default         fe80::123:4567:89ab:cdef        U       if0
5669 5667           *      default         fe80::123:4567:89ab:cdef        U       if1
5670 5668           */
5671 5669  
5672 5670          /* RTF_GATEWAY not set */
5673 5671          if (!(flags & RTF_GATEWAY)) {
5674 5672                  if (sp != NULL) {
5675 5673                          ip2dbg(("ip_rt_add: gateway security attributes "
5676 5674                              "cannot be set with interface route\n"));
5677 5675                          if (ipif != NULL)
5678 5676                                  ipif_refrele(ipif);
5679 5677                          return (EINVAL);
5680 5678                  }
5681 5679  
5682 5680                  /*
5683 5681                   * Whether or not ill (RTA_IFP) is set, we require that
5684 5682                   * the gateway is one of our local addresses.
5685 5683                   */
5686 5684                  if (ipif == NULL)
5687 5685                          return (ENETUNREACH);
5688 5686  
5689 5687                  /*
5690 5688                   * We use MATCH_IRE_ILL here. If the caller specified an
5691 5689                   * interface (from the RTA_IFP sockaddr) we use it, otherwise
5692 5690                   * we use the ill derived from the gateway address.
5693 5691                   * We can always match the gateway address since we record it
5694 5692                   * in ire_gateway_addr.
5695 5693                   * We don't allow RTA_IFP to specify a different ill than the
5696 5694                   * one matching the ipif to make sure we can delete the route.
5697 5695                   */
5698 5696                  match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
5699 5697                  if (ill == NULL) {
5700 5698                          ill = ipif->ipif_ill;
5701 5699                  } else if (ill != ipif->ipif_ill) {
5702 5700                          ipif_refrele(ipif);
5703 5701                          return (EINVAL);
5704 5702                  }
5705 5703  
5706 5704                  /*
5707 5705                   * We check for an existing entry at this point.
5708 5706                   *
5709 5707                   * Since a netmask isn't passed in via the ioctl interface
5710 5708                   * (SIOCADDRT), we don't check for a matching netmask in that
5711 5709                   * case.
5712 5710                   */
5713 5711                  if (!ioctl_msg)
5714 5712                          match_flags |= MATCH_IRE_MASK;
5715 5713                  ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5716 5714                      IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
5717 5715                      NULL);
5718 5716                  if (ire != NULL) {
5719 5717                          ire_refrele(ire);
5720 5718                          ipif_refrele(ipif);
5721 5719                          return (EEXIST);
5722 5720                  }
5723 5721  
5724 5722                  /*
5725 5723                   * Some software (for example, GateD and Sun Cluster) attempts
5726 5724                   * to create (what amount to) IRE_PREFIX routes with the
5727 5725                   * loopback address as the gateway.  This is primarily done to
5728 5726                   * set up prefixes with the RTF_REJECT flag set (for example,
5729 5727                   * when generating aggregate routes.)
5730 5728                   *
5731 5729                   * If the IRE type (as defined by ill->ill_net_type) would be
5732 5730                   * IRE_LOOPBACK, then we map the request into a
5733 5731                   * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
5734 5732                   * these interface routes, by definition, can only be that.
5735 5733                   *
5736 5734                   * Needless to say, the real IRE_LOOPBACK is NOT created by this
5737 5735                   * routine, but rather using ire_create() directly.
5738 5736                   *
5739 5737                   */
5740 5738                  type = ill->ill_net_type;
5741 5739                  if (type == IRE_LOOPBACK) {
5742 5740                          type = IRE_IF_NORESOLVER;
5743 5741                          flags |= RTF_BLACKHOLE;
5744 5742                  }
5745 5743  
5746 5744                  /*
5747 5745                   * Create a copy of the IRE_IF_NORESOLVER or
5748 5746                   * IRE_IF_RESOLVER with the modified address, netmask, and
5749 5747                   * gateway.
5750 5748                   */
5751 5749                  ire = ire_create(
5752 5750                      (uchar_t *)&dst_addr,
5753 5751                      (uint8_t *)&mask,
5754 5752                      (uint8_t *)&gw_addr,
5755 5753                      type,
5756 5754                      ill,
5757 5755                      zoneid,
5758 5756                      flags,
5759 5757                      NULL,
5760 5758                      ipst);
5761 5759                  if (ire == NULL) {
5762 5760                          ipif_refrele(ipif);
5763 5761                          return (ENOMEM);
5764 5762                  }
5765 5763  
5766 5764                  /* src address assigned by the caller? */
5767 5765                  if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5768 5766                          ire->ire_setsrc_addr = src_addr;
5769 5767  
5770 5768                  nire = ire_add(ire);
5771 5769                  if (nire == NULL) {
5772 5770                          /*
5773 5771                           * In the result of failure, ire_add() will have
5774 5772                           * already deleted the ire in question, so there
5775 5773                           * is no need to do that here.
5776 5774                           */
5777 5775                          ipif_refrele(ipif);
5778 5776                          return (ENOMEM);
5779 5777                  }
5780 5778                  /*
5781 5779                   * Check if it was a duplicate entry. This handles
5782 5780                   * the case of two racing route adds for the same route
5783 5781                   */
5784 5782                  if (nire != ire) {
5785 5783                          ire_delete(nire);
5786 5784                          ire_refrele(nire);
5787 5785                          ipif_refrele(ipif);
5788 5786                          return (EEXIST);
5789 5787                  }
5790 5788                  ire = nire;
5791 5789                  goto save_ire;
5792 5790          }
5793 5791  
5794 5792          /*
5795 5793           * Get an interface IRE for the specified gateway.
5796 5794           * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
5797 5795           * gateway, it is currently unreachable and we fail the request
5798 5796           * accordingly. We reject any RTF_GATEWAY routes where the gateway
5799 5797           * is an IRE_LOCAL or IRE_LOOPBACK.
5800 5798           * If RTA_IFP was specified we look on that particular ill.
5801 5799           */
5802 5800          if (ill != NULL)
5803 5801                  match_flags |= MATCH_IRE_ILL;
5804 5802  
5805 5803          /* Check whether the gateway is reachable. */
5806 5804  again:
5807 5805          type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
5808 5806          if (flags & RTF_INDIRECT)
5809 5807                  type |= IRE_OFFLINK;
5810 5808  
5811 5809          gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
5812 5810              ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
5813 5811          if (gw_ire == NULL) {
5814 5812                  /*
5815 5813                   * With IPMP, we allow host routes to influence in.mpathd's
5816 5814                   * target selection.  However, if the test addresses are on
5817 5815                   * their own network, the above lookup will fail since the
5818 5816                   * underlying IRE_INTERFACEs are marked hidden.  So allow
5819 5817                   * hidden test IREs to be found and try again.
5820 5818                   */
5821 5819                  if (!(match_flags & MATCH_IRE_TESTHIDDEN))  {
5822 5820                          match_flags |= MATCH_IRE_TESTHIDDEN;
5823 5821                          goto again;
5824 5822                  }
5825 5823                  if (ipif != NULL)
5826 5824                          ipif_refrele(ipif);
5827 5825                  return (ENETUNREACH);
5828 5826          }
5829 5827          if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
5830 5828                  ire_refrele(gw_ire);
5831 5829                  if (ipif != NULL)
5832 5830                          ipif_refrele(ipif);
5833 5831                  return (ENETUNREACH);
5834 5832          }
5835 5833  
5836 5834          if (ill == NULL && !(flags & RTF_INDIRECT)) {
5837 5835                  unbound = B_TRUE;
5838 5836                  if (ipst->ips_ip_strict_src_multihoming > 0)
5839 5837                          ill = gw_ire->ire_ill;
5840 5838          }
5841 5839  
5842 5840          /*
5843 5841           * We create one of three types of IREs as a result of this request
5844 5842           * based on the netmask.  A netmask of all ones (which is automatically
5845 5843           * assumed when RTF_HOST is set) results in an IRE_HOST being created.
5846 5844           * An all zeroes netmask implies a default route so an IRE_DEFAULT is
5847 5845           * created.  Otherwise, an IRE_PREFIX route is created for the
5848 5846           * destination prefix.
5849 5847           */
5850 5848          if (mask == IP_HOST_MASK)
5851 5849                  type = IRE_HOST;
5852 5850          else if (mask == 0)
5853 5851                  type = IRE_DEFAULT;
5854 5852          else
5855 5853                  type = IRE_PREFIX;
5856 5854  
5857 5855          /* check for a duplicate entry */
5858 5856          ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5859 5857              ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
5860 5858              0, ipst, NULL);
5861 5859          if (ire != NULL) {
5862 5860                  if (ipif != NULL)
5863 5861                          ipif_refrele(ipif);
5864 5862                  ire_refrele(gw_ire);
5865 5863                  ire_refrele(ire);
5866 5864                  return (EEXIST);
5867 5865          }
5868 5866  
5869 5867          /* Security attribute exists */
5870 5868          if (sp != NULL) {
5871 5869                  tsol_gcgrp_addr_t ga;
5872 5870  
5873 5871                  /* find or create the gateway credentials group */
5874 5872                  ga.ga_af = AF_INET;
5875 5873                  IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
5876 5874  
5877 5875                  /* we hold reference to it upon success */
5878 5876                  gcgrp = gcgrp_lookup(&ga, B_TRUE);
5879 5877                  if (gcgrp == NULL) {
5880 5878                          if (ipif != NULL)
5881 5879                                  ipif_refrele(ipif);
5882 5880                          ire_refrele(gw_ire);
5883 5881                          return (ENOMEM);
5884 5882                  }
5885 5883  
5886 5884                  /*
5887 5885                   * Create and add the security attribute to the group; a
5888 5886                   * reference to the group is made upon allocating a new
5889 5887                   * entry successfully.  If it finds an already-existing
5890 5888                   * entry for the security attribute in the group, it simply
5891 5889                   * returns it and no new reference is made to the group.
5892 5890                   */
5893 5891                  gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
5894 5892                  if (gc == NULL) {
5895 5893                          if (ipif != NULL)
5896 5894                                  ipif_refrele(ipif);
5897 5895                          /* release reference held by gcgrp_lookup */
5898 5896                          GCGRP_REFRELE(gcgrp);
5899 5897                          ire_refrele(gw_ire);
5900 5898                          return (ENOMEM);
5901 5899                  }
5902 5900          }
5903 5901  
5904 5902          /* Create the IRE. */
5905 5903          ire = ire_create(
5906 5904              (uchar_t *)&dst_addr,               /* dest address */
5907 5905              (uchar_t *)&mask,                   /* mask */
5908 5906              (uchar_t *)&gw_addr,                /* gateway address */
5909 5907              (ushort_t)type,                     /* IRE type */
5910 5908              ill,
5911 5909              zoneid,
5912 5910              flags,
5913 5911              gc,                                 /* security attribute */
5914 5912              ipst);
5915 5913  
5916 5914          /*
5917 5915           * The ire holds a reference to the 'gc' and the 'gc' holds a
5918 5916           * reference to the 'gcgrp'. We can now release the extra reference
5919 5917           * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
5920 5918           */
5921 5919          if (gcgrp_xtraref)
5922 5920                  GCGRP_REFRELE(gcgrp);
5923 5921          if (ire == NULL) {
5924 5922                  if (gc != NULL)
5925 5923                          GC_REFRELE(gc);
5926 5924                  if (ipif != NULL)
5927 5925                          ipif_refrele(ipif);
5928 5926                  ire_refrele(gw_ire);
5929 5927                  return (ENOMEM);
5930 5928          }
5931 5929  
5932 5930          /* Before we add, check if an extra CGTP broadcast is needed */
5933 5931          cgtp_broadcast = ((flags & RTF_MULTIRT) &&
5934 5932              ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
5935 5933  
5936 5934          /* src address assigned by the caller? */
5937 5935          if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5938 5936                  ire->ire_setsrc_addr = src_addr;
5939 5937  
5940 5938          ire->ire_unbound = unbound;
5941 5939  
5942 5940          /*
5943 5941           * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
5944 5942           * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
5945 5943           */
5946 5944  
5947 5945          /* Add the new IRE. */
5948 5946          nire = ire_add(ire);
5949 5947          if (nire == NULL) {
5950 5948                  /*
5951 5949                   * In the result of failure, ire_add() will have
5952 5950                   * already deleted the ire in question, so there
5953 5951                   * is no need to do that here.
5954 5952                   */
5955 5953                  if (ipif != NULL)
5956 5954                          ipif_refrele(ipif);
5957 5955                  ire_refrele(gw_ire);
5958 5956                  return (ENOMEM);
5959 5957          }
5960 5958          /*
5961 5959           * Check if it was a duplicate entry. This handles
5962 5960           * the case of two racing route adds for the same route
5963 5961           */
5964 5962          if (nire != ire) {
5965 5963                  ire_delete(nire);
5966 5964                  ire_refrele(nire);
5967 5965                  if (ipif != NULL)
5968 5966                          ipif_refrele(ipif);
5969 5967                  ire_refrele(gw_ire);
5970 5968                  return (EEXIST);
5971 5969          }
5972 5970          ire = nire;
5973 5971  
5974 5972          if (flags & RTF_MULTIRT) {
5975 5973                  /*
5976 5974                   * Invoke the CGTP (multirouting) filtering module
5977 5975                   * to add the dst address in the filtering database.
5978 5976                   * Replicated inbound packets coming from that address
5979 5977                   * will be filtered to discard the duplicates.
5980 5978                   * It is not necessary to call the CGTP filter hook
5981 5979                   * when the dst address is a broadcast or multicast,
5982 5980                   * because an IP source address cannot be a broadcast
5983 5981                   * or a multicast.
5984 5982                   */
5985 5983                  if (cgtp_broadcast) {
5986 5984                          ip_cgtp_bcast_add(ire, ipst);
5987 5985                          goto save_ire;
5988 5986                  }
5989 5987                  if (ipst->ips_ip_cgtp_filter_ops != NULL &&
5990 5988                      !CLASSD(ire->ire_addr)) {
5991 5989                          int res;
5992 5990                          ipif_t *src_ipif;
5993 5991  
5994 5992                          /* Find the source address corresponding to gw_ire */
5995 5993                          src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
5996 5994                              NULL, zoneid, ipst);
5997 5995                          if (src_ipif != NULL) {
5998 5996                                  res = ipst->ips_ip_cgtp_filter_ops->
5999 5997                                      cfo_add_dest_v4(
6000 5998                                      ipst->ips_netstack->netstack_stackid,
6001 5999                                      ire->ire_addr,
6002 6000                                      ire->ire_gateway_addr,
6003 6001                                      ire->ire_setsrc_addr,
6004 6002                                      src_ipif->ipif_lcl_addr);
6005 6003                                  ipif_refrele(src_ipif);
6006 6004                          } else {
6007 6005                                  res = EADDRNOTAVAIL;
6008 6006                          }
6009 6007                          if (res != 0) {
6010 6008                                  if (ipif != NULL)
6011 6009                                          ipif_refrele(ipif);
6012 6010                                  ire_refrele(gw_ire);
6013 6011                                  ire_delete(ire);
6014 6012                                  ire_refrele(ire);       /* Held in ire_add */
6015 6013                                  return (res);
6016 6014                          }
6017 6015                  }
6018 6016          }
6019 6017  
6020 6018  save_ire:
6021 6019          if (gw_ire != NULL) {
6022 6020                  ire_refrele(gw_ire);
6023 6021                  gw_ire = NULL;
6024 6022          }
6025 6023          if (ill != NULL) {
6026 6024                  /*
6027 6025                   * Save enough information so that we can recreate the IRE if
6028 6026                   * the interface goes down and then up.  The metrics associated
6029 6027                   * with the route will be saved as well when rts_setmetrics() is
6030 6028                   * called after the IRE has been created.  In the case where
6031 6029                   * memory cannot be allocated, none of this information will be
6032 6030                   * saved.
6033 6031                   */
6034 6032                  ill_save_ire(ill, ire);
6035 6033          }
6036 6034          if (ioctl_msg)
6037 6035                  ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
6038 6036          if (ire_arg != NULL) {
6039 6037                  /*
6040 6038                   * Store the ire that was successfully added into where ire_arg
6041 6039                   * points to so that callers don't have to look it up
6042 6040                   * themselves (but they are responsible for ire_refrele()ing
6043 6041                   * the ire when they are finished with it).
6044 6042                   */
6045 6043                  *ire_arg = ire;
6046 6044          } else {
6047 6045                  ire_refrele(ire);               /* Held in ire_add */
6048 6046          }
6049 6047          if (ipif != NULL)
6050 6048                  ipif_refrele(ipif);
6051 6049          return (0);
6052 6050  }
6053 6051  
6054 6052  /*
6055 6053   * ip_rt_delete is called to delete an IPv4 route.
6056 6054   * ill is passed in to associate it with the correct interface.
6057 6055   */
6058 6056  /* ARGSUSED4 */
6059 6057  int
6060 6058  ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
6061 6059      uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
6062 6060      ip_stack_t *ipst, zoneid_t zoneid)
6063 6061  {
6064 6062          ire_t   *ire = NULL;
6065 6063          ipif_t  *ipif;
6066 6064          uint_t  type;
6067 6065          uint_t  match_flags = MATCH_IRE_TYPE;
6068 6066          int     err = 0;
6069 6067  
6070 6068          ip1dbg(("ip_rt_delete:"));
6071 6069          /*
6072 6070           * If this is the case of RTF_HOST being set, then we set the netmask
6073 6071           * to all ones.  Otherwise, we use the netmask if one was supplied.
6074 6072           */
6075 6073          if (flags & RTF_HOST) {
6076 6074                  mask = IP_HOST_MASK;
6077 6075                  match_flags |= MATCH_IRE_MASK;
6078 6076          } else if (rtm_addrs & RTA_NETMASK) {
6079 6077                  match_flags |= MATCH_IRE_MASK;
6080 6078          }
6081 6079  
6082 6080          /*
6083 6081           * Note that RTF_GATEWAY is never set on a delete, therefore
6084 6082           * we check if the gateway address is one of our interfaces first,
6085 6083           * and fall back on RTF_GATEWAY routes.
6086 6084           *
6087 6085           * This makes it possible to delete an original
6088 6086           * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
6089 6087           * However, we have RTF_KERNEL set on the ones created by ipif_up
6090 6088           * and those can not be deleted here.
6091 6089           *
6092 6090           * We use MATCH_IRE_ILL if we know the interface. If the caller
6093 6091           * specified an interface (from the RTA_IFP sockaddr) we use it,
6094 6092           * otherwise we use the ill derived from the gateway address.
6095 6093           * We can always match the gateway address since we record it
6096 6094           * in ire_gateway_addr.
6097 6095           *
6098 6096           * For more detail on specifying routes by gateway address and by
6099 6097           * interface index, see the comments in ip_rt_add().
6100 6098           */
6101 6099          ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
6102 6100          if (ipif != NULL) {
6103 6101                  ill_t   *ill_match;
6104 6102  
6105 6103                  if (ill != NULL)
6106 6104                          ill_match = ill;
6107 6105                  else
6108 6106                          ill_match = ipif->ipif_ill;
6109 6107  
6110 6108                  match_flags |= MATCH_IRE_ILL;
6111 6109                  if (ipif->ipif_ire_type == IRE_LOOPBACK) {
6112 6110                          ire = ire_ftable_lookup_v4(dst_addr, mask, 0,
6113 6111                              IRE_LOOPBACK, ill_match, ALL_ZONES, NULL,
6114 6112                              match_flags, 0, ipst, NULL);
6115 6113                  }
6116 6114                  if (ire == NULL) {
6117 6115                          match_flags |= MATCH_IRE_GW;
6118 6116                          ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
6119 6117                              IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
6120 6118                              match_flags, 0, ipst, NULL);
6121 6119                  }
6122 6120                  /* Avoid deleting routes created by kernel from an ipif */
6123 6121                  if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
6124 6122                          ire_refrele(ire);
6125 6123                          ire = NULL;
6126 6124                  }
6127 6125  
6128 6126                  /* Restore in case we didn't find a match */
6129 6127                  match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
6130 6128          }
6131 6129  
6132 6130          if (ire == NULL) {
6133 6131                  /*
6134 6132                   * At this point, the gateway address is not one of our own
6135 6133                   * addresses or a matching interface route was not found.  We
6136 6134                   * set the IRE type to lookup based on whether
6137 6135                   * this is a host route, a default route or just a prefix.
6138 6136                   *
6139 6137                   * If an ill was passed in, then the lookup is based on an
6140 6138                   * interface index so MATCH_IRE_ILL is added to match_flags.
6141 6139                   */
6142 6140                  match_flags |= MATCH_IRE_GW;
6143 6141                  if (ill != NULL)
6144 6142                          match_flags |= MATCH_IRE_ILL;
6145 6143                  if (mask == IP_HOST_MASK)
6146 6144                          type = IRE_HOST;
6147 6145                  else if (mask == 0)
6148 6146                          type = IRE_DEFAULT;
6149 6147                  else
6150 6148                          type = IRE_PREFIX;
6151 6149                  ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
6152 6150                      ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
6153 6151          }
6154 6152  
6155 6153          if (ipif != NULL) {
6156 6154                  ipif_refrele(ipif);
6157 6155                  ipif = NULL;
6158 6156          }
6159 6157  
6160 6158          if (ire == NULL)
6161 6159                  return (ESRCH);
6162 6160  
6163 6161          if (ire->ire_flags & RTF_MULTIRT) {
6164 6162                  /*
6165 6163                   * Invoke the CGTP (multirouting) filtering module
6166 6164                   * to remove the dst address from the filtering database.
6167 6165                   * Packets coming from that address will no longer be
6168 6166                   * filtered to remove duplicates.
6169 6167                   */
6170 6168                  if (ipst->ips_ip_cgtp_filter_ops != NULL) {
6171 6169                          err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
6172 6170                              ipst->ips_netstack->netstack_stackid,
6173 6171                              ire->ire_addr, ire->ire_gateway_addr);
6174 6172                  }
6175 6173                  ip_cgtp_bcast_delete(ire, ipst);
6176 6174          }
6177 6175  
6178 6176          ill = ire->ire_ill;
6179 6177          if (ill != NULL)
6180 6178                  ill_remove_saved_ire(ill, ire);
6181 6179          if (ioctl_msg)
6182 6180                  ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
6183 6181          ire_delete(ire);
6184 6182          ire_refrele(ire);
6185 6183          return (err);
6186 6184  }
6187 6185  
6188 6186  /*
6189 6187   * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
6190 6188   */
6191 6189  /* ARGSUSED */
6192 6190  int
6193 6191  ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6194 6192      ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6195 6193  {
6196 6194          ipaddr_t dst_addr;
6197 6195          ipaddr_t gw_addr;
6198 6196          ipaddr_t mask;
6199 6197          int error = 0;
6200 6198          mblk_t *mp1;
6201 6199          struct rtentry *rt;
6202 6200          ipif_t *ipif = NULL;
6203 6201          ip_stack_t      *ipst;
6204 6202  
6205 6203          ASSERT(q->q_next == NULL);
6206 6204          ipst = CONNQ_TO_IPST(q);
6207 6205  
6208 6206          ip1dbg(("ip_siocaddrt:"));
6209 6207          /* Existence of mp1 verified in ip_wput_nondata */
6210 6208          mp1 = mp->b_cont->b_cont;
6211 6209          rt = (struct rtentry *)mp1->b_rptr;
6212 6210  
6213 6211          dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6214 6212          gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6215 6213  
6216 6214          /*
6217 6215           * If the RTF_HOST flag is on, this is a request to assign a gateway
6218 6216           * to a particular host address.  In this case, we set the netmask to
6219 6217           * all ones for the particular destination address.  Otherwise,
6220 6218           * determine the netmask to be used based on dst_addr and the interfaces
6221 6219           * in use.
6222 6220           */
6223 6221          if (rt->rt_flags & RTF_HOST) {
6224 6222                  mask = IP_HOST_MASK;
6225 6223          } else {
6226 6224                  /*
6227 6225                   * Note that ip_subnet_mask returns a zero mask in the case of
6228 6226                   * default (an all-zeroes address).
6229 6227                   */
6230 6228                  mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6231 6229          }
6232 6230  
6233 6231          error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
6234 6232              B_TRUE, NULL, ipst, ALL_ZONES);
6235 6233          if (ipif != NULL)
6236 6234                  ipif_refrele(ipif);
6237 6235          return (error);
6238 6236  }
6239 6237  
6240 6238  /*
6241 6239   * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6242 6240   */
6243 6241  /* ARGSUSED */
6244 6242  int
6245 6243  ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6246 6244      ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6247 6245  {
6248 6246          ipaddr_t dst_addr;
6249 6247          ipaddr_t gw_addr;
6250 6248          ipaddr_t mask;
6251 6249          int error;
6252 6250          mblk_t *mp1;
6253 6251          struct rtentry *rt;
6254 6252          ipif_t *ipif = NULL;
6255 6253          ip_stack_t      *ipst;
6256 6254  
6257 6255          ASSERT(q->q_next == NULL);
6258 6256          ipst = CONNQ_TO_IPST(q);
6259 6257  
6260 6258          ip1dbg(("ip_siocdelrt:"));
6261 6259          /* Existence of mp1 verified in ip_wput_nondata */
6262 6260          mp1 = mp->b_cont->b_cont;
6263 6261          rt = (struct rtentry *)mp1->b_rptr;
6264 6262  
6265 6263          dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6266 6264          gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6267 6265  
6268 6266          /*
6269 6267           * If the RTF_HOST flag is on, this is a request to delete a gateway
6270 6268           * to a particular host address.  In this case, we set the netmask to
6271 6269           * all ones for the particular destination address.  Otherwise,
6272 6270           * determine the netmask to be used based on dst_addr and the interfaces
6273 6271           * in use.
6274 6272           */
6275 6273          if (rt->rt_flags & RTF_HOST) {
6276 6274                  mask = IP_HOST_MASK;
6277 6275          } else {
6278 6276                  /*
6279 6277                   * Note that ip_subnet_mask returns a zero mask in the case of
6280 6278                   * default (an all-zeroes address).
6281 6279                   */
6282 6280                  mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6283 6281          }
6284 6282  
6285 6283          error = ip_rt_delete(dst_addr, mask, gw_addr,
6286 6284              RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
6287 6285              ipst, ALL_ZONES);
6288 6286          if (ipif != NULL)
6289 6287                  ipif_refrele(ipif);
6290 6288          return (error);
6291 6289  }
6292 6290  
6293 6291  /*
6294 6292   * Enqueue the mp onto the ipsq, chained by b_next.
6295 6293   * b_prev stores the function to be executed later, and b_queue the queue
6296 6294   * where this mp originated.
6297 6295   */
6298 6296  void
6299 6297  ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6300 6298      ill_t *pending_ill)
6301 6299  {
6302 6300          conn_t  *connp;
6303 6301          ipxop_t *ipx = ipsq->ipsq_xop;
6304 6302  
6305 6303          ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6306 6304          ASSERT(MUTEX_HELD(&ipx->ipx_lock));
6307 6305          ASSERT(func != NULL);
6308 6306  
6309 6307          mp->b_queue = q;
6310 6308          mp->b_prev = (void *)func;
6311 6309          mp->b_next = NULL;
6312 6310  
6313 6311          switch (type) {
6314 6312          case CUR_OP:
6315 6313                  if (ipx->ipx_mptail != NULL) {
6316 6314                          ASSERT(ipx->ipx_mphead != NULL);
6317 6315                          ipx->ipx_mptail->b_next = mp;
6318 6316                  } else {
6319 6317                          ASSERT(ipx->ipx_mphead == NULL);
6320 6318                          ipx->ipx_mphead = mp;
6321 6319                  }
6322 6320                  ipx->ipx_mptail = mp;
6323 6321                  break;
6324 6322  
6325 6323          case NEW_OP:
6326 6324                  if (ipsq->ipsq_xopq_mptail != NULL) {
6327 6325                          ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6328 6326                          ipsq->ipsq_xopq_mptail->b_next = mp;
6329 6327                  } else {
6330 6328                          ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6331 6329                          ipsq->ipsq_xopq_mphead = mp;
6332 6330                  }
6333 6331                  ipsq->ipsq_xopq_mptail = mp;
6334 6332                  ipx->ipx_ipsq_queued = B_TRUE;
6335 6333                  break;
6336 6334  
6337 6335          case SWITCH_OP:
6338 6336                  ASSERT(ipsq->ipsq_swxop != NULL);
6339 6337                  /* only one switch operation is currently allowed */
6340 6338                  ASSERT(ipsq->ipsq_switch_mp == NULL);
6341 6339                  ipsq->ipsq_switch_mp = mp;
6342 6340                  ipx->ipx_ipsq_queued = B_TRUE;
6343 6341                  break;
6344 6342          default:
6345 6343                  cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6346 6344          }
6347 6345  
6348 6346          if (CONN_Q(q) && pending_ill != NULL) {
6349 6347                  connp = Q_TO_CONN(q);
6350 6348                  ASSERT(MUTEX_HELD(&connp->conn_lock));
6351 6349                  connp->conn_oper_pending_ill = pending_ill;
6352 6350          }
6353 6351  }
6354 6352  
6355 6353  /*
6356 6354   * Dequeue the next message that requested exclusive access to this IPSQ's
6357 6355   * xop.  Specifically:
6358 6356   *
6359 6357   *  1. If we're still processing the current operation on `ipsq', then
6360 6358   *     dequeue the next message for the operation (from ipx_mphead), or
6361 6359   *     return NULL if there are no queued messages for the operation.
6362 6360   *     These messages are queued via CUR_OP to qwriter_ip() and friends.
6363 6361   *
6364 6362   *  2. If the current operation on `ipsq' has completed (ipx_current_ipif is
6365 6363   *     not set) see if the ipsq has requested an xop switch.  If so, switch
6366 6364   *     `ipsq' to a different xop.  Xop switches only happen when joining or
6367 6365   *     leaving IPMP groups and require a careful dance -- see the comments
6368 6366   *     in-line below for details.  If we're leaving a group xop or if we're
6369 6367   *     joining a group xop and become writer on it, then we proceed to (3).
6370 6368   *     Otherwise, we return NULL and exit the xop.
6371 6369   *
6372 6370   *  3. For each IPSQ in the xop, return any switch operation stored on
6373 6371   *     ipsq_switch_mp (set via SWITCH_OP); these must be processed before
6374 6372   *     any other messages queued on the IPSQ.  Otherwise, dequeue the next
6375 6373   *     exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
6376 6374   *     Note that if the phyint tied to `ipsq' is not using IPMP there will
6377 6375   *     only be one IPSQ in the xop.  Otherwise, there will be one IPSQ for
6378 6376   *     each phyint in the group, including the IPMP meta-interface phyint.
6379 6377   */
6380 6378  static mblk_t *
6381 6379  ipsq_dq(ipsq_t *ipsq)
6382 6380  {
6383 6381          ill_t   *illv4, *illv6;
6384 6382          mblk_t  *mp;
6385 6383          ipsq_t  *xopipsq;
6386 6384          ipsq_t  *leftipsq = NULL;
6387 6385          ipxop_t *ipx;
6388 6386          phyint_t *phyi = ipsq->ipsq_phyint;
6389 6387          ip_stack_t *ipst = ipsq->ipsq_ipst;
6390 6388          boolean_t emptied = B_FALSE;
6391 6389  
6392 6390          /*
6393 6391           * Grab all the locks we need in the defined order (ill_g_lock ->
6394 6392           * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
6395 6393           */
6396 6394          rw_enter(&ipst->ips_ill_g_lock,
6397 6395              ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
6398 6396          mutex_enter(&ipsq->ipsq_lock);
6399 6397          ipx = ipsq->ipsq_xop;
6400 6398          mutex_enter(&ipx->ipx_lock);
6401 6399  
6402 6400          /*
6403 6401           * Dequeue the next message associated with the current exclusive
6404 6402           * operation, if any.
6405 6403           */
6406 6404          if ((mp = ipx->ipx_mphead) != NULL) {
6407 6405                  ipx->ipx_mphead = mp->b_next;
6408 6406                  if (ipx->ipx_mphead == NULL)
6409 6407                          ipx->ipx_mptail = NULL;
6410 6408                  mp->b_next = (void *)ipsq;
6411 6409                  goto out;
6412 6410          }
6413 6411  
6414 6412          if (ipx->ipx_current_ipif != NULL)
6415 6413                  goto empty;
6416 6414  
6417 6415          if (ipsq->ipsq_swxop != NULL) {
6418 6416                  /*
6419 6417                   * The exclusive operation that is now being completed has
6420 6418                   * requested a switch to a different xop.  This happens
6421 6419                   * when an interface joins or leaves an IPMP group.  Joins
6422 6420                   * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
6423 6421                   * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
6424 6422                   * (phyint_free()), or interface plumb for an ill type
6425 6423                   * not in the IPMP group (ip_rput_dlpi_writer()).
6426 6424                   *
6427 6425                   * Xop switches are not allowed on the IPMP meta-interface.
6428 6426                   */
6429 6427                  ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
6430 6428                  ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
6431 6429                  DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
6432 6430  
6433 6431                  if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
6434 6432                          /*
6435 6433                           * We're switching back to our own xop, so we have two
6436 6434                           * xop's to drain/exit: our own, and the group xop
6437 6435                           * that we are leaving.
6438 6436                           *
6439 6437                           * First, pull ourselves out of the group ipsq list.
6440 6438                           * This is safe since we're writer on ill_g_lock.
6441 6439                           */
6442 6440                          ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
6443 6441  
6444 6442                          xopipsq = ipx->ipx_ipsq;
6445 6443                          while (xopipsq->ipsq_next != ipsq)
6446 6444                                  xopipsq = xopipsq->ipsq_next;
6447 6445  
6448 6446                          xopipsq->ipsq_next = ipsq->ipsq_next;
6449 6447                          ipsq->ipsq_next = ipsq;
6450 6448                          ipsq->ipsq_xop = ipsq->ipsq_swxop;
6451 6449                          ipsq->ipsq_swxop = NULL;
6452 6450  
6453 6451                          /*
6454 6452                           * Second, prepare to exit the group xop.  The actual
6455 6453                           * ipsq_exit() is done at the end of this function
6456 6454                           * since we cannot hold any locks across ipsq_exit().
6457 6455                           * Note that although we drop the group's ipx_lock, no
6458 6456                           * threads can proceed since we're still ipx_writer.
6459 6457                           */
6460 6458                          leftipsq = xopipsq;
6461 6459                          mutex_exit(&ipx->ipx_lock);
6462 6460  
6463 6461                          /*
6464 6462                           * Third, set ipx to point to our own xop (which was
6465 6463                           * inactive and therefore can be entered).
6466 6464                           */
6467 6465                          ipx = ipsq->ipsq_xop;
6468 6466                          mutex_enter(&ipx->ipx_lock);
6469 6467                          ASSERT(ipx->ipx_writer == NULL);
6470 6468                          ASSERT(ipx->ipx_current_ipif == NULL);
6471 6469                  } else {
6472 6470                          /*
6473 6471                           * We're switching from our own xop to a group xop.
6474 6472                           * The requestor of the switch must ensure that the
6475 6473                           * group xop cannot go away (e.g. by ensuring the
6476 6474                           * phyint associated with the xop cannot go away).
6477 6475                           *
6478 6476                           * If we can become writer on our new xop, then we'll
6479 6477                           * do the drain.  Otherwise, the current writer of our
6480 6478                           * new xop will do the drain when it exits.
6481 6479                           *
6482 6480                           * First, splice ourselves into the group IPSQ list.
6483 6481                           * This is safe since we're writer on ill_g_lock.
6484 6482                           */
6485 6483                          ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6486 6484  
6487 6485                          xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
6488 6486                          while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
6489 6487                                  xopipsq = xopipsq->ipsq_next;
6490 6488  
6491 6489                          xopipsq->ipsq_next = ipsq;
6492 6490                          ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
6493 6491                          ipsq->ipsq_xop = ipsq->ipsq_swxop;
6494 6492                          ipsq->ipsq_swxop = NULL;
6495 6493  
6496 6494                          /*
6497 6495                           * Second, exit our own xop, since it's now unused.
6498 6496                           * This is safe since we've got the only reference.
6499 6497                           */
6500 6498                          ASSERT(ipx->ipx_writer == curthread);
6501 6499                          ipx->ipx_writer = NULL;
6502 6500                          VERIFY(--ipx->ipx_reentry_cnt == 0);
6503 6501                          ipx->ipx_ipsq_queued = B_FALSE;
6504 6502                          mutex_exit(&ipx->ipx_lock);
6505 6503  
6506 6504                          /*
6507 6505                           * Third, set ipx to point to our new xop, and check
6508 6506                           * if we can become writer on it.  If we cannot, then
6509 6507                           * the current writer will drain the IPSQ group when
6510 6508                           * it exits.  Our ipsq_xop is guaranteed to be stable
6511 6509                           * because we're still holding ipsq_lock.
6512 6510                           */
6513 6511                          ipx = ipsq->ipsq_xop;
6514 6512                          mutex_enter(&ipx->ipx_lock);
6515 6513                          if (ipx->ipx_writer != NULL ||
6516 6514                              ipx->ipx_current_ipif != NULL) {
6517 6515                                  goto out;
6518 6516                          }
6519 6517                  }
6520 6518  
6521 6519                  /*
6522 6520                   * Fourth, become writer on our new ipx before we continue
6523 6521                   * with the drain.  Note that we never dropped ipsq_lock
6524 6522                   * above, so no other thread could've raced with us to
6525 6523                   * become writer first.  Also, we're holding ipx_lock, so
6526 6524                   * no other thread can examine the ipx right now.
6527 6525                   */
6528 6526                  ASSERT(ipx->ipx_current_ipif == NULL);
6529 6527                  ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6530 6528                  VERIFY(ipx->ipx_reentry_cnt++ == 0);
6531 6529                  ipx->ipx_writer = curthread;
6532 6530                  ipx->ipx_forced = B_FALSE;
6533 6531  #ifdef DEBUG
6534 6532                  ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6535 6533  #endif
6536 6534          }
6537 6535  
6538 6536          xopipsq = ipsq;
6539 6537          do {
6540 6538                  /*
6541 6539                   * So that other operations operate on a consistent and
6542 6540                   * complete phyint, a switch message on an IPSQ must be
6543 6541                   * handled prior to any other operations on that IPSQ.
6544 6542                   */
6545 6543                  if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
6546 6544                          xopipsq->ipsq_switch_mp = NULL;
6547 6545                          ASSERT(mp->b_next == NULL);
6548 6546                          mp->b_next = (void *)xopipsq;
6549 6547                          goto out;
6550 6548                  }
6551 6549  
6552 6550                  if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
6553 6551                          xopipsq->ipsq_xopq_mphead = mp->b_next;
6554 6552                          if (xopipsq->ipsq_xopq_mphead == NULL)
6555 6553                                  xopipsq->ipsq_xopq_mptail = NULL;
6556 6554                          mp->b_next = (void *)xopipsq;
6557 6555                          goto out;
6558 6556                  }
6559 6557          } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6560 6558  empty:
6561 6559          /*
6562 6560           * There are no messages.  Further, we are holding ipx_lock, hence no
6563 6561           * new messages can end up on any IPSQ in the xop.
6564 6562           */
6565 6563          ipx->ipx_writer = NULL;
6566 6564          ipx->ipx_forced = B_FALSE;
6567 6565          VERIFY(--ipx->ipx_reentry_cnt == 0);
6568 6566          ipx->ipx_ipsq_queued = B_FALSE;
6569 6567          emptied = B_TRUE;
6570 6568  #ifdef  DEBUG
6571 6569          ipx->ipx_depth = 0;
6572 6570  #endif
6573 6571  out:
6574 6572          mutex_exit(&ipx->ipx_lock);
6575 6573          mutex_exit(&ipsq->ipsq_lock);
6576 6574  
6577 6575          /*
6578 6576           * If we completely emptied the xop, then wake up any threads waiting
6579 6577           * to enter any of the IPSQ's associated with it.
6580 6578           */
6581 6579          if (emptied) {
6582 6580                  xopipsq = ipsq;
6583 6581                  do {
6584 6582                          if ((phyi = xopipsq->ipsq_phyint) == NULL)
6585 6583                                  continue;
6586 6584  
6587 6585                          illv4 = phyi->phyint_illv4;
6588 6586                          illv6 = phyi->phyint_illv6;
6589 6587  
6590 6588                          GRAB_ILL_LOCKS(illv4, illv6);
6591 6589                          if (illv4 != NULL)
6592 6590                                  cv_broadcast(&illv4->ill_cv);
6593 6591                          if (illv6 != NULL)
6594 6592                                  cv_broadcast(&illv6->ill_cv);
6595 6593                          RELEASE_ILL_LOCKS(illv4, illv6);
6596 6594                  } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6597 6595          }
6598 6596          rw_exit(&ipst->ips_ill_g_lock);
6599 6597  
6600 6598          /*
6601 6599           * Now that all locks are dropped, exit the IPSQ we left.
6602 6600           */
6603 6601          if (leftipsq != NULL)
6604 6602                  ipsq_exit(leftipsq);
6605 6603  
6606 6604          return (mp);
6607 6605  }
6608 6606  
6609 6607  /*
6610 6608   * Return completion status of previously initiated DLPI operations on
6611 6609   * ills in the purview of an ipsq.
6612 6610   */
6613 6611  static boolean_t
6614 6612  ipsq_dlpi_done(ipsq_t *ipsq)
6615 6613  {
6616 6614          ipsq_t          *ipsq_start;
6617 6615          phyint_t        *phyi;
6618 6616          ill_t           *ill;
6619 6617  
6620 6618          ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
6621 6619          ipsq_start = ipsq;
6622 6620  
6623 6621          do {
6624 6622                  /*
6625 6623                   * The only current users of this function are ipsq_try_enter
6626 6624                   * and ipsq_enter which have made sure that ipsq_writer is
6627 6625                   * NULL before we reach here. ill_dlpi_pending is modified
6628 6626                   * only by an ipsq writer
6629 6627                   */
6630 6628                  ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
6631 6629                  phyi = ipsq->ipsq_phyint;
6632 6630                  /*
6633 6631                   * phyi could be NULL if a phyint that is part of an
6634 6632                   * IPMP group is being unplumbed. A more detailed
6635 6633                   * comment is in ipmp_grp_update_kstats()
6636 6634                   */
6637 6635                  if (phyi != NULL) {
6638 6636                          ill = phyi->phyint_illv4;
6639 6637                          if (ill != NULL &&
6640 6638                              (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
6641 6639                              ill->ill_arl_dlpi_pending))
6642 6640                                  return (B_FALSE);
6643 6641  
6644 6642                          ill = phyi->phyint_illv6;
6645 6643                          if (ill != NULL &&
6646 6644                              ill->ill_dlpi_pending != DL_PRIM_INVAL)
6647 6645                                  return (B_FALSE);
6648 6646                  }
6649 6647  
6650 6648          } while ((ipsq = ipsq->ipsq_next) != ipsq_start);
6651 6649  
6652 6650          return (B_TRUE);
6653 6651  }
6654 6652  
6655 6653  /*
6656 6654   * Enter the ipsq corresponding to ill, by waiting synchronously till
6657 6655   * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6658 6656   * will have to drain completely before ipsq_enter returns success.
6659 6657   * ipx_current_ipif will be set if some exclusive op is in progress,
6660 6658   * and the ipsq_exit logic will start the next enqueued op after
6661 6659   * completion of the current op. If 'force' is used, we don't wait
6662 6660   * for the enqueued ops. This is needed when a conn_close wants to
6663 6661   * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6664 6662   * of an ill can also use this option. But we dont' use it currently.
6665 6663   */
6666 6664  #define ENTER_SQ_WAIT_TICKS 100
6667 6665  boolean_t
6668 6666  ipsq_enter(ill_t *ill, boolean_t force, int type)
6669 6667  {
6670 6668          ipsq_t  *ipsq;
6671 6669          ipxop_t *ipx;
6672 6670          boolean_t waited_enough = B_FALSE;
6673 6671          ip_stack_t *ipst = ill->ill_ipst;
6674 6672  
6675 6673          /*
6676 6674           * Note that the relationship between ill and ipsq is fixed as long as
6677 6675           * the ill is not ILL_CONDEMNED.  Holding ipsq_lock ensures the
6678 6676           * relationship between the IPSQ and xop cannot change.  However,
6679 6677           * since we cannot hold ipsq_lock across the cv_wait(), it may change
6680 6678           * while we're waiting.  We wait on ill_cv and rely on ipsq_exit()
6681 6679           * waking up all ills in the xop when it becomes available.
6682 6680           */
6683 6681          for (;;) {
6684 6682                  rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6685 6683                  mutex_enter(&ill->ill_lock);
6686 6684                  if (ill->ill_state_flags & ILL_CONDEMNED) {
6687 6685                          mutex_exit(&ill->ill_lock);
6688 6686                          rw_exit(&ipst->ips_ill_g_lock);
6689 6687                          return (B_FALSE);
6690 6688                  }
6691 6689  
6692 6690                  ipsq = ill->ill_phyint->phyint_ipsq;
6693 6691                  mutex_enter(&ipsq->ipsq_lock);
6694 6692                  ipx = ipsq->ipsq_xop;
6695 6693                  mutex_enter(&ipx->ipx_lock);
6696 6694  
6697 6695                  if (ipx->ipx_writer == NULL && (type == CUR_OP ||
6698 6696                      (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
6699 6697                      waited_enough))
6700 6698                          break;
6701 6699  
6702 6700                  rw_exit(&ipst->ips_ill_g_lock);
6703 6701  
6704 6702                  if (!force || ipx->ipx_writer != NULL) {
6705 6703                          mutex_exit(&ipx->ipx_lock);
6706 6704                          mutex_exit(&ipsq->ipsq_lock);
6707 6705                          cv_wait(&ill->ill_cv, &ill->ill_lock);
6708 6706                  } else {
6709 6707                          mutex_exit(&ipx->ipx_lock);
6710 6708                          mutex_exit(&ipsq->ipsq_lock);
6711 6709                          (void) cv_reltimedwait(&ill->ill_cv,
6712 6710                              &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
6713 6711                          waited_enough = B_TRUE;
6714 6712                  }
6715 6713                  mutex_exit(&ill->ill_lock);
6716 6714          }
6717 6715  
6718 6716          ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6719 6717          ASSERT(ipx->ipx_reentry_cnt == 0);
6720 6718          ipx->ipx_writer = curthread;
6721 6719          ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
6722 6720          ipx->ipx_reentry_cnt++;
6723 6721  #ifdef DEBUG
6724 6722          ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6725 6723  #endif
6726 6724          mutex_exit(&ipx->ipx_lock);
6727 6725          mutex_exit(&ipsq->ipsq_lock);
6728 6726          mutex_exit(&ill->ill_lock);
6729 6727          rw_exit(&ipst->ips_ill_g_lock);
6730 6728  
6731 6729          return (B_TRUE);
6732 6730  }
6733 6731  
6734 6732  /*
6735 6733   * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
6736 6734   * across the call to the core interface ipsq_try_enter() and hence calls this
6737 6735   * function directly. This is explained more fully in ipif_set_values().
6738 6736   * In order to support the above constraint, ipsq_try_enter is implemented as
6739 6737   * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
6740 6738   */
6741 6739  static ipsq_t *
6742 6740  ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
6743 6741      int type, boolean_t reentry_ok)
6744 6742  {
6745 6743          ipsq_t  *ipsq;
6746 6744          ipxop_t *ipx;
6747 6745          ip_stack_t *ipst = ill->ill_ipst;
6748 6746  
6749 6747          /*
6750 6748           * lock ordering:
6751 6749           * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
6752 6750           *
6753 6751           * ipx of an ipsq can't change when ipsq_lock is held.
6754 6752           */
6755 6753          ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
6756 6754          GRAB_CONN_LOCK(q);
6757 6755          mutex_enter(&ill->ill_lock);
6758 6756          ipsq = ill->ill_phyint->phyint_ipsq;
6759 6757          mutex_enter(&ipsq->ipsq_lock);
6760 6758          ipx = ipsq->ipsq_xop;
6761 6759          mutex_enter(&ipx->ipx_lock);
6762 6760  
6763 6761          /*
6764 6762           * 1. Enter the ipsq if we are already writer and reentry is ok.
6765 6763           *    (Note: If the caller does not specify reentry_ok then neither
6766 6764           *    'func' nor any of its callees must ever attempt to enter the ipsq
6767 6765           *    again. Otherwise it can lead to an infinite loop
6768 6766           * 2. Enter the ipsq if there is no current writer and this attempted
6769 6767           *    entry is part of the current operation
6770 6768           * 3. Enter the ipsq if there is no current writer and this is a new
6771 6769           *    operation and the operation queue is empty and there is no
6772 6770           *    operation currently in progress and if all previously initiated
6773 6771           *    DLPI operations have completed.
6774 6772           */
6775 6773          if ((ipx->ipx_writer == curthread && reentry_ok) ||
6776 6774              (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
6777 6775              !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
6778 6776              ipsq_dlpi_done(ipsq))))) {
6779 6777                  /* Success. */
6780 6778                  ipx->ipx_reentry_cnt++;
6781 6779                  ipx->ipx_writer = curthread;
6782 6780                  ipx->ipx_forced = B_FALSE;
6783 6781                  mutex_exit(&ipx->ipx_lock);
6784 6782                  mutex_exit(&ipsq->ipsq_lock);
6785 6783                  mutex_exit(&ill->ill_lock);
6786 6784                  RELEASE_CONN_LOCK(q);
6787 6785  #ifdef DEBUG
6788 6786                  ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6789 6787  #endif
6790 6788                  return (ipsq);
6791 6789          }
6792 6790  
6793 6791          if (func != NULL)
6794 6792                  ipsq_enq(ipsq, q, mp, func, type, ill);
6795 6793  
6796 6794          mutex_exit(&ipx->ipx_lock);
6797 6795          mutex_exit(&ipsq->ipsq_lock);
6798 6796          mutex_exit(&ill->ill_lock);
6799 6797          RELEASE_CONN_LOCK(q);
6800 6798          return (NULL);
6801 6799  }
6802 6800  
6803 6801  /*
6804 6802   * The ipsq_t (ipsq) is the synchronization data structure used to serialize
6805 6803   * certain critical operations like plumbing (i.e. most set ioctls), etc.
6806 6804   * There is one ipsq per phyint. The ipsq
6807 6805   * serializes exclusive ioctls issued by applications on a per ipsq basis in
6808 6806   * ipsq_xopq_mphead. It also protects against multiple threads executing in
6809 6807   * the ipsq. Responses from the driver pertain to the current ioctl (say a
6810 6808   * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
6811 6809   * up the interface) and are enqueued in ipx_mphead.
6812 6810   *
6813 6811   * If a thread does not want to reenter the ipsq when it is already writer,
6814 6812   * it must make sure that the specified reentry point to be called later
6815 6813   * when the ipsq is empty, nor any code path starting from the specified reentry
6816 6814   * point must never ever try to enter the ipsq again. Otherwise it can lead
6817 6815   * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
6818 6816   * When the thread that is currently exclusive finishes, it (ipsq_exit)
6819 6817   * dequeues the requests waiting to become exclusive in ipx_mphead and calls
6820 6818   * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
6821 6819   * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
6822 6820   * ioctl if the current ioctl has completed. If the current ioctl is still
6823 6821   * in progress it simply returns. The current ioctl could be waiting for
6824 6822   * a response from another module (the driver or could be waiting for
6825 6823   * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
6826 6824   * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
6827 6825   * execution of the ioctl and ipsq_exit does not start the next ioctl unless
6828 6826   * ipx_current_ipif is NULL which happens only once the ioctl is complete and
6829 6827   * all associated DLPI operations have completed.
6830 6828   */
6831 6829  
6832 6830  /*
6833 6831   * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
6834 6832   * and `ill' cannot both be specified).  Returns a pointer to the entered IPSQ
6835 6833   * on success, or NULL on failure.  The caller ensures ipif/ill is valid by
6836 6834   * refholding it as necessary.  If the IPSQ cannot be entered and `func' is
6837 6835   * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
6838 6836   * can be entered.  If `func' is NULL, then `q' and `mp' are ignored.
6839 6837   */
6840 6838  ipsq_t *
6841 6839  ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
6842 6840      ipsq_func_t func, int type, boolean_t reentry_ok)
6843 6841  {
6844 6842          ip_stack_t      *ipst;
6845 6843          ipsq_t          *ipsq;
6846 6844  
6847 6845          /* Only 1 of ipif or ill can be specified */
6848 6846          ASSERT((ipif != NULL) ^ (ill != NULL));
6849 6847  
6850 6848          if (ipif != NULL)
6851 6849                  ill = ipif->ipif_ill;
6852 6850          ipst = ill->ill_ipst;
6853 6851  
6854 6852          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6855 6853          ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
6856 6854          rw_exit(&ipst->ips_ill_g_lock);
6857 6855  
6858 6856          return (ipsq);
6859 6857  }
6860 6858  
6861 6859  /*
6862 6860   * Try to enter the IPSQ corresponding to `ill' as writer.  The caller ensures
6863 6861   * ill is valid by refholding it if necessary; we will refrele.  If the IPSQ
6864 6862   * cannot be entered, the mp is queued for completion.
6865 6863   */
6866 6864  void
6867 6865  qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6868 6866      boolean_t reentry_ok)
6869 6867  {
6870 6868          ipsq_t  *ipsq;
6871 6869  
6872 6870          ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
6873 6871  
6874 6872          /*
6875 6873           * Drop the caller's refhold on the ill.  This is safe since we either
6876 6874           * entered the IPSQ (and thus are exclusive), or failed to enter the
6877 6875           * IPSQ, in which case we return without accessing ill anymore.  This
6878 6876           * is needed because func needs to see the correct refcount.
6879 6877           * e.g. removeif can work only then.
6880 6878           */
6881 6879          ill_refrele(ill);
6882 6880          if (ipsq != NULL) {
6883 6881                  (*func)(ipsq, q, mp, NULL);
6884 6882                  ipsq_exit(ipsq);
6885 6883          }
6886 6884  }
6887 6885  
6888 6886  /*
6889 6887   * Exit the specified IPSQ.  If this is the final exit on it then drain it
6890 6888   * prior to exiting.  Caller must be writer on the specified IPSQ.
6891 6889   */
6892 6890  void
6893 6891  ipsq_exit(ipsq_t *ipsq)
6894 6892  {
6895 6893          mblk_t *mp;
6896 6894          ipsq_t *mp_ipsq;
6897 6895          queue_t *q;
6898 6896          phyint_t *phyi;
6899 6897          ipsq_func_t func;
6900 6898  
6901 6899          ASSERT(IAM_WRITER_IPSQ(ipsq));
6902 6900  
6903 6901          ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
6904 6902          if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
6905 6903                  ipsq->ipsq_xop->ipx_reentry_cnt--;
6906 6904                  return;
6907 6905          }
6908 6906  
6909 6907          for (;;) {
6910 6908                  phyi = ipsq->ipsq_phyint;
6911 6909                  mp = ipsq_dq(ipsq);
6912 6910                  mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
6913 6911  
6914 6912                  /*
6915 6913                   * If we've changed to a new IPSQ, and the phyint associated
6916 6914                   * with the old one has gone away, free the old IPSQ.  Note
6917 6915                   * that this cannot happen while the IPSQ is in a group.
6918 6916                   */
6919 6917                  if (mp_ipsq != ipsq && phyi == NULL) {
6920 6918                          ASSERT(ipsq->ipsq_next == ipsq);
6921 6919                          ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6922 6920                          ipsq_delete(ipsq);
6923 6921                  }
6924 6922  
6925 6923                  if (mp == NULL)
6926 6924                          break;
6927 6925  
6928 6926                  q = mp->b_queue;
6929 6927                  func = (ipsq_func_t)mp->b_prev;
6930 6928                  ipsq = mp_ipsq;
6931 6929                  mp->b_next = mp->b_prev = NULL;
6932 6930                  mp->b_queue = NULL;
6933 6931  
6934 6932                  /*
6935 6933                   * If 'q' is an conn queue, it is valid, since we did a
6936 6934                   * a refhold on the conn at the start of the ioctl.
6937 6935                   * If 'q' is an ill queue, it is valid, since close of an
6938 6936                   * ill will clean up its IPSQ.
6939 6937                   */
6940 6938                  (*func)(ipsq, q, mp, NULL);
6941 6939          }
6942 6940  }
6943 6941  
6944 6942  /*
6945 6943   * Used to start any igmp or mld timers that could not be started
6946 6944   * while holding ill_mcast_lock. The timers can't be started while holding
6947 6945   * the lock, since mld/igmp_start_timers may need to call untimeout()
6948 6946   * which can't be done while holding the lock which the timeout handler
6949 6947   * acquires. Otherwise
6950 6948   * there could be a deadlock since the timeout handlers
6951 6949   * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
6952 6950   * ill_mcast_lock.
6953 6951   */
6954 6952  void
6955 6953  ill_mcast_timer_start(ip_stack_t *ipst)
6956 6954  {
6957 6955          int             next;
6958 6956  
6959 6957          mutex_enter(&ipst->ips_igmp_timer_lock);
6960 6958          next = ipst->ips_igmp_deferred_next;
6961 6959          ipst->ips_igmp_deferred_next = INFINITY;
6962 6960          mutex_exit(&ipst->ips_igmp_timer_lock);
6963 6961  
6964 6962          if (next != INFINITY)
6965 6963                  igmp_start_timers(next, ipst);
6966 6964  
6967 6965          mutex_enter(&ipst->ips_mld_timer_lock);
6968 6966          next = ipst->ips_mld_deferred_next;
6969 6967          ipst->ips_mld_deferred_next = INFINITY;
6970 6968          mutex_exit(&ipst->ips_mld_timer_lock);
6971 6969  
6972 6970          if (next != INFINITY)
6973 6971                  mld_start_timers(next, ipst);
6974 6972  }
6975 6973  
6976 6974  /*
6977 6975   * Start the current exclusive operation on `ipsq'; associate it with `ipif'
6978 6976   * and `ioccmd'.
6979 6977   */
6980 6978  void
6981 6979  ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
6982 6980  {
6983 6981          ill_t *ill = ipif->ipif_ill;
6984 6982          ipxop_t *ipx = ipsq->ipsq_xop;
6985 6983  
6986 6984          ASSERT(IAM_WRITER_IPSQ(ipsq));
6987 6985          ASSERT(ipx->ipx_current_ipif == NULL);
6988 6986          ASSERT(ipx->ipx_current_ioctl == 0);
6989 6987  
6990 6988          ipx->ipx_current_done = B_FALSE;
6991 6989          ipx->ipx_current_ioctl = ioccmd;
6992 6990          mutex_enter(&ipx->ipx_lock);
6993 6991          ipx->ipx_current_ipif = ipif;
6994 6992          mutex_exit(&ipx->ipx_lock);
6995 6993  
6996 6994          /*
6997 6995           * Set IPIF_CHANGING on one or more ipifs associated with the
6998 6996           * current exclusive operation.  IPIF_CHANGING prevents any new
6999 6997           * references to the ipif (so that the references will eventually
7000 6998           * drop to zero) and also prevents any "get" operations (e.g.,
7001 6999           * SIOCGLIFFLAGS) from being able to access the ipif until the
7002 7000           * operation has completed and the ipif is again in a stable state.
7003 7001           *
7004 7002           * For ioctls, IPIF_CHANGING is set on the ipif associated with the
7005 7003           * ioctl.  For internal operations (where ioccmd is zero), all ipifs
7006 7004           * on the ill are marked with IPIF_CHANGING since it's unclear which
7007 7005           * ipifs will be affected.
7008 7006           *
7009 7007           * Note that SIOCLIFREMOVEIF is a special case as it sets
7010 7008           * IPIF_CONDEMNED internally after identifying the right ipif to
7011 7009           * operate on.
7012 7010           */
7013 7011          switch (ioccmd) {
7014 7012          case SIOCLIFREMOVEIF:
7015 7013                  break;
7016 7014          case 0:
7017 7015                  mutex_enter(&ill->ill_lock);
7018 7016                  ipif = ipif->ipif_ill->ill_ipif;
7019 7017                  for (; ipif != NULL; ipif = ipif->ipif_next)
7020 7018                          ipif->ipif_state_flags |= IPIF_CHANGING;
7021 7019                  mutex_exit(&ill->ill_lock);
7022 7020                  break;
7023 7021          default:
7024 7022                  mutex_enter(&ill->ill_lock);
7025 7023                  ipif->ipif_state_flags |= IPIF_CHANGING;
7026 7024                  mutex_exit(&ill->ill_lock);
7027 7025          }
7028 7026  }
7029 7027  
7030 7028  /*
7031 7029   * Finish the current exclusive operation on `ipsq'.  Usually, this will allow
7032 7030   * the next exclusive operation to begin once we ipsq_exit().  However, if
7033 7031   * pending DLPI operations remain, then we will wait for the queue to drain
7034 7032   * before allowing the next exclusive operation to begin.  This ensures that
7035 7033   * DLPI operations from one exclusive operation are never improperly processed
7036 7034   * as part of a subsequent exclusive operation.
7037 7035   */
7038 7036  void
7039 7037  ipsq_current_finish(ipsq_t *ipsq)
7040 7038  {
7041 7039          ipxop_t *ipx = ipsq->ipsq_xop;
7042 7040          t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
7043 7041          ipif_t  *ipif = ipx->ipx_current_ipif;
7044 7042  
7045 7043          ASSERT(IAM_WRITER_IPSQ(ipsq));
7046 7044  
7047 7045          /*
7048 7046           * For SIOCLIFREMOVEIF, the ipif has been already been blown away
7049 7047           * (but in that case, IPIF_CHANGING will already be clear and no
7050 7048           * pending DLPI messages can remain).
7051 7049           */
7052 7050          if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
7053 7051                  ill_t *ill = ipif->ipif_ill;
7054 7052  
7055 7053                  mutex_enter(&ill->ill_lock);
7056 7054                  dlpi_pending = ill->ill_dlpi_pending;
7057 7055                  if (ipx->ipx_current_ioctl == 0) {
7058 7056                          ipif = ill->ill_ipif;
7059 7057                          for (; ipif != NULL; ipif = ipif->ipif_next)
7060 7058                                  ipif->ipif_state_flags &= ~IPIF_CHANGING;
7061 7059                  } else {
7062 7060                          ipif->ipif_state_flags &= ~IPIF_CHANGING;
7063 7061                  }
7064 7062                  mutex_exit(&ill->ill_lock);
7065 7063          }
7066 7064  
7067 7065          ASSERT(!ipx->ipx_current_done);
7068 7066          ipx->ipx_current_done = B_TRUE;
7069 7067          ipx->ipx_current_ioctl = 0;
7070 7068          if (dlpi_pending == DL_PRIM_INVAL) {
7071 7069                  mutex_enter(&ipx->ipx_lock);
7072 7070                  ipx->ipx_current_ipif = NULL;
7073 7071                  mutex_exit(&ipx->ipx_lock);
7074 7072          }
7075 7073  }
7076 7074  
7077 7075  /*
7078 7076   * The ill is closing. Flush all messages on the ipsq that originated
7079 7077   * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
7080 7078   * for this ill since ipsq_enter could not have entered until then.
7081 7079   * New messages can't be queued since the CONDEMNED flag is set.
7082 7080   */
7083 7081  static void
7084 7082  ipsq_flush(ill_t *ill)
7085 7083  {
7086 7084          queue_t *q;
7087 7085          mblk_t  *prev;
7088 7086          mblk_t  *mp;
7089 7087          mblk_t  *mp_next;
7090 7088          ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
7091 7089  
7092 7090          ASSERT(IAM_WRITER_ILL(ill));
7093 7091  
7094 7092          /*
7095 7093           * Flush any messages sent up by the driver.
7096 7094           */
7097 7095          mutex_enter(&ipx->ipx_lock);
7098 7096          for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
7099 7097                  mp_next = mp->b_next;
7100 7098                  q = mp->b_queue;
7101 7099                  if (q == ill->ill_rq || q == ill->ill_wq) {
7102 7100                          /* dequeue mp */
7103 7101                          if (prev == NULL)
7104 7102                                  ipx->ipx_mphead = mp->b_next;
7105 7103                          else
7106 7104                                  prev->b_next = mp->b_next;
7107 7105                          if (ipx->ipx_mptail == mp) {
7108 7106                                  ASSERT(mp_next == NULL);
7109 7107                                  ipx->ipx_mptail = prev;
7110 7108                          }
7111 7109                          inet_freemsg(mp);
7112 7110                  } else {
7113 7111                          prev = mp;
7114 7112                  }
7115 7113          }
7116 7114          mutex_exit(&ipx->ipx_lock);
7117 7115          (void) ipsq_pending_mp_cleanup(ill, NULL);
7118 7116          ipsq_xopq_mp_cleanup(ill, NULL);
7119 7117  }
7120 7118  
7121 7119  /*
7122 7120   * Parse an ifreq or lifreq struct coming down ioctls and refhold
7123 7121   * and return the associated ipif.
7124 7122   * Return value:
7125 7123   *      Non zero: An error has occurred. ci may not be filled out.
7126 7124   *      zero : ci is filled out with the ioctl cmd in ci.ci_name, and
7127 7125   *      a held ipif in ci.ci_ipif.
7128 7126   */
7129 7127  int
7130 7128  ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
7131 7129      cmd_info_t *ci)
7132 7130  {
7133 7131          char            *name;
7134 7132          struct ifreq    *ifr;
7135 7133          struct lifreq    *lifr;
7136 7134          ipif_t          *ipif = NULL;
7137 7135          ill_t           *ill;
7138 7136          conn_t          *connp;
7139 7137          boolean_t       isv6;
7140 7138          int             err;
7141 7139          mblk_t          *mp1;
7142 7140          zoneid_t        zoneid;
7143 7141          ip_stack_t      *ipst;
7144 7142  
7145 7143          if (q->q_next != NULL) {
7146 7144                  ill = (ill_t *)q->q_ptr;
7147 7145                  isv6 = ill->ill_isv6;
7148 7146                  connp = NULL;
7149 7147                  zoneid = ALL_ZONES;
7150 7148                  ipst = ill->ill_ipst;
7151 7149          } else {
7152 7150                  ill = NULL;
7153 7151                  connp = Q_TO_CONN(q);
7154 7152                  isv6 = (connp->conn_family == AF_INET6);
7155 7153                  zoneid = connp->conn_zoneid;
7156 7154                  if (zoneid == GLOBAL_ZONEID) {
7157 7155                          /* global zone can access ipifs in all zones */
7158 7156                          zoneid = ALL_ZONES;
7159 7157                  }
7160 7158                  ipst = connp->conn_netstack->netstack_ip;
7161 7159          }
7162 7160  
7163 7161          /* Has been checked in ip_wput_nondata */
7164 7162          mp1 = mp->b_cont->b_cont;
7165 7163  
7166 7164          if (ipip->ipi_cmd_type == IF_CMD) {
7167 7165                  /* This a old style SIOC[GS]IF* command */
7168 7166                  ifr = (struct ifreq *)mp1->b_rptr;
7169 7167                  /*
7170 7168                   * Null terminate the string to protect against buffer
7171 7169                   * overrun. String was generated by user code and may not
7172 7170                   * be trusted.
7173 7171                   */
7174 7172                  ifr->ifr_name[IFNAMSIZ - 1] = '\0';
7175 7173                  name = ifr->ifr_name;
7176 7174                  ci->ci_sin = (sin_t *)&ifr->ifr_addr;
7177 7175                  ci->ci_sin6 = NULL;
7178 7176                  ci->ci_lifr = (struct lifreq *)ifr;
7179 7177          } else {
7180 7178                  /* This a new style SIOC[GS]LIF* command */
7181 7179                  ASSERT(ipip->ipi_cmd_type == LIF_CMD);
7182 7180                  lifr = (struct lifreq *)mp1->b_rptr;
7183 7181                  /*
7184 7182                   * Null terminate the string to protect against buffer
7185 7183                   * overrun. String was generated by user code and may not
7186 7184                   * be trusted.
7187 7185                   */
7188 7186                  lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
7189 7187                  name = lifr->lifr_name;
7190 7188                  ci->ci_sin = (sin_t *)&lifr->lifr_addr;
7191 7189                  ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
7192 7190                  ci->ci_lifr = lifr;
7193 7191          }
7194 7192  
7195 7193          if (ipip->ipi_cmd == SIOCSLIFNAME) {
7196 7194                  /*
7197 7195                   * The ioctl will be failed if the ioctl comes down
7198 7196                   * an conn stream
7199 7197                   */
7200 7198                  if (ill == NULL) {
7201 7199                          /*
7202 7200                           * Not an ill queue, return EINVAL same as the
7203 7201                           * old error code.
7204 7202                           */
7205 7203                          return (ENXIO);
7206 7204                  }
7207 7205                  ipif = ill->ill_ipif;
7208 7206                  ipif_refhold(ipif);
7209 7207          } else {
7210 7208                  /*
7211 7209                   * Ensure that ioctls don't see any internal state changes
7212 7210                   * caused by set ioctls by deferring them if IPIF_CHANGING is
7213 7211                   * set.
7214 7212                   */
7215 7213                  ipif = ipif_lookup_on_name_async(name, mi_strlen(name),
7216 7214                      isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst);
7217 7215                  if (ipif == NULL) {
7218 7216                          if (err == EINPROGRESS)
7219 7217                                  return (err);
7220 7218                          err = 0;        /* Ensure we don't use it below */
7221 7219                  }
7222 7220          }
7223 7221  
7224 7222          /*
7225 7223           * Old style [GS]IFCMD does not admit IPv6 ipif
7226 7224           */
7227 7225          if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
7228 7226                  ipif_refrele(ipif);
7229 7227                  return (ENXIO);
7230 7228          }
7231 7229  
7232 7230          if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
7233 7231              name[0] == '\0') {
7234 7232                  /*
7235 7233                   * Handle a or a SIOC?IF* with a null name
7236 7234                   * during plumb (on the ill queue before the I_PLINK).
7237 7235                   */
7238 7236                  ipif = ill->ill_ipif;
7239 7237                  ipif_refhold(ipif);
7240 7238          }
7241 7239  
7242 7240          if (ipif == NULL)
7243 7241                  return (ENXIO);
7244 7242  
7245 7243          DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
7246 7244              int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
7247 7245  
7248 7246          ci->ci_ipif = ipif;
7249 7247          return (0);
7250 7248  }
7251 7249  
7252 7250  /*
7253 7251   * Return the total number of ipifs.
7254 7252   */
7255 7253  static uint_t
7256 7254  ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
7257 7255  {
7258 7256          uint_t numifs = 0;
7259 7257          ill_t   *ill;
7260 7258          ill_walk_context_t      ctx;
7261 7259          ipif_t  *ipif;
7262 7260  
7263 7261          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7264 7262          ill = ILL_START_WALK_V4(&ctx, ipst);
7265 7263          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7266 7264                  if (IS_UNDER_IPMP(ill))
7267 7265                          continue;
7268 7266                  for (ipif = ill->ill_ipif; ipif != NULL;
7269 7267                      ipif = ipif->ipif_next) {
7270 7268                          if (ipif->ipif_zoneid == zoneid ||
7271 7269                              ipif->ipif_zoneid == ALL_ZONES)
7272 7270                                  numifs++;
7273 7271                  }
7274 7272          }
7275 7273          rw_exit(&ipst->ips_ill_g_lock);
7276 7274          return (numifs);
7277 7275  }
7278 7276  
7279 7277  /*
7280 7278   * Return the total number of ipifs.
7281 7279   */
7282 7280  static uint_t
7283 7281  ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
7284 7282  {
7285 7283          uint_t numifs = 0;
7286 7284          ill_t   *ill;
7287 7285          ipif_t  *ipif;
7288 7286          ill_walk_context_t      ctx;
7289 7287  
7290 7288          ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7291 7289  
7292 7290          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7293 7291          if (family == AF_INET)
7294 7292                  ill = ILL_START_WALK_V4(&ctx, ipst);
7295 7293          else if (family == AF_INET6)
7296 7294                  ill = ILL_START_WALK_V6(&ctx, ipst);
7297 7295          else
7298 7296                  ill = ILL_START_WALK_ALL(&ctx, ipst);
7299 7297  
7300 7298          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7301 7299                  if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
7302 7300                          continue;
7303 7301  
7304 7302                  for (ipif = ill->ill_ipif; ipif != NULL;
7305 7303                      ipif = ipif->ipif_next) {
7306 7304                          if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7307 7305                              !(lifn_flags & LIFC_NOXMIT))
7308 7306                                  continue;
7309 7307                          if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7310 7308                              !(lifn_flags & LIFC_TEMPORARY))
7311 7309                                  continue;
7312 7310                          if (((ipif->ipif_flags &
7313 7311                              (IPIF_NOXMIT|IPIF_NOLOCAL|
7314 7312                              IPIF_DEPRECATED)) ||
7315 7313                              IS_LOOPBACK(ill) ||
7316 7314                              !(ipif->ipif_flags & IPIF_UP)) &&
7317 7315                              (lifn_flags & LIFC_EXTERNAL_SOURCE))
7318 7316                                  continue;
7319 7317  
7320 7318                          if (zoneid != ipif->ipif_zoneid &&
7321 7319                              ipif->ipif_zoneid != ALL_ZONES &&
7322 7320                              (zoneid != GLOBAL_ZONEID ||
7323 7321                              !(lifn_flags & LIFC_ALLZONES)))
7324 7322                                  continue;
7325 7323  
7326 7324                          numifs++;
7327 7325                  }
7328 7326          }
7329 7327          rw_exit(&ipst->ips_ill_g_lock);
7330 7328          return (numifs);
7331 7329  }
7332 7330  
7333 7331  uint_t
7334 7332  ip_get_lifsrcofnum(ill_t *ill)
7335 7333  {
7336 7334          uint_t numifs = 0;
7337 7335          ill_t   *ill_head = ill;
7338 7336          ip_stack_t      *ipst = ill->ill_ipst;
7339 7337  
7340 7338          /*
7341 7339           * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7342 7340           * other thread may be trying to relink the ILLs in this usesrc group
7343 7341           * and adjusting the ill_usesrc_grp_next pointers
7344 7342           */
7345 7343          rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7346 7344          if ((ill->ill_usesrc_ifindex == 0) &&
7347 7345              (ill->ill_usesrc_grp_next != NULL)) {
7348 7346                  for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7349 7347                      ill = ill->ill_usesrc_grp_next)
7350 7348                          numifs++;
7351 7349          }
7352 7350          rw_exit(&ipst->ips_ill_g_usesrc_lock);
7353 7351  
7354 7352          return (numifs);
7355 7353  }
7356 7354  
7357 7355  /* Null values are passed in for ipif, sin, and ifreq */
7358 7356  /* ARGSUSED */
7359 7357  int
7360 7358  ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7361 7359      mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7362 7360  {
7363 7361          int *nump;
7364 7362          conn_t *connp = Q_TO_CONN(q);
7365 7363  
7366 7364          ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7367 7365  
7368 7366          /* Existence of b_cont->b_cont checked in ip_wput_nondata */
7369 7367          nump = (int *)mp->b_cont->b_cont->b_rptr;
7370 7368  
7371 7369          *nump = ip_get_numifs(connp->conn_zoneid,
7372 7370              connp->conn_netstack->netstack_ip);
7373 7371          ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7374 7372          return (0);
7375 7373  }
7376 7374  
7377 7375  /* Null values are passed in for ipif, sin, and ifreq */
7378 7376  /* ARGSUSED */
7379 7377  int
7380 7378  ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7381 7379      queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7382 7380  {
7383 7381          struct lifnum *lifn;
7384 7382          mblk_t  *mp1;
7385 7383          conn_t *connp = Q_TO_CONN(q);
7386 7384  
7387 7385          ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7388 7386  
7389 7387          /* Existence checked in ip_wput_nondata */
7390 7388          mp1 = mp->b_cont->b_cont;
7391 7389  
7392 7390          lifn = (struct lifnum *)mp1->b_rptr;
7393 7391          switch (lifn->lifn_family) {
7394 7392          case AF_UNSPEC:
7395 7393          case AF_INET:
7396 7394          case AF_INET6:
7397 7395                  break;
7398 7396          default:
7399 7397                  return (EAFNOSUPPORT);
7400 7398          }
7401 7399  
7402 7400          lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7403 7401              connp->conn_zoneid, connp->conn_netstack->netstack_ip);
7404 7402          ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7405 7403          return (0);
7406 7404  }
7407 7405  
7408 7406  /* ARGSUSED */
7409 7407  int
7410 7408  ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7411 7409      mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7412 7410  {
7413 7411          STRUCT_HANDLE(ifconf, ifc);
7414 7412          mblk_t *mp1;
7415 7413          struct iocblk *iocp;
7416 7414          struct ifreq *ifr;
7417 7415          ill_walk_context_t      ctx;
7418 7416          ill_t   *ill;
7419 7417          ipif_t  *ipif;
7420 7418          struct sockaddr_in *sin;
7421 7419          int32_t ifclen;
7422 7420          zoneid_t zoneid;
7423 7421          ip_stack_t *ipst = CONNQ_TO_IPST(q);
7424 7422  
7425 7423          ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7426 7424  
7427 7425          ip1dbg(("ip_sioctl_get_ifconf"));
7428 7426          /* Existence verified in ip_wput_nondata */
7429 7427          mp1 = mp->b_cont->b_cont;
7430 7428          iocp = (struct iocblk *)mp->b_rptr;
7431 7429          zoneid = Q_TO_CONN(q)->conn_zoneid;
7432 7430  
7433 7431          /*
7434 7432           * The original SIOCGIFCONF passed in a struct ifconf which specified
7435 7433           * the user buffer address and length into which the list of struct
7436 7434           * ifreqs was to be copied.  Since AT&T Streams does not seem to
7437 7435           * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7438 7436           * the SIOCGIFCONF operation was redefined to simply provide
7439 7437           * a large output buffer into which we are supposed to jam the ifreq
7440 7438           * array.  The same ioctl command code was used, despite the fact that
7441 7439           * both the applications and the kernel code had to change, thus making
7442 7440           * it impossible to support both interfaces.
7443 7441           *
7444 7442           * For reasons not good enough to try to explain, the following
7445 7443           * algorithm is used for deciding what to do with one of these:
7446 7444           * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7447 7445           * form with the output buffer coming down as the continuation message.
7448 7446           * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7449 7447           * and we have to copy in the ifconf structure to find out how big the
7450 7448           * output buffer is and where to copy out to.  Sure no problem...
7451 7449           *
7452 7450           */
7453 7451          STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7454 7452          if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7455 7453                  int numifs = 0;
7456 7454                  size_t ifc_bufsize;
7457 7455  
7458 7456                  /*
7459 7457                   * Must be (better be!) continuation of a TRANSPARENT
7460 7458                   * IOCTL.  We just copied in the ifconf structure.
7461 7459                   */
7462 7460                  STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7463 7461                      (struct ifconf *)mp1->b_rptr);
7464 7462  
7465 7463                  /*
7466 7464                   * Allocate a buffer to hold requested information.
7467 7465                   *
7468 7466                   * If ifc_len is larger than what is needed, we only
7469 7467                   * allocate what we will use.
7470 7468                   *
7471 7469                   * If ifc_len is smaller than what is needed, return
7472 7470                   * EINVAL.
7473 7471                   *
7474 7472                   * XXX: the ill_t structure can hava 2 counters, for
7475 7473                   * v4 and v6 (not just ill_ipif_up_count) to store the
7476 7474                   * number of interfaces for a device, so we don't need
7477 7475                   * to count them here...
7478 7476                   */
7479 7477                  numifs = ip_get_numifs(zoneid, ipst);
7480 7478  
7481 7479                  ifclen = STRUCT_FGET(ifc, ifc_len);
7482 7480                  ifc_bufsize = numifs * sizeof (struct ifreq);
7483 7481                  if (ifc_bufsize > ifclen) {
7484 7482                          if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7485 7483                                  /* old behaviour */
7486 7484                                  return (EINVAL);
7487 7485                          } else {
7488 7486                                  ifc_bufsize = ifclen;
7489 7487                          }
7490 7488                  }
7491 7489  
7492 7490                  mp1 = mi_copyout_alloc(q, mp,
7493 7491                      STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
7494 7492                  if (mp1 == NULL)
7495 7493                          return (ENOMEM);
7496 7494  
7497 7495                  mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
7498 7496          }
7499 7497          bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7500 7498          /*
7501 7499           * the SIOCGIFCONF ioctl only knows about
7502 7500           * IPv4 addresses, so don't try to tell
7503 7501           * it about interfaces with IPv6-only
7504 7502           * addresses. (Last parm 'isv6' is B_FALSE)
7505 7503           */
7506 7504  
7507 7505          ifr = (struct ifreq *)mp1->b_rptr;
7508 7506  
7509 7507          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7510 7508          ill = ILL_START_WALK_V4(&ctx, ipst);
7511 7509          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7512 7510                  if (IS_UNDER_IPMP(ill))
7513 7511                          continue;
7514 7512                  for (ipif = ill->ill_ipif; ipif != NULL;
7515 7513                      ipif = ipif->ipif_next) {
7516 7514                          if (zoneid != ipif->ipif_zoneid &&
7517 7515                              ipif->ipif_zoneid != ALL_ZONES)
7518 7516                                  continue;
7519 7517                          if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
7520 7518                                  if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7521 7519                                          /* old behaviour */
7522 7520                                          rw_exit(&ipst->ips_ill_g_lock);
7523 7521                                          return (EINVAL);
7524 7522                                  } else {
7525 7523                                          goto if_copydone;
7526 7524                                  }
7527 7525                          }
7528 7526                          ipif_get_name(ipif, ifr->ifr_name,
7529 7527                              sizeof (ifr->ifr_name));
7530 7528                          sin = (sin_t *)&ifr->ifr_addr;
7531 7529                          *sin = sin_null;
7532 7530                          sin->sin_family = AF_INET;
7533 7531                          sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7534 7532                          ifr++;
7535 7533                  }
7536 7534          }
7537 7535  if_copydone:
7538 7536          rw_exit(&ipst->ips_ill_g_lock);
7539 7537          mp1->b_wptr = (uchar_t *)ifr;
7540 7538  
7541 7539          if (STRUCT_BUF(ifc) != NULL) {
7542 7540                  STRUCT_FSET(ifc, ifc_len,
7543 7541                      (int)((uchar_t *)ifr - mp1->b_rptr));
7544 7542          }
7545 7543          return (0);
7546 7544  }
7547 7545  
7548 7546  /*
7549 7547   * Get the interfaces using the address hosted on the interface passed in,
7550 7548   * as a source adddress
7551 7549   */
7552 7550  /* ARGSUSED */
7553 7551  int
7554 7552  ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7555 7553      mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7556 7554  {
7557 7555          mblk_t *mp1;
7558 7556          ill_t   *ill, *ill_head;
7559 7557          ipif_t  *ipif, *orig_ipif;
7560 7558          int     numlifs = 0;
7561 7559          size_t  lifs_bufsize, lifsmaxlen;
7562 7560          struct  lifreq *lifr;
7563 7561          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7564 7562          uint_t  ifindex;
7565 7563          zoneid_t zoneid;
7566 7564          boolean_t isv6 = B_FALSE;
7567 7565          struct  sockaddr_in     *sin;
7568 7566          struct  sockaddr_in6    *sin6;
7569 7567          STRUCT_HANDLE(lifsrcof, lifs);
7570 7568          ip_stack_t              *ipst;
7571 7569  
7572 7570          ipst = CONNQ_TO_IPST(q);
7573 7571  
7574 7572          ASSERT(q->q_next == NULL);
7575 7573  
7576 7574          zoneid = Q_TO_CONN(q)->conn_zoneid;
7577 7575  
7578 7576          /* Existence verified in ip_wput_nondata */
7579 7577          mp1 = mp->b_cont->b_cont;
7580 7578  
7581 7579          /*
7582 7580           * Must be (better be!) continuation of a TRANSPARENT
7583 7581           * IOCTL.  We just copied in the lifsrcof structure.
7584 7582           */
7585 7583          STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
7586 7584              (struct lifsrcof *)mp1->b_rptr);
7587 7585  
7588 7586          if (MBLKL(mp1) != STRUCT_SIZE(lifs))
7589 7587                  return (EINVAL);
7590 7588  
7591 7589          ifindex = STRUCT_FGET(lifs, lifs_ifindex);
7592 7590          isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
7593 7591          ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
7594 7592          if (ipif == NULL) {
7595 7593                  ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
7596 7594                      ifindex));
7597 7595                  return (ENXIO);
7598 7596          }
7599 7597  
7600 7598          /* Allocate a buffer to hold requested information */
7601 7599          numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
7602 7600          lifs_bufsize = numlifs * sizeof (struct lifreq);
7603 7601          lifsmaxlen =  STRUCT_FGET(lifs, lifs_maxlen);
7604 7602          /* The actual size needed is always returned in lifs_len */
7605 7603          STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
7606 7604  
7607 7605          /* If the amount we need is more than what is passed in, abort */
7608 7606          if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
7609 7607                  ipif_refrele(ipif);
7610 7608                  return (0);
7611 7609          }
7612 7610  
7613 7611          mp1 = mi_copyout_alloc(q, mp,
7614 7612              STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
7615 7613          if (mp1 == NULL) {
7616 7614                  ipif_refrele(ipif);
7617 7615                  return (ENOMEM);
7618 7616          }
7619 7617  
7620 7618          mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
7621 7619          bzero(mp1->b_rptr, lifs_bufsize);
7622 7620  
7623 7621          lifr = (struct lifreq *)mp1->b_rptr;
7624 7622  
7625 7623          ill = ill_head = ipif->ipif_ill;
7626 7624          orig_ipif = ipif;
7627 7625  
7628 7626          /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
7629 7627          rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7630 7628          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7631 7629  
7632 7630          ill = ill->ill_usesrc_grp_next; /* start from next ill */
7633 7631          for (; (ill != NULL) && (ill != ill_head);
7634 7632              ill = ill->ill_usesrc_grp_next) {
7635 7633  
7636 7634                  if ((uchar_t *)&lifr[1] > mp1->b_wptr)
7637 7635                          break;
7638 7636  
7639 7637                  ipif = ill->ill_ipif;
7640 7638                  ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
7641 7639                  if (ipif->ipif_isv6) {
7642 7640                          sin6 = (sin6_t *)&lifr->lifr_addr;
7643 7641                          *sin6 = sin6_null;
7644 7642                          sin6->sin6_family = AF_INET6;
7645 7643                          sin6->sin6_addr = ipif->ipif_v6lcl_addr;
7646 7644                          lifr->lifr_addrlen = ip_mask_to_plen_v6(
7647 7645                              &ipif->ipif_v6net_mask);
7648 7646                  } else {
7649 7647                          sin = (sin_t *)&lifr->lifr_addr;
7650 7648                          *sin = sin_null;
7651 7649                          sin->sin_family = AF_INET;
7652 7650                          sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7653 7651                          lifr->lifr_addrlen = ip_mask_to_plen(
7654 7652                              ipif->ipif_net_mask);
7655 7653                  }
7656 7654                  lifr++;
7657 7655          }
7658 7656          rw_exit(&ipst->ips_ill_g_lock);
7659 7657          rw_exit(&ipst->ips_ill_g_usesrc_lock);
7660 7658          ipif_refrele(orig_ipif);
7661 7659          mp1->b_wptr = (uchar_t *)lifr;
7662 7660          STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
7663 7661  
7664 7662          return (0);
7665 7663  }
7666 7664  
7667 7665  /* ARGSUSED */
7668 7666  int
7669 7667  ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7670 7668      mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7671 7669  {
7672 7670          mblk_t *mp1;
7673 7671          int     list;
7674 7672          ill_t   *ill;
7675 7673          ipif_t  *ipif;
7676 7674          int     flags;
7677 7675          int     numlifs = 0;
7678 7676          size_t  lifc_bufsize;
7679 7677          struct  lifreq *lifr;
7680 7678          sa_family_t     family;
7681 7679          struct  sockaddr_in     *sin;
7682 7680          struct  sockaddr_in6    *sin6;
7683 7681          ill_walk_context_t      ctx;
7684 7682          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7685 7683          int32_t lifclen;
7686 7684          zoneid_t zoneid;
7687 7685          STRUCT_HANDLE(lifconf, lifc);
7688 7686          ip_stack_t *ipst = CONNQ_TO_IPST(q);
7689 7687  
7690 7688          ip1dbg(("ip_sioctl_get_lifconf"));
7691 7689  
7692 7690          ASSERT(q->q_next == NULL);
7693 7691  
7694 7692          zoneid = Q_TO_CONN(q)->conn_zoneid;
7695 7693  
7696 7694          /* Existence verified in ip_wput_nondata */
7697 7695          mp1 = mp->b_cont->b_cont;
7698 7696  
7699 7697          /*
7700 7698           * An extended version of SIOCGIFCONF that takes an
7701 7699           * additional address family and flags field.
7702 7700           * AF_UNSPEC retrieve both IPv4 and IPv6.
7703 7701           * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
7704 7702           * interfaces are omitted.
7705 7703           * Similarly, IPIF_TEMPORARY interfaces are omitted
7706 7704           * unless LIFC_TEMPORARY is specified.
7707 7705           * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
7708 7706           * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
7709 7707           * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
7710 7708           * has priority over LIFC_NOXMIT.
7711 7709           */
7712 7710          STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
7713 7711  
7714 7712          if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
7715 7713                  return (EINVAL);
7716 7714  
7717 7715          /*
7718 7716           * Must be (better be!) continuation of a TRANSPARENT
7719 7717           * IOCTL.  We just copied in the lifconf structure.
7720 7718           */
7721 7719          STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
7722 7720  
7723 7721          family = STRUCT_FGET(lifc, lifc_family);
7724 7722          flags = STRUCT_FGET(lifc, lifc_flags);
7725 7723  
7726 7724          switch (family) {
7727 7725          case AF_UNSPEC:
7728 7726                  /*
7729 7727                   * walk all ILL's.
7730 7728                   */
7731 7729                  list = MAX_G_HEADS;
7732 7730                  break;
7733 7731          case AF_INET:
7734 7732                  /*
7735 7733                   * walk only IPV4 ILL's.
7736 7734                   */
7737 7735                  list = IP_V4_G_HEAD;
7738 7736                  break;
7739 7737          case AF_INET6:
7740 7738                  /*
7741 7739                   * walk only IPV6 ILL's.
7742 7740                   */
7743 7741                  list = IP_V6_G_HEAD;
7744 7742                  break;
7745 7743          default:
7746 7744                  return (EAFNOSUPPORT);
7747 7745          }
7748 7746  
7749 7747          /*
7750 7748           * Allocate a buffer to hold requested information.
7751 7749           *
7752 7750           * If lifc_len is larger than what is needed, we only
7753 7751           * allocate what we will use.
7754 7752           *
7755 7753           * If lifc_len is smaller than what is needed, return
7756 7754           * EINVAL.
7757 7755           */
7758 7756          numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
7759 7757          lifc_bufsize = numlifs * sizeof (struct lifreq);
7760 7758          lifclen = STRUCT_FGET(lifc, lifc_len);
7761 7759          if (lifc_bufsize > lifclen) {
7762 7760                  if (iocp->ioc_cmd == O_SIOCGLIFCONF)
7763 7761                          return (EINVAL);
7764 7762                  else
7765 7763                          lifc_bufsize = lifclen;
7766 7764          }
7767 7765  
7768 7766          mp1 = mi_copyout_alloc(q, mp,
7769 7767              STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
7770 7768          if (mp1 == NULL)
7771 7769                  return (ENOMEM);
7772 7770  
7773 7771          mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
7774 7772          bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7775 7773  
7776 7774          lifr = (struct lifreq *)mp1->b_rptr;
7777 7775  
7778 7776          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7779 7777          ill = ill_first(list, list, &ctx, ipst);
7780 7778          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7781 7779                  if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
7782 7780                          continue;
7783 7781  
7784 7782                  for (ipif = ill->ill_ipif; ipif != NULL;
7785 7783                      ipif = ipif->ipif_next) {
7786 7784                          if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7787 7785                              !(flags & LIFC_NOXMIT))
7788 7786                                  continue;
7789 7787  
7790 7788                          if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7791 7789                              !(flags & LIFC_TEMPORARY))
7792 7790                                  continue;
7793 7791  
7794 7792                          if (((ipif->ipif_flags &
7795 7793                              (IPIF_NOXMIT|IPIF_NOLOCAL|
7796 7794                              IPIF_DEPRECATED)) ||
7797 7795                              IS_LOOPBACK(ill) ||
7798 7796                              !(ipif->ipif_flags & IPIF_UP)) &&
7799 7797                              (flags & LIFC_EXTERNAL_SOURCE))
7800 7798                                  continue;
7801 7799  
7802 7800                          if (zoneid != ipif->ipif_zoneid &&
7803 7801                              ipif->ipif_zoneid != ALL_ZONES &&
7804 7802                              (zoneid != GLOBAL_ZONEID ||
7805 7803                              !(flags & LIFC_ALLZONES)))
7806 7804                                  continue;
7807 7805  
7808 7806                          if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
7809 7807                                  if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
7810 7808                                          rw_exit(&ipst->ips_ill_g_lock);
7811 7809                                          return (EINVAL);
7812 7810                                  } else {
7813 7811                                          goto lif_copydone;
7814 7812                                  }
7815 7813                          }
7816 7814  
7817 7815                          ipif_get_name(ipif, lifr->lifr_name,
7818 7816                              sizeof (lifr->lifr_name));
7819 7817                          lifr->lifr_type = ill->ill_type;
7820 7818                          if (ipif->ipif_isv6) {
7821 7819                                  sin6 = (sin6_t *)&lifr->lifr_addr;
7822 7820                                  *sin6 = sin6_null;
7823 7821                                  sin6->sin6_family = AF_INET6;
7824 7822                                  sin6->sin6_addr =
7825 7823                                      ipif->ipif_v6lcl_addr;
7826 7824                                  lifr->lifr_addrlen =
7827 7825                                      ip_mask_to_plen_v6(
7828 7826                                      &ipif->ipif_v6net_mask);
7829 7827                          } else {
7830 7828                                  sin = (sin_t *)&lifr->lifr_addr;
7831 7829                                  *sin = sin_null;
7832 7830                                  sin->sin_family = AF_INET;
7833 7831                                  sin->sin_addr.s_addr =
7834 7832                                      ipif->ipif_lcl_addr;
7835 7833                                  lifr->lifr_addrlen =
7836 7834                                      ip_mask_to_plen(
7837 7835                                      ipif->ipif_net_mask);
7838 7836                          }
7839 7837                          lifr++;
7840 7838                  }
7841 7839          }
7842 7840  lif_copydone:
7843 7841          rw_exit(&ipst->ips_ill_g_lock);
7844 7842  
7845 7843          mp1->b_wptr = (uchar_t *)lifr;
7846 7844          if (STRUCT_BUF(lifc) != NULL) {
7847 7845                  STRUCT_FSET(lifc, lifc_len,
7848 7846                      (int)((uchar_t *)lifr - mp1->b_rptr));
7849 7847          }
7850 7848          return (0);
7851 7849  }
7852 7850  
7853 7851  static void
7854 7852  ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
7855 7853  {
7856 7854          ip6_asp_t *table;
7857 7855          size_t table_size;
7858 7856          mblk_t *data_mp;
7859 7857          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7860 7858          ip_stack_t      *ipst;
7861 7859  
7862 7860          if (q->q_next == NULL)
7863 7861                  ipst = CONNQ_TO_IPST(q);
7864 7862          else
7865 7863                  ipst = ILLQ_TO_IPST(q);
7866 7864  
7867 7865          /* These two ioctls are I_STR only */
7868 7866          if (iocp->ioc_count == TRANSPARENT) {
7869 7867                  miocnak(q, mp, 0, EINVAL);
7870 7868                  return;
7871 7869          }
7872 7870  
7873 7871          data_mp = mp->b_cont;
7874 7872          if (data_mp == NULL) {
7875 7873                  /* The user passed us a NULL argument */
7876 7874                  table = NULL;
7877 7875                  table_size = iocp->ioc_count;
7878 7876          } else {
7879 7877                  /*
7880 7878                   * The user provided a table.  The stream head
7881 7879                   * may have copied in the user data in chunks,
7882 7880                   * so make sure everything is pulled up
7883 7881                   * properly.
7884 7882                   */
7885 7883                  if (MBLKL(data_mp) < iocp->ioc_count) {
7886 7884                          mblk_t *new_data_mp;
7887 7885                          if ((new_data_mp = msgpullup(data_mp, -1)) ==
7888 7886                              NULL) {
7889 7887                                  miocnak(q, mp, 0, ENOMEM);
7890 7888                                  return;
7891 7889                          }
7892 7890                          freemsg(data_mp);
7893 7891                          data_mp = new_data_mp;
7894 7892                          mp->b_cont = data_mp;
7895 7893                  }
7896 7894                  table = (ip6_asp_t *)data_mp->b_rptr;
7897 7895                  table_size = iocp->ioc_count;
7898 7896          }
7899 7897  
7900 7898          switch (iocp->ioc_cmd) {
7901 7899          case SIOCGIP6ADDRPOLICY:
7902 7900                  iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
7903 7901                  if (iocp->ioc_rval == -1)
7904 7902                          iocp->ioc_error = EINVAL;
7905 7903  #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7906 7904                  else if (table != NULL &&
7907 7905                      (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
7908 7906                          ip6_asp_t *src = table;
7909 7907                          ip6_asp32_t *dst = (void *)table;
7910 7908                          int count = table_size / sizeof (ip6_asp_t);
7911 7909                          int i;
7912 7910  
7913 7911                          /*
7914 7912                           * We need to do an in-place shrink of the array
7915 7913                           * to match the alignment attributes of the
7916 7914                           * 32-bit ABI looking at it.
7917 7915                           */
7918 7916                          /* LINTED: logical expression always true: op "||" */
7919 7917                          ASSERT(sizeof (*src) > sizeof (*dst));
7920 7918                          for (i = 1; i < count; i++)
7921 7919                                  bcopy(src + i, dst + i, sizeof (*dst));
7922 7920                  }
7923 7921  #endif
7924 7922                  break;
7925 7923  
7926 7924          case SIOCSIP6ADDRPOLICY:
7927 7925                  ASSERT(mp->b_prev == NULL);
7928 7926                  mp->b_prev = (void *)q;
7929 7927  #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7930 7928                  /*
7931 7929                   * We pass in the datamodel here so that the ip6_asp_replace()
7932 7930                   * routine can handle converting from 32-bit to native formats
7933 7931                   * where necessary.
7934 7932                   *
7935 7933                   * A better way to handle this might be to convert the inbound
7936 7934                   * data structure here, and hang it off a new 'mp'; thus the
7937 7935                   * ip6_asp_replace() logic would always be dealing with native
7938 7936                   * format data structures..
7939 7937                   *
7940 7938                   * (An even simpler way to handle these ioctls is to just
7941 7939                   * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
7942 7940                   * and just recompile everything that depends on it.)
7943 7941                   */
7944 7942  #endif
7945 7943                  ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
7946 7944                      iocp->ioc_flag & IOC_MODELS);
7947 7945                  return;
7948 7946          }
7949 7947  
7950 7948          DB_TYPE(mp) =  (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
7951 7949          qreply(q, mp);
7952 7950  }
7953 7951  
7954 7952  static void
7955 7953  ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
7956 7954  {
7957 7955          mblk_t          *data_mp;
7958 7956          struct dstinforeq       *dir;
7959 7957          uint8_t         *end, *cur;
7960 7958          in6_addr_t      *daddr, *saddr;
7961 7959          ipaddr_t        v4daddr;
7962 7960          ire_t           *ire;
7963 7961          ipaddr_t        v4setsrc;
7964 7962          in6_addr_t      v6setsrc;
7965 7963          char            *slabel, *dlabel;
7966 7964          boolean_t       isipv4;
7967 7965          int             match_ire;
7968 7966          ill_t           *dst_ill;
7969 7967          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7970 7968          conn_t          *connp = Q_TO_CONN(q);
7971 7969          zoneid_t        zoneid = IPCL_ZONEID(connp);
7972 7970          ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
7973 7971          uint64_t        ipif_flags;
7974 7972  
7975 7973          ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7976 7974  
7977 7975          /*
7978 7976           * This ioctl is I_STR only, and must have a
7979 7977           * data mblk following the M_IOCTL mblk.
7980 7978           */
7981 7979          data_mp = mp->b_cont;
7982 7980          if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
7983 7981                  miocnak(q, mp, 0, EINVAL);
7984 7982                  return;
7985 7983          }
7986 7984  
7987 7985          if (MBLKL(data_mp) < iocp->ioc_count) {
7988 7986                  mblk_t *new_data_mp;
7989 7987  
7990 7988                  if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
7991 7989                          miocnak(q, mp, 0, ENOMEM);
7992 7990                          return;
7993 7991                  }
7994 7992                  freemsg(data_mp);
7995 7993                  data_mp = new_data_mp;
7996 7994                  mp->b_cont = data_mp;
7997 7995          }
7998 7996          match_ire = MATCH_IRE_DSTONLY;
7999 7997  
8000 7998          for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
8001 7999              end - cur >= sizeof (struct dstinforeq);
8002 8000              cur += sizeof (struct dstinforeq)) {
8003 8001                  dir = (struct dstinforeq *)cur;
8004 8002                  daddr = &dir->dir_daddr;
8005 8003                  saddr = &dir->dir_saddr;
8006 8004  
8007 8005                  /*
8008 8006                   * ip_addr_scope_v6() and ip6_asp_lookup() handle
8009 8007                   * v4 mapped addresses; ire_ftable_lookup_v6()
8010 8008                   * and ip_select_source_v6() do not.
8011 8009                   */
8012 8010                  dir->dir_dscope = ip_addr_scope_v6(daddr);
8013 8011                  dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
8014 8012  
8015 8013                  isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
8016 8014                  if (isipv4) {
8017 8015                          IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
8018 8016                          v4setsrc = INADDR_ANY;
8019 8017                          ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
8020 8018                              NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc,
8021 8019                              NULL, NULL);
8022 8020                  } else {
8023 8021                          v6setsrc = ipv6_all_zeros;
8024 8022                          ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
8025 8023                              NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc,
8026 8024                              NULL, NULL);
8027 8025                  }
8028 8026                  ASSERT(ire != NULL);
8029 8027                  if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
8030 8028                          ire_refrele(ire);
8031 8029                          dir->dir_dreachable = 0;
8032 8030  
8033 8031                          /* move on to next dst addr */
8034 8032                          continue;
8035 8033                  }
8036 8034                  dir->dir_dreachable = 1;
8037 8035  
8038 8036                  dst_ill = ire_nexthop_ill(ire);
8039 8037                  if (dst_ill == NULL) {
8040 8038                          ire_refrele(ire);
8041 8039                          continue;
8042 8040                  }
8043 8041  
8044 8042                  /* With ipmp we most likely look at the ipmp ill here */
8045 8043                  dir->dir_dmactype = dst_ill->ill_mactype;
8046 8044  
8047 8045                  if (isipv4) {
8048 8046                          ipaddr_t v4saddr;
8049 8047  
8050 8048                          if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
8051 8049                              connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
8052 8050                              &v4saddr, NULL, &ipif_flags) != 0) {
8053 8051                                  v4saddr = INADDR_ANY;
8054 8052                                  ipif_flags = 0;
8055 8053                          }
8056 8054                          IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
8057 8055                  } else {
8058 8056                          if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
8059 8057                              zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
8060 8058                              saddr, NULL, &ipif_flags) != 0) {
8061 8059                                  *saddr = ipv6_all_zeros;
8062 8060                                  ipif_flags = 0;
8063 8061                          }
8064 8062                  }
8065 8063  
8066 8064                  dir->dir_sscope = ip_addr_scope_v6(saddr);
8067 8065                  slabel = ip6_asp_lookup(saddr, NULL, ipst);
8068 8066                  dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
8069 8067                  dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
8070 8068                  ire_refrele(ire);
8071 8069                  ill_refrele(dst_ill);
8072 8070          }
8073 8071          miocack(q, mp, iocp->ioc_count, 0);
8074 8072  }
8075 8073  
8076 8074  /*
8077 8075   * Check if this is an address assigned to this machine.
8078 8076   * Skips interfaces that are down by using ire checks.
8079 8077   * Translates mapped addresses to v4 addresses and then
8080 8078   * treats them as such, returning true if the v4 address
8081 8079   * associated with this mapped address is configured.
8082 8080   * Note: Applications will have to be careful what they do
8083 8081   * with the response; use of mapped addresses limits
8084 8082   * what can be done with the socket, especially with
8085 8083   * respect to socket options and ioctls - neither IPv4
8086 8084   * options nor IPv6 sticky options/ancillary data options
8087 8085   * may be used.
8088 8086   */
8089 8087  /* ARGSUSED */
8090 8088  int
8091 8089  ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8092 8090      ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8093 8091  {
8094 8092          struct sioc_addrreq *sia;
8095 8093          sin_t *sin;
8096 8094          ire_t *ire;
8097 8095          mblk_t *mp1;
8098 8096          zoneid_t zoneid;
8099 8097          ip_stack_t      *ipst;
8100 8098  
8101 8099          ip1dbg(("ip_sioctl_tmyaddr"));
8102 8100  
8103 8101          ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8104 8102          zoneid = Q_TO_CONN(q)->conn_zoneid;
8105 8103          ipst = CONNQ_TO_IPST(q);
8106 8104  
8107 8105          /* Existence verified in ip_wput_nondata */
8108 8106          mp1 = mp->b_cont->b_cont;
8109 8107          sia = (struct sioc_addrreq *)mp1->b_rptr;
8110 8108          sin = (sin_t *)&sia->sa_addr;
8111 8109          switch (sin->sin_family) {
8112 8110          case AF_INET6: {
8113 8111                  sin6_t *sin6 = (sin6_t *)sin;
8114 8112  
8115 8113                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8116 8114                          ipaddr_t v4_addr;
8117 8115  
8118 8116                          IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8119 8117                              v4_addr);
8120 8118                          ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
8121 8119                              IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8122 8120                              MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8123 8121                  } else {
8124 8122                          in6_addr_t v6addr;
8125 8123  
8126 8124                          v6addr = sin6->sin6_addr;
8127 8125                          ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
8128 8126                              IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8129 8127                              MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8130 8128                  }
8131 8129                  break;
8132 8130          }
8133 8131          case AF_INET: {
8134 8132                  ipaddr_t v4addr;
8135 8133  
8136 8134                  v4addr = sin->sin_addr.s_addr;
8137 8135                  ire = ire_ftable_lookup_v4(v4addr, 0, 0,
8138 8136                      IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8139 8137                      NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8140 8138                  break;
8141 8139          }
8142 8140          default:
8143 8141                  return (EAFNOSUPPORT);
8144 8142          }
8145 8143          if (ire != NULL) {
8146 8144                  sia->sa_res = 1;
8147 8145                  ire_refrele(ire);
8148 8146          } else {
8149 8147                  sia->sa_res = 0;
8150 8148          }
8151 8149          return (0);
8152 8150  }
8153 8151  
8154 8152  /*
8155 8153   * Check if this is an address assigned on-link i.e. neighbor,
8156 8154   * and makes sure it's reachable from the current zone.
8157 8155   * Returns true for my addresses as well.
8158 8156   * Translates mapped addresses to v4 addresses and then
8159 8157   * treats them as such, returning true if the v4 address
8160 8158   * associated with this mapped address is configured.
8161 8159   * Note: Applications will have to be careful what they do
8162 8160   * with the response; use of mapped addresses limits
8163 8161   * what can be done with the socket, especially with
8164 8162   * respect to socket options and ioctls - neither IPv4
8165 8163   * options nor IPv6 sticky options/ancillary data options
8166 8164   * may be used.
8167 8165   */
8168 8166  /* ARGSUSED */
8169 8167  int
8170 8168  ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8171 8169      ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
8172 8170  {
8173 8171          struct sioc_addrreq *sia;
8174 8172          sin_t *sin;
8175 8173          mblk_t  *mp1;
8176 8174          ire_t *ire = NULL;
8177 8175          zoneid_t zoneid;
8178 8176          ip_stack_t      *ipst;
8179 8177  
8180 8178          ip1dbg(("ip_sioctl_tonlink"));
8181 8179  
8182 8180          ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8183 8181          zoneid = Q_TO_CONN(q)->conn_zoneid;
8184 8182          ipst = CONNQ_TO_IPST(q);
8185 8183  
8186 8184          /* Existence verified in ip_wput_nondata */
8187 8185          mp1 = mp->b_cont->b_cont;
8188 8186          sia = (struct sioc_addrreq *)mp1->b_rptr;
8189 8187          sin = (sin_t *)&sia->sa_addr;
8190 8188  
8191 8189          /*
8192 8190           * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
8193 8191           * to make sure we only look at on-link unicast address.
8194 8192           */
8195 8193          switch (sin->sin_family) {
8196 8194          case AF_INET6: {
8197 8195                  sin6_t *sin6 = (sin6_t *)sin;
8198 8196  
8199 8197                  if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8200 8198                          ipaddr_t v4_addr;
8201 8199  
8202 8200                          IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8203 8201                              v4_addr);
8204 8202                          if (!CLASSD(v4_addr)) {
8205 8203                                  ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
8206 8204                                      NULL, zoneid, NULL, MATCH_IRE_DSTONLY,
8207 8205                                      0, ipst, NULL);
8208 8206                          }
8209 8207                  } else {
8210 8208                          in6_addr_t v6addr;
8211 8209  
8212 8210                          v6addr = sin6->sin6_addr;
8213 8211                          if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
8214 8212                                  ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
8215 8213                                      NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0,
8216 8214                                      ipst, NULL);
8217 8215                          }
8218 8216                  }
8219 8217                  break;
8220 8218          }
8221 8219          case AF_INET: {
8222 8220                  ipaddr_t v4addr;
8223 8221  
8224 8222                  v4addr = sin->sin_addr.s_addr;
8225 8223                  if (!CLASSD(v4addr)) {
8226 8224                          ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
8227 8225                              zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
8228 8226                  }
8229 8227                  break;
8230 8228          }
8231 8229          default:
8232 8230                  return (EAFNOSUPPORT);
8233 8231          }
8234 8232          sia->sa_res = 0;
8235 8233          if (ire != NULL) {
8236 8234                  ASSERT(!(ire->ire_type & IRE_MULTICAST));
8237 8235  
8238 8236                  if ((ire->ire_type & IRE_ONLINK) &&
8239 8237                      !(ire->ire_type & IRE_BROADCAST))
8240 8238                          sia->sa_res = 1;
8241 8239                  ire_refrele(ire);
8242 8240          }
8243 8241          return (0);
8244 8242  }
8245 8243  
8246 8244  /*
8247 8245   * TBD: implement when kernel maintaines a list of site prefixes.
8248 8246   */
8249 8247  /* ARGSUSED */
8250 8248  int
8251 8249  ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8252 8250      ip_ioctl_cmd_t *ipip, void *ifreq)
8253 8251  {
8254 8252          return (ENXIO);
8255 8253  }
8256 8254  
8257 8255  /* ARP IOCTLs. */
8258 8256  /* ARGSUSED */
8259 8257  int
8260 8258  ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8261 8259      ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8262 8260  {
8263 8261          int             err;
8264 8262          ipaddr_t        ipaddr;
8265 8263          struct iocblk   *iocp;
8266 8264          conn_t          *connp;
8267 8265          struct arpreq   *ar;
8268 8266          struct xarpreq  *xar;
8269 8267          int             arp_flags, flags, alength;
8270 8268          uchar_t         *lladdr;
8271 8269          ip_stack_t      *ipst;
8272 8270          ill_t           *ill = ipif->ipif_ill;
8273 8271          ill_t           *proxy_ill = NULL;
8274 8272          ipmp_arpent_t   *entp = NULL;
8275 8273          boolean_t       proxyarp = B_FALSE;
8276 8274          boolean_t       if_arp_ioctl = B_FALSE;
8277 8275          ncec_t          *ncec = NULL;
8278 8276          nce_t           *nce;
8279 8277  
8280 8278          ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8281 8279          connp = Q_TO_CONN(q);
8282 8280          ipst = connp->conn_netstack->netstack_ip;
8283 8281          iocp = (struct iocblk *)mp->b_rptr;
8284 8282  
8285 8283          if (ipip->ipi_cmd_type == XARP_CMD) {
8286 8284                  /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8287 8285                  xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8288 8286                  ar = NULL;
8289 8287  
8290 8288                  arp_flags = xar->xarp_flags;
8291 8289                  lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
8292 8290                  if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
8293 8291                  /*
8294 8292                   * Validate against user's link layer address length
8295 8293                   * input and name and addr length limits.
8296 8294                   */
8297 8295                  alength = ill->ill_phys_addr_length;
8298 8296                  if (ipip->ipi_cmd == SIOCSXARP) {
8299 8297                          if (alength != xar->xarp_ha.sdl_alen ||
8300 8298                              (alength + xar->xarp_ha.sdl_nlen >
8301 8299                              sizeof (xar->xarp_ha.sdl_data)))
8302 8300                                  return (EINVAL);
8303 8301                  }
8304 8302          } else {
8305 8303                  /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8306 8304                  ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8307 8305                  xar = NULL;
8308 8306  
8309 8307                  arp_flags = ar->arp_flags;
8310 8308                  lladdr = (uchar_t *)ar->arp_ha.sa_data;
8311 8309                  /*
8312 8310                   * Theoretically, the sa_family could tell us what link
8313 8311                   * layer type this operation is trying to deal with. By
8314 8312                   * common usage AF_UNSPEC means ethernet. We'll assume
8315 8313                   * any attempt to use the SIOC?ARP ioctls is for ethernet,
8316 8314                   * for now. Our new SIOC*XARP ioctls can be used more
8317 8315                   * generally.
8318 8316                   *
8319 8317                   * If the underlying media happens to have a non 6 byte
8320 8318                   * address, arp module will fail set/get, but the del
8321 8319                   * operation will succeed.
8322 8320                   */
8323 8321                  alength = 6;
8324 8322                  if ((ipip->ipi_cmd != SIOCDARP) &&
8325 8323                      (alength != ill->ill_phys_addr_length)) {
8326 8324                          return (EINVAL);
8327 8325                  }
8328 8326          }
8329 8327  
8330 8328          /* Translate ATF* flags to NCE* flags */
8331 8329          flags = 0;
8332 8330          if (arp_flags & ATF_AUTHORITY)
8333 8331                  flags |= NCE_F_AUTHORITY;
8334 8332          if (arp_flags & ATF_PERM)
8335 8333                  flags |= NCE_F_NONUD; /* not subject to aging */
8336 8334          if (arp_flags & ATF_PUBL)
8337 8335                  flags |= NCE_F_PUBLISH;
8338 8336  
8339 8337          /*
8340 8338           * IPMP ARP special handling:
8341 8339           *
8342 8340           * 1. Since ARP mappings must appear consistent across the group,
8343 8341           *    prohibit changing ARP mappings on the underlying interfaces.
8344 8342           *
8345 8343           * 2. Since ARP mappings for IPMP data addresses are maintained by
8346 8344           *    IP itself, prohibit changing them.
8347 8345           *
8348 8346           * 3. For proxy ARP, use a functioning hardware address in the group,
8349 8347           *    provided one exists.  If one doesn't, just add the entry as-is;
8350 8348           *    ipmp_illgrp_refresh_arpent() will refresh it if things change.
8351 8349           */
8352 8350          if (IS_UNDER_IPMP(ill)) {
8353 8351                  if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
8354 8352                          return (EPERM);
8355 8353          }
8356 8354          if (IS_IPMP(ill)) {
8357 8355                  ipmp_illgrp_t *illg = ill->ill_grp;
8358 8356  
8359 8357                  switch (ipip->ipi_cmd) {
8360 8358                  case SIOCSARP:
8361 8359                  case SIOCSXARP:
8362 8360                          proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
8363 8361                          if (proxy_ill != NULL) {
8364 8362                                  proxyarp = B_TRUE;
8365 8363                                  if (!ipmp_ill_is_active(proxy_ill))
8366 8364                                          proxy_ill = ipmp_illgrp_next_ill(illg);
8367 8365                                  if (proxy_ill != NULL)
8368 8366                                          lladdr = proxy_ill->ill_phys_addr;
8369 8367                          }
8370 8368                          /* FALLTHRU */
8371 8369                  }
8372 8370          }
8373 8371  
8374 8372          ipaddr = sin->sin_addr.s_addr;
8375 8373          /*
8376 8374           * don't match across illgrp per case (1) and (2).
8377 8375           * XXX use IS_IPMP(ill) like ndp_sioc_update?
8378 8376           */
8379 8377          nce = nce_lookup_v4(ill, &ipaddr);
8380 8378          if (nce != NULL)
8381 8379                  ncec = nce->nce_common;
8382 8380  
8383 8381          switch (iocp->ioc_cmd) {
8384 8382          case SIOCDARP:
8385 8383          case SIOCDXARP: {
8386 8384                  /*
8387 8385                   * Delete the NCE if any.
8388 8386                   */
8389 8387                  if (ncec == NULL) {
8390 8388                          iocp->ioc_error = ENXIO;
8391 8389                          break;
8392 8390                  }
8393 8391                  /* Don't allow changes to arp mappings of local addresses. */
8394 8392                  if (NCE_MYADDR(ncec)) {
8395 8393                          nce_refrele(nce);
8396 8394                          return (ENOTSUP);
8397 8395                  }
8398 8396                  iocp->ioc_error = 0;
8399 8397  
8400 8398                  /*
8401 8399                   * Delete the nce_common which has ncec_ill set to ipmp_ill.
8402 8400                   * This will delete all the nce entries on the under_ills.
8403 8401                   */
8404 8402                  ncec_delete(ncec);
8405 8403                  /*
8406 8404                   * Once the NCE has been deleted, then the ire_dep* consistency
8407 8405                   * mechanism will find any IRE which depended on the now
8408 8406                   * condemned NCE (as part of sending packets).
8409 8407                   * That mechanism handles redirects by deleting redirects
8410 8408                   * that refer to UNREACHABLE nces.
8411 8409                   */
8412 8410                  break;
8413 8411          }
8414 8412          case SIOCGARP:
8415 8413          case SIOCGXARP:
8416 8414                  if (ncec != NULL) {
8417 8415                          lladdr = ncec->ncec_lladdr;
8418 8416                          flags = ncec->ncec_flags;
8419 8417                          iocp->ioc_error = 0;
8420 8418                          ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
8421 8419                  } else {
8422 8420                          iocp->ioc_error = ENXIO;
8423 8421                  }
8424 8422                  break;
8425 8423          case SIOCSARP:
8426 8424          case SIOCSXARP:
8427 8425                  /* Don't allow changes to arp mappings of local addresses. */
8428 8426                  if (ncec != NULL && NCE_MYADDR(ncec)) {
8429 8427                          nce_refrele(nce);
8430 8428                          return (ENOTSUP);
8431 8429                  }
8432 8430  
8433 8431                  /* static arp entries will undergo NUD if ATF_PERM is not set */
8434 8432                  flags |= NCE_F_STATIC;
8435 8433                  if (!if_arp_ioctl) {
8436 8434                          ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
8437 8435                              lladdr, alength, flags);
8438 8436                  } else {
8439 8437                          ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8440 8438                          if (ipif != NULL) {
8441 8439                                  ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
8442 8440                                      lladdr, alength, flags);
8443 8441                                  ipif_refrele(ipif);
8444 8442                          }
8445 8443                  }
8446 8444                  if (nce != NULL) {
8447 8445                          nce_refrele(nce);
8448 8446                          nce = NULL;
8449 8447                  }
8450 8448                  /*
8451 8449                   * NCE_F_STATIC entries will be added in state ND_REACHABLE
8452 8450                   * by nce_add_common()
8453 8451                   */
8454 8452                  err = nce_lookup_then_add_v4(ill, lladdr,
8455 8453                      ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
8456 8454                      &nce);
8457 8455                  if (err == EEXIST) {
8458 8456                          ncec = nce->nce_common;
8459 8457                          mutex_enter(&ncec->ncec_lock);
8460 8458                          ncec->ncec_state = ND_REACHABLE;
8461 8459                          ncec->ncec_flags = flags;
8462 8460                          nce_update(ncec, ND_UNCHANGED, lladdr);
8463 8461                          mutex_exit(&ncec->ncec_lock);
8464 8462                          err = 0;
8465 8463                  }
8466 8464                  if (nce != NULL) {
8467 8465                          nce_refrele(nce);
8468 8466                          nce = NULL;
8469 8467                  }
8470 8468                  if (IS_IPMP(ill) && err == 0) {
8471 8469                          entp = ipmp_illgrp_create_arpent(ill->ill_grp,
8472 8470                              proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
8473 8471                              flags);
8474 8472                          if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
8475 8473                                  iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
8476 8474                                  break;
8477 8475                          }
8478 8476                  }
8479 8477                  iocp->ioc_error = err;
8480 8478          }
8481 8479  
8482 8480          if (nce != NULL) {
8483 8481                  nce_refrele(nce);
8484 8482          }
8485 8483  
8486 8484          /*
8487 8485           * If we created an IPMP ARP entry, mark that we've notified ARP.
8488 8486           */
8489 8487          if (entp != NULL)
8490 8488                  ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
8491 8489  
8492 8490          return (iocp->ioc_error);
8493 8491  }
8494 8492  
8495 8493  /*
8496 8494   * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
8497 8495   * the associated sin and refhold and return the associated ipif via `ci'.
8498 8496   */
8499 8497  int
8500 8498  ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8501 8499      cmd_info_t *ci)
8502 8500  {
8503 8501          mblk_t  *mp1;
8504 8502          sin_t   *sin;
8505 8503          conn_t  *connp;
8506 8504          ipif_t  *ipif;
8507 8505          ire_t   *ire = NULL;
8508 8506          ill_t   *ill = NULL;
8509 8507          boolean_t exists;
8510 8508          ip_stack_t *ipst;
8511 8509          struct arpreq *ar;
8512 8510          struct xarpreq *xar;
8513 8511          struct sockaddr_dl *sdl;
8514 8512  
8515 8513          /* ioctl comes down on a conn */
8516 8514          ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8517 8515          connp = Q_TO_CONN(q);
8518 8516          if (connp->conn_family == AF_INET6)
8519 8517                  return (ENXIO);
8520 8518  
8521 8519          ipst = connp->conn_netstack->netstack_ip;
8522 8520  
8523 8521          /* Verified in ip_wput_nondata */
8524 8522          mp1 = mp->b_cont->b_cont;
8525 8523  
8526 8524          if (ipip->ipi_cmd_type == XARP_CMD) {
8527 8525                  ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
8528 8526                  xar = (struct xarpreq *)mp1->b_rptr;
8529 8527                  sin = (sin_t *)&xar->xarp_pa;
8530 8528                  sdl = &xar->xarp_ha;
8531 8529  
8532 8530                  if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
8533 8531                          return (ENXIO);
8534 8532                  if (sdl->sdl_nlen >= LIFNAMSIZ)
8535 8533                          return (EINVAL);
8536 8534          } else {
8537 8535                  ASSERT(ipip->ipi_cmd_type == ARP_CMD);
8538 8536                  ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
8539 8537                  ar = (struct arpreq *)mp1->b_rptr;
8540 8538                  sin = (sin_t *)&ar->arp_pa;
8541 8539          }
8542 8540  
8543 8541          if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
8544 8542                  ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
8545 8543                      B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
8546 8544                  if (ipif == NULL)
8547 8545                          return (ENXIO);
8548 8546                  if (ipif->ipif_id != 0) {
8549 8547                          ipif_refrele(ipif);
8550 8548                          return (ENXIO);
8551 8549                  }
8552 8550          } else {
8553 8551                  /*
8554 8552                   * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
8555 8553                   * of 0: use the IP address to find the ipif.  If the IP
8556 8554                   * address is an IPMP test address, ire_ftable_lookup() will
8557 8555                   * find the wrong ill, so we first do an ipif_lookup_addr().
8558 8556                   */
8559 8557                  ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
8560 8558                      ipst);
8561 8559                  if (ipif == NULL) {
8562 8560                          ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
8563 8561                              0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
8564 8562                              NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
8565 8563                          if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
8566 8564                                  if (ire != NULL)
8567 8565                                          ire_refrele(ire);
8568 8566                                  return (ENXIO);
8569 8567                          }
8570 8568                          ASSERT(ire != NULL && ill != NULL);
8571 8569                          ipif = ill->ill_ipif;
8572 8570                          ipif_refhold(ipif);
8573 8571                          ire_refrele(ire);
8574 8572                  }
8575 8573          }
8576 8574  
8577 8575          if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
8578 8576                  ipif_refrele(ipif);
8579 8577                  return (ENXIO);
8580 8578          }
8581 8579  
8582 8580          ci->ci_sin = sin;
8583 8581          ci->ci_ipif = ipif;
8584 8582          return (0);
8585 8583  }
8586 8584  
8587 8585  /*
8588 8586   * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
8589 8587   * value of `ioccmd'.  While an illgrp is linked to an ipmp_grp_t, it is
8590 8588   * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
8591 8589   * up and thus an ill can join that illgrp.
8592 8590   *
8593 8591   * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
8594 8592   * open()/close() primarily because close() is not allowed to fail or block
8595 8593   * forever.  On the other hand, I_PUNLINK *can* fail, and there's no reason
8596 8594   * why anyone should ever need to I_PUNLINK an in-use IPMP stream.  To ensure
8597 8595   * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
8598 8596   * I_PUNLINK) we defer linking to I_PLINK.  Separately, we also fail attempts
8599 8597   * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
8600 8598   * state if I_UNLINK didn't occur.
8601 8599   *
8602 8600   * Note that for each plumb/unplumb operation, we may end up here more than
8603 8601   * once because of the way ifconfig works.  However, it's OK to link the same
8604 8602   * illgrp more than once, or unlink an illgrp that's already unlinked.
8605 8603   */
8606 8604  static int
8607 8605  ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
8608 8606  {
8609 8607          int err;
8610 8608          ip_stack_t *ipst = ill->ill_ipst;
8611 8609  
8612 8610          ASSERT(IS_IPMP(ill));
8613 8611          ASSERT(IAM_WRITER_ILL(ill));
8614 8612  
8615 8613          switch (ioccmd) {
8616 8614          case I_LINK:
8617 8615                  return (ENOTSUP);
8618 8616  
8619 8617          case I_PLINK:
8620 8618                  rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8621 8619                  ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
8622 8620                  rw_exit(&ipst->ips_ipmp_lock);
8623 8621                  break;
8624 8622  
8625 8623          case I_PUNLINK:
8626 8624                  /*
8627 8625                   * Require all UP ipifs be brought down prior to unlinking the
8628 8626                   * illgrp so any associated IREs (and other state) is torched.
8629 8627                   */
8630 8628                  if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
8631 8629                          return (EBUSY);
8632 8630  
8633 8631                  /*
8634 8632                   * NOTE: We hold ipmp_lock across the unlink to prevent a race
8635 8633                   * with an SIOCSLIFGROUPNAME request from an ill trying to
8636 8634                   * join this group.  Specifically: ills trying to join grab
8637 8635                   * ipmp_lock and bump a "pending join" counter checked by
8638 8636                   * ipmp_illgrp_unlink_grp().  During the unlink no new pending
8639 8637                   * joins can occur (since we have ipmp_lock).  Once we drop
8640 8638                   * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
8641 8639                   * find the illgrp (since we unlinked it) and will return
8642 8640                   * EAFNOSUPPORT.  This will then take them back through the
8643 8641                   * IPMP meta-interface plumbing logic in ifconfig, and thus
8644 8642                   * back through I_PLINK above.
8645 8643                   */
8646 8644                  rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8647 8645                  err = ipmp_illgrp_unlink_grp(ill->ill_grp);
8648 8646                  rw_exit(&ipst->ips_ipmp_lock);
8649 8647                  return (err);
8650 8648          default:
8651 8649                  break;
8652 8650          }
8653 8651          return (0);
8654 8652  }
8655 8653  
8656 8654  /*
8657 8655   * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
8658 8656   * atomically set/clear the muxids. Also complete the ioctl by acking or
8659 8657   * naking it.  Note that the code is structured such that the link type,
8660 8658   * whether it's persistent or not, is treated equally.  ifconfig(1M) and
8661 8659   * its clones use the persistent link, while pppd(1M) and perhaps many
8662 8660   * other daemons may use non-persistent link.  When combined with some
8663 8661   * ill_t states, linking and unlinking lower streams may be used as
8664 8662   * indicators of dynamic re-plumbing events [see PSARC/1999/348].
8665 8663   */
8666 8664  /* ARGSUSED */
8667 8665  void
8668 8666  ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8669 8667  {
8670 8668          mblk_t          *mp1;
8671 8669          struct linkblk  *li;
8672 8670          int             ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8673 8671          int             err = 0;
8674 8672  
8675 8673          ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
8676 8674              ioccmd == I_LINK || ioccmd == I_UNLINK);
8677 8675  
8678 8676          mp1 = mp->b_cont;       /* This is the linkblk info */
8679 8677          li = (struct linkblk *)mp1->b_rptr;
8680 8678  
8681 8679          err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
8682 8680          if (err == EINPROGRESS)
8683 8681                  return;
8684 8682          if (err == 0)
8685 8683                  miocack(q, mp, 0, 0);
8686 8684          else
8687 8685                  miocnak(q, mp, 0, err);
8688 8686  
8689 8687          /* Conn was refheld in ip_sioctl_copyin_setup */
8690 8688          if (CONN_Q(q)) {
8691 8689                  CONN_DEC_IOCTLREF(Q_TO_CONN(q));
8692 8690                  CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
8693 8691          }
8694 8692  }
8695 8693  
8696 8694  /*
8697 8695   * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
8698 8696   * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
8699 8697   * module stream).
8700 8698   * Returns zero on success, EINPROGRESS if the operation is still pending, or
8701 8699   * an error code on failure.
8702 8700   */
8703 8701  static int
8704 8702  ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
8705 8703      struct linkblk *li)
8706 8704  {
8707 8705          int             err = 0;
8708 8706          ill_t           *ill;
8709 8707          queue_t         *ipwq, *dwq;
8710 8708          const char      *name;
8711 8709          struct qinit    *qinfo;
8712 8710          boolean_t       islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
8713 8711          boolean_t       entered_ipsq = B_FALSE;
8714 8712          boolean_t       is_ip = B_FALSE;
8715 8713          arl_t           *arl;
8716 8714  
8717 8715          /*
8718 8716           * Walk the lower stream to verify it's the IP module stream.
8719 8717           * The IP module is identified by its name, wput function,
8720 8718           * and non-NULL q_next.  STREAMS ensures that the lower stream
8721 8719           * (li->l_qbot) will not vanish until this ioctl completes.
8722 8720           */
8723 8721          for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
8724 8722                  qinfo = ipwq->q_qinfo;
8725 8723                  name = qinfo->qi_minfo->mi_idname;
8726 8724                  if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
8727 8725                      qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8728 8726                          is_ip = B_TRUE;
8729 8727                          break;
8730 8728                  }
8731 8729                  if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
8732 8730                      qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8733 8731                          break;
8734 8732                  }
8735 8733          }
8736 8734  
8737 8735          /*
8738 8736           * If this isn't an IP module stream, bail.
8739 8737           */
8740 8738          if (ipwq == NULL)
8741 8739                  return (0);
8742 8740  
8743 8741          if (!is_ip) {
8744 8742                  arl = (arl_t *)ipwq->q_ptr;
8745 8743                  ill = arl_to_ill(arl);
8746 8744                  if (ill == NULL)
8747 8745                          return (0);
8748 8746          } else {
8749 8747                  ill = ipwq->q_ptr;
8750 8748          }
8751 8749          ASSERT(ill != NULL);
8752 8750  
8753 8751          if (ipsq == NULL) {
8754 8752                  ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
8755 8753                      NEW_OP, B_FALSE);
8756 8754                  if (ipsq == NULL) {
8757 8755                          if (!is_ip)
8758 8756                                  ill_refrele(ill);
8759 8757                          return (EINPROGRESS);
8760 8758                  }
8761 8759                  entered_ipsq = B_TRUE;
8762 8760          }
8763 8761          ASSERT(IAM_WRITER_ILL(ill));
8764 8762          mutex_enter(&ill->ill_lock);
8765 8763          if (!is_ip) {
8766 8764                  if (islink && ill->ill_muxid == 0) {
8767 8765                          /*
8768 8766                           * Plumbing has to be done with IP plumbed first, arp
8769 8767                           * second, but here we have arp being plumbed first.
8770 8768                           */
8771 8769                          mutex_exit(&ill->ill_lock);
8772 8770                          if (entered_ipsq)
8773 8771                                  ipsq_exit(ipsq);
8774 8772                          ill_refrele(ill);
8775 8773                          return (EINVAL);
8776 8774                  }
8777 8775          }
8778 8776          mutex_exit(&ill->ill_lock);
8779 8777          if (!is_ip) {
8780 8778                  arl->arl_muxid = islink ? li->l_index : 0;
8781 8779                  ill_refrele(ill);
8782 8780                  goto done;
8783 8781          }
8784 8782  
8785 8783          if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
8786 8784                  goto done;
8787 8785  
8788 8786          /*
8789 8787           * As part of I_{P}LINKing, stash the number of downstream modules and
8790 8788           * the read queue of the module immediately below IP in the ill.
8791 8789           * These are used during the capability negotiation below.
8792 8790           */
8793 8791          ill->ill_lmod_rq = NULL;
8794 8792          ill->ill_lmod_cnt = 0;
8795 8793          if (islink && ((dwq = ipwq->q_next) != NULL)) {
8796 8794                  ill->ill_lmod_rq = RD(dwq);
8797 8795                  for (; dwq != NULL; dwq = dwq->q_next)
8798 8796                          ill->ill_lmod_cnt++;
8799 8797          }
8800 8798  
8801 8799          ill->ill_muxid = islink ? li->l_index : 0;
8802 8800  
8803 8801          /*
8804 8802           * Mark the ipsq busy until the capability operations initiated below
8805 8803           * complete. The PLINK/UNLINK ioctl itself completes when our caller
8806 8804           * returns, but the capability operation may complete asynchronously
8807 8805           * much later.
8808 8806           */
8809 8807          ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
8810 8808          /*
8811 8809           * If there's at least one up ipif on this ill, then we're bound to
8812 8810           * the underlying driver via DLPI.  In that case, renegotiate
8813 8811           * capabilities to account for any possible change in modules
8814 8812           * interposed between IP and the driver.
8815 8813           */
8816 8814          if (ill->ill_ipif_up_count > 0) {
8817 8815                  if (islink)
8818 8816                          ill_capability_probe(ill);
8819 8817                  else
8820 8818                          ill_capability_reset(ill, B_FALSE);
8821 8819          }
8822 8820          ipsq_current_finish(ipsq);
8823 8821  done:
8824 8822          if (entered_ipsq)
8825 8823                  ipsq_exit(ipsq);
8826 8824  
8827 8825          return (err);
8828 8826  }
8829 8827  
8830 8828  /*
8831 8829   * Search the ioctl command in the ioctl tables and return a pointer
8832 8830   * to the ioctl command information. The ioctl command tables are
8833 8831   * static and fully populated at compile time.
8834 8832   */
8835 8833  ip_ioctl_cmd_t *
8836 8834  ip_sioctl_lookup(int ioc_cmd)
8837 8835  {
8838 8836          int index;
8839 8837          ip_ioctl_cmd_t *ipip;
8840 8838          ip_ioctl_cmd_t *ipip_end;
8841 8839  
8842 8840          if (ioc_cmd == IPI_DONTCARE)
8843 8841                  return (NULL);
8844 8842  
8845 8843          /*
8846 8844           * Do a 2 step search. First search the indexed table
8847 8845           * based on the least significant byte of the ioctl cmd.
8848 8846           * If we don't find a match, then search the misc table
8849 8847           * serially.
8850 8848           */
8851 8849          index = ioc_cmd & 0xFF;
8852 8850          if (index < ip_ndx_ioctl_count) {
8853 8851                  ipip = &ip_ndx_ioctl_table[index];
8854 8852                  if (ipip->ipi_cmd == ioc_cmd) {
8855 8853                          /* Found a match in the ndx table */
8856 8854                          return (ipip);
8857 8855                  }
8858 8856          }
8859 8857  
8860 8858          /* Search the misc table */
8861 8859          ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
8862 8860          for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
8863 8861                  if (ipip->ipi_cmd == ioc_cmd)
8864 8862                          /* Found a match in the misc table */
8865 8863                          return (ipip);
8866 8864          }
8867 8865  
8868 8866          return (NULL);
8869 8867  }
8870 8868  
8871 8869  /*
8872 8870   * helper function for ip_sioctl_getsetprop(), which does some sanity checks
8873 8871   */
8874 8872  static boolean_t
8875 8873  getset_ioctl_checks(mblk_t *mp)
8876 8874  {
8877 8875          struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
8878 8876          mblk_t          *mp1 = mp->b_cont;
8879 8877          mod_ioc_prop_t  *pioc;
8880 8878          uint_t          flags;
8881 8879          uint_t          pioc_size;
8882 8880  
8883 8881          /* do sanity checks on various arguments */
8884 8882          if (mp1 == NULL || iocp->ioc_count == 0 ||
8885 8883              iocp->ioc_count == TRANSPARENT) {
8886 8884                  return (B_FALSE);
8887 8885          }
8888 8886          if (msgdsize(mp1) < iocp->ioc_count) {
8889 8887                  if (!pullupmsg(mp1, iocp->ioc_count))
8890 8888                          return (B_FALSE);
8891 8889          }
8892 8890  
8893 8891          pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8894 8892  
8895 8893          /* sanity checks on mpr_valsize */
8896 8894          pioc_size = sizeof (mod_ioc_prop_t);
8897 8895          if (pioc->mpr_valsize != 0)
8898 8896                  pioc_size += pioc->mpr_valsize - 1;
8899 8897  
8900 8898          if (iocp->ioc_count != pioc_size)
8901 8899                  return (B_FALSE);
8902 8900  
8903 8901          flags = pioc->mpr_flags;
8904 8902          if (iocp->ioc_cmd == SIOCSETPROP) {
8905 8903                  /*
8906 8904                   * One can either reset the value to it's default value or
8907 8905                   * change the current value or append/remove the value from
8908 8906                   * a multi-valued properties.
8909 8907                   */
8910 8908                  if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8911 8909                      flags != MOD_PROP_ACTIVE &&
8912 8910                      flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) &&
8913 8911                      flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE))
8914 8912                          return (B_FALSE);
8915 8913          } else {
8916 8914                  ASSERT(iocp->ioc_cmd == SIOCGETPROP);
8917 8915  
8918 8916                  /*
8919 8917                   * One can retrieve only one kind of property information
8920 8918                   * at a time.
8921 8919                   */
8922 8920                  if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE &&
8923 8921                      (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8924 8922                      (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE &&
8925 8923                      (flags & MOD_PROP_PERM) != MOD_PROP_PERM)
8926 8924                          return (B_FALSE);
8927 8925          }
8928 8926  
8929 8927          return (B_TRUE);
8930 8928  }
8931 8929  
8932 8930  /*
8933 8931   * process the SIOC{SET|GET}PROP ioctl's
8934 8932   */
8935 8933  /* ARGSUSED */
8936 8934  static void
8937 8935  ip_sioctl_getsetprop(queue_t *q, mblk_t *mp)
8938 8936  {
8939 8937          struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
8940 8938          mblk_t          *mp1 = mp->b_cont;
8941 8939          mod_ioc_prop_t  *pioc;
8942 8940          mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8943 8941          ip_stack_t      *ipst;
8944 8942          netstack_t      *stack;
8945 8943          cred_t          *cr;
8946 8944          boolean_t       set;
8947 8945          int             err;
8948 8946  
8949 8947          ASSERT(q->q_next == NULL);
8950 8948          ASSERT(CONN_Q(q));
8951 8949  
8952 8950          if (!getset_ioctl_checks(mp)) {
8953 8951                  miocnak(q, mp, 0, EINVAL);
8954 8952                  return;
8955 8953          }
8956 8954          ipst = CONNQ_TO_IPST(q);
8957 8955          stack = ipst->ips_netstack;
8958 8956          pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8959 8957  
8960 8958          switch (pioc->mpr_proto) {
8961 8959          case MOD_PROTO_IP:
8962 8960          case MOD_PROTO_IPV4:
8963 8961          case MOD_PROTO_IPV6:
8964 8962                  ptbl = ipst->ips_propinfo_tbl;
8965 8963                  break;
8966 8964          case MOD_PROTO_RAWIP:
8967 8965                  ptbl = stack->netstack_icmp->is_propinfo_tbl;
8968 8966                  break;
8969 8967          case MOD_PROTO_TCP:
8970 8968                  ptbl = stack->netstack_tcp->tcps_propinfo_tbl;
8971 8969                  break;
8972 8970          case MOD_PROTO_UDP:
8973 8971                  ptbl = stack->netstack_udp->us_propinfo_tbl;
8974 8972                  break;
8975 8973          case MOD_PROTO_SCTP:
8976 8974                  ptbl = stack->netstack_sctp->sctps_propinfo_tbl;
8977 8975                  break;
8978 8976          default:
8979 8977                  miocnak(q, mp, 0, EINVAL);
8980 8978                  return;
8981 8979          }
8982 8980  
8983 8981          pinfo = mod_prop_lookup(ptbl, pioc->mpr_name, pioc->mpr_proto);
8984 8982          if (pinfo == NULL) {
8985 8983                  miocnak(q, mp, 0, ENOENT);
8986 8984                  return;
8987 8985          }
8988 8986  
8989 8987          set = (iocp->ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE;
8990 8988          if (set && pinfo->mpi_setf != NULL) {
8991 8989                  cr = msg_getcred(mp, NULL);
8992 8990                  if (cr == NULL)
8993 8991                          cr = iocp->ioc_cr;
8994 8992                  err = pinfo->mpi_setf(stack, cr, pinfo, pioc->mpr_ifname,
8995 8993                      pioc->mpr_val, pioc->mpr_flags);
8996 8994          } else if (!set && pinfo->mpi_getf != NULL) {
8997 8995                  err = pinfo->mpi_getf(stack, pinfo, pioc->mpr_ifname,
8998 8996                      pioc->mpr_val, pioc->mpr_valsize, pioc->mpr_flags);
8999 8997          } else {
9000 8998                  err = EPERM;
9001 8999          }
9002 9000  
9003 9001          if (err != 0) {
9004 9002                  miocnak(q, mp, 0, err);
9005 9003          } else {
9006 9004                  if (set)
9007 9005                          miocack(q, mp, 0, 0);
9008 9006                  else    /* For get, we need to return back the data */
9009 9007                          miocack(q, mp, iocp->ioc_count, 0);
9010 9008          }
9011 9009  }
9012 9010  
9013 9011  /*
9014 9012   * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding
9015 9013   * as several routing daemons have unfortunately used this 'unpublished'
9016 9014   * but well-known ioctls.
9017 9015   */
9018 9016  /* ARGSUSED */
9019 9017  static void
9020 9018  ip_process_legacy_nddprop(queue_t *q, mblk_t *mp)
9021 9019  {
9022 9020          struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
9023 9021          mblk_t          *mp1 = mp->b_cont;
9024 9022          char            *pname, *pval, *buf;
9025 9023          uint_t          bufsize, proto;
9026 9024          mod_prop_info_t *pinfo = NULL;
9027 9025          ip_stack_t      *ipst;
9028 9026          int             err = 0;
9029 9027  
9030 9028          ASSERT(CONN_Q(q));
9031 9029          ipst = CONNQ_TO_IPST(q);
9032 9030  
9033 9031          if (iocp->ioc_count == 0 || mp1 == NULL) {
9034 9032                  miocnak(q, mp, 0, EINVAL);
9035 9033                  return;
9036 9034          }
9037 9035  
9038 9036          mp1->b_datap->db_lim[-1] = '\0';        /* Force null termination */
9039 9037          pval = buf = pname = (char *)mp1->b_rptr;
9040 9038          bufsize = MBLKL(mp1);
9041 9039  
9042 9040          if (strcmp(pname, "ip_forwarding") == 0) {
9043 9041                  pname = "forwarding";
9044 9042                  proto = MOD_PROTO_IPV4;
9045 9043          } else if (strcmp(pname, "ip6_forwarding") == 0) {
9046 9044                  pname = "forwarding";
9047 9045                  proto = MOD_PROTO_IPV6;
9048 9046          } else {
9049 9047                  miocnak(q, mp, 0, EINVAL);
9050 9048                  return;
9051 9049          }
9052 9050  
9053 9051          pinfo = mod_prop_lookup(ipst->ips_propinfo_tbl, pname, proto);
9054 9052  
9055 9053          switch (iocp->ioc_cmd) {
9056 9054          case ND_GET:
9057 9055                  if ((err = pinfo->mpi_getf(ipst->ips_netstack, pinfo, NULL, buf,
9058 9056                      bufsize, 0)) == 0) {
9059 9057                          miocack(q, mp, iocp->ioc_count, 0);
9060 9058                          return;
9061 9059                  }
9062 9060                  break;
9063 9061          case ND_SET:
9064 9062                  /*
9065 9063                   * buffer will have property name and value in the following
9066 9064                   * format,
9067 9065                   * <property name>'\0'<property value>'\0', extract them;
9068 9066                   */
9069 9067                  while (*pval++)
9070 9068                          noop;
9071 9069  
9072 9070                  if (!*pval || pval >= (char *)mp1->b_wptr) {
9073 9071                          err = EINVAL;
9074 9072                  } else if ((err = pinfo->mpi_setf(ipst->ips_netstack, NULL,
9075 9073                      pinfo, NULL, pval, 0)) == 0) {
9076 9074                          miocack(q, mp, 0, 0);
9077 9075                          return;
9078 9076                  }
9079 9077                  break;
9080 9078          default:
9081 9079                  err = EINVAL;
9082 9080                  break;
9083 9081          }
9084 9082          miocnak(q, mp, 0, err);
9085 9083  }
9086 9084  
9087 9085  /*
9088 9086   * Wrapper function for resuming deferred ioctl processing
9089 9087   * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
9090 9088   * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
9091 9089   */
9092 9090  /* ARGSUSED */
9093 9091  void
9094 9092  ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
9095 9093      void *dummy_arg)
9096 9094  {
9097 9095          ip_sioctl_copyin_setup(q, mp);
9098 9096  }
9099 9097  
9100 9098  /*
9101 9099   * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
9102 9100   * that arrives.  Most of the IOCTLs are "socket" IOCTLs which we handle
9103 9101   * in either I_STR or TRANSPARENT form, using the mi_copy facility.
9104 9102   * We establish here the size of the block to be copied in.  mi_copyin
9105 9103   * arranges for this to happen, an processing continues in ip_wput_nondata with
9106 9104   * an M_IOCDATA message.
9107 9105   */
9108 9106  void
9109 9107  ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
9110 9108  {
9111 9109          int     copyin_size;
9112 9110          struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9113 9111          ip_ioctl_cmd_t *ipip;
9114 9112          cred_t *cr;
9115 9113          ip_stack_t      *ipst;
9116 9114  
9117 9115          if (CONN_Q(q))
9118 9116                  ipst = CONNQ_TO_IPST(q);
9119 9117          else
9120 9118                  ipst = ILLQ_TO_IPST(q);
9121 9119  
9122 9120          ipip = ip_sioctl_lookup(iocp->ioc_cmd);
9123 9121          if (ipip == NULL) {
9124 9122                  /*
9125 9123                   * The ioctl is not one we understand or own.
9126 9124                   * Pass it along to be processed down stream,
9127 9125                   * if this is a module instance of IP, else nak
9128 9126                   * the ioctl.
9129 9127                   */
9130 9128                  if (q->q_next == NULL) {
9131 9129                          goto nak;
9132 9130                  } else {
9133 9131                          putnext(q, mp);
9134 9132                          return;
9135 9133                  }
9136 9134          }
9137 9135  
9138 9136          /*
9139 9137           * If this is deferred, then we will do all the checks when we
9140 9138           * come back.
9141 9139           */
9142 9140          if ((iocp->ioc_cmd == SIOCGDSTINFO ||
9143 9141              iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
9144 9142                  ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
9145 9143                  return;
9146 9144          }
9147 9145  
9148 9146          /*
9149 9147           * Only allow a very small subset of IP ioctls on this stream if
9150 9148           * IP is a module and not a driver. Allowing ioctls to be processed
9151 9149           * in this case may cause assert failures or data corruption.
9152 9150           * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
9153 9151           * ioctls allowed on an IP module stream, after which this stream
9154 9152           * normally becomes a multiplexor (at which time the stream head
9155 9153           * will fail all ioctls).
9156 9154           */
9157 9155          if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
9158 9156                  goto nak;
9159 9157          }
9160 9158  
9161 9159          /* Make sure we have ioctl data to process. */
9162 9160          if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
9163 9161                  goto nak;
9164 9162  
9165 9163          /*
9166 9164           * Prefer dblk credential over ioctl credential; some synthesized
9167 9165           * ioctls have kcred set because there's no way to crhold()
9168 9166           * a credential in some contexts.  (ioc_cr is not crfree() by
9169 9167           * the framework; the caller of ioctl needs to hold the reference
9170 9168           * for the duration of the call).
9171 9169           */
9172 9170          cr = msg_getcred(mp, NULL);
9173 9171          if (cr == NULL)
9174 9172                  cr = iocp->ioc_cr;
9175 9173  
9176 9174          /* Make sure normal users don't send down privileged ioctls */
9177 9175          if ((ipip->ipi_flags & IPI_PRIV) &&
9178 9176              (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
9179 9177                  /* We checked the privilege earlier but log it here */
9180 9178                  miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
9181 9179                  return;
9182 9180          }
9183 9181  
9184 9182          /*
9185 9183           * The ioctl command tables can only encode fixed length
9186 9184           * ioctl data. If the length is variable, the table will
9187 9185           * encode the length as zero. Such special cases are handled
9188 9186           * below in the switch.
9189 9187           */
9190 9188          if (ipip->ipi_copyin_size != 0) {
9191 9189                  mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
9192 9190                  return;
9193 9191          }
9194 9192  
9195 9193          switch (iocp->ioc_cmd) {
9196 9194          case O_SIOCGIFCONF:
9197 9195          case SIOCGIFCONF:
9198 9196                  /*
9199 9197                   * This IOCTL is hilarious.  See comments in
9200 9198                   * ip_sioctl_get_ifconf for the story.
9201 9199                   */
9202 9200                  if (iocp->ioc_count == TRANSPARENT)
9203 9201                          copyin_size = SIZEOF_STRUCT(ifconf,
9204 9202                              iocp->ioc_flag);
9205 9203                  else
9206 9204                          copyin_size = iocp->ioc_count;
9207 9205                  mi_copyin(q, mp, NULL, copyin_size);
9208 9206                  return;
9209 9207  
9210 9208          case O_SIOCGLIFCONF:
9211 9209          case SIOCGLIFCONF:
9212 9210                  copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
9213 9211                  mi_copyin(q, mp, NULL, copyin_size);
9214 9212                  return;
9215 9213  
9216 9214          case SIOCGLIFSRCOF:
9217 9215                  copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
9218 9216                  mi_copyin(q, mp, NULL, copyin_size);
9219 9217                  return;
9220 9218  
9221 9219          case SIOCGIP6ADDRPOLICY:
9222 9220                  ip_sioctl_ip6addrpolicy(q, mp);
9223 9221                  ip6_asp_table_refrele(ipst);
9224 9222                  return;
9225 9223  
9226 9224          case SIOCSIP6ADDRPOLICY:
9227 9225                  ip_sioctl_ip6addrpolicy(q, mp);
9228 9226                  return;
9229 9227  
9230 9228          case SIOCGDSTINFO:
9231 9229                  ip_sioctl_dstinfo(q, mp);
9232 9230                  ip6_asp_table_refrele(ipst);
9233 9231                  return;
9234 9232  
9235 9233          case ND_SET:
9236 9234          case ND_GET:
9237 9235                  ip_process_legacy_nddprop(q, mp);
9238 9236                  return;
9239 9237  
9240 9238          case SIOCSETPROP:
9241 9239          case SIOCGETPROP:
9242 9240                  ip_sioctl_getsetprop(q, mp);
9243 9241                  return;
9244 9242  
9245 9243          case I_PLINK:
9246 9244          case I_PUNLINK:
9247 9245          case I_LINK:
9248 9246          case I_UNLINK:
9249 9247                  /*
9250 9248                   * We treat non-persistent link similarly as the persistent
9251 9249                   * link case, in terms of plumbing/unplumbing, as well as
9252 9250                   * dynamic re-plumbing events indicator.  See comments
9253 9251                   * in ip_sioctl_plink() for more.
9254 9252                   *
9255 9253                   * Request can be enqueued in the 'ipsq' while waiting
9256 9254                   * to become exclusive. So bump up the conn ref.
9257 9255                   */
9258 9256                  if (CONN_Q(q)) {
9259 9257                          CONN_INC_REF(Q_TO_CONN(q));
9260 9258                          CONN_INC_IOCTLREF(Q_TO_CONN(q))
9261 9259                  }
9262 9260                  ip_sioctl_plink(NULL, q, mp, NULL);
9263 9261                  return;
9264 9262  
9265 9263          case IP_IOCTL:
9266 9264                  ip_wput_ioctl(q, mp);
9267 9265                  return;
9268 9266  
9269 9267          case SIOCILB:
9270 9268                  /* The ioctl length varies depending on the ILB command. */
9271 9269                  copyin_size = iocp->ioc_count;
9272 9270                  if (copyin_size < sizeof (ilb_cmd_t))
9273 9271                          goto nak;
9274 9272                  mi_copyin(q, mp, NULL, copyin_size);
9275 9273                  return;
9276 9274  
9277 9275          default:
9278 9276                  cmn_err(CE_WARN, "Unknown ioctl %d/0x%x slipped through.",
9279 9277                      iocp->ioc_cmd, iocp->ioc_cmd);
9280 9278                  /* FALLTHRU */
9281 9279          }
9282 9280  nak:
9283 9281          if (mp->b_cont != NULL) {
9284 9282                  freemsg(mp->b_cont);
9285 9283                  mp->b_cont = NULL;
9286 9284          }
9287 9285          iocp->ioc_error = EINVAL;
9288 9286          mp->b_datap->db_type = M_IOCNAK;
9289 9287          iocp->ioc_count = 0;
9290 9288          qreply(q, mp);
9291 9289  }
9292 9290  
9293 9291  static void
9294 9292  ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
9295 9293  {
9296 9294          struct arpreq *ar;
9297 9295          struct xarpreq *xar;
9298 9296          mblk_t  *tmp;
9299 9297          struct iocblk *iocp;
9300 9298          int x_arp_ioctl = B_FALSE;
9301 9299          int *flagsp;
9302 9300          char *storage = NULL;
9303 9301  
9304 9302          ASSERT(ill != NULL);
9305 9303  
9306 9304          iocp = (struct iocblk *)mp->b_rptr;
9307 9305          ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
9308 9306  
9309 9307          tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
9310 9308          if ((iocp->ioc_cmd == SIOCGXARP) ||
9311 9309              (iocp->ioc_cmd == SIOCSXARP)) {
9312 9310                  x_arp_ioctl = B_TRUE;
9313 9311                  xar = (struct xarpreq *)tmp->b_rptr;
9314 9312                  flagsp = &xar->xarp_flags;
9315 9313                  storage = xar->xarp_ha.sdl_data;
9316 9314          } else {
9317 9315                  ar = (struct arpreq *)tmp->b_rptr;
9318 9316                  flagsp = &ar->arp_flags;
9319 9317                  storage = ar->arp_ha.sa_data;
9320 9318          }
9321 9319  
9322 9320          /*
9323 9321           * We're done if this is not an SIOCG{X}ARP
9324 9322           */
9325 9323          if (x_arp_ioctl) {
9326 9324                  storage += ill_xarp_info(&xar->xarp_ha, ill);
9327 9325                  if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9328 9326                      sizeof (xar->xarp_ha.sdl_data)) {
9329 9327                          iocp->ioc_error = EINVAL;
9330 9328                          return;
9331 9329                  }
9332 9330          }
9333 9331          *flagsp = ATF_INUSE;
9334 9332          /*
9335 9333           * If /sbin/arp told us we are the authority using the "permanent"
9336 9334           * flag, or if this is one of my addresses print "permanent"
9337 9335           * in the /sbin/arp output.
9338 9336           */
9339 9337          if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
9340 9338                  *flagsp |= ATF_AUTHORITY;
9341 9339          if (flags & NCE_F_NONUD)
9342 9340                  *flagsp |= ATF_PERM; /* not subject to aging */
9343 9341          if (flags & NCE_F_PUBLISH)
9344 9342                  *flagsp |= ATF_PUBL;
9345 9343          if (hwaddr != NULL) {
9346 9344                  *flagsp |= ATF_COM;
9347 9345                  bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
9348 9346          }
9349 9347  }
9350 9348  
9351 9349  /*
9352 9350   * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9353 9351   * interface) create the next available logical interface for this
9354 9352   * physical interface.
9355 9353   * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9356 9354   * ipif with the specified name.
9357 9355   *
9358 9356   * If the address family is not AF_UNSPEC then set the address as well.
9359 9357   *
9360 9358   * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
9361 9359   * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
9362 9360   *
9363 9361   * Executed as a writer on the ill.
9364 9362   * So no lock is needed to traverse the ipif chain, or examine the
9365 9363   * phyint flags.
9366 9364   */
9367 9365  /* ARGSUSED */
9368 9366  int
9369 9367  ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9370 9368      ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9371 9369  {
9372 9370          mblk_t  *mp1;
9373 9371          struct lifreq *lifr;
9374 9372          boolean_t       isv6;
9375 9373          boolean_t       exists;
9376 9374          char    *name;
9377 9375          char    *endp;
9378 9376          char    *cp;
9379 9377          int     namelen;
9380 9378          ipif_t  *ipif;
9381 9379          long    id;
9382 9380          ipsq_t  *ipsq;
9383 9381          ill_t   *ill;
9384 9382          sin_t   *sin;
9385 9383          int     err = 0;
9386 9384          boolean_t found_sep = B_FALSE;
9387 9385          conn_t  *connp;
9388 9386          zoneid_t zoneid;
9389 9387          ip_stack_t *ipst = CONNQ_TO_IPST(q);
9390 9388  
9391 9389          ASSERT(q->q_next == NULL);
9392 9390          ip1dbg(("ip_sioctl_addif\n"));
9393 9391          /* Existence of mp1 has been checked in ip_wput_nondata */
9394 9392          mp1 = mp->b_cont->b_cont;
9395 9393          /*
9396 9394           * Null terminate the string to protect against buffer
9397 9395           * overrun. String was generated by user code and may not
9398 9396           * be trusted.
9399 9397           */
9400 9398          lifr = (struct lifreq *)mp1->b_rptr;
9401 9399          lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
9402 9400          name = lifr->lifr_name;
9403 9401          ASSERT(CONN_Q(q));
9404 9402          connp = Q_TO_CONN(q);
9405 9403          isv6 = (connp->conn_family == AF_INET6);
9406 9404          zoneid = connp->conn_zoneid;
9407 9405          namelen = mi_strlen(name);
9408 9406          if (namelen == 0)
9409 9407                  return (EINVAL);
9410 9408  
9411 9409          exists = B_FALSE;
9412 9410          if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
9413 9411              (mi_strcmp(name, ipif_loopback_name) == 0)) {
9414 9412                  /*
9415 9413                   * Allow creating lo0 using SIOCLIFADDIF.
9416 9414                   * can't be any other writer thread. So can pass null below
9417 9415                   * for the last 4 args to ipif_lookup_name.
9418 9416                   */
9419 9417                  ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
9420 9418                      &exists, isv6, zoneid, ipst);
9421 9419                  /* Prevent any further action */
9422 9420                  if (ipif == NULL) {
9423 9421                          return (ENOBUFS);
9424 9422                  } else if (!exists) {
9425 9423                          /* We created the ipif now and as writer */
9426 9424                          ipif_refrele(ipif);
9427 9425                          return (0);
9428 9426                  } else {
9429 9427                          ill = ipif->ipif_ill;
9430 9428                          ill_refhold(ill);
9431 9429                          ipif_refrele(ipif);
9432 9430                  }
9433 9431          } else {
9434 9432                  /* Look for a colon in the name. */
9435 9433                  endp = &name[namelen];
9436 9434                  for (cp = endp; --cp > name; ) {
9437 9435                          if (*cp == IPIF_SEPARATOR_CHAR) {
9438 9436                                  found_sep = B_TRUE;
9439 9437                                  /*
9440 9438                                   * Reject any non-decimal aliases for plumbing
9441 9439                                   * of logical interfaces. Aliases with leading
9442 9440                                   * zeroes are also rejected as they introduce
9443 9441                                   * ambiguity in the naming of the interfaces.
9444 9442                                   * Comparing with "0" takes care of all such
9445 9443                                   * cases.
9446 9444                                   */
9447 9445                                  if ((strncmp("0", cp+1, 1)) == 0)
9448 9446                                          return (EINVAL);
9449 9447  
9450 9448                                  if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
9451 9449                                      id <= 0 || *endp != '\0') {
9452 9450                                          return (EINVAL);
9453 9451                                  }
9454 9452                                  *cp = '\0';
9455 9453                                  break;
9456 9454                          }
9457 9455                  }
9458 9456                  ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
9459 9457                  if (found_sep)
9460 9458                          *cp = IPIF_SEPARATOR_CHAR;
9461 9459                  if (ill == NULL)
9462 9460                          return (ENXIO);
9463 9461          }
9464 9462  
9465 9463          ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
9466 9464              B_TRUE);
9467 9465  
9468 9466          /*
9469 9467           * Release the refhold due to the lookup, now that we are excl
9470 9468           * or we are just returning
9471 9469           */
9472 9470          ill_refrele(ill);
9473 9471  
9474 9472          if (ipsq == NULL)
9475 9473                  return (EINPROGRESS);
9476 9474  
9477 9475          /* We are now exclusive on the IPSQ */
9478 9476          ASSERT(IAM_WRITER_ILL(ill));
9479 9477  
9480 9478          if (found_sep) {
9481 9479                  /* Now see if there is an IPIF with this unit number. */
9482 9480                  for (ipif = ill->ill_ipif; ipif != NULL;
9483 9481                      ipif = ipif->ipif_next) {
9484 9482                          if (ipif->ipif_id == id) {
9485 9483                                  err = EEXIST;
9486 9484                                  goto done;
9487 9485                          }
9488 9486                  }
9489 9487          }
9490 9488  
9491 9489          /*
9492 9490           * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
9493 9491           * of lo0.  Plumbing for lo0:0 happens in ipif_lookup_on_name()
9494 9492           * instead.
9495 9493           */
9496 9494          if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
9497 9495              B_TRUE, B_TRUE, &err)) == NULL) {
9498 9496                  goto done;
9499 9497          }
9500 9498  
9501 9499          /* Return created name with ioctl */
9502 9500          (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
9503 9501              IPIF_SEPARATOR_CHAR, ipif->ipif_id);
9504 9502          ip1dbg(("created %s\n", lifr->lifr_name));
9505 9503  
9506 9504          /* Set address */
9507 9505          sin = (sin_t *)&lifr->lifr_addr;
9508 9506          if (sin->sin_family != AF_UNSPEC) {
9509 9507                  err = ip_sioctl_addr(ipif, sin, q, mp,
9510 9508                      &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
9511 9509          }
9512 9510  
9513 9511  done:
9514 9512          ipsq_exit(ipsq);
9515 9513          return (err);
9516 9514  }
9517 9515  
9518 9516  /*
9519 9517   * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
9520 9518   * interface) delete it based on the IP address (on this physical interface).
9521 9519   * Otherwise delete it based on the ipif_id.
9522 9520   * Also, special handling to allow a removeif of lo0.
9523 9521   */
9524 9522  /* ARGSUSED */
9525 9523  int
9526 9524  ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9527 9525      ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9528 9526  {
9529 9527          conn_t          *connp;
9530 9528          ill_t           *ill = ipif->ipif_ill;
9531 9529          boolean_t        success;
9532 9530          ip_stack_t      *ipst;
9533 9531  
9534 9532          ipst = CONNQ_TO_IPST(q);
9535 9533  
9536 9534          ASSERT(q->q_next == NULL);
9537 9535          ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
9538 9536              ill->ill_name, ipif->ipif_id, (void *)ipif));
9539 9537          ASSERT(IAM_WRITER_IPIF(ipif));
9540 9538  
9541 9539          connp = Q_TO_CONN(q);
9542 9540          /*
9543 9541           * Special case for unplumbing lo0 (the loopback physical interface).
9544 9542           * If unplumbing lo0, the incoming address structure has been
9545 9543           * initialized to all zeros. When unplumbing lo0, all its logical
9546 9544           * interfaces must be removed too.
9547 9545           *
9548 9546           * Note that this interface may be called to remove a specific
9549 9547           * loopback logical interface (eg, lo0:1). But in that case
9550 9548           * ipif->ipif_id != 0 so that the code path for that case is the
9551 9549           * same as any other interface (meaning it skips the code directly
9552 9550           * below).
9553 9551           */
9554 9552          if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9555 9553                  if (sin->sin_family == AF_UNSPEC &&
9556 9554                      (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
9557 9555                          /*
9558 9556                           * Mark it condemned. No new ref. will be made to ill.
9559 9557                           */
9560 9558                          mutex_enter(&ill->ill_lock);
9561 9559                          ill->ill_state_flags |= ILL_CONDEMNED;
9562 9560                          for (ipif = ill->ill_ipif; ipif != NULL;
9563 9561                              ipif = ipif->ipif_next) {
9564 9562                                  ipif->ipif_state_flags |= IPIF_CONDEMNED;
9565 9563                          }
9566 9564                          mutex_exit(&ill->ill_lock);
9567 9565  
9568 9566                          ipif = ill->ill_ipif;
9569 9567                          /* unplumb the loopback interface */
9570 9568                          ill_delete(ill);
9571 9569                          mutex_enter(&connp->conn_lock);
9572 9570                          mutex_enter(&ill->ill_lock);
9573 9571  
9574 9572                          /* Are any references to this ill active */
9575 9573                          if (ill_is_freeable(ill)) {
9576 9574                                  mutex_exit(&ill->ill_lock);
9577 9575                                  mutex_exit(&connp->conn_lock);
9578 9576                                  ill_delete_tail(ill);
9579 9577                                  mi_free(ill);
9580 9578                                  return (0);
9581 9579                          }
9582 9580                          success = ipsq_pending_mp_add(connp, ipif,
9583 9581                              CONNP_TO_WQ(connp), mp, ILL_FREE);
9584 9582                          mutex_exit(&connp->conn_lock);
9585 9583                          mutex_exit(&ill->ill_lock);
9586 9584                          if (success)
9587 9585                                  return (EINPROGRESS);
9588 9586                          else
9589 9587                                  return (EINTR);
9590 9588                  }
9591 9589          }
9592 9590  
9593 9591          if (ipif->ipif_id == 0) {
9594 9592                  ipsq_t *ipsq;
9595 9593  
9596 9594                  /* Find based on address */
9597 9595                  if (ipif->ipif_isv6) {
9598 9596                          sin6_t *sin6;
9599 9597  
9600 9598                          if (sin->sin_family != AF_INET6)
9601 9599                                  return (EAFNOSUPPORT);
9602 9600  
9603 9601                          sin6 = (sin6_t *)sin;
9604 9602                          /* We are a writer, so we should be able to lookup */
9605 9603                          ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
9606 9604                              ipst);
9607 9605                  } else {
9608 9606                          if (sin->sin_family != AF_INET)
9609 9607                                  return (EAFNOSUPPORT);
9610 9608  
9611 9609                          /* We are a writer, so we should be able to lookup */
9612 9610                          ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
9613 9611                              ipst);
9614 9612                  }
9615 9613                  if (ipif == NULL) {
9616 9614                          return (EADDRNOTAVAIL);
9617 9615                  }
9618 9616  
9619 9617                  /*
9620 9618                   * It is possible for a user to send an SIOCLIFREMOVEIF with
9621 9619                   * lifr_name of the physical interface but with an ip address
9622 9620                   * lifr_addr of a logical interface plumbed over it.
9623 9621                   * So update ipx_current_ipif now that ipif points to the
9624 9622                   * correct one.
9625 9623                   */
9626 9624                  ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
9627 9625                  ipsq->ipsq_xop->ipx_current_ipif = ipif;
9628 9626  
9629 9627                  /* This is a writer */
9630 9628                  ipif_refrele(ipif);
9631 9629          }
9632 9630  
9633 9631          /*
9634 9632           * Can not delete instance zero since it is tied to the ill.
9635 9633           */
9636 9634          if (ipif->ipif_id == 0)
9637 9635                  return (EBUSY);
9638 9636  
9639 9637          mutex_enter(&ill->ill_lock);
9640 9638          ipif->ipif_state_flags |= IPIF_CONDEMNED;
9641 9639          mutex_exit(&ill->ill_lock);
9642 9640  
9643 9641          ipif_free(ipif);
9644 9642  
9645 9643          mutex_enter(&connp->conn_lock);
9646 9644          mutex_enter(&ill->ill_lock);
9647 9645  
9648 9646          /* Are any references to this ipif active */
9649 9647          if (ipif_is_freeable(ipif)) {
9650 9648                  mutex_exit(&ill->ill_lock);
9651 9649                  mutex_exit(&connp->conn_lock);
9652 9650                  ipif_non_duplicate(ipif);
9653 9651                  (void) ipif_down_tail(ipif);
9654 9652                  ipif_free_tail(ipif); /* frees ipif */
9655 9653                  return (0);
9656 9654          }
9657 9655          success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
9658 9656              IPIF_FREE);
9659 9657          mutex_exit(&ill->ill_lock);
9660 9658          mutex_exit(&connp->conn_lock);
9661 9659          if (success)
9662 9660                  return (EINPROGRESS);
9663 9661          else
9664 9662                  return (EINTR);
9665 9663  }
9666 9664  
9667 9665  /*
9668 9666   * Restart the removeif ioctl. The refcnt has gone down to 0.
9669 9667   * The ipif is already condemned. So can't find it thru lookups.
9670 9668   */
9671 9669  /* ARGSUSED */
9672 9670  int
9673 9671  ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
9674 9672      mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9675 9673  {
9676 9674          ill_t *ill = ipif->ipif_ill;
9677 9675  
9678 9676          ASSERT(IAM_WRITER_IPIF(ipif));
9679 9677          ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
9680 9678  
9681 9679          ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
9682 9680              ill->ill_name, ipif->ipif_id, (void *)ipif));
9683 9681  
9684 9682          if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9685 9683                  ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
9686 9684                  ill_delete_tail(ill);
9687 9685                  mi_free(ill);
9688 9686                  return (0);
9689 9687          }
9690 9688  
9691 9689          ipif_non_duplicate(ipif);
9692 9690          (void) ipif_down_tail(ipif);
9693 9691          ipif_free_tail(ipif);
9694 9692  
9695 9693          return (0);
9696 9694  }
9697 9695  
9698 9696  /*
9699 9697   * Set the local interface address using the given prefix and ill_token.
9700 9698   */
9701 9699  /* ARGSUSED */
9702 9700  int
9703 9701  ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9704 9702      ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9705 9703  {
9706 9704          int err;
9707 9705          in6_addr_t v6addr;
9708 9706          sin6_t *sin6;
9709 9707          ill_t *ill;
9710 9708          int i;
9711 9709  
9712 9710          ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n",
9713 9711              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9714 9712  
9715 9713          ASSERT(IAM_WRITER_IPIF(ipif));
9716 9714  
9717 9715          if (!ipif->ipif_isv6)
9718 9716                  return (EINVAL);
9719 9717  
9720 9718          if (sin->sin_family != AF_INET6)
9721 9719                  return (EAFNOSUPPORT);
9722 9720  
9723 9721          sin6 = (sin6_t *)sin;
9724 9722          v6addr = sin6->sin6_addr;
9725 9723          ill = ipif->ipif_ill;
9726 9724  
9727 9725          if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) ||
9728 9726              IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token))
9729 9727                  return (EADDRNOTAVAIL);
9730 9728  
9731 9729          for (i = 0; i < 4; i++)
9732 9730                  sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i];
9733 9731  
9734 9732          err = ip_sioctl_addr(ipif, sin, q, mp,
9735 9733              &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq);
9736 9734          return (err);
9737 9735  }
9738 9736  
9739 9737  /*
9740 9738   * Restart entry point to restart the address set operation after the
9741 9739   * refcounts have dropped to zero.
9742 9740   */
9743 9741  /* ARGSUSED */
9744 9742  int
9745 9743  ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9746 9744      ip_ioctl_cmd_t *ipip, void *ifreq)
9747 9745  {
9748 9746          ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n",
9749 9747              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9750 9748          return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq));
9751 9749  }
9752 9750  
9753 9751  /*
9754 9752   * Set the local interface address.
9755 9753   * Allow an address of all zero when the interface is down.
9756 9754   */
9757 9755  /* ARGSUSED */
9758 9756  int
9759 9757  ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9760 9758      ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9761 9759  {
9762 9760          int err = 0;
9763 9761          in6_addr_t v6addr;
9764 9762          boolean_t need_up = B_FALSE;
9765 9763          ill_t *ill;
9766 9764  
9767 9765          ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
9768 9766              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9769 9767  
9770 9768          ASSERT(IAM_WRITER_IPIF(ipif));
9771 9769  
9772 9770          ill = ipif->ipif_ill;
9773 9771          if (ipif->ipif_isv6) {
9774 9772                  sin6_t *sin6;
9775 9773                  phyint_t *phyi;
9776 9774  
9777 9775                  if (sin->sin_family != AF_INET6)
9778 9776                          return (EAFNOSUPPORT);
9779 9777  
9780 9778                  sin6 = (sin6_t *)sin;
9781 9779                  v6addr = sin6->sin6_addr;
9782 9780                  phyi = ill->ill_phyint;
9783 9781  
9784 9782                  /*
9785 9783                   * Enforce that true multicast interfaces have a link-local
9786 9784                   * address for logical unit 0.
9787 9785                   *
9788 9786                   * However for those ipif's for which link-local address was
9789 9787                   * not created by default, also allow setting :: as the address.
9790 9788                   * This scenario would arise, when we delete an address on ipif
9791 9789                   * with logical unit 0, we would want to set :: as the address.
9792 9790                   */
9793 9791                  if (ipif->ipif_id == 0 &&
9794 9792                      (ill->ill_flags & ILLF_MULTICAST) &&
9795 9793                      !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
9796 9794                      !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
9797 9795                      !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
9798 9796  
9799 9797                          /*
9800 9798                           * if default link-local was not created by kernel for
9801 9799                           * this ill, allow setting :: as the address on ipif:0.
9802 9800                           */
9803 9801                          if (ill->ill_flags & ILLF_NOLINKLOCAL) {
9804 9802                                  if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr))
9805 9803                                          return (EADDRNOTAVAIL);
9806 9804                          } else {
9807 9805                                  return (EADDRNOTAVAIL);
9808 9806                          }
9809 9807                  }
9810 9808  
9811 9809                  /*
9812 9810                   * up interfaces shouldn't have the unspecified address
9813 9811                   * unless they also have the IPIF_NOLOCAL flags set and
9814 9812                   * have a subnet assigned.
9815 9813                   */
9816 9814                  if ((ipif->ipif_flags & IPIF_UP) &&
9817 9815                      IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
9818 9816                      (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
9819 9817                      IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
9820 9818                          return (EADDRNOTAVAIL);
9821 9819                  }
9822 9820  
9823 9821                  if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9824 9822                          return (EADDRNOTAVAIL);
9825 9823          } else {
9826 9824                  ipaddr_t addr;
9827 9825  
9828 9826                  if (sin->sin_family != AF_INET)
9829 9827                          return (EAFNOSUPPORT);
9830 9828  
9831 9829                  addr = sin->sin_addr.s_addr;
9832 9830  
9833 9831                  /* Allow INADDR_ANY as the local address. */
9834 9832                  if (addr != INADDR_ANY &&
9835 9833                      !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
9836 9834                          return (EADDRNOTAVAIL);
9837 9835  
9838 9836                  IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9839 9837          }
9840 9838          /* verify that the address being configured is permitted by mac */
9841 9839          if (!ill_ipcheck_addr(ill, &v6addr)) {
9842 9840                  return (EPERM);
9843 9841          }
9844 9842          /*
9845 9843           * Even if there is no change we redo things just to rerun
9846 9844           * ipif_set_default.
9847 9845           */
9848 9846          if (ipif->ipif_flags & IPIF_UP) {
9849 9847                  /*
9850 9848                   * Setting a new local address, make sure
9851 9849                   * we have net and subnet bcast ire's for
9852 9850                   * the old address if we need them.
9853 9851                   */
9854 9852                  /*
9855 9853                   * If the interface is already marked up,
9856 9854                   * we call ipif_down which will take care
9857 9855                   * of ditching any IREs that have been set
9858 9856                   * up based on the old interface address.
9859 9857                   */
9860 9858                  err = ipif_logical_down(ipif, q, mp);
9861 9859                  if (err == EINPROGRESS)
9862 9860                          return (err);
9863 9861                  (void) ipif_down_tail(ipif);
9864 9862                  need_up = 1;
9865 9863          }
9866 9864  
9867 9865          err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
9868 9866          return (err);
9869 9867  }
9870 9868  
9871 9869  int
9872 9870  ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9873 9871      boolean_t need_up)
9874 9872  {
9875 9873          in6_addr_t v6addr;
9876 9874          in6_addr_t ov6addr;
9877 9875          ipaddr_t addr;
9878 9876          sin6_t  *sin6;
9879 9877          int     sinlen;
9880 9878          int     err = 0;
9881 9879          ill_t   *ill = ipif->ipif_ill;
9882 9880          boolean_t need_dl_down;
9883 9881          boolean_t need_arp_down;
9884 9882          struct iocblk *iocp;
9885 9883  
9886 9884          iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
9887 9885  
9888 9886          ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
9889 9887              ill->ill_name, ipif->ipif_id, (void *)ipif));
9890 9888          ASSERT(IAM_WRITER_IPIF(ipif));
9891 9889  
9892 9890          /* Must cancel any pending timer before taking the ill_lock */
9893 9891          if (ipif->ipif_recovery_id != 0)
9894 9892                  (void) untimeout(ipif->ipif_recovery_id);
9895 9893          ipif->ipif_recovery_id = 0;
9896 9894  
9897 9895          if (ipif->ipif_isv6) {
9898 9896                  sin6 = (sin6_t *)sin;
9899 9897                  v6addr = sin6->sin6_addr;
9900 9898                  sinlen = sizeof (struct sockaddr_in6);
9901 9899          } else {
9902 9900                  addr = sin->sin_addr.s_addr;
9903 9901                  IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9904 9902                  sinlen = sizeof (struct sockaddr_in);
9905 9903          }
9906 9904          mutex_enter(&ill->ill_lock);
9907 9905          ov6addr = ipif->ipif_v6lcl_addr;
9908 9906          ipif->ipif_v6lcl_addr = v6addr;
9909 9907          sctp_update_ipif_addr(ipif, ov6addr);
9910 9908          ipif->ipif_addr_ready = 0;
9911 9909  
9912 9910          ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
9913 9911  
9914 9912          /*
9915 9913           * If the interface was previously marked as a duplicate, then since
9916 9914           * we've now got a "new" address, it should no longer be considered a
9917 9915           * duplicate -- even if the "new" address is the same as the old one.
9918 9916           * Note that if all ipifs are down, we may have a pending ARP down
9919 9917           * event to handle.  This is because we want to recover from duplicates
9920 9918           * and thus delay tearing down ARP until the duplicates have been
9921 9919           * removed or disabled.
9922 9920           */
9923 9921          need_dl_down = need_arp_down = B_FALSE;
9924 9922          if (ipif->ipif_flags & IPIF_DUPLICATE) {
9925 9923                  need_arp_down = !need_up;
9926 9924                  ipif->ipif_flags &= ~IPIF_DUPLICATE;
9927 9925                  if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9928 9926                      ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9929 9927                          need_dl_down = B_TRUE;
9930 9928                  }
9931 9929          }
9932 9930  
9933 9931          ipif_set_default(ipif);
9934 9932  
9935 9933          /*
9936 9934           * If we've just manually set the IPv6 link-local address (0th ipif),
9937 9935           * tag the ill so that future updates to the interface ID don't result
9938 9936           * in this address getting automatically reconfigured from under the
9939 9937           * administrator.
9940 9938           */
9941 9939          if (ipif->ipif_isv6 && ipif->ipif_id == 0) {
9942 9940                  if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR &&
9943 9941                      !IN6_IS_ADDR_UNSPECIFIED(&v6addr)))
9944 9942                          ill->ill_manual_linklocal = 1;
9945 9943          }
9946 9944  
9947 9945          /*
9948 9946           * When publishing an interface address change event, we only notify
9949 9947           * the event listeners of the new address.  It is assumed that if they
9950 9948           * actively care about the addresses assigned that they will have
9951 9949           * already discovered the previous address assigned (if there was one.)
9952 9950           *
9953 9951           * Don't attach nic event message for SIOCLIFADDIF ioctl.
9954 9952           */
9955 9953          if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
9956 9954                  ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id),
9957 9955                      NE_ADDRESS_CHANGE, sin, sinlen);
9958 9956          }
9959 9957  
9960 9958          mutex_exit(&ill->ill_lock);
9961 9959  
9962 9960          if (need_up) {
9963 9961                  /*
9964 9962                   * Now bring the interface back up.  If this
9965 9963                   * is the only IPIF for the ILL, ipif_up
9966 9964                   * will have to re-bind to the device, so
9967 9965                   * we may get back EINPROGRESS, in which
9968 9966                   * case, this IOCTL will get completed in
9969 9967                   * ip_rput_dlpi when we see the DL_BIND_ACK.
9970 9968                   */
9971 9969                  err = ipif_up(ipif, q, mp);
9972 9970          } else {
9973 9971                  /* Perhaps ilgs should use this ill */
9974 9972                  update_conn_ill(NULL, ill->ill_ipst);
9975 9973          }
9976 9974  
9977 9975          if (need_dl_down)
9978 9976                  ill_dl_down(ill);
9979 9977  
9980 9978          if (need_arp_down && !ill->ill_isv6)
9981 9979                  (void) ipif_arp_down(ipif);
9982 9980  
9983 9981          /*
9984 9982           * The default multicast interface might have changed (for
9985 9983           * instance if the IPv6 scope of the address changed)
9986 9984           */
9987 9985          ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
9988 9986  
9989 9987          return (err);
9990 9988  }
9991 9989  
9992 9990  /*
9993 9991   * Restart entry point to restart the address set operation after the
9994 9992   * refcounts have dropped to zero.
9995 9993   */
9996 9994  /* ARGSUSED */
9997 9995  int
9998 9996  ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9999 9997      ip_ioctl_cmd_t *ipip, void *ifreq)
10000 9998  {
10001 9999          ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
10002 10000              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10003 10001          ASSERT(IAM_WRITER_IPIF(ipif));
10004 10002          (void) ipif_down_tail(ipif);
10005 10003          return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
10006 10004  }
10007 10005  
10008 10006  /* ARGSUSED */
10009 10007  int
10010 10008  ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10011 10009      ip_ioctl_cmd_t *ipip, void *if_req)
10012 10010  {
10013 10011          sin6_t *sin6 = (struct sockaddr_in6 *)sin;
10014 10012          struct lifreq *lifr = (struct lifreq *)if_req;
10015 10013  
10016 10014          ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
10017 10015              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10018 10016          /*
10019 10017           * The net mask and address can't change since we have a
10020 10018           * reference to the ipif. So no lock is necessary.
10021 10019           */
10022 10020          if (ipif->ipif_isv6) {
10023 10021                  *sin6 = sin6_null;
10024 10022                  sin6->sin6_family = AF_INET6;
10025 10023                  sin6->sin6_addr = ipif->ipif_v6lcl_addr;
10026 10024                  if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
10027 10025                          sin6->sin6_scope_id =
10028 10026                              ipif->ipif_ill->ill_phyint->phyint_ifindex;
10029 10027                  }
10030 10028                  ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10031 10029                  lifr->lifr_addrlen =
10032 10030                      ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10033 10031          } else {
10034 10032                  *sin = sin_null;
10035 10033                  sin->sin_family = AF_INET;
10036 10034                  sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
10037 10035                  if (ipip->ipi_cmd_type == LIF_CMD) {
10038 10036                          lifr->lifr_addrlen =
10039 10037                              ip_mask_to_plen(ipif->ipif_net_mask);
10040 10038                  }
10041 10039          }
10042 10040          return (0);
10043 10041  }
10044 10042  
10045 10043  /*
10046 10044   * Set the destination address for a pt-pt interface.
10047 10045   */
10048 10046  /* ARGSUSED */
10049 10047  int
10050 10048  ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10051 10049      ip_ioctl_cmd_t *ipip, void *if_req)
10052 10050  {
10053 10051          int err = 0;
10054 10052          in6_addr_t v6addr;
10055 10053          boolean_t need_up = B_FALSE;
10056 10054  
10057 10055          ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
10058 10056              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10059 10057          ASSERT(IAM_WRITER_IPIF(ipif));
10060 10058  
10061 10059          if (ipif->ipif_isv6) {
10062 10060                  sin6_t *sin6;
10063 10061  
10064 10062                  if (sin->sin_family != AF_INET6)
10065 10063                          return (EAFNOSUPPORT);
10066 10064  
10067 10065                  sin6 = (sin6_t *)sin;
10068 10066                  v6addr = sin6->sin6_addr;
10069 10067  
10070 10068                  if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
10071 10069                          return (EADDRNOTAVAIL);
10072 10070          } else {
10073 10071                  ipaddr_t addr;
10074 10072  
10075 10073                  if (sin->sin_family != AF_INET)
10076 10074                          return (EAFNOSUPPORT);
10077 10075  
10078 10076                  addr = sin->sin_addr.s_addr;
10079 10077                  if (addr != INADDR_ANY &&
10080 10078                      !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) {
10081 10079                          return (EADDRNOTAVAIL);
10082 10080                  }
10083 10081  
10084 10082                  IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10085 10083          }
10086 10084  
10087 10085          if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
10088 10086                  return (0);     /* No change */
10089 10087  
10090 10088          if (ipif->ipif_flags & IPIF_UP) {
10091 10089                  /*
10092 10090                   * If the interface is already marked up,
10093 10091                   * we call ipif_down which will take care
10094 10092                   * of ditching any IREs that have been set
10095 10093                   * up based on the old pp dst address.
10096 10094                   */
10097 10095                  err = ipif_logical_down(ipif, q, mp);
10098 10096                  if (err == EINPROGRESS)
10099 10097                          return (err);
10100 10098                  (void) ipif_down_tail(ipif);
10101 10099                  need_up = B_TRUE;
10102 10100          }
10103 10101          /*
10104 10102           * could return EINPROGRESS. If so ioctl will complete in
10105 10103           * ip_rput_dlpi_writer
10106 10104           */
10107 10105          err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
10108 10106          return (err);
10109 10107  }
10110 10108  
10111 10109  static int
10112 10110  ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10113 10111      boolean_t need_up)
10114 10112  {
10115 10113          in6_addr_t v6addr;
10116 10114          ill_t   *ill = ipif->ipif_ill;
10117 10115          int     err = 0;
10118 10116          boolean_t need_dl_down;
10119 10117          boolean_t need_arp_down;
10120 10118  
10121 10119          ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
10122 10120              ipif->ipif_id, (void *)ipif));
10123 10121  
10124 10122          /* Must cancel any pending timer before taking the ill_lock */
10125 10123          if (ipif->ipif_recovery_id != 0)
10126 10124                  (void) untimeout(ipif->ipif_recovery_id);
10127 10125          ipif->ipif_recovery_id = 0;
10128 10126  
10129 10127          if (ipif->ipif_isv6) {
10130 10128                  sin6_t *sin6;
10131 10129  
10132 10130                  sin6 = (sin6_t *)sin;
10133 10131                  v6addr = sin6->sin6_addr;
10134 10132          } else {
10135 10133                  ipaddr_t addr;
10136 10134  
10137 10135                  addr = sin->sin_addr.s_addr;
10138 10136                  IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10139 10137          }
10140 10138          mutex_enter(&ill->ill_lock);
10141 10139          /* Set point to point destination address. */
10142 10140          if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10143 10141                  /*
10144 10142                   * Allow this as a means of creating logical
10145 10143                   * pt-pt interfaces on top of e.g. an Ethernet.
10146 10144                   * XXX Undocumented HACK for testing.
10147 10145                   * pt-pt interfaces are created with NUD disabled.
10148 10146                   */
10149 10147                  ipif->ipif_flags |= IPIF_POINTOPOINT;
10150 10148                  ipif->ipif_flags &= ~IPIF_BROADCAST;
10151 10149                  if (ipif->ipif_isv6)
10152 10150                          ill->ill_flags |= ILLF_NONUD;
10153 10151          }
10154 10152  
10155 10153          /*
10156 10154           * If the interface was previously marked as a duplicate, then since
10157 10155           * we've now got a "new" address, it should no longer be considered a
10158 10156           * duplicate -- even if the "new" address is the same as the old one.
10159 10157           * Note that if all ipifs are down, we may have a pending ARP down
10160 10158           * event to handle.
10161 10159           */
10162 10160          need_dl_down = need_arp_down = B_FALSE;
10163 10161          if (ipif->ipif_flags & IPIF_DUPLICATE) {
10164 10162                  need_arp_down = !need_up;
10165 10163                  ipif->ipif_flags &= ~IPIF_DUPLICATE;
10166 10164                  if (--ill->ill_ipif_dup_count == 0 && !need_up &&
10167 10165                      ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
10168 10166                          need_dl_down = B_TRUE;
10169 10167                  }
10170 10168          }
10171 10169  
10172 10170          /*
10173 10171           * If we've just manually set the IPv6 destination link-local address
10174 10172           * (0th ipif), tag the ill so that future updates to the destination
10175 10173           * interface ID (as can happen with interfaces over IP tunnels) don't
10176 10174           * result in this address getting automatically reconfigured from
10177 10175           * under the administrator.
10178 10176           */
10179 10177          if (ipif->ipif_isv6 && ipif->ipif_id == 0)
10180 10178                  ill->ill_manual_dst_linklocal = 1;
10181 10179  
10182 10180          /* Set the new address. */
10183 10181          ipif->ipif_v6pp_dst_addr = v6addr;
10184 10182          /* Make sure subnet tracks pp_dst */
10185 10183          ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
10186 10184          mutex_exit(&ill->ill_lock);
10187 10185  
10188 10186          if (need_up) {
10189 10187                  /*
10190 10188                   * Now bring the interface back up.  If this
10191 10189                   * is the only IPIF for the ILL, ipif_up
10192 10190                   * will have to re-bind to the device, so
10193 10191                   * we may get back EINPROGRESS, in which
10194 10192                   * case, this IOCTL will get completed in
10195 10193                   * ip_rput_dlpi when we see the DL_BIND_ACK.
10196 10194                   */
10197 10195                  err = ipif_up(ipif, q, mp);
10198 10196          }
10199 10197  
10200 10198          if (need_dl_down)
10201 10199                  ill_dl_down(ill);
10202 10200          if (need_arp_down && !ipif->ipif_isv6)
10203 10201                  (void) ipif_arp_down(ipif);
10204 10202  
10205 10203          return (err);
10206 10204  }
10207 10205  
10208 10206  /*
10209 10207   * Restart entry point to restart the dstaddress set operation after the
10210 10208   * refcounts have dropped to zero.
10211 10209   */
10212 10210  /* ARGSUSED */
10213 10211  int
10214 10212  ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10215 10213      ip_ioctl_cmd_t *ipip, void *ifreq)
10216 10214  {
10217 10215          ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
10218 10216              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10219 10217          (void) ipif_down_tail(ipif);
10220 10218          return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
10221 10219  }
10222 10220  
10223 10221  /* ARGSUSED */
10224 10222  int
10225 10223  ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10226 10224      ip_ioctl_cmd_t *ipip, void *if_req)
10227 10225  {
10228 10226          sin6_t  *sin6 = (struct sockaddr_in6 *)sin;
10229 10227  
10230 10228          ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
10231 10229              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10232 10230          /*
10233 10231           * Get point to point destination address. The addresses can't
10234 10232           * change since we hold a reference to the ipif.
10235 10233           */
10236 10234          if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
10237 10235                  return (EADDRNOTAVAIL);
10238 10236  
10239 10237          if (ipif->ipif_isv6) {
10240 10238                  ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10241 10239                  *sin6 = sin6_null;
10242 10240                  sin6->sin6_family = AF_INET6;
10243 10241                  sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
10244 10242          } else {
10245 10243                  *sin = sin_null;
10246 10244                  sin->sin_family = AF_INET;
10247 10245                  sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
10248 10246          }
10249 10247          return (0);
10250 10248  }
10251 10249  
10252 10250  /*
10253 10251   * Check which flags will change by the given flags being set
10254 10252   * silently ignore flags which userland is not allowed to control.
10255 10253   * (Because these flags may change between SIOCGLIFFLAGS and
10256 10254   * SIOCSLIFFLAGS, and that's outside of userland's control,
10257 10255   * we need to silently ignore them rather than fail.)
10258 10256   */
10259 10257  static void
10260 10258  ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp,
10261 10259      uint64_t *offp)
10262 10260  {
10263 10261          ill_t           *ill = ipif->ipif_ill;
10264 10262          phyint_t        *phyi = ill->ill_phyint;
10265 10263          uint64_t        cantchange_flags, intf_flags;
10266 10264          uint64_t        turn_on, turn_off;
10267 10265  
10268 10266          intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10269 10267          cantchange_flags = IFF_CANTCHANGE;
10270 10268          if (IS_IPMP(ill))
10271 10269                  cantchange_flags |= IFF_IPMP_CANTCHANGE;
10272 10270          turn_on = (flags ^ intf_flags) & ~cantchange_flags;
10273 10271          turn_off = intf_flags & turn_on;
10274 10272          turn_on ^= turn_off;
10275 10273          *onp = turn_on;
10276 10274          *offp = turn_off;
10277 10275  }
10278 10276  
10279 10277  /*
10280 10278   * Set interface flags.  Many flags require special handling (e.g.,
10281 10279   * bringing the interface down); see below for details.
10282 10280   *
10283 10281   * NOTE : We really don't enforce that ipif_id zero should be used
10284 10282   *        for setting any flags other than IFF_LOGINT_FLAGS. This
10285 10283   *        is because applications generally does SICGLIFFLAGS and
10286 10284   *        ORs in the new flags (that affects the logical) and does a
10287 10285   *        SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10288 10286   *        than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10289 10287   *        flags that will be turned on is correct with respect to
10290 10288   *        ipif_id 0. For backward compatibility reasons, it is not done.
10291 10289   */
10292 10290  /* ARGSUSED */
10293 10291  int
10294 10292  ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10295 10293      ip_ioctl_cmd_t *ipip, void *if_req)
10296 10294  {
10297 10295          uint64_t turn_on;
10298 10296          uint64_t turn_off;
10299 10297          int     err = 0;
10300 10298          phyint_t *phyi;
10301 10299          ill_t *ill;
10302 10300          conn_t *connp;
10303 10301          uint64_t intf_flags;
10304 10302          boolean_t phyint_flags_modified = B_FALSE;
10305 10303          uint64_t flags;
10306 10304          struct ifreq *ifr;
10307 10305          struct lifreq *lifr;
10308 10306          boolean_t set_linklocal = B_FALSE;
10309 10307  
10310 10308          ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10311 10309              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10312 10310  
10313 10311          ASSERT(IAM_WRITER_IPIF(ipif));
10314 10312  
10315 10313          ill = ipif->ipif_ill;
10316 10314          phyi = ill->ill_phyint;
10317 10315  
10318 10316          if (ipip->ipi_cmd_type == IF_CMD) {
10319 10317                  ifr = (struct ifreq *)if_req;
10320 10318                  flags =  (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10321 10319          } else {
10322 10320                  lifr = (struct lifreq *)if_req;
10323 10321                  flags = lifr->lifr_flags;
10324 10322          }
10325 10323  
10326 10324          intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10327 10325  
10328 10326          /*
10329 10327           * Have the flags been set correctly until now?
10330 10328           */
10331 10329          ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10332 10330          ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10333 10331          ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10334 10332          /*
10335 10333           * Compare the new flags to the old, and partition
10336 10334           * into those coming on and those going off.
10337 10335           * For the 16 bit command keep the bits above bit 16 unchanged.
10338 10336           */
10339 10337          if (ipip->ipi_cmd == SIOCSIFFLAGS)
10340 10338                  flags |= intf_flags & ~0xFFFF;
10341 10339  
10342 10340          /*
10343 10341           * Explicitly fail attempts to change flags that are always invalid on
10344 10342           * an IPMP meta-interface.
10345 10343           */
10346 10344          if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
10347 10345                  return (EINVAL);
10348 10346  
10349 10347          ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10350 10348          if ((turn_on|turn_off) == 0)
10351 10349                  return (0);     /* No change */
10352 10350  
10353 10351          /*
10354 10352           * All test addresses must be IFF_DEPRECATED (to ensure source address
10355 10353           * selection avoids them) -- so force IFF_DEPRECATED on, and do not
10356 10354           * allow it to be turned off.
10357 10355           */
10358 10356          if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
10359 10357              (turn_on|intf_flags) & IFF_NOFAILOVER)
10360 10358                  return (EINVAL);
10361 10359  
10362 10360          if ((connp = Q_TO_CONN(q)) == NULL)
10363 10361                  return (EINVAL);
10364 10362  
10365 10363          /*
10366 10364           * Only vrrp control socket is allowed to change IFF_UP and
10367 10365           * IFF_NOACCEPT flags when IFF_VRRP is set.
10368 10366           */
10369 10367          if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) {
10370 10368                  if (!connp->conn_isvrrp)
10371 10369                          return (EINVAL);
10372 10370          }
10373 10371  
10374 10372          /*
10375 10373           * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by
10376 10374           * VRRP control socket.
10377 10375           */
10378 10376          if ((turn_off | turn_on) & IFF_NOACCEPT) {
10379 10377                  if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP))
10380 10378                          return (EINVAL);
10381 10379          }
10382 10380  
10383 10381          if (turn_on & IFF_NOFAILOVER) {
10384 10382                  turn_on |= IFF_DEPRECATED;
10385 10383                  flags |= IFF_DEPRECATED;
10386 10384          }
10387 10385  
10388 10386          /*
10389 10387           * On underlying interfaces, only allow applications to manage test
10390 10388           * addresses -- otherwise, they may get confused when the address
10391 10389           * moves as part of being brought up.  Likewise, prevent an
10392 10390           * application-managed test address from being converted to a data
10393 10391           * address.  To prevent migration of administratively up addresses in
10394 10392           * the kernel, we don't allow them to be converted either.
10395 10393           */
10396 10394          if (IS_UNDER_IPMP(ill)) {
10397 10395                  const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
10398 10396  
10399 10397                  if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
10400 10398                          return (EINVAL);
10401 10399  
10402 10400                  if ((turn_off & IFF_NOFAILOVER) &&
10403 10401                      (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
10404 10402                          return (EINVAL);
10405 10403          }
10406 10404  
10407 10405          /*
10408 10406           * Only allow IFF_TEMPORARY flag to be set on
10409 10407           * IPv6 interfaces.
10410 10408           */
10411 10409          if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
10412 10410                  return (EINVAL);
10413 10411  
10414 10412          /*
10415 10413           * cannot turn off IFF_NOXMIT on  VNI interfaces.
10416 10414           */
10417 10415          if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
10418 10416                  return (EINVAL);
10419 10417  
10420 10418          /*
10421 10419           * Don't allow the IFF_ROUTER flag to be turned on on loopback
10422 10420           * interfaces.  It makes no sense in that context.
10423 10421           */
10424 10422          if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
10425 10423                  return (EINVAL);
10426 10424  
10427 10425          /*
10428 10426           * For IPv6 ipif_id 0, don't allow the interface to be up without
10429 10427           * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
10430 10428           * If the link local address isn't set, and can be set, it will get
10431 10429           * set later on in this function.
10432 10430           */
10433 10431          if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
10434 10432              (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
10435 10433              IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
10436 10434                  if (ipif_cant_setlinklocal(ipif))
10437 10435                          return (EINVAL);
10438 10436                  set_linklocal = B_TRUE;
10439 10437          }
10440 10438  
10441 10439          /*
10442 10440           * If we modify physical interface flags, we'll potentially need to
10443 10441           * send up two routing socket messages for the changes (one for the
10444 10442           * IPv4 ill, and another for the IPv6 ill).  Note that here.
10445 10443           */
10446 10444          if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10447 10445                  phyint_flags_modified = B_TRUE;
10448 10446  
10449 10447          /*
10450 10448           * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
10451 10449           * (otherwise, we'd immediately use them, defeating standby).  Also,
10452 10450           * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
10453 10451           * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
10454 10452           * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared.  We
10455 10453           * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
10456 10454           * will not be honored.
10457 10455           */
10458 10456          if (turn_on & PHYI_STANDBY) {
10459 10457                  /*
10460 10458                   * No need to grab ill_g_usesrc_lock here; see the
10461 10459                   * synchronization notes in ip.c.
10462 10460                   */
10463 10461                  if (ill->ill_usesrc_grp_next != NULL ||
10464 10462                      intf_flags & PHYI_INACTIVE)
10465 10463                          return (EINVAL);
10466 10464                  if (!(flags & PHYI_FAILED)) {
10467 10465                          flags |= PHYI_INACTIVE;
10468 10466                          turn_on |= PHYI_INACTIVE;
10469 10467                  }
10470 10468          }
10471 10469  
10472 10470          if (turn_off & PHYI_STANDBY) {
10473 10471                  flags &= ~PHYI_INACTIVE;
10474 10472                  turn_off |= PHYI_INACTIVE;
10475 10473          }
10476 10474  
10477 10475          /*
10478 10476           * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
10479 10477           * would end up on.
10480 10478           */
10481 10479          if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
10482 10480              (PHYI_FAILED | PHYI_INACTIVE))
10483 10481                  return (EINVAL);
10484 10482  
10485 10483          /*
10486 10484           * If ILLF_ROUTER changes, we need to change the ip forwarding
10487 10485           * status of the interface.
10488 10486           */
10489 10487          if ((turn_on | turn_off) & ILLF_ROUTER) {
10490 10488                  err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
10491 10489                  if (err != 0)
10492 10490                          return (err);
10493 10491          }
10494 10492  
10495 10493          /*
10496 10494           * If the interface is not UP and we are not going to
10497 10495           * bring it UP, record the flags and return. When the
10498 10496           * interface comes UP later, the right actions will be
10499 10497           * taken.
10500 10498           */
10501 10499          if (!(ipif->ipif_flags & IPIF_UP) &&
10502 10500              !(turn_on & IPIF_UP)) {
10503 10501                  /* Record new flags in their respective places. */
10504 10502                  mutex_enter(&ill->ill_lock);
10505 10503                  mutex_enter(&ill->ill_phyint->phyint_lock);
10506 10504                  ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10507 10505                  ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10508 10506                  ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10509 10507                  ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10510 10508                  phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10511 10509                  phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10512 10510                  mutex_exit(&ill->ill_lock);
10513 10511                  mutex_exit(&ill->ill_phyint->phyint_lock);
10514 10512  
10515 10513                  /*
10516 10514                   * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
10517 10515                   * same to the kernel: if any of them has been set by
10518 10516                   * userland, the interface cannot be used for data traffic.
10519 10517                   */
10520 10518                  if ((turn_on|turn_off) &
10521 10519                      (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10522 10520                          ASSERT(!IS_IPMP(ill));
10523 10521                          /*
10524 10522                           * It's possible the ill is part of an "anonymous"
10525 10523                           * IPMP group rather than a real group.  In that case,
10526 10524                           * there are no other interfaces in the group and thus
10527 10525                           * no need to call ipmp_phyint_refresh_active().
10528 10526                           */
10529 10527                          if (IS_UNDER_IPMP(ill))
10530 10528                                  ipmp_phyint_refresh_active(phyi);
10531 10529                  }
10532 10530  
10533 10531                  if (phyint_flags_modified) {
10534 10532                          if (phyi->phyint_illv4 != NULL) {
10535 10533                                  ip_rts_ifmsg(phyi->phyint_illv4->
10536 10534                                      ill_ipif, RTSQ_DEFAULT);
10537 10535                          }
10538 10536                          if (phyi->phyint_illv6 != NULL) {
10539 10537                                  ip_rts_ifmsg(phyi->phyint_illv6->
10540 10538                                      ill_ipif, RTSQ_DEFAULT);
10541 10539                          }
10542 10540                  }
10543 10541                  /* The default multicast interface might have changed */
10544 10542                  ire_increment_multicast_generation(ill->ill_ipst,
10545 10543                      ill->ill_isv6);
10546 10544  
10547 10545                  return (0);
10548 10546          } else if (set_linklocal) {
10549 10547                  mutex_enter(&ill->ill_lock);
10550 10548                  if (set_linklocal)
10551 10549                          ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
10552 10550                  mutex_exit(&ill->ill_lock);
10553 10551          }
10554 10552  
10555 10553          /*
10556 10554           * Disallow IPv6 interfaces coming up that have the unspecified address,
10557 10555           * or point-to-point interfaces with an unspecified destination. We do
10558 10556           * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
10559 10557           * have a subnet assigned, which is how in.ndpd currently manages its
10560 10558           * onlink prefix list when no addresses are configured with those
10561 10559           * prefixes.
10562 10560           */
10563 10561          if (ipif->ipif_isv6 &&
10564 10562              ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
10565 10563              (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
10566 10564              IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
10567 10565              ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10568 10566              IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
10569 10567                  return (EINVAL);
10570 10568          }
10571 10569  
10572 10570          /*
10573 10571           * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
10574 10572           * from being brought up.
10575 10573           */
10576 10574          if (!ipif->ipif_isv6 &&
10577 10575              ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10578 10576              ipif->ipif_pp_dst_addr == INADDR_ANY)) {
10579 10577                  return (EINVAL);
10580 10578          }
10581 10579  
10582 10580          /*
10583 10581           * If we are going to change one or more of the flags that are
10584 10582           * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
10585 10583           * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
10586 10584           * IPIF_NOFAILOVER, we will take special action.  This is
10587 10585           * done by bring the ipif down, changing the flags and bringing
10588 10586           * it back up again.  For IPIF_NOFAILOVER, the act of bringing it
10589 10587           * back up will trigger the address to be moved.
10590 10588           *
10591 10589           * If we are going to change IFF_NOACCEPT, we need to bring
10592 10590           * all the ipifs down then bring them up again.  The act of
10593 10591           * bringing all the ipifs back up will trigger the local
10594 10592           * ires being recreated with "no_accept" set/cleared.
10595 10593           *
10596 10594           * Note that ILLF_NOACCEPT is always set separately from the
10597 10595           * other flags.
10598 10596           */
10599 10597          if ((turn_on|turn_off) &
10600 10598              (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
10601 10599              ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
10602 10600              IPIF_NOFAILOVER)) {
10603 10601                  /*
10604 10602                   * ipif_down() will ire_delete bcast ire's for the subnet,
10605 10603                   * while the ire_identical_ref tracks the case of IRE_BROADCAST
10606 10604                   * entries shared between multiple ipifs on the same subnet.
10607 10605                   */
10608 10606                  if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
10609 10607                      !(turn_off & IPIF_UP)) {
10610 10608                          if (ipif->ipif_flags & IPIF_UP)
10611 10609                                  ill->ill_logical_down = 1;
10612 10610                          turn_on &= ~IPIF_UP;
10613 10611                  }
10614 10612                  err = ipif_down(ipif, q, mp);
10615 10613                  ip1dbg(("ipif_down returns %d err ", err));
10616 10614                  if (err == EINPROGRESS)
10617 10615                          return (err);
10618 10616                  (void) ipif_down_tail(ipif);
10619 10617          } else if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10620 10618                  /*
10621 10619                   * If we can quiesce the ill, then continue.  If not, then
10622 10620                   * ip_sioctl_flags_tail() will be called from
10623 10621                   * ipif_ill_refrele_tail().
10624 10622                   */
10625 10623                  ill_down_ipifs(ill, B_TRUE);
10626 10624  
10627 10625                  mutex_enter(&connp->conn_lock);
10628 10626                  mutex_enter(&ill->ill_lock);
10629 10627                  if (!ill_is_quiescent(ill)) {
10630 10628                          boolean_t success;
10631 10629  
10632 10630                          success = ipsq_pending_mp_add(connp, ill->ill_ipif,
10633 10631                              q, mp, ILL_DOWN);
10634 10632                          mutex_exit(&ill->ill_lock);
10635 10633                          mutex_exit(&connp->conn_lock);
10636 10634                          return (success ? EINPROGRESS : EINTR);
10637 10635                  }
10638 10636                  mutex_exit(&ill->ill_lock);
10639 10637                  mutex_exit(&connp->conn_lock);
10640 10638          }
10641 10639          return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10642 10640  }
10643 10641  
10644 10642  static int
10645 10643  ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
10646 10644  {
10647 10645          ill_t   *ill;
10648 10646          phyint_t *phyi;
10649 10647          uint64_t turn_on, turn_off;
10650 10648          boolean_t phyint_flags_modified = B_FALSE;
10651 10649          int     err = 0;
10652 10650          boolean_t set_linklocal = B_FALSE;
10653 10651  
10654 10652          ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
10655 10653              ipif->ipif_ill->ill_name, ipif->ipif_id));
10656 10654  
10657 10655          ASSERT(IAM_WRITER_IPIF(ipif));
10658 10656  
10659 10657          ill = ipif->ipif_ill;
10660 10658          phyi = ill->ill_phyint;
10661 10659  
10662 10660          ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10663 10661  
10664 10662          /*
10665 10663           * IFF_UP is handled separately.
10666 10664           */
10667 10665          turn_on &= ~IFF_UP;
10668 10666          turn_off &= ~IFF_UP;
10669 10667  
10670 10668          if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10671 10669                  phyint_flags_modified = B_TRUE;
10672 10670  
10673 10671          /*
10674 10672           * Now we change the flags. Track current value of
10675 10673           * other flags in their respective places.
10676 10674           */
10677 10675          mutex_enter(&ill->ill_lock);
10678 10676          mutex_enter(&phyi->phyint_lock);
10679 10677          ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10680 10678          ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10681 10679          ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10682 10680          ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10683 10681          phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10684 10682          phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10685 10683          if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
10686 10684                  set_linklocal = B_TRUE;
10687 10685                  ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
10688 10686          }
10689 10687  
10690 10688          mutex_exit(&ill->ill_lock);
10691 10689          mutex_exit(&phyi->phyint_lock);
10692 10690  
10693 10691          if (set_linklocal)
10694 10692                  (void) ipif_setlinklocal(ipif);
10695 10693  
10696 10694          /*
10697 10695           * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
10698 10696           * the kernel: if any of them has been set by userland, the interface
10699 10697           * cannot be used for data traffic.
10700 10698           */
10701 10699          if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10702 10700                  ASSERT(!IS_IPMP(ill));
10703 10701                  /*
10704 10702                   * It's possible the ill is part of an "anonymous" IPMP group
10705 10703                   * rather than a real group.  In that case, there are no other
10706 10704                   * interfaces in the group and thus no need for us to call
10707 10705                   * ipmp_phyint_refresh_active().
10708 10706                   */
10709 10707                  if (IS_UNDER_IPMP(ill))
10710 10708                          ipmp_phyint_refresh_active(phyi);
10711 10709          }
10712 10710  
10713 10711          if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10714 10712                  /*
10715 10713                   * If the ILLF_NOACCEPT flag is changed, bring up all the
10716 10714                   * ipifs that were brought down.
10717 10715                   *
10718 10716                   * The routing sockets messages are sent as the result
10719 10717                   * of ill_up_ipifs(), further, SCTP's IPIF list was updated
10720 10718                   * as well.
10721 10719                   */
10722 10720                  err = ill_up_ipifs(ill, q, mp);
10723 10721          } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
10724 10722                  /*
10725 10723                   * XXX ipif_up really does not know whether a phyint flags
10726 10724                   * was modified or not. So, it sends up information on
10727 10725                   * only one routing sockets message. As we don't bring up
10728 10726                   * the interface and also set PHYI_ flags simultaneously
10729 10727                   * it should be okay.
10730 10728                   */
10731 10729                  err = ipif_up(ipif, q, mp);
10732 10730          } else {
10733 10731                  /*
10734 10732                   * Make sure routing socket sees all changes to the flags.
10735 10733                   * ipif_up_done* handles this when we use ipif_up.
10736 10734                   */
10737 10735                  if (phyint_flags_modified) {
10738 10736                          if (phyi->phyint_illv4 != NULL) {
10739 10737                                  ip_rts_ifmsg(phyi->phyint_illv4->
10740 10738                                      ill_ipif, RTSQ_DEFAULT);
10741 10739                          }
10742 10740                          if (phyi->phyint_illv6 != NULL) {
10743 10741                                  ip_rts_ifmsg(phyi->phyint_illv6->
10744 10742                                      ill_ipif, RTSQ_DEFAULT);
10745 10743                          }
10746 10744                  } else {
10747 10745                          ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
10748 10746                  }
10749 10747                  /*
10750 10748                   * Update the flags in SCTP's IPIF list, ipif_up() will do
10751 10749                   * this in need_up case.
10752 10750                   */
10753 10751                  sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10754 10752          }
10755 10753  
10756 10754          /* The default multicast interface might have changed */
10757 10755          ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
10758 10756          return (err);
10759 10757  }
10760 10758  
10761 10759  /*
10762 10760   * Restart the flags operation now that the refcounts have dropped to zero.
10763 10761   */
10764 10762  /* ARGSUSED */
10765 10763  int
10766 10764  ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10767 10765      ip_ioctl_cmd_t *ipip, void *if_req)
10768 10766  {
10769 10767          uint64_t flags;
10770 10768          struct ifreq *ifr = if_req;
10771 10769          struct lifreq *lifr = if_req;
10772 10770          uint64_t turn_on, turn_off;
10773 10771  
10774 10772          ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
10775 10773              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10776 10774  
10777 10775          if (ipip->ipi_cmd_type == IF_CMD) {
10778 10776                  /* cast to uint16_t prevents unwanted sign extension */
10779 10777                  flags = (uint16_t)ifr->ifr_flags;
10780 10778          } else {
10781 10779                  flags = lifr->lifr_flags;
10782 10780          }
10783 10781  
10784 10782          /*
10785 10783           * If this function call is a result of the ILLF_NOACCEPT flag
10786 10784           * change, do not call ipif_down_tail(). See ip_sioctl_flags().
10787 10785           */
10788 10786          ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10789 10787          if (!((turn_on|turn_off) & ILLF_NOACCEPT))
10790 10788                  (void) ipif_down_tail(ipif);
10791 10789  
10792 10790          return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10793 10791  }
10794 10792  
10795 10793  /*
10796 10794   * Can operate on either a module or a driver queue.
10797 10795   */
10798 10796  /* ARGSUSED */
10799 10797  int
10800 10798  ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10801 10799      ip_ioctl_cmd_t *ipip, void *if_req)
10802 10800  {
10803 10801          /*
10804 10802           * Has the flags been set correctly till now ?
10805 10803           */
10806 10804          ill_t *ill = ipif->ipif_ill;
10807 10805          phyint_t *phyi = ill->ill_phyint;
10808 10806  
10809 10807          ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
10810 10808              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10811 10809          ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10812 10810          ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10813 10811          ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10814 10812  
10815 10813          /*
10816 10814           * Need a lock since some flags can be set even when there are
10817 10815           * references to the ipif.
10818 10816           */
10819 10817          mutex_enter(&ill->ill_lock);
10820 10818          if (ipip->ipi_cmd_type == IF_CMD) {
10821 10819                  struct ifreq *ifr = (struct ifreq *)if_req;
10822 10820  
10823 10821                  /* Get interface flags (low 16 only). */
10824 10822                  ifr->ifr_flags = ((ipif->ipif_flags |
10825 10823                      ill->ill_flags | phyi->phyint_flags) & 0xffff);
10826 10824          } else {
10827 10825                  struct lifreq *lifr = (struct lifreq *)if_req;
10828 10826  
10829 10827                  /* Get interface flags. */
10830 10828                  lifr->lifr_flags = ipif->ipif_flags |
10831 10829                      ill->ill_flags | phyi->phyint_flags;
10832 10830          }
10833 10831          mutex_exit(&ill->ill_lock);
10834 10832          return (0);
10835 10833  }
10836 10834  
10837 10835  /*
10838 10836   * We allow the MTU to be set on an ILL, but not have it be different
10839 10837   * for different IPIFs since we don't actually send packets on IPIFs.
10840 10838   */
10841 10839  /* ARGSUSED */
10842 10840  int
10843 10841  ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10844 10842      ip_ioctl_cmd_t *ipip, void *if_req)
10845 10843  {
10846 10844          int mtu;
10847 10845          int ip_min_mtu;
10848 10846          struct ifreq    *ifr;
10849 10847          struct lifreq *lifr;
10850 10848          ill_t   *ill;
10851 10849  
10852 10850          ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
10853 10851              ipif->ipif_id, (void *)ipif));
10854 10852          if (ipip->ipi_cmd_type == IF_CMD) {
10855 10853                  ifr = (struct ifreq *)if_req;
10856 10854                  mtu = ifr->ifr_metric;
10857 10855          } else {
10858 10856                  lifr = (struct lifreq *)if_req;
10859 10857                  mtu = lifr->lifr_mtu;
10860 10858          }
10861 10859          /* Only allow for logical unit zero i.e. not on "bge0:17" */
10862 10860          if (ipif->ipif_id != 0)
10863 10861                  return (EINVAL);
10864 10862  
10865 10863          ill = ipif->ipif_ill;
10866 10864          if (ipif->ipif_isv6)
10867 10865                  ip_min_mtu = IPV6_MIN_MTU;
10868 10866          else
10869 10867                  ip_min_mtu = IP_MIN_MTU;
10870 10868  
10871 10869          mutex_enter(&ill->ill_lock);
10872 10870          if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
10873 10871                  mutex_exit(&ill->ill_lock);
10874 10872                  return (EINVAL);
10875 10873          }
10876 10874          /* Avoid increasing ill_mc_mtu */
10877 10875          if (ill->ill_mc_mtu > mtu)
10878 10876                  ill->ill_mc_mtu = mtu;
10879 10877  
10880 10878          /*
10881 10879           * The dce and fragmentation code can handle changes to ill_mtu
10882 10880           * concurrent with sending/fragmenting packets.
10883 10881           */
10884 10882          ill->ill_mtu = mtu;
10885 10883          ill->ill_flags |= ILLF_FIXEDMTU;
10886 10884          mutex_exit(&ill->ill_lock);
10887 10885  
10888 10886          /*
10889 10887           * Make sure all dce_generation checks find out
10890 10888           * that ill_mtu/ill_mc_mtu has changed.
10891 10889           */
10892 10890          dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
10893 10891  
10894 10892          /*
10895 10893           * Refresh IPMP meta-interface MTU if necessary.
10896 10894           */
10897 10895          if (IS_UNDER_IPMP(ill))
10898 10896                  ipmp_illgrp_refresh_mtu(ill->ill_grp);
10899 10897  
10900 10898          /* Update the MTU in SCTP's list */
10901 10899          sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10902 10900          return (0);
10903 10901  }
10904 10902  
10905 10903  /* Get interface MTU. */
10906 10904  /* ARGSUSED */
10907 10905  int
10908 10906  ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10909 10907      ip_ioctl_cmd_t *ipip, void *if_req)
10910 10908  {
10911 10909          struct ifreq    *ifr;
10912 10910          struct lifreq   *lifr;
10913 10911  
10914 10912          ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
10915 10913              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10916 10914  
10917 10915          /*
10918 10916           * We allow a get on any logical interface even though the set
10919 10917           * can only be done on logical unit 0.
10920 10918           */
10921 10919          if (ipip->ipi_cmd_type == IF_CMD) {
10922 10920                  ifr = (struct ifreq *)if_req;
10923 10921                  ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
10924 10922          } else {
10925 10923                  lifr = (struct lifreq *)if_req;
10926 10924                  lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
10927 10925          }
10928 10926          return (0);
10929 10927  }
10930 10928  
10931 10929  /* Set interface broadcast address. */
10932 10930  /* ARGSUSED2 */
10933 10931  int
10934 10932  ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10935 10933      ip_ioctl_cmd_t *ipip, void *if_req)
10936 10934  {
10937 10935          ipaddr_t addr;
10938 10936          ire_t   *ire;
10939 10937          ill_t           *ill = ipif->ipif_ill;
10940 10938          ip_stack_t      *ipst = ill->ill_ipst;
10941 10939  
10942 10940          ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
10943 10941              ipif->ipif_id));
10944 10942  
10945 10943          ASSERT(IAM_WRITER_IPIF(ipif));
10946 10944          if (!(ipif->ipif_flags & IPIF_BROADCAST))
10947 10945                  return (EADDRNOTAVAIL);
10948 10946  
10949 10947          ASSERT(!(ipif->ipif_isv6));     /* No IPv6 broadcast */
10950 10948  
10951 10949          if (sin->sin_family != AF_INET)
10952 10950                  return (EAFNOSUPPORT);
10953 10951  
10954 10952          addr = sin->sin_addr.s_addr;
10955 10953  
10956 10954          if (ipif->ipif_flags & IPIF_UP) {
10957 10955                  /*
10958 10956                   * If we are already up, make sure the new
10959 10957                   * broadcast address makes sense.  If it does,
10960 10958                   * there should be an IRE for it already.
10961 10959                   */
10962 10960                  ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
10963 10961                      ill, ipif->ipif_zoneid, NULL,
10964 10962                      (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
10965 10963                  if (ire == NULL) {
10966 10964                          return (EINVAL);
10967 10965                  } else {
10968 10966                          ire_refrele(ire);
10969 10967                  }
10970 10968          }
10971 10969          /*
10972 10970           * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
10973 10971           * needs to already exist we never need to change the set of
10974 10972           * IRE_BROADCASTs when we are UP.
10975 10973           */
10976 10974          if (addr != ipif->ipif_brd_addr)
10977 10975                  IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
10978 10976  
10979 10977          return (0);
10980 10978  }
10981 10979  
10982 10980  /* Get interface broadcast address. */
10983 10981  /* ARGSUSED */
10984 10982  int
10985 10983  ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10986 10984      ip_ioctl_cmd_t *ipip, void *if_req)
10987 10985  {
10988 10986          ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
10989 10987              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10990 10988          if (!(ipif->ipif_flags & IPIF_BROADCAST))
10991 10989                  return (EADDRNOTAVAIL);
10992 10990  
10993 10991          /* IPIF_BROADCAST not possible with IPv6 */
10994 10992          ASSERT(!ipif->ipif_isv6);
10995 10993          *sin = sin_null;
10996 10994          sin->sin_family = AF_INET;
10997 10995          sin->sin_addr.s_addr = ipif->ipif_brd_addr;
10998 10996          return (0);
10999 10997  }
11000 10998  
11001 10999  /*
11002 11000   * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
11003 11001   */
11004 11002  /* ARGSUSED */
11005 11003  int
11006 11004  ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11007 11005      ip_ioctl_cmd_t *ipip, void *if_req)
11008 11006  {
11009 11007          int err = 0;
11010 11008          in6_addr_t v6mask;
11011 11009  
11012 11010          ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
11013 11011              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11014 11012  
11015 11013          ASSERT(IAM_WRITER_IPIF(ipif));
11016 11014  
11017 11015          if (ipif->ipif_isv6) {
11018 11016                  sin6_t *sin6;
11019 11017  
11020 11018                  if (sin->sin_family != AF_INET6)
11021 11019                          return (EAFNOSUPPORT);
11022 11020  
11023 11021                  sin6 = (sin6_t *)sin;
11024 11022                  v6mask = sin6->sin6_addr;
11025 11023          } else {
11026 11024                  ipaddr_t mask;
11027 11025  
11028 11026                  if (sin->sin_family != AF_INET)
11029 11027                          return (EAFNOSUPPORT);
11030 11028  
11031 11029                  mask = sin->sin_addr.s_addr;
11032 11030                  if (!ip_contiguous_mask(ntohl(mask)))
11033 11031                          return (ENOTSUP);
11034 11032                  V4MASK_TO_V6(mask, v6mask);
11035 11033          }
11036 11034  
11037 11035          /*
11038 11036           * No big deal if the interface isn't already up, or the mask
11039 11037           * isn't really changing, or this is pt-pt.
11040 11038           */
11041 11039          if (!(ipif->ipif_flags & IPIF_UP) ||
11042 11040              IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
11043 11041              (ipif->ipif_flags & IPIF_POINTOPOINT)) {
11044 11042                  ipif->ipif_v6net_mask = v6mask;
11045 11043                  if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11046 11044                          V6_MASK_COPY(ipif->ipif_v6lcl_addr,
11047 11045                              ipif->ipif_v6net_mask,
11048 11046                              ipif->ipif_v6subnet);
11049 11047                  }
11050 11048                  return (0);
11051 11049          }
11052 11050          /*
11053 11051           * Make sure we have valid net and subnet broadcast ire's
11054 11052           * for the old netmask, if needed by other logical interfaces.
11055 11053           */
11056 11054          err = ipif_logical_down(ipif, q, mp);
11057 11055          if (err == EINPROGRESS)
11058 11056                  return (err);
11059 11057          (void) ipif_down_tail(ipif);
11060 11058          err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
11061 11059          return (err);
11062 11060  }
11063 11061  
11064 11062  static int
11065 11063  ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
11066 11064  {
11067 11065          in6_addr_t v6mask;
11068 11066          int err = 0;
11069 11067  
11070 11068          ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
11071 11069              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11072 11070  
11073 11071          if (ipif->ipif_isv6) {
11074 11072                  sin6_t *sin6;
11075 11073  
11076 11074                  sin6 = (sin6_t *)sin;
11077 11075                  v6mask = sin6->sin6_addr;
11078 11076          } else {
11079 11077                  ipaddr_t mask;
11080 11078  
11081 11079                  mask = sin->sin_addr.s_addr;
11082 11080                  V4MASK_TO_V6(mask, v6mask);
11083 11081          }
11084 11082  
11085 11083          ipif->ipif_v6net_mask = v6mask;
11086 11084          if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11087 11085                  V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
11088 11086                      ipif->ipif_v6subnet);
11089 11087          }
11090 11088          err = ipif_up(ipif, q, mp);
11091 11089  
11092 11090          if (err == 0 || err == EINPROGRESS) {
11093 11091                  /*
11094 11092                   * The interface must be DL_BOUND if this packet has to
11095 11093                   * go out on the wire. Since we only go through a logical
11096 11094                   * down and are bound with the driver during an internal
11097 11095                   * down/up that is satisfied.
11098 11096                   */
11099 11097                  if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
11100 11098                          /* Potentially broadcast an address mask reply. */
11101 11099                          ipif_mask_reply(ipif);
11102 11100                  }
11103 11101          }
11104 11102          return (err);
11105 11103  }
11106 11104  
11107 11105  /* ARGSUSED */
11108 11106  int
11109 11107  ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11110 11108      ip_ioctl_cmd_t *ipip, void *if_req)
11111 11109  {
11112 11110          ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
11113 11111              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11114 11112          (void) ipif_down_tail(ipif);
11115 11113          return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
11116 11114  }
11117 11115  
11118 11116  /* Get interface net mask. */
11119 11117  /* ARGSUSED */
11120 11118  int
11121 11119  ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11122 11120      ip_ioctl_cmd_t *ipip, void *if_req)
11123 11121  {
11124 11122          struct lifreq *lifr = (struct lifreq *)if_req;
11125 11123          struct sockaddr_in6 *sin6 = (sin6_t *)sin;
11126 11124  
11127 11125          ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
11128 11126              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11129 11127  
11130 11128          /*
11131 11129           * net mask can't change since we have a reference to the ipif.
11132 11130           */
11133 11131          if (ipif->ipif_isv6) {
11134 11132                  ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11135 11133                  *sin6 = sin6_null;
11136 11134                  sin6->sin6_family = AF_INET6;
11137 11135                  sin6->sin6_addr = ipif->ipif_v6net_mask;
11138 11136                  lifr->lifr_addrlen =
11139 11137                      ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11140 11138          } else {
11141 11139                  *sin = sin_null;
11142 11140                  sin->sin_family = AF_INET;
11143 11141                  sin->sin_addr.s_addr = ipif->ipif_net_mask;
11144 11142                  if (ipip->ipi_cmd_type == LIF_CMD) {
11145 11143                          lifr->lifr_addrlen =
11146 11144                              ip_mask_to_plen(ipif->ipif_net_mask);
11147 11145                  }
11148 11146          }
11149 11147          return (0);
11150 11148  }
11151 11149  
11152 11150  /* ARGSUSED */
11153 11151  int
11154 11152  ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11155 11153      ip_ioctl_cmd_t *ipip, void *if_req)
11156 11154  {
11157 11155          ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
11158 11156              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11159 11157  
11160 11158          /*
11161 11159           * Since no applications should ever be setting metrics on underlying
11162 11160           * interfaces, we explicitly fail to smoke 'em out.
11163 11161           */
11164 11162          if (IS_UNDER_IPMP(ipif->ipif_ill))
11165 11163                  return (EINVAL);
11166 11164  
11167 11165          /*
11168 11166           * Set interface metric.  We don't use this for
11169 11167           * anything but we keep track of it in case it is
11170 11168           * important to routing applications or such.
11171 11169           */
11172 11170          if (ipip->ipi_cmd_type == IF_CMD) {
11173 11171                  struct ifreq    *ifr;
11174 11172  
11175 11173                  ifr = (struct ifreq *)if_req;
11176 11174                  ipif->ipif_ill->ill_metric = ifr->ifr_metric;
11177 11175          } else {
11178 11176                  struct lifreq   *lifr;
11179 11177  
11180 11178                  lifr = (struct lifreq *)if_req;
11181 11179                  ipif->ipif_ill->ill_metric = lifr->lifr_metric;
11182 11180          }
11183 11181          return (0);
11184 11182  }
11185 11183  
11186 11184  /* ARGSUSED */
11187 11185  int
11188 11186  ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11189 11187      ip_ioctl_cmd_t *ipip, void *if_req)
11190 11188  {
11191 11189          /* Get interface metric. */
11192 11190          ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
11193 11191              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11194 11192  
11195 11193          if (ipip->ipi_cmd_type == IF_CMD) {
11196 11194                  struct ifreq    *ifr;
11197 11195  
11198 11196                  ifr = (struct ifreq *)if_req;
11199 11197                  ifr->ifr_metric = ipif->ipif_ill->ill_metric;
11200 11198          } else {
11201 11199                  struct lifreq   *lifr;
11202 11200  
11203 11201                  lifr = (struct lifreq *)if_req;
11204 11202                  lifr->lifr_metric = ipif->ipif_ill->ill_metric;
11205 11203          }
11206 11204  
11207 11205          return (0);
11208 11206  }
11209 11207  
11210 11208  /* ARGSUSED */
11211 11209  int
11212 11210  ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11213 11211      ip_ioctl_cmd_t *ipip, void *if_req)
11214 11212  {
11215 11213          int     arp_muxid;
11216 11214  
11217 11215          ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
11218 11216              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11219 11217          /*
11220 11218           * Set the muxid returned from I_PLINK.
11221 11219           */
11222 11220          if (ipip->ipi_cmd_type == IF_CMD) {
11223 11221                  struct ifreq *ifr = (struct ifreq *)if_req;
11224 11222  
11225 11223                  ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
11226 11224                  arp_muxid = ifr->ifr_arp_muxid;
11227 11225          } else {
11228 11226                  struct lifreq *lifr = (struct lifreq *)if_req;
11229 11227  
11230 11228                  ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
11231 11229                  arp_muxid = lifr->lifr_arp_muxid;
11232 11230          }
11233 11231          arl_set_muxid(ipif->ipif_ill, arp_muxid);
11234 11232          return (0);
11235 11233  }
11236 11234  
11237 11235  /* ARGSUSED */
11238 11236  int
11239 11237  ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11240 11238      ip_ioctl_cmd_t *ipip, void *if_req)
11241 11239  {
11242 11240          int     arp_muxid = 0;
11243 11241  
11244 11242          ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
11245 11243              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11246 11244          /*
11247 11245           * Get the muxid saved in ill for I_PUNLINK.
11248 11246           */
11249 11247          arp_muxid = arl_get_muxid(ipif->ipif_ill);
11250 11248          if (ipip->ipi_cmd_type == IF_CMD) {
11251 11249                  struct ifreq *ifr = (struct ifreq *)if_req;
11252 11250  
11253 11251                  ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11254 11252                  ifr->ifr_arp_muxid = arp_muxid;
11255 11253          } else {
11256 11254                  struct lifreq *lifr = (struct lifreq *)if_req;
11257 11255  
11258 11256                  lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11259 11257                  lifr->lifr_arp_muxid = arp_muxid;
11260 11258          }
11261 11259          return (0);
11262 11260  }
11263 11261  
11264 11262  /*
11265 11263   * Set the subnet prefix. Does not modify the broadcast address.
11266 11264   */
11267 11265  /* ARGSUSED */
11268 11266  int
11269 11267  ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11270 11268      ip_ioctl_cmd_t *ipip, void *if_req)
11271 11269  {
11272 11270          int err = 0;
11273 11271          in6_addr_t v6addr;
11274 11272          in6_addr_t v6mask;
11275 11273          boolean_t need_up = B_FALSE;
11276 11274          int addrlen;
11277 11275  
11278 11276          ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11279 11277              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11280 11278  
11281 11279          ASSERT(IAM_WRITER_IPIF(ipif));
11282 11280          addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11283 11281  
11284 11282          if (ipif->ipif_isv6) {
11285 11283                  sin6_t *sin6;
11286 11284  
11287 11285                  if (sin->sin_family != AF_INET6)
11288 11286                          return (EAFNOSUPPORT);
11289 11287  
11290 11288                  sin6 = (sin6_t *)sin;
11291 11289                  v6addr = sin6->sin6_addr;
11292 11290                  if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11293 11291                          return (EADDRNOTAVAIL);
11294 11292          } else {
11295 11293                  ipaddr_t addr;
11296 11294  
11297 11295                  if (sin->sin_family != AF_INET)
11298 11296                          return (EAFNOSUPPORT);
11299 11297  
11300 11298                  addr = sin->sin_addr.s_addr;
11301 11299                  if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11302 11300                          return (EADDRNOTAVAIL);
11303 11301                  IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11304 11302                  /* Add 96 bits */
11305 11303                  addrlen += IPV6_ABITS - IP_ABITS;
11306 11304          }
11307 11305  
11308 11306          if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11309 11307                  return (EINVAL);
11310 11308  
11311 11309          /* Check if bits in the address is set past the mask */
11312 11310          if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11313 11311                  return (EINVAL);
11314 11312  
11315 11313          if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11316 11314              IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11317 11315                  return (0);     /* No change */
11318 11316  
11319 11317          if (ipif->ipif_flags & IPIF_UP) {
11320 11318                  /*
11321 11319                   * If the interface is already marked up,
11322 11320                   * we call ipif_down which will take care
11323 11321                   * of ditching any IREs that have been set
11324 11322                   * up based on the old interface address.
11325 11323                   */
11326 11324                  err = ipif_logical_down(ipif, q, mp);
11327 11325                  if (err == EINPROGRESS)
11328 11326                          return (err);
11329 11327                  (void) ipif_down_tail(ipif);
11330 11328                  need_up = B_TRUE;
11331 11329          }
11332 11330  
11333 11331          err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11334 11332          return (err);
11335 11333  }
11336 11334  
11337 11335  static int
11338 11336  ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11339 11337      queue_t *q, mblk_t *mp, boolean_t need_up)
11340 11338  {
11341 11339          ill_t   *ill = ipif->ipif_ill;
11342 11340          int     err = 0;
11343 11341  
11344 11342          ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11345 11343              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11346 11344  
11347 11345          /* Set the new address. */
11348 11346          mutex_enter(&ill->ill_lock);
11349 11347          ipif->ipif_v6net_mask = v6mask;
11350 11348          if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11351 11349                  V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11352 11350                      ipif->ipif_v6subnet);
11353 11351          }
11354 11352          mutex_exit(&ill->ill_lock);
11355 11353  
11356 11354          if (need_up) {
11357 11355                  /*
11358 11356                   * Now bring the interface back up.  If this
11359 11357                   * is the only IPIF for the ILL, ipif_up
11360 11358                   * will have to re-bind to the device, so
11361 11359                   * we may get back EINPROGRESS, in which
11362 11360                   * case, this IOCTL will get completed in
11363 11361                   * ip_rput_dlpi when we see the DL_BIND_ACK.
11364 11362                   */
11365 11363                  err = ipif_up(ipif, q, mp);
11366 11364                  if (err == EINPROGRESS)
11367 11365                          return (err);
11368 11366          }
11369 11367          return (err);
11370 11368  }
11371 11369  
11372 11370  /* ARGSUSED */
11373 11371  int
11374 11372  ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11375 11373      ip_ioctl_cmd_t *ipip, void *if_req)
11376 11374  {
11377 11375          int     addrlen;
11378 11376          in6_addr_t v6addr;
11379 11377          in6_addr_t v6mask;
11380 11378          struct lifreq *lifr = (struct lifreq *)if_req;
11381 11379  
11382 11380          ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
11383 11381              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11384 11382          (void) ipif_down_tail(ipif);
11385 11383  
11386 11384          addrlen = lifr->lifr_addrlen;
11387 11385          if (ipif->ipif_isv6) {
11388 11386                  sin6_t *sin6;
11389 11387  
11390 11388                  sin6 = (sin6_t *)sin;
11391 11389                  v6addr = sin6->sin6_addr;
11392 11390          } else {
11393 11391                  ipaddr_t addr;
11394 11392  
11395 11393                  addr = sin->sin_addr.s_addr;
11396 11394                  IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11397 11395                  addrlen += IPV6_ABITS - IP_ABITS;
11398 11396          }
11399 11397          (void) ip_plen_to_mask_v6(addrlen, &v6mask);
11400 11398  
11401 11399          return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
11402 11400  }
11403 11401  
11404 11402  /* ARGSUSED */
11405 11403  int
11406 11404  ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11407 11405      ip_ioctl_cmd_t *ipip, void *if_req)
11408 11406  {
11409 11407          struct lifreq *lifr = (struct lifreq *)if_req;
11410 11408          struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
11411 11409  
11412 11410          ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
11413 11411              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11414 11412          ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11415 11413  
11416 11414          if (ipif->ipif_isv6) {
11417 11415                  *sin6 = sin6_null;
11418 11416                  sin6->sin6_family = AF_INET6;
11419 11417                  sin6->sin6_addr = ipif->ipif_v6subnet;
11420 11418                  lifr->lifr_addrlen =
11421 11419                      ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11422 11420          } else {
11423 11421                  *sin = sin_null;
11424 11422                  sin->sin_family = AF_INET;
11425 11423                  sin->sin_addr.s_addr = ipif->ipif_subnet;
11426 11424                  lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
11427 11425          }
11428 11426          return (0);
11429 11427  }
11430 11428  
11431 11429  /*
11432 11430   * Set the IPv6 address token.
11433 11431   */
11434 11432  /* ARGSUSED */
11435 11433  int
11436 11434  ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11437 11435      ip_ioctl_cmd_t *ipi, void *if_req)
11438 11436  {
11439 11437          ill_t *ill = ipif->ipif_ill;
11440 11438          int err;
11441 11439          in6_addr_t v6addr;
11442 11440          in6_addr_t v6mask;
11443 11441          boolean_t need_up = B_FALSE;
11444 11442          int i;
11445 11443          sin6_t *sin6 = (sin6_t *)sin;
11446 11444          struct lifreq *lifr = (struct lifreq *)if_req;
11447 11445          int addrlen;
11448 11446  
11449 11447          ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
11450 11448              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11451 11449          ASSERT(IAM_WRITER_IPIF(ipif));
11452 11450  
11453 11451          addrlen = lifr->lifr_addrlen;
11454 11452          /* Only allow for logical unit zero i.e. not on "le0:17" */
11455 11453          if (ipif->ipif_id != 0)
11456 11454                  return (EINVAL);
11457 11455  
11458 11456          if (!ipif->ipif_isv6)
11459 11457                  return (EINVAL);
11460 11458  
11461 11459          if (addrlen > IPV6_ABITS)
11462 11460                  return (EINVAL);
11463 11461  
11464 11462          v6addr = sin6->sin6_addr;
11465 11463  
11466 11464          /*
11467 11465           * The length of the token is the length from the end.  To get
11468 11466           * the proper mask for this, compute the mask of the bits not
11469 11467           * in the token; ie. the prefix, and then xor to get the mask.
11470 11468           */
11471 11469          if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
11472 11470                  return (EINVAL);
11473 11471          for (i = 0; i < 4; i++) {
11474 11472                  v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11475 11473          }
11476 11474  
11477 11475          if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
11478 11476              ill->ill_token_length == addrlen)
11479 11477                  return (0);     /* No change */
11480 11478  
11481 11479          if (ipif->ipif_flags & IPIF_UP) {
11482 11480                  err = ipif_logical_down(ipif, q, mp);
11483 11481                  if (err == EINPROGRESS)
11484 11482                          return (err);
11485 11483                  (void) ipif_down_tail(ipif);
11486 11484                  need_up = B_TRUE;
11487 11485          }
11488 11486          err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
11489 11487          return (err);
11490 11488  }
11491 11489  
11492 11490  static int
11493 11491  ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
11494 11492      mblk_t *mp, boolean_t need_up)
11495 11493  {
11496 11494          in6_addr_t v6addr;
11497 11495          in6_addr_t v6mask;
11498 11496          ill_t   *ill = ipif->ipif_ill;
11499 11497          int     i;
11500 11498          int     err = 0;
11501 11499  
11502 11500          ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
11503 11501              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11504 11502          v6addr = sin6->sin6_addr;
11505 11503          /*
11506 11504           * The length of the token is the length from the end.  To get
11507 11505           * the proper mask for this, compute the mask of the bits not
11508 11506           * in the token; ie. the prefix, and then xor to get the mask.
11509 11507           */
11510 11508          (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
11511 11509          for (i = 0; i < 4; i++)
11512 11510                  v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11513 11511  
11514 11512          mutex_enter(&ill->ill_lock);
11515 11513          V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
11516 11514          ill->ill_token_length = addrlen;
11517 11515          ill->ill_manual_token = 1;
11518 11516  
11519 11517          /* Reconfigure the link-local address based on this new token */
11520 11518          ipif_setlinklocal(ill->ill_ipif);
11521 11519  
11522 11520          mutex_exit(&ill->ill_lock);
11523 11521  
11524 11522          if (need_up) {
11525 11523                  /*
11526 11524                   * Now bring the interface back up.  If this
11527 11525                   * is the only IPIF for the ILL, ipif_up
11528 11526                   * will have to re-bind to the device, so
11529 11527                   * we may get back EINPROGRESS, in which
11530 11528                   * case, this IOCTL will get completed in
11531 11529                   * ip_rput_dlpi when we see the DL_BIND_ACK.
11532 11530                   */
11533 11531                  err = ipif_up(ipif, q, mp);
11534 11532                  if (err == EINPROGRESS)
11535 11533                          return (err);
11536 11534          }
11537 11535          return (err);
11538 11536  }
11539 11537  
11540 11538  /* ARGSUSED */
11541 11539  int
11542 11540  ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11543 11541      ip_ioctl_cmd_t *ipi, void *if_req)
11544 11542  {
11545 11543          ill_t *ill;
11546 11544          sin6_t *sin6 = (sin6_t *)sin;
11547 11545          struct lifreq *lifr = (struct lifreq *)if_req;
11548 11546  
11549 11547          ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
11550 11548              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11551 11549          if (ipif->ipif_id != 0)
11552 11550                  return (EINVAL);
11553 11551  
11554 11552          ill = ipif->ipif_ill;
11555 11553          if (!ill->ill_isv6)
11556 11554                  return (ENXIO);
11557 11555  
11558 11556          *sin6 = sin6_null;
11559 11557          sin6->sin6_family = AF_INET6;
11560 11558          ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
11561 11559          sin6->sin6_addr = ill->ill_token;
11562 11560          lifr->lifr_addrlen = ill->ill_token_length;
11563 11561          return (0);
11564 11562  }
11565 11563  
11566 11564  /*
11567 11565   * Set (hardware) link specific information that might override
11568 11566   * what was acquired through the DL_INFO_ACK.
11569 11567   */
11570 11568  /* ARGSUSED */
11571 11569  int
11572 11570  ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11573 11571      ip_ioctl_cmd_t *ipi, void *if_req)
11574 11572  {
11575 11573          ill_t           *ill = ipif->ipif_ill;
11576 11574          int             ip_min_mtu;
11577 11575          struct lifreq   *lifr = (struct lifreq *)if_req;
11578 11576          lif_ifinfo_req_t *lir;
11579 11577  
11580 11578          ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
11581 11579              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11582 11580          lir = &lifr->lifr_ifinfo;
11583 11581          ASSERT(IAM_WRITER_IPIF(ipif));
11584 11582  
11585 11583          /* Only allow for logical unit zero i.e. not on "bge0:17" */
11586 11584          if (ipif->ipif_id != 0)
11587 11585                  return (EINVAL);
11588 11586  
11589 11587          /* Set interface MTU. */
11590 11588          if (ipif->ipif_isv6)
11591 11589                  ip_min_mtu = IPV6_MIN_MTU;
11592 11590          else
11593 11591                  ip_min_mtu = IP_MIN_MTU;
11594 11592  
11595 11593          /*
11596 11594           * Verify values before we set anything. Allow zero to
11597 11595           * mean unspecified.
11598 11596           *
11599 11597           * XXX We should be able to set the user-defined lir_mtu to some value
11600 11598           * that is greater than ill_current_frag but less than ill_max_frag- the
11601 11599           * ill_max_frag value tells us the max MTU that can be handled by the
11602 11600           * datalink, whereas the ill_current_frag is dynamically computed for
11603 11601           * some link-types like tunnels, based on the tunnel PMTU. However,
11604 11602           * since there is currently no way of distinguishing between
11605 11603           * administratively fixed link mtu values (e.g., those set via
11606 11604           * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
11607 11605           * for tunnels) we conservatively choose the  ill_current_frag as the
11608 11606           * upper-bound.
11609 11607           */
11610 11608          if (lir->lir_maxmtu != 0 &&
11611 11609              (lir->lir_maxmtu > ill->ill_current_frag ||
11612 11610              lir->lir_maxmtu < ip_min_mtu))
11613 11611                  return (EINVAL);
11614 11612          if (lir->lir_reachtime != 0 &&
11615 11613              lir->lir_reachtime > ND_MAX_REACHTIME)
11616 11614                  return (EINVAL);
11617 11615          if (lir->lir_reachretrans != 0 &&
11618 11616              lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
11619 11617                  return (EINVAL);
11620 11618  
11621 11619          mutex_enter(&ill->ill_lock);
11622 11620          /*
11623 11621           * The dce and fragmentation code can handle changes to ill_mtu
11624 11622           * concurrent with sending/fragmenting packets.
11625 11623           */
11626 11624          if (lir->lir_maxmtu != 0)
11627 11625                  ill->ill_user_mtu = lir->lir_maxmtu;
11628 11626  
11629 11627          if (lir->lir_reachtime != 0)
11630 11628                  ill->ill_reachable_time = lir->lir_reachtime;
11631 11629  
11632 11630          if (lir->lir_reachretrans != 0)
11633 11631                  ill->ill_reachable_retrans_time = lir->lir_reachretrans;
11634 11632  
11635 11633          ill->ill_max_hops = lir->lir_maxhops;
11636 11634          ill->ill_max_buf = ND_MAX_Q;
11637 11635          if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
11638 11636                  /*
11639 11637                   * ill_mtu is the actual interface MTU, obtained as the min
11640 11638                   * of user-configured mtu and the value announced by the
11641 11639                   * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
11642 11640                   * we have already made the choice of requiring
11643 11641                   * ill_user_mtu < ill_current_frag by the time we get here,
11644 11642                   * the ill_mtu effectively gets assigned to the ill_user_mtu
11645 11643                   * here.
11646 11644                   */
11647 11645                  ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
11648 11646                  ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
11649 11647          }
11650 11648          mutex_exit(&ill->ill_lock);
11651 11649  
11652 11650          /*
11653 11651           * Make sure all dce_generation checks find out
11654 11652           * that ill_mtu/ill_mc_mtu has changed.
11655 11653           */
11656 11654          if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
11657 11655                  dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
11658 11656  
11659 11657          /*
11660 11658           * Refresh IPMP meta-interface MTU if necessary.
11661 11659           */
11662 11660          if (IS_UNDER_IPMP(ill))
11663 11661                  ipmp_illgrp_refresh_mtu(ill->ill_grp);
11664 11662  
11665 11663          return (0);
11666 11664  }
11667 11665  
11668 11666  /* ARGSUSED */
11669 11667  int
11670 11668  ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11671 11669      ip_ioctl_cmd_t *ipi, void *if_req)
11672 11670  {
11673 11671          struct lif_ifinfo_req *lir;
11674 11672          ill_t *ill = ipif->ipif_ill;
11675 11673  
11676 11674          ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
11677 11675              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11678 11676          if (ipif->ipif_id != 0)
11679 11677                  return (EINVAL);
11680 11678  
11681 11679          lir = &((struct lifreq *)if_req)->lifr_ifinfo;
11682 11680          lir->lir_maxhops = ill->ill_max_hops;
11683 11681          lir->lir_reachtime = ill->ill_reachable_time;
11684 11682          lir->lir_reachretrans = ill->ill_reachable_retrans_time;
11685 11683          lir->lir_maxmtu = ill->ill_mtu;
11686 11684  
11687 11685          return (0);
11688 11686  }
11689 11687  
11690 11688  /*
11691 11689   * Return best guess as to the subnet mask for the specified address.
11692 11690   * Based on the subnet masks for all the configured interfaces.
11693 11691   *
11694 11692   * We end up returning a zero mask in the case of default, multicast or
11695 11693   * experimental.
11696 11694   */
11697 11695  static ipaddr_t
11698 11696  ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
11699 11697  {
11700 11698          ipaddr_t net_mask;
11701 11699          ill_t   *ill;
11702 11700          ipif_t  *ipif;
11703 11701          ill_walk_context_t ctx;
11704 11702          ipif_t  *fallback_ipif = NULL;
11705 11703  
11706 11704          net_mask = ip_net_mask(addr);
11707 11705          if (net_mask == 0) {
11708 11706                  *ipifp = NULL;
11709 11707                  return (0);
11710 11708          }
11711 11709  
11712 11710          /* Let's check to see if this is maybe a local subnet route. */
11713 11711          /* this function only applies to IPv4 interfaces */
11714 11712          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
11715 11713          ill = ILL_START_WALK_V4(&ctx, ipst);
11716 11714          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
11717 11715                  mutex_enter(&ill->ill_lock);
11718 11716                  for (ipif = ill->ill_ipif; ipif != NULL;
11719 11717                      ipif = ipif->ipif_next) {
11720 11718                          if (IPIF_IS_CONDEMNED(ipif))
11721 11719                                  continue;
11722 11720                          if (!(ipif->ipif_flags & IPIF_UP))
11723 11721                                  continue;
11724 11722                          if ((ipif->ipif_subnet & net_mask) ==
11725 11723                              (addr & net_mask)) {
11726 11724                                  /*
11727 11725                                   * Don't trust pt-pt interfaces if there are
11728 11726                                   * other interfaces.
11729 11727                                   */
11730 11728                                  if (ipif->ipif_flags & IPIF_POINTOPOINT) {
11731 11729                                          if (fallback_ipif == NULL) {
11732 11730                                                  ipif_refhold_locked(ipif);
11733 11731                                                  fallback_ipif = ipif;
11734 11732                                          }
11735 11733                                          continue;
11736 11734                                  }
11737 11735  
11738 11736                                  /*
11739 11737                                   * Fine. Just assume the same net mask as the
11740 11738                                   * directly attached subnet interface is using.
11741 11739                                   */
11742 11740                                  ipif_refhold_locked(ipif);
11743 11741                                  mutex_exit(&ill->ill_lock);
11744 11742                                  rw_exit(&ipst->ips_ill_g_lock);
11745 11743                                  if (fallback_ipif != NULL)
11746 11744                                          ipif_refrele(fallback_ipif);
11747 11745                                  *ipifp = ipif;
11748 11746                                  return (ipif->ipif_net_mask);
11749 11747                          }
11750 11748                  }
11751 11749                  mutex_exit(&ill->ill_lock);
11752 11750          }
11753 11751          rw_exit(&ipst->ips_ill_g_lock);
11754 11752  
11755 11753          *ipifp = fallback_ipif;
11756 11754          return ((fallback_ipif != NULL) ?
11757 11755              fallback_ipif->ipif_net_mask : net_mask);
11758 11756  }
11759 11757  
11760 11758  /*
11761 11759   * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
11762 11760   */
11763 11761  static void
11764 11762  ip_wput_ioctl(queue_t *q, mblk_t *mp)
11765 11763  {
11766 11764          IOCP    iocp;
11767 11765          ipft_t  *ipft;
11768 11766          ipllc_t *ipllc;
11769 11767          mblk_t  *mp1;
11770 11768          cred_t  *cr;
11771 11769          int     error = 0;
11772 11770          conn_t  *connp;
11773 11771  
11774 11772          ip1dbg(("ip_wput_ioctl"));
11775 11773          iocp = (IOCP)mp->b_rptr;
11776 11774          mp1 = mp->b_cont;
11777 11775          if (mp1 == NULL) {
11778 11776                  iocp->ioc_error = EINVAL;
11779 11777                  mp->b_datap->db_type = M_IOCNAK;
11780 11778                  iocp->ioc_count = 0;
11781 11779                  qreply(q, mp);
11782 11780                  return;
11783 11781          }
11784 11782  
11785 11783          /*
11786 11784           * These IOCTLs provide various control capabilities to
11787 11785           * upstream agents such as ULPs and processes.  There
11788 11786           * are currently two such IOCTLs implemented.  They
11789 11787           * are used by TCP to provide update information for
11790 11788           * existing IREs and to forcibly delete an IRE for a
11791 11789           * host that is not responding, thereby forcing an
11792 11790           * attempt at a new route.
11793 11791           */
11794 11792          iocp->ioc_error = EINVAL;
11795 11793          if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
11796 11794                  goto done;
11797 11795  
11798 11796          ipllc = (ipllc_t *)mp1->b_rptr;
11799 11797          for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
11800 11798                  if (ipllc->ipllc_cmd == ipft->ipft_cmd)
11801 11799                          break;
11802 11800          }
11803 11801          /*
11804 11802           * prefer credential from mblk over ioctl;
11805 11803           * see ip_sioctl_copyin_setup
11806 11804           */
11807 11805          cr = msg_getcred(mp, NULL);
11808 11806          if (cr == NULL)
11809 11807                  cr = iocp->ioc_cr;
11810 11808  
11811 11809          /*
11812 11810           * Refhold the conn in case the request gets queued up in some lookup
11813 11811           */
11814 11812          ASSERT(CONN_Q(q));
11815 11813          connp = Q_TO_CONN(q);
11816 11814          CONN_INC_REF(connp);
11817 11815          CONN_INC_IOCTLREF(connp);
11818 11816          if (ipft->ipft_pfi &&
11819 11817              ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
11820 11818              pullupmsg(mp1, ipft->ipft_min_size))) {
11821 11819                  error = (*ipft->ipft_pfi)(q,
11822 11820                      (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
11823 11821          }
11824 11822          if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
11825 11823                  /*
11826 11824                   * CONN_OPER_PENDING_DONE happens in the function called
11827 11825                   * through ipft_pfi above.
11828 11826                   */
11829 11827                  return;
11830 11828          }
11831 11829  
11832 11830          CONN_DEC_IOCTLREF(connp);
11833 11831          CONN_OPER_PENDING_DONE(connp);
11834 11832          if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
11835 11833                  freemsg(mp);
11836 11834                  return;
11837 11835          }
11838 11836          iocp->ioc_error = error;
11839 11837  
11840 11838  done:
11841 11839          mp->b_datap->db_type = M_IOCACK;
11842 11840          if (iocp->ioc_error)
11843 11841                  iocp->ioc_count = 0;
11844 11842          qreply(q, mp);
11845 11843  }
11846 11844  
11847 11845  /*
11848 11846   * Assign a unique id for the ipif. This is used by sctp_addr.c
11849 11847   * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
11850 11848   */
11851 11849  static void
11852 11850  ipif_assign_seqid(ipif_t *ipif)
11853 11851  {
11854 11852          ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
11855 11853  
11856 11854          ipif->ipif_seqid = atomic_inc_64_nv(&ipst->ips_ipif_g_seqid);
11857 11855  }
11858 11856  
11859 11857  /*
11860 11858   * Clone the contents of `sipif' to `dipif'.  Requires that both ipifs are
11861 11859   * administratively down (i.e., no DAD), of the same type, and locked.  Note
11862 11860   * that the clone is complete -- including the seqid -- and the expectation is
11863 11861   * that the caller will either free or overwrite `sipif' before it's unlocked.
11864 11862   */
11865 11863  static void
11866 11864  ipif_clone(const ipif_t *sipif, ipif_t *dipif)
11867 11865  {
11868 11866          ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
11869 11867          ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
11870 11868          ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11871 11869          ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11872 11870          ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
11873 11871  
11874 11872          dipif->ipif_flags = sipif->ipif_flags;
11875 11873          dipif->ipif_zoneid = sipif->ipif_zoneid;
11876 11874          dipif->ipif_v6subnet = sipif->ipif_v6subnet;
11877 11875          dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
11878 11876          dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
11879 11877          dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
11880 11878          dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
11881 11879  
11882 11880          /*
11883 11881           * As per the comment atop the function, we assume that these sipif
11884 11882           * fields will be changed before sipif is unlocked.
11885 11883           */
11886 11884          dipif->ipif_seqid = sipif->ipif_seqid;
11887 11885          dipif->ipif_state_flags = sipif->ipif_state_flags;
11888 11886  }
11889 11887  
11890 11888  /*
11891 11889   * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
11892 11890   * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
11893 11891   * (unreferenced) ipif.  Also, if `sipif' is used by the current xop, then
11894 11892   * transfer the xop to `dipif'.  Requires that all ipifs are administratively
11895 11893   * down (i.e., no DAD), of the same type, and unlocked.
11896 11894   */
11897 11895  static void
11898 11896  ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
11899 11897  {
11900 11898          ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
11901 11899          ipxop_t *ipx = ipsq->ipsq_xop;
11902 11900  
11903 11901          ASSERT(sipif != dipif);
11904 11902          ASSERT(sipif != virgipif);
11905 11903  
11906 11904          /*
11907 11905           * Grab all of the locks that protect the ipif in a defined order.
11908 11906           */
11909 11907          GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11910 11908  
11911 11909          ipif_clone(sipif, dipif);
11912 11910          if (virgipif != NULL) {
11913 11911                  ipif_clone(virgipif, sipif);
11914 11912                  mi_free(virgipif);
11915 11913          }
11916 11914  
11917 11915          RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11918 11916  
11919 11917          /*
11920 11918           * Transfer ownership of the current xop, if necessary.
11921 11919           */
11922 11920          if (ipx->ipx_current_ipif == sipif) {
11923 11921                  ASSERT(ipx->ipx_pending_ipif == NULL);
11924 11922                  mutex_enter(&ipx->ipx_lock);
11925 11923                  ipx->ipx_current_ipif = dipif;
11926 11924                  mutex_exit(&ipx->ipx_lock);
11927 11925          }
11928 11926  
11929 11927          if (virgipif == NULL)
11930 11928                  mi_free(sipif);
11931 11929  }
11932 11930  
11933 11931  /*
11934 11932   * checks if:
11935 11933   *      - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and
11936 11934   *      - logical interface is within the allowed range
11937 11935   */
11938 11936  static int
11939 11937  is_lifname_valid(ill_t *ill, unsigned int ipif_id)
11940 11938  {
11941 11939          if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ)
11942 11940                  return (ENAMETOOLONG);
11943 11941  
11944 11942          if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if)
11945 11943                  return (ERANGE);
11946 11944          return (0);
11947 11945  }
11948 11946  
11949 11947  /*
11950 11948   * Insert the ipif, so that the list of ipifs on the ill will be sorted
11951 11949   * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
11952 11950   * be inserted into the first space available in the list. The value of
11953 11951   * ipif_id will then be set to the appropriate value for its position.
11954 11952   */
11955 11953  static int
11956 11954  ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
11957 11955  {
11958 11956          ill_t *ill;
11959 11957          ipif_t *tipif;
11960 11958          ipif_t **tipifp;
11961 11959          int id, err;
11962 11960          ip_stack_t      *ipst;
11963 11961  
11964 11962          ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
11965 11963              IAM_WRITER_IPIF(ipif));
11966 11964  
11967 11965          ill = ipif->ipif_ill;
11968 11966          ASSERT(ill != NULL);
11969 11967          ipst = ill->ill_ipst;
11970 11968  
11971 11969          /*
11972 11970           * In the case of lo0:0 we already hold the ill_g_lock.
11973 11971           * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
11974 11972           * ipif_insert.
11975 11973           */
11976 11974          if (acquire_g_lock)
11977 11975                  rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11978 11976          mutex_enter(&ill->ill_lock);
11979 11977          id = ipif->ipif_id;
11980 11978          tipifp = &(ill->ill_ipif);
11981 11979          if (id == -1) { /* need to find a real id */
11982 11980                  id = 0;
11983 11981                  while ((tipif = *tipifp) != NULL) {
11984 11982                          ASSERT(tipif->ipif_id >= id);
11985 11983                          if (tipif->ipif_id != id)
11986 11984                                  break; /* non-consecutive id */
11987 11985                          id++;
11988 11986                          tipifp = &(tipif->ipif_next);
11989 11987                  }
11990 11988                  if ((err = is_lifname_valid(ill, id)) != 0) {
11991 11989                          mutex_exit(&ill->ill_lock);
11992 11990                          if (acquire_g_lock)
11993 11991                                  rw_exit(&ipst->ips_ill_g_lock);
11994 11992                          return (err);
11995 11993                  }
11996 11994                  ipif->ipif_id = id; /* assign new id */
11997 11995          } else if ((err = is_lifname_valid(ill, id)) == 0) {
11998 11996                  /* we have a real id; insert ipif in the right place */
11999 11997                  while ((tipif = *tipifp) != NULL) {
12000 11998                          ASSERT(tipif->ipif_id != id);
12001 11999                          if (tipif->ipif_id > id)
12002 12000                                  break; /* found correct location */
12003 12001                          tipifp = &(tipif->ipif_next);
12004 12002                  }
12005 12003          } else {
12006 12004                  mutex_exit(&ill->ill_lock);
12007 12005                  if (acquire_g_lock)
12008 12006                          rw_exit(&ipst->ips_ill_g_lock);
12009 12007                  return (err);
12010 12008          }
12011 12009  
12012 12010          ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
12013 12011  
12014 12012          ipif->ipif_next = tipif;
12015 12013          *tipifp = ipif;
12016 12014          mutex_exit(&ill->ill_lock);
12017 12015          if (acquire_g_lock)
12018 12016                  rw_exit(&ipst->ips_ill_g_lock);
12019 12017  
12020 12018          return (0);
12021 12019  }
12022 12020  
12023 12021  static void
12024 12022  ipif_remove(ipif_t *ipif)
12025 12023  {
12026 12024          ipif_t  **ipifp;
12027 12025          ill_t   *ill = ipif->ipif_ill;
12028 12026  
12029 12027          ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
12030 12028  
12031 12029          mutex_enter(&ill->ill_lock);
12032 12030          ipifp = &ill->ill_ipif;
12033 12031          for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
12034 12032                  if (*ipifp == ipif) {
12035 12033                          *ipifp = ipif->ipif_next;
12036 12034                          break;
12037 12035                  }
12038 12036          }
12039 12037          mutex_exit(&ill->ill_lock);
12040 12038  }
12041 12039  
12042 12040  /*
12043 12041   * Allocate and initialize a new interface control structure.  (Always
12044 12042   * called as writer.)
12045 12043   * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
12046 12044   * is not part of the global linked list of ills. ipif_seqid is unique
12047 12045   * in the system and to preserve the uniqueness, it is assigned only
12048 12046   * when ill becomes part of the global list. At that point ill will
12049 12047   * have a name. If it doesn't get assigned here, it will get assigned
12050 12048   * in ipif_set_values() as part of SIOCSLIFNAME processing.
12051 12049   * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
12052 12050   * the interface flags or any other information from the DL_INFO_ACK for
12053 12051   * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
12054 12052   * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
12055 12053   * second DL_INFO_ACK comes in from the driver.
12056 12054   */
12057 12055  static ipif_t *
12058 12056  ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
12059 12057      boolean_t insert, int *errorp)
12060 12058  {
12061 12059          int err;
12062 12060          ipif_t  *ipif;
12063 12061          ip_stack_t *ipst = ill->ill_ipst;
12064 12062  
12065 12063          ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
12066 12064              ill->ill_name, id, (void *)ill));
12067 12065          ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
12068 12066  
12069 12067          if (errorp != NULL)
12070 12068                  *errorp = 0;
12071 12069  
12072 12070          if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) {
12073 12071                  if (errorp != NULL)
12074 12072                          *errorp = ENOMEM;
12075 12073                  return (NULL);
12076 12074          }
12077 12075          *ipif = ipif_zero;      /* start clean */
12078 12076  
12079 12077          ipif->ipif_ill = ill;
12080 12078          ipif->ipif_id = id;     /* could be -1 */
12081 12079          /*
12082 12080           * Inherit the zoneid from the ill; for the shared stack instance
12083 12081           * this is always the global zone
12084 12082           */
12085 12083          ipif->ipif_zoneid = ill->ill_zoneid;
12086 12084  
12087 12085          ipif->ipif_refcnt = 0;
12088 12086  
12089 12087          if (insert) {
12090 12088                  if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) {
12091 12089                          mi_free(ipif);
12092 12090                          if (errorp != NULL)
12093 12091                                  *errorp = err;
12094 12092                          return (NULL);
12095 12093                  }
12096 12094                  /* -1 id should have been replaced by real id */
12097 12095                  id = ipif->ipif_id;
12098 12096                  ASSERT(id >= 0);
12099 12097          }
12100 12098  
12101 12099          if (ill->ill_name[0] != '\0')
12102 12100                  ipif_assign_seqid(ipif);
12103 12101  
12104 12102          /*
12105 12103           * If this is the zeroth ipif on the IPMP ill, create the illgrp
12106 12104           * (which must not exist yet because the zeroth ipif is created once
12107 12105           * per ill).  However, do not not link it to the ipmp_grp_t until
12108 12106           * I_PLINK is called; see ip_sioctl_plink_ipmp() for details.
12109 12107           */
12110 12108          if (id == 0 && IS_IPMP(ill)) {
12111 12109                  if (ipmp_illgrp_create(ill) == NULL) {
12112 12110                          if (insert) {
12113 12111                                  rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
12114 12112                                  ipif_remove(ipif);
12115 12113                                  rw_exit(&ipst->ips_ill_g_lock);
12116 12114                          }
12117 12115                          mi_free(ipif);
12118 12116                          if (errorp != NULL)
12119 12117                                  *errorp = ENOMEM;
12120 12118                          return (NULL);
12121 12119                  }
12122 12120          }
12123 12121  
12124 12122          /*
12125 12123           * We grab ill_lock to protect the flag changes.  The ipif is still
12126 12124           * not up and can't be looked up until the ioctl completes and the
12127 12125           * IPIF_CHANGING flag is cleared.
12128 12126           */
12129 12127          mutex_enter(&ill->ill_lock);
12130 12128  
12131 12129          ipif->ipif_ire_type = ire_type;
12132 12130  
12133 12131          if (ipif->ipif_isv6) {
12134 12132                  ill->ill_flags |= ILLF_IPV6;
12135 12133          } else {
12136 12134                  ipaddr_t inaddr_any = INADDR_ANY;
12137 12135  
12138 12136                  ill->ill_flags |= ILLF_IPV4;
12139 12137  
12140 12138                  /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
12141 12139                  IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12142 12140                      &ipif->ipif_v6lcl_addr);
12143 12141                  IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12144 12142                      &ipif->ipif_v6subnet);
12145 12143                  IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12146 12144                      &ipif->ipif_v6net_mask);
12147 12145                  IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12148 12146                      &ipif->ipif_v6brd_addr);
12149 12147                  IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12150 12148                      &ipif->ipif_v6pp_dst_addr);
12151 12149          }
12152 12150  
12153 12151          /*
12154 12152           * Don't set the interface flags etc. now, will do it in
12155 12153           * ip_ll_subnet_defaults.
12156 12154           */
12157 12155          if (!initialize)
12158 12156                  goto out;
12159 12157  
12160 12158          /*
12161 12159           * NOTE: The IPMP meta-interface is special-cased because it starts
12162 12160           * with no underlying interfaces (and thus an unknown broadcast
12163 12161           * address length), but all interfaces that can be placed into an IPMP
12164 12162           * group are required to be broadcast-capable.
12165 12163           */
12166 12164          if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
12167 12165                  /*
12168 12166                   * Later detect lack of DLPI driver multicast capability by
12169 12167                   * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi().
12170 12168                   */
12171 12169                  ill->ill_flags |= ILLF_MULTICAST;
12172 12170                  if (!ipif->ipif_isv6)
12173 12171                          ipif->ipif_flags |= IPIF_BROADCAST;
12174 12172          } else {
12175 12173                  if (ill->ill_net_type != IRE_LOOPBACK) {
12176 12174                          if (ipif->ipif_isv6)
12177 12175                                  /*
12178 12176                                   * Note: xresolv interfaces will eventually need
12179 12177                                   * NOARP set here as well, but that will require
12180 12178                                   * those external resolvers to have some
12181 12179                                   * knowledge of that flag and act appropriately.
12182 12180                                   * Not to be changed at present.
12183 12181                                   */
12184 12182                                  ill->ill_flags |= ILLF_NONUD;
12185 12183                          else
12186 12184                                  ill->ill_flags |= ILLF_NOARP;
12187 12185                  }
12188 12186                  if (ill->ill_phys_addr_length == 0) {
12189 12187                          if (IS_VNI(ill)) {
12190 12188                                  ipif->ipif_flags |= IPIF_NOXMIT;
12191 12189                          } else {
12192 12190                                  /* pt-pt supports multicast. */
12193 12191                                  ill->ill_flags |= ILLF_MULTICAST;
12194 12192                                  if (ill->ill_net_type != IRE_LOOPBACK)
12195 12193                                          ipif->ipif_flags |= IPIF_POINTOPOINT;
12196 12194                          }
12197 12195                  }
12198 12196          }
12199 12197  out:
12200 12198          mutex_exit(&ill->ill_lock);
12201 12199          return (ipif);
12202 12200  }
12203 12201  
12204 12202  /*
12205 12203   * Remove the neighbor cache entries associated with this logical
12206 12204   * interface.
12207 12205   */
12208 12206  int
12209 12207  ipif_arp_down(ipif_t *ipif)
12210 12208  {
12211 12209          ill_t   *ill = ipif->ipif_ill;
12212 12210          int     err = 0;
12213 12211  
12214 12212          ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
12215 12213          ASSERT(IAM_WRITER_IPIF(ipif));
12216 12214  
12217 12215          DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
12218 12216              ill_t *, ill, ipif_t *, ipif);
12219 12217          ipif_nce_down(ipif);
12220 12218  
12221 12219          /*
12222 12220           * If this is the last ipif that is going down and there are no
12223 12221           * duplicate addresses we may yet attempt to re-probe, then we need to
12224 12222           * clean up ARP completely.
12225 12223           */
12226 12224          if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
12227 12225              !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
12228 12226                  /*
12229 12227                   * If this was the last ipif on an IPMP interface, purge any
12230 12228                   * static ARP entries associated with it.
12231 12229                   */
12232 12230                  if (IS_IPMP(ill))
12233 12231                          ipmp_illgrp_refresh_arpent(ill->ill_grp);
12234 12232  
12235 12233                  /* UNBIND, DETACH */
12236 12234                  err = arp_ll_down(ill);
12237 12235          }
12238 12236  
12239 12237          return (err);
12240 12238  }
12241 12239  
12242 12240  /*
12243 12241   * Get the resolver set up for a new IP address.  (Always called as writer.)
12244 12242   * Called both for IPv4 and IPv6 interfaces, though it only does some
12245 12243   * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
12246 12244   *
12247 12245   * The enumerated value res_act tunes the behavior:
12248 12246   *      * Res_act_initial: set up all the resolver structures for a new
12249 12247   *        IP address.
12250 12248   *      * Res_act_defend: tell ARP that it needs to send a single gratuitous
12251 12249   *        ARP message in defense of the address.
12252 12250   *      * Res_act_rebind: tell ARP to change the hardware address for an IP
12253 12251   *        address (and issue gratuitous ARPs).  Used by ipmp_ill_bind_ipif().
12254 12252   *
12255 12253   * Returns zero on success, or an errno upon failure.
12256 12254   */
12257 12255  int
12258 12256  ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
12259 12257  {
12260 12258          ill_t           *ill = ipif->ipif_ill;
12261 12259          int             err;
12262 12260          boolean_t       was_dup;
12263 12261  
12264 12262          ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
12265 12263              ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
12266 12264          ASSERT(IAM_WRITER_IPIF(ipif));
12267 12265  
12268 12266          was_dup = B_FALSE;
12269 12267          if (res_act == Res_act_initial) {
12270 12268                  ipif->ipif_addr_ready = 0;
12271 12269                  /*
12272 12270                   * We're bringing an interface up here.  There's no way that we
12273 12271                   * should need to shut down ARP now.
12274 12272                   */
12275 12273                  mutex_enter(&ill->ill_lock);
12276 12274                  if (ipif->ipif_flags & IPIF_DUPLICATE) {
12277 12275                          ipif->ipif_flags &= ~IPIF_DUPLICATE;
12278 12276                          ill->ill_ipif_dup_count--;
12279 12277                          was_dup = B_TRUE;
12280 12278                  }
12281 12279                  mutex_exit(&ill->ill_lock);
12282 12280          }
12283 12281          if (ipif->ipif_recovery_id != 0)
12284 12282                  (void) untimeout(ipif->ipif_recovery_id);
12285 12283          ipif->ipif_recovery_id = 0;
12286 12284          if (ill->ill_net_type != IRE_IF_RESOLVER) {
12287 12285                  ipif->ipif_addr_ready = 1;
12288 12286                  return (0);
12289 12287          }
12290 12288          /* NDP will set the ipif_addr_ready flag when it's ready */
12291 12289          if (ill->ill_isv6)
12292 12290                  return (0);
12293 12291  
12294 12292          err = ipif_arp_up(ipif, res_act, was_dup);
12295 12293          return (err);
12296 12294  }
12297 12295  
12298 12296  /*
12299 12297   * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
12300 12298   * when a link has just gone back up.
12301 12299   */
12302 12300  static void
12303 12301  ipif_nce_start_dad(ipif_t *ipif)
12304 12302  {
12305 12303          ncec_t *ncec;
12306 12304          ill_t *ill = ipif->ipif_ill;
12307 12305          boolean_t isv6 = ill->ill_isv6;
12308 12306  
12309 12307          if (isv6) {
12310 12308                  ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
12311 12309                      &ipif->ipif_v6lcl_addr);
12312 12310          } else {
12313 12311                  ipaddr_t v4addr;
12314 12312  
12315 12313                  if (ill->ill_net_type != IRE_IF_RESOLVER ||
12316 12314                      (ipif->ipif_flags & IPIF_UNNUMBERED) ||
12317 12315                      ipif->ipif_lcl_addr == INADDR_ANY) {
12318 12316                          /*
12319 12317                           * If we can't contact ARP for some reason,
12320 12318                           * that's not really a problem.  Just send
12321 12319                           * out the routing socket notification that
12322 12320                           * DAD completion would have done, and continue.
12323 12321                           */
12324 12322                          ipif_mask_reply(ipif);
12325 12323                          ipif_up_notify(ipif);
12326 12324                          ipif->ipif_addr_ready = 1;
12327 12325                          return;
12328 12326                  }
12329 12327  
12330 12328                  IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
12331 12329                  ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
12332 12330          }
12333 12331  
12334 12332          if (ncec == NULL) {
12335 12333                  ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
12336 12334                      (void *)ipif));
12337 12335                  return;
12338 12336          }
12339 12337          if (!nce_restart_dad(ncec)) {
12340 12338                  /*
12341 12339                   * If we can't restart DAD for some reason, that's not really a
12342 12340                   * problem.  Just send out the routing socket notification that
12343 12341                   * DAD completion would have done, and continue.
12344 12342                   */
12345 12343                  ipif_up_notify(ipif);
12346 12344                  ipif->ipif_addr_ready = 1;
12347 12345          }
12348 12346          ncec_refrele(ncec);
12349 12347  }
12350 12348  
12351 12349  /*
12352 12350   * Restart duplicate address detection on all interfaces on the given ill.
12353 12351   *
12354 12352   * This is called when an interface transitions from down to up
12355 12353   * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
12356 12354   *
12357 12355   * Note that since the underlying physical link has transitioned, we must cause
12358 12356   * at least one routing socket message to be sent here, either via DAD
12359 12357   * completion or just by default on the first ipif.  (If we don't do this, then
12360 12358   * in.mpathd will see long delays when doing link-based failure recovery.)
12361 12359   */
12362 12360  void
12363 12361  ill_restart_dad(ill_t *ill, boolean_t went_up)
12364 12362  {
12365 12363          ipif_t *ipif;
12366 12364  
12367 12365          if (ill == NULL)
12368 12366                  return;
12369 12367  
12370 12368          /*
12371 12369           * If layer two doesn't support duplicate address detection, then just
12372 12370           * send the routing socket message now and be done with it.
12373 12371           */
12374 12372          if (!ill->ill_isv6 && arp_no_defense) {
12375 12373                  ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12376 12374                  return;
12377 12375          }
12378 12376  
12379 12377          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12380 12378                  if (went_up) {
12381 12379  
12382 12380                          if (ipif->ipif_flags & IPIF_UP) {
12383 12381                                  ipif_nce_start_dad(ipif);
12384 12382                          } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
12385 12383                                  /*
12386 12384                                   * kick off the bring-up process now.
12387 12385                                   */
12388 12386                                  ipif_do_recovery(ipif);
12389 12387                          } else {
12390 12388                                  /*
12391 12389                                   * Unfortunately, the first ipif is "special"
12392 12390                                   * and represents the underlying ill in the
12393 12391                                   * routing socket messages.  Thus, when this
12394 12392                                   * one ipif is down, we must still notify so
12395 12393                                   * that the user knows the IFF_RUNNING status
12396 12394                                   * change.  (If the first ipif is up, then
12397 12395                                   * we'll handle eventual routing socket
12398 12396                                   * notification via DAD completion.)
12399 12397                                   */
12400 12398                                  if (ipif == ill->ill_ipif) {
12401 12399                                          ip_rts_ifmsg(ill->ill_ipif,
12402 12400                                              RTSQ_DEFAULT);
12403 12401                                  }
12404 12402                          }
12405 12403                  } else {
12406 12404                          /*
12407 12405                           * After link down, we'll need to send a new routing
12408 12406                           * message when the link comes back, so clear
12409 12407                           * ipif_addr_ready.
12410 12408                           */
12411 12409                          ipif->ipif_addr_ready = 0;
12412 12410                  }
12413 12411          }
12414 12412  
12415 12413          /*
12416 12414           * If we've torn down links, then notify the user right away.
12417 12415           */
12418 12416          if (!went_up)
12419 12417                  ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12420 12418  }
12421 12419  
12422 12420  static void
12423 12421  ipsq_delete(ipsq_t *ipsq)
12424 12422  {
12425 12423          ipxop_t *ipx = ipsq->ipsq_xop;
12426 12424  
12427 12425          ipsq->ipsq_ipst = NULL;
12428 12426          ASSERT(ipsq->ipsq_phyint == NULL);
12429 12427          ASSERT(ipsq->ipsq_xop != NULL);
12430 12428          ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
12431 12429          ASSERT(ipx->ipx_pending_mp == NULL);
12432 12430          kmem_free(ipsq, sizeof (ipsq_t));
12433 12431  }
12434 12432  
12435 12433  static int
12436 12434  ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
12437 12435  {
12438 12436          int err = 0;
12439 12437          ipif_t *ipif;
12440 12438  
12441 12439          if (ill == NULL)
12442 12440                  return (0);
12443 12441  
12444 12442          ASSERT(IAM_WRITER_ILL(ill));
12445 12443          ill->ill_up_ipifs = B_TRUE;
12446 12444          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12447 12445                  if (ipif->ipif_was_up) {
12448 12446                          if (!(ipif->ipif_flags & IPIF_UP))
12449 12447                                  err = ipif_up(ipif, q, mp);
12450 12448                          ipif->ipif_was_up = B_FALSE;
12451 12449                          if (err != 0) {
12452 12450                                  ASSERT(err == EINPROGRESS);
12453 12451                                  return (err);
12454 12452                          }
12455 12453                  }
12456 12454          }
12457 12455          ill->ill_up_ipifs = B_FALSE;
12458 12456          return (0);
12459 12457  }
12460 12458  
12461 12459  /*
12462 12460   * This function is called to bring up all the ipifs that were up before
12463 12461   * bringing the ill down via ill_down_ipifs().
12464 12462   */
12465 12463  int
12466 12464  ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
12467 12465  {
12468 12466          int err;
12469 12467  
12470 12468          ASSERT(IAM_WRITER_ILL(ill));
12471 12469  
12472 12470          if (ill->ill_replumbing) {
12473 12471                  ill->ill_replumbing = 0;
12474 12472                  /*
12475 12473                   * Send down REPLUMB_DONE notification followed by the
12476 12474                   * BIND_REQ on the arp stream.
12477 12475                   */
12478 12476                  if (!ill->ill_isv6)
12479 12477                          arp_send_replumb_conf(ill);
12480 12478          }
12481 12479          err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
12482 12480          if (err != 0)
12483 12481                  return (err);
12484 12482  
12485 12483          return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
12486 12484  }
12487 12485  
12488 12486  /*
12489 12487   * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring
12490 12488   * down the ipifs without sending DL_UNBIND_REQ to the driver.
12491 12489   */
12492 12490  static void
12493 12491  ill_down_ipifs(ill_t *ill, boolean_t logical)
12494 12492  {
12495 12493          ipif_t *ipif;
12496 12494  
12497 12495          ASSERT(IAM_WRITER_ILL(ill));
12498 12496  
12499 12497          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12500 12498                  /*
12501 12499                   * We go through the ipif_down logic even if the ipif
12502 12500                   * is already down, since routes can be added based
12503 12501                   * on down ipifs. Going through ipif_down once again
12504 12502                   * will delete any IREs created based on these routes.
12505 12503                   */
12506 12504                  if (ipif->ipif_flags & IPIF_UP)
12507 12505                          ipif->ipif_was_up = B_TRUE;
12508 12506  
12509 12507                  if (logical) {
12510 12508                          (void) ipif_logical_down(ipif, NULL, NULL);
12511 12509                          ipif_non_duplicate(ipif);
12512 12510                          (void) ipif_down_tail(ipif);
12513 12511                  } else {
12514 12512                          (void) ipif_down(ipif, NULL, NULL);
12515 12513                  }
12516 12514          }
12517 12515  }
12518 12516  
12519 12517  /*
12520 12518   * Redo source address selection.  This makes IXAF_VERIFY_SOURCE take
12521 12519   * a look again at valid source addresses.
12522 12520   * This should be called each time after the set of source addresses has been
12523 12521   * changed.
12524 12522   */
12525 12523  void
12526 12524  ip_update_source_selection(ip_stack_t *ipst)
12527 12525  {
12528 12526          /* We skip past SRC_GENERATION_VERIFY */
12529 12527          if (atomic_inc_32_nv(&ipst->ips_src_generation) ==
12530 12528              SRC_GENERATION_VERIFY)
12531 12529                  atomic_inc_32(&ipst->ips_src_generation);
12532 12530  }
12533 12531  
12534 12532  /*
12535 12533   * Finish the group join started in ip_sioctl_groupname().
12536 12534   */
12537 12535  /* ARGSUSED */
12538 12536  static void
12539 12537  ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
12540 12538  {
12541 12539          ill_t           *ill = q->q_ptr;
12542 12540          phyint_t        *phyi = ill->ill_phyint;
12543 12541          ipmp_grp_t      *grp = phyi->phyint_grp;
12544 12542          ip_stack_t      *ipst = ill->ill_ipst;
12545 12543  
12546 12544          /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
12547 12545          ASSERT(!IS_IPMP(ill) && grp != NULL);
12548 12546          ASSERT(IAM_WRITER_IPSQ(ipsq));
12549 12547  
12550 12548          if (phyi->phyint_illv4 != NULL) {
12551 12549                  rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12552 12550                  VERIFY(grp->gr_pendv4-- > 0);
12553 12551                  rw_exit(&ipst->ips_ipmp_lock);
12554 12552                  ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
12555 12553          }
12556 12554          if (phyi->phyint_illv6 != NULL) {
12557 12555                  rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12558 12556                  VERIFY(grp->gr_pendv6-- > 0);
12559 12557                  rw_exit(&ipst->ips_ipmp_lock);
12560 12558                  ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
12561 12559          }
12562 12560          freemsg(mp);
12563 12561  }
12564 12562  
12565 12563  /*
12566 12564   * Process an SIOCSLIFGROUPNAME request.
12567 12565   */
12568 12566  /* ARGSUSED */
12569 12567  int
12570 12568  ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12571 12569      ip_ioctl_cmd_t *ipip, void *ifreq)
12572 12570  {
12573 12571          struct lifreq   *lifr = ifreq;
12574 12572          ill_t           *ill = ipif->ipif_ill;
12575 12573          ip_stack_t      *ipst = ill->ill_ipst;
12576 12574          phyint_t        *phyi = ill->ill_phyint;
12577 12575          ipmp_grp_t      *grp = phyi->phyint_grp;
12578 12576          mblk_t          *ipsq_mp;
12579 12577          int             err = 0;
12580 12578  
12581 12579          /*
12582 12580           * Note that phyint_grp can only change here, where we're exclusive.
12583 12581           */
12584 12582          ASSERT(IAM_WRITER_ILL(ill));
12585 12583  
12586 12584          if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
12587 12585              (phyi->phyint_flags & PHYI_VIRTUAL))
12588 12586                  return (EINVAL);
12589 12587  
12590 12588          lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
12591 12589  
12592 12590          rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12593 12591  
12594 12592          /*
12595 12593           * If the name hasn't changed, there's nothing to do.
12596 12594           */
12597 12595          if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
12598 12596                  goto unlock;
12599 12597  
12600 12598          /*
12601 12599           * Handle requests to rename an IPMP meta-interface.
12602 12600           *
12603 12601           * Note that creation of the IPMP meta-interface is handled in
12604 12602           * userland through the standard plumbing sequence.  As part of the
12605 12603           * plumbing the IPMP meta-interface, its initial groupname is set to
12606 12604           * the name of the interface (see ipif_set_values_tail()).
12607 12605           */
12608 12606          if (IS_IPMP(ill)) {
12609 12607                  err = ipmp_grp_rename(grp, lifr->lifr_groupname);
12610 12608                  goto unlock;
12611 12609          }
12612 12610  
12613 12611          /*
12614 12612           * Handle requests to add or remove an IP interface from a group.
12615 12613           */
12616 12614          if (lifr->lifr_groupname[0] != '\0') {                  /* add */
12617 12615                  /*
12618 12616                   * Moves are handled by first removing the interface from
12619 12617                   * its existing group, and then adding it to another group.
12620 12618                   * So, fail if it's already in a group.
12621 12619                   */
12622 12620                  if (IS_UNDER_IPMP(ill)) {
12623 12621                          err = EALREADY;
12624 12622                          goto unlock;
12625 12623                  }
12626 12624  
12627 12625                  grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
12628 12626                  if (grp == NULL) {
12629 12627                          err = ENOENT;
12630 12628                          goto unlock;
12631 12629                  }
12632 12630  
12633 12631                  /*
12634 12632                   * Check if the phyint and its ills are suitable for
12635 12633                   * inclusion into the group.
12636 12634                   */
12637 12635                  if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
12638 12636                          goto unlock;
12639 12637  
12640 12638                  /*
12641 12639                   * Checks pass; join the group, and enqueue the remaining
12642 12640                   * illgrp joins for when we've become part of the group xop
12643 12641                   * and are exclusive across its IPSQs.  Since qwriter_ip()
12644 12642                   * requires an mblk_t to scribble on, and since `mp' will be
12645 12643                   * freed as part of completing the ioctl, allocate another.
12646 12644                   */
12647 12645                  if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
12648 12646                          err = ENOMEM;
12649 12647                          goto unlock;
12650 12648                  }
12651 12649  
12652 12650                  /*
12653 12651                   * Before we drop ipmp_lock, bump gr_pend* to ensure that the
12654 12652                   * IPMP meta-interface ills needed by `phyi' cannot go away
12655 12653                   * before ip_join_illgrps() is called back.  See the comments
12656 12654                   * in ip_sioctl_plink_ipmp() for more.
12657 12655                   */
12658 12656                  if (phyi->phyint_illv4 != NULL)
12659 12657                          grp->gr_pendv4++;
12660 12658                  if (phyi->phyint_illv6 != NULL)
12661 12659                          grp->gr_pendv6++;
12662 12660  
12663 12661                  rw_exit(&ipst->ips_ipmp_lock);
12664 12662  
12665 12663                  ipmp_phyint_join_grp(phyi, grp);
12666 12664                  ill_refhold(ill);
12667 12665                  qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
12668 12666                      SWITCH_OP, B_FALSE);
12669 12667                  return (0);
12670 12668          } else {
12671 12669                  /*
12672 12670                   * Request to remove the interface from a group.  If the
12673 12671                   * interface is not in a group, this trivially succeeds.
12674 12672                   */
12675 12673                  rw_exit(&ipst->ips_ipmp_lock);
12676 12674                  if (IS_UNDER_IPMP(ill))
12677 12675                          ipmp_phyint_leave_grp(phyi);
12678 12676                  return (0);
12679 12677          }
12680 12678  unlock:
12681 12679          rw_exit(&ipst->ips_ipmp_lock);
12682 12680          return (err);
12683 12681  }
12684 12682  
12685 12683  /*
12686 12684   * Process an SIOCGLIFBINDING request.
12687 12685   */
12688 12686  /* ARGSUSED */
12689 12687  int
12690 12688  ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12691 12689      ip_ioctl_cmd_t *ipip, void *ifreq)
12692 12690  {
12693 12691          ill_t           *ill;
12694 12692          struct lifreq   *lifr = ifreq;
12695 12693          ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
12696 12694  
12697 12695          if (!IS_IPMP(ipif->ipif_ill))
12698 12696                  return (EINVAL);
12699 12697  
12700 12698          rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12701 12699          if ((ill = ipif->ipif_bound_ill) == NULL)
12702 12700                  lifr->lifr_binding[0] = '\0';
12703 12701          else
12704 12702                  (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ);
12705 12703          rw_exit(&ipst->ips_ipmp_lock);
12706 12704          return (0);
12707 12705  }
12708 12706  
12709 12707  /*
12710 12708   * Process an SIOCGLIFGROUPNAME request.
12711 12709   */
12712 12710  /* ARGSUSED */
12713 12711  int
12714 12712  ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12715 12713      ip_ioctl_cmd_t *ipip, void *ifreq)
12716 12714  {
12717 12715          ipmp_grp_t      *grp;
12718 12716          struct lifreq   *lifr = ifreq;
12719 12717          ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
12720 12718  
12721 12719          rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12722 12720          if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
12723 12721                  lifr->lifr_groupname[0] = '\0';
12724 12722          else
12725 12723                  (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
12726 12724          rw_exit(&ipst->ips_ipmp_lock);
12727 12725          return (0);
12728 12726  }
12729 12727  
12730 12728  /*
12731 12729   * Process an SIOCGLIFGROUPINFO request.
12732 12730   */
12733 12731  /* ARGSUSED */
12734 12732  int
12735 12733  ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12736 12734      ip_ioctl_cmd_t *ipip, void *dummy)
12737 12735  {
12738 12736          ipmp_grp_t      *grp;
12739 12737          lifgroupinfo_t  *lifgr;
12740 12738          ip_stack_t      *ipst = CONNQ_TO_IPST(q);
12741 12739  
12742 12740          /* ip_wput_nondata() verified mp->b_cont->b_cont */
12743 12741          lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
12744 12742          lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
12745 12743  
12746 12744          rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12747 12745          if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
12748 12746                  rw_exit(&ipst->ips_ipmp_lock);
12749 12747                  return (ENOENT);
12750 12748          }
12751 12749          ipmp_grp_info(grp, lifgr);
12752 12750          rw_exit(&ipst->ips_ipmp_lock);
12753 12751          return (0);
12754 12752  }
12755 12753  
12756 12754  static void
12757 12755  ill_dl_down(ill_t *ill)
12758 12756  {
12759 12757          DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
12760 12758  
12761 12759          /*
12762 12760           * The ill is down; unbind but stay attached since we're still
12763 12761           * associated with a PPA. If we have negotiated DLPI capabilites
12764 12762           * with the data link service provider (IDS_OK) then reset them.
12765 12763           * The interval between unbinding and rebinding is potentially
12766 12764           * unbounded hence we cannot assume things will be the same.
12767 12765           * The DLPI capabilities will be probed again when the data link
12768 12766           * is brought up.
12769 12767           */
12770 12768          mblk_t  *mp = ill->ill_unbind_mp;
12771 12769  
12772 12770          ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
12773 12771  
12774 12772          if (!ill->ill_replumbing) {
12775 12773                  /* Free all ilms for this ill */
12776 12774                  update_conn_ill(ill, ill->ill_ipst);
12777 12775          } else {
12778 12776                  ill_leave_multicast(ill);
12779 12777          }
12780 12778  
12781 12779          ill->ill_unbind_mp = NULL;
12782 12780  
12783 12781          mutex_enter(&ill->ill_lock);
12784 12782          ill->ill_dl_up = 0;
12785 12783          ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
12786 12784          mutex_exit(&ill->ill_lock);
12787 12785  
12788 12786          if (mp != NULL) {
12789 12787                  ip1dbg(("ill_dl_down: %s (%u) for %s\n",
12790 12788                      dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12791 12789                      ill->ill_name));
12792 12790                  mutex_enter(&ill->ill_lock);
12793 12791                  ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
12794 12792                  mutex_exit(&ill->ill_lock);
12795 12793                  /*
12796 12794                   * ip_rput does not pass up normal (M_PROTO) DLPI messages
12797 12795                   * after ILL_CONDEMNED is set. So in the unplumb case, we call
12798 12796                   * ill_capability_dld_disable disable rightaway. If this is not
12799 12797                   * an unplumb operation then the disable happens on receipt of
12800 12798                   * the capab ack via ip_rput_dlpi_writer ->
12801 12799                   * ill_capability_ack_thr. In both cases the order of
12802 12800                   * the operations seen by DLD is capability disable followed
12803 12801                   * by DL_UNBIND. Also the DLD capability disable needs a
12804 12802                   * cv_wait'able context.
12805 12803                   */
12806 12804                  if (ill->ill_state_flags & ILL_CONDEMNED)
12807 12805                          ill_capability_dld_disable(ill);
12808 12806                  ill_capability_reset(ill, B_FALSE);
12809 12807                  ill_dlpi_send(ill, mp);
12810 12808  
12811 12809                  /*
12812 12810                   * Wait for the capability reset to finish.
12813 12811                   * In this case, it doesn't matter WHY or HOW it finished.
12814 12812                   */
12815 12813                  (void) ill_capability_wait(ill);
12816 12814          }
12817 12815  }
12818 12816  
12819 12817  void
12820 12818  ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
12821 12819  {
12822 12820          union DL_primitives *dlp;
12823 12821          t_uscalar_t prim;
12824 12822          boolean_t waitack = B_FALSE;
12825 12823  
12826 12824          ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12827 12825  
12828 12826          dlp = (union DL_primitives *)mp->b_rptr;
12829 12827          prim = dlp->dl_primitive;
12830 12828  
12831 12829          ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
12832 12830              dl_primstr(prim), prim, ill->ill_name));
12833 12831  
12834 12832          switch (prim) {
12835 12833          case DL_PHYS_ADDR_REQ:
12836 12834          {
12837 12835                  dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
12838 12836                  ill->ill_phys_addr_pend = dlpap->dl_addr_type;
12839 12837                  break;
12840 12838          }
12841 12839          case DL_BIND_REQ:
12842 12840                  mutex_enter(&ill->ill_lock);
12843 12841                  ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
12844 12842                  mutex_exit(&ill->ill_lock);
12845 12843                  break;
12846 12844          }
12847 12845  
12848 12846          /*
12849 12847           * Except for the ACKs for the M_PCPROTO messages, all other ACKs
12850 12848           * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
12851 12849           * we only wait for the ACK of the DL_UNBIND_REQ.
12852 12850           */
12853 12851          mutex_enter(&ill->ill_lock);
12854 12852          if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12855 12853              (prim == DL_UNBIND_REQ)) {
12856 12854                  ill->ill_dlpi_pending = prim;
12857 12855                  waitack = B_TRUE;
12858 12856          }
12859 12857  
12860 12858          mutex_exit(&ill->ill_lock);
12861 12859          DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
12862 12860              char *, dl_primstr(prim), ill_t *, ill);
12863 12861          putnext(ill->ill_wq, mp);
12864 12862  
12865 12863          /*
12866 12864           * There is no ack for DL_NOTIFY_CONF messages
12867 12865           */
12868 12866          if (waitack && prim == DL_NOTIFY_CONF)
12869 12867                  ill_dlpi_done(ill, prim);
12870 12868  }
12871 12869  
12872 12870  /*
12873 12871   * Helper function for ill_dlpi_send().
12874 12872   */
12875 12873  /* ARGSUSED */
12876 12874  static void
12877 12875  ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12878 12876  {
12879 12877          ill_dlpi_send(q->q_ptr, mp);
12880 12878  }
12881 12879  
12882 12880  /*
12883 12881   * Send a DLPI control message to the driver but make sure there
12884 12882   * is only one outstanding message. Uses ill_dlpi_pending to tell
12885 12883   * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
12886 12884   * when an ACK or a NAK is received to process the next queued message.
12887 12885   */
12888 12886  void
12889 12887  ill_dlpi_send(ill_t *ill, mblk_t *mp)
12890 12888  {
12891 12889          mblk_t **mpp;
12892 12890  
12893 12891          ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12894 12892  
12895 12893          /*
12896 12894           * To ensure that any DLPI requests for current exclusive operation
12897 12895           * are always completely sent before any DLPI messages for other
12898 12896           * operations, require writer access before enqueuing.
12899 12897           */
12900 12898          if (!IAM_WRITER_ILL(ill)) {
12901 12899                  ill_refhold(ill);
12902 12900                  /* qwriter_ip() does the ill_refrele() */
12903 12901                  qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
12904 12902                      NEW_OP, B_TRUE);
12905 12903                  return;
12906 12904          }
12907 12905  
12908 12906          mutex_enter(&ill->ill_lock);
12909 12907          if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12910 12908                  /* Must queue message. Tail insertion */
12911 12909                  mpp = &ill->ill_dlpi_deferred;
12912 12910                  while (*mpp != NULL)
12913 12911                          mpp = &((*mpp)->b_next);
12914 12912  
12915 12913                  ip1dbg(("ill_dlpi_send: deferring request for %s "
12916 12914                      "while %s pending\n", ill->ill_name,
12917 12915                      dl_primstr(ill->ill_dlpi_pending)));
12918 12916  
12919 12917                  *mpp = mp;
12920 12918                  mutex_exit(&ill->ill_lock);
12921 12919                  return;
12922 12920          }
12923 12921          mutex_exit(&ill->ill_lock);
12924 12922          ill_dlpi_dispatch(ill, mp);
12925 12923  }
12926 12924  
12927 12925  void

↓ open down ↓

9399 lines elided

↑ open up ↑

12928 12926  ill_capability_send(ill_t *ill, mblk_t *mp)
12929 12927  {
12930 12928          ill->ill_capab_pending_cnt++;
12931 12929          ill_dlpi_send(ill, mp);
12932 12930  }
12933 12931  
12934 12932  void
12935 12933  ill_capability_done(ill_t *ill)
12936 12934  {
12937 12935          ASSERT(ill->ill_capab_pending_cnt != 0);
     12936 +        ASSERT(IAM_WRITER_ILL(ill));
12938 12937  
12939 12938          ill_dlpi_done(ill, DL_CAPABILITY_REQ);
12940 12939  
12941 12940          ill->ill_capab_pending_cnt--;
12942 12941          if (ill->ill_capab_pending_cnt == 0 &&
12943 12942              ill->ill_dlpi_capab_state == IDCS_OK)
12944 12943                  ill_capability_reset_alloc(ill);
12945      -
12946      -        mutex_enter(&ill->ill_dlpi_capab_lock);
12947      -        cv_broadcast(&ill->ill_dlpi_capab_cv);
12948      -        mutex_exit(&ill->ill_dlpi_capab_lock);
12949 12944  }
12950 12945  
12951 12946  /*
12952 12947   * Send all deferred DLPI messages without waiting for their ACKs.
12953 12948   */
12954 12949  void
12955 12950  ill_dlpi_send_deferred(ill_t *ill)
12956 12951  {
12957 12952          mblk_t *mp, *nextmp;
12958 12953

12959 12954          /*
12960 12955           * Clear ill_dlpi_pending so that the message is not queued in
12961 12956           * ill_dlpi_send().
12962 12957           */
12963 12958          mutex_enter(&ill->ill_lock);
12964 12959          ill->ill_dlpi_pending = DL_PRIM_INVAL;
12965 12960          mp = ill->ill_dlpi_deferred;
12966 12961          ill->ill_dlpi_deferred = NULL;
12967 12962          mutex_exit(&ill->ill_lock);
12968 12963  
12969 12964          for (; mp != NULL; mp = nextmp) {
12970 12965                  nextmp = mp->b_next;
12971 12966                  mp->b_next = NULL;
12972 12967                  ill_dlpi_send(ill, mp);
12973 12968          }
12974 12969  }
12975 12970  
12976 12971  /*
12977 12972   * Clear all the deferred DLPI messages. Called on receiving an M_ERROR
12978 12973   * or M_HANGUP
12979 12974   */
12980 12975  static void
12981 12976  ill_dlpi_clear_deferred(ill_t *ill)
12982 12977  {
12983 12978          mblk_t  *mp, *nextmp;
12984 12979  
12985 12980          mutex_enter(&ill->ill_lock);
12986 12981          ill->ill_dlpi_pending = DL_PRIM_INVAL;
12987 12982          mp = ill->ill_dlpi_deferred;
12988 12983          ill->ill_dlpi_deferred = NULL;
12989 12984          mutex_exit(&ill->ill_lock);
12990 12985  
12991 12986          for (; mp != NULL; mp = nextmp) {
12992 12987                  nextmp = mp->b_next;
12993 12988                  inet_freemsg(mp);
12994 12989          }
12995 12990  }
12996 12991  
12997 12992  /*
12998 12993   * Check if the DLPI primitive `prim' is pending; print a warning if not.
12999 12994   */
13000 12995  boolean_t
13001 12996  ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
13002 12997  {
13003 12998          t_uscalar_t pending;
13004 12999  
13005 13000          mutex_enter(&ill->ill_lock);
13006 13001          if (ill->ill_dlpi_pending == prim) {
13007 13002                  mutex_exit(&ill->ill_lock);
13008 13003                  return (B_TRUE);
13009 13004          }
13010 13005  
13011 13006          /*
13012 13007           * During teardown, ill_dlpi_dispatch() will send DLPI requests
13013 13008           * without waiting, so don't print any warnings in that case.
13014 13009           */
13015 13010          if (ill->ill_state_flags & ILL_CONDEMNED) {
13016 13011                  mutex_exit(&ill->ill_lock);
13017 13012                  return (B_FALSE);
13018 13013          }
13019 13014          pending = ill->ill_dlpi_pending;
13020 13015          mutex_exit(&ill->ill_lock);
13021 13016  
13022 13017          if (pending == DL_PRIM_INVAL) {
13023 13018                  (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
13024 13019                      "received unsolicited ack for %s on %s\n",
13025 13020                      dl_primstr(prim), ill->ill_name);
13026 13021          } else {
13027 13022                  (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
13028 13023                      "received unexpected ack for %s on %s (expecting %s)\n",
13029 13024                      dl_primstr(prim), ill->ill_name, dl_primstr(pending));
13030 13025          }
13031 13026          return (B_FALSE);
13032 13027  }
13033 13028  
13034 13029  /*
13035 13030   * Complete the current DLPI operation associated with `prim' on `ill' and
13036 13031   * start the next queued DLPI operation (if any).  If there are no queued DLPI
13037 13032   * operations and the ill's current exclusive IPSQ operation has finished
13038 13033   * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to
13039 13034   * allow the next exclusive IPSQ operation to begin upon ipsq_exit().  See
13040 13035   * the comments above ipsq_current_finish() for details.
13041 13036   */
13042 13037  void
13043 13038  ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
13044 13039  {
13045 13040          mblk_t *mp;
13046 13041          ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
13047 13042          ipxop_t *ipx = ipsq->ipsq_xop;
13048 13043  
13049 13044          ASSERT(IAM_WRITER_IPSQ(ipsq));
13050 13045          mutex_enter(&ill->ill_lock);
13051 13046  
13052 13047          ASSERT(prim != DL_PRIM_INVAL);
13053 13048          ASSERT(ill->ill_dlpi_pending == prim);
13054 13049  
13055 13050          ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
13056 13051              dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
13057 13052  
13058 13053          if ((mp = ill->ill_dlpi_deferred) == NULL) {
13059 13054                  ill->ill_dlpi_pending = DL_PRIM_INVAL;
13060 13055                  if (ipx->ipx_current_done) {
13061 13056                          mutex_enter(&ipx->ipx_lock);
13062 13057                          ipx->ipx_current_ipif = NULL;
13063 13058                          mutex_exit(&ipx->ipx_lock);
13064 13059                  }
13065 13060                  cv_signal(&ill->ill_cv);
13066 13061                  mutex_exit(&ill->ill_lock);
13067 13062                  return;
13068 13063          }
13069 13064  
13070 13065          ill->ill_dlpi_deferred = mp->b_next;
13071 13066          mp->b_next = NULL;
13072 13067          mutex_exit(&ill->ill_lock);
13073 13068  
13074 13069          ill_dlpi_dispatch(ill, mp);
13075 13070  }
13076 13071  
13077 13072  /*
13078 13073   * Queue a (multicast) DLPI control message to be sent to the driver by
13079 13074   * later calling ill_dlpi_send_queued.
13080 13075   * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13081 13076   * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
13082 13077   * for the same group to race.
13083 13078   * We send DLPI control messages in order using ill_lock.
13084 13079   * For IPMP we should be called on the cast_ill.
13085 13080   */
13086 13081  void
13087 13082  ill_dlpi_queue(ill_t *ill, mblk_t *mp)
13088 13083  {
13089 13084          mblk_t **mpp;
13090 13085  
13091 13086          ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
13092 13087  
13093 13088          mutex_enter(&ill->ill_lock);
13094 13089          /* Must queue message. Tail insertion */
13095 13090          mpp = &ill->ill_dlpi_deferred;
13096 13091          while (*mpp != NULL)
13097 13092                  mpp = &((*mpp)->b_next);
13098 13093  
13099 13094          *mpp = mp;
13100 13095          mutex_exit(&ill->ill_lock);
13101 13096  }
13102 13097  
13103 13098  /*
13104 13099   * Send the messages that were queued. Make sure there is only
13105 13100   * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
13106 13101   * when an ACK or a NAK is received to process the next queued message.
13107 13102   * For IPMP we are called on the upper ill, but when send what is queued
13108 13103   * on the cast_ill.
13109 13104   */
13110 13105  void
13111 13106  ill_dlpi_send_queued(ill_t *ill)
13112 13107  {
13113 13108          mblk_t  *mp;
13114 13109          union DL_primitives *dlp;
13115 13110          t_uscalar_t prim;
13116 13111          ill_t *release_ill = NULL;
13117 13112  
13118 13113          if (IS_IPMP(ill)) {
13119 13114                  /* On the upper IPMP ill. */
13120 13115                  release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13121 13116                  if (release_ill == NULL) {
13122 13117                          /* Avoid ever sending anything down to the ipmpstub */
13123 13118                          return;
13124 13119                  }
13125 13120                  ill = release_ill;
13126 13121          }
13127 13122          mutex_enter(&ill->ill_lock);
13128 13123          while ((mp = ill->ill_dlpi_deferred) != NULL) {
13129 13124                  if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
13130 13125                          /* Can't send. Somebody else will send it */
13131 13126                          mutex_exit(&ill->ill_lock);
13132 13127                          goto done;
13133 13128                  }
13134 13129                  ill->ill_dlpi_deferred = mp->b_next;
13135 13130                  mp->b_next = NULL;
13136 13131                  if (!ill->ill_dl_up) {
13137 13132                          /*
13138 13133                           * Nobody there. All multicast addresses will be
13139 13134                           * re-joined when we get the DL_BIND_ACK bringing the
13140 13135                           * interface up.
13141 13136                           */
13142 13137                          freemsg(mp);
13143 13138                          continue;
13144 13139                  }
13145 13140                  dlp = (union DL_primitives *)mp->b_rptr;
13146 13141                  prim = dlp->dl_primitive;
13147 13142  
13148 13143                  if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
13149 13144                      (prim == DL_UNBIND_REQ)) {
13150 13145                          ill->ill_dlpi_pending = prim;
13151 13146                  }
13152 13147                  mutex_exit(&ill->ill_lock);
13153 13148  
13154 13149                  DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
13155 13150                      char *, dl_primstr(prim), ill_t *, ill);
13156 13151                  putnext(ill->ill_wq, mp);
13157 13152                  mutex_enter(&ill->ill_lock);
13158 13153          }
13159 13154          mutex_exit(&ill->ill_lock);
13160 13155  done:
13161 13156          if (release_ill != NULL)
13162 13157                  ill_refrele(release_ill);
13163 13158  }
13164 13159  
13165 13160  /*
13166 13161   * Queue an IP (IGMP/MLD) message to be sent by IP from
13167 13162   * ill_mcast_send_queued
13168 13163   * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13169 13164   * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
13170 13165   * group to race.
13171 13166   * We send them in order using ill_lock.
13172 13167   * For IPMP we are called on the upper ill, but we queue on the cast_ill.
13173 13168   */
13174 13169  void
13175 13170  ill_mcast_queue(ill_t *ill, mblk_t *mp)
13176 13171  {
13177 13172          mblk_t **mpp;
13178 13173          ill_t *release_ill = NULL;
13179 13174  
13180 13175          ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
13181 13176  
13182 13177          if (IS_IPMP(ill)) {
13183 13178                  /* On the upper IPMP ill. */
13184 13179                  release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13185 13180                  if (release_ill == NULL) {
13186 13181                          /* Discard instead of queuing for the ipmp interface */
13187 13182                          BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13188 13183                          ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
13189 13184                              mp, ill);
13190 13185                          freemsg(mp);
13191 13186                          return;
13192 13187                  }
13193 13188                  ill = release_ill;
13194 13189          }
13195 13190  
13196 13191          mutex_enter(&ill->ill_lock);
13197 13192          /* Must queue message. Tail insertion */
13198 13193          mpp = &ill->ill_mcast_deferred;
13199 13194          while (*mpp != NULL)
13200 13195                  mpp = &((*mpp)->b_next);
13201 13196  
13202 13197          *mpp = mp;
13203 13198          mutex_exit(&ill->ill_lock);
13204 13199          if (release_ill != NULL)
13205 13200                  ill_refrele(release_ill);
13206 13201  }
13207 13202  
13208 13203  /*
13209 13204   * Send the IP packets that were queued by ill_mcast_queue.
13210 13205   * These are IGMP/MLD packets.
13211 13206   *
13212 13207   * For IPMP we are called on the upper ill, but when send what is queued
13213 13208   * on the cast_ill.
13214 13209   *
13215 13210   * Request loopback of the report if we are acting as a multicast
13216 13211   * router, so that the process-level routing demon can hear it.
13217 13212   * This will run multiple times for the same group if there are members
13218 13213   * on the same group for multiple ipif's on the same ill. The
13219 13214   * igmp_input/mld_input code will suppress this due to the loopback thus we
13220 13215   * always loopback membership report.
13221 13216   *
13222 13217   * We also need to make sure that this does not get load balanced
13223 13218   * by IPMP. We do this by passing an ill to ip_output_simple.
13224 13219   */
13225 13220  void
13226 13221  ill_mcast_send_queued(ill_t *ill)
13227 13222  {
13228 13223          mblk_t  *mp;
13229 13224          ip_xmit_attr_t ixas;
13230 13225          ill_t *release_ill = NULL;
13231 13226  
13232 13227          if (IS_IPMP(ill)) {
13233 13228                  /* On the upper IPMP ill. */
13234 13229                  release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13235 13230                  if (release_ill == NULL) {
13236 13231                          /*
13237 13232                           * We should have no messages on the ipmp interface
13238 13233                           * but no point in trying to send them.
13239 13234                           */
13240 13235                          return;
13241 13236                  }
13242 13237                  ill = release_ill;
13243 13238          }
13244 13239          bzero(&ixas, sizeof (ixas));
13245 13240          ixas.ixa_zoneid = ALL_ZONES;
13246 13241          ixas.ixa_cred = kcred;
13247 13242          ixas.ixa_cpid = NOPID;
13248 13243          ixas.ixa_tsl = NULL;
13249 13244          /*
13250 13245           * Here we set ixa_ifindex. If IPMP it will be the lower ill which
13251 13246           * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
13252 13247           * That is necessary to handle IGMP/MLD snooping switches.
13253 13248           */
13254 13249          ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
13255 13250          ixas.ixa_ipst = ill->ill_ipst;
13256 13251  
13257 13252          mutex_enter(&ill->ill_lock);
13258 13253          while ((mp = ill->ill_mcast_deferred) != NULL) {
13259 13254                  ill->ill_mcast_deferred = mp->b_next;
13260 13255                  mp->b_next = NULL;
13261 13256                  if (!ill->ill_dl_up) {
13262 13257                          /*
13263 13258                           * Nobody there. Just drop the ip packets.
13264 13259                           * IGMP/MLD will resend later, if this is a replumb.
13265 13260                           */
13266 13261                          freemsg(mp);
13267 13262                          continue;
13268 13263                  }
13269 13264                  mutex_enter(&ill->ill_phyint->phyint_lock);
13270 13265                  if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
13271 13266                          /*
13272 13267                           * When the ill is getting deactivated, we only want to
13273 13268                           * send the DLPI messages, so drop IGMP/MLD packets.
13274 13269                           * DLPI messages are handled by ill_dlpi_send_queued()
13275 13270                           */
13276 13271                          mutex_exit(&ill->ill_phyint->phyint_lock);
13277 13272                          freemsg(mp);
13278 13273                          continue;
13279 13274                  }
13280 13275                  mutex_exit(&ill->ill_phyint->phyint_lock);
13281 13276                  mutex_exit(&ill->ill_lock);
13282 13277  
13283 13278                  /* Check whether we are sending IPv4 or IPv6. */
13284 13279                  if (ill->ill_isv6) {
13285 13280                          ip6_t  *ip6h = (ip6_t *)mp->b_rptr;
13286 13281  
13287 13282                          ixas.ixa_multicast_ttl = ip6h->ip6_hops;
13288 13283                          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
13289 13284                  } else {
13290 13285                          ipha_t *ipha = (ipha_t *)mp->b_rptr;
13291 13286  
13292 13287                          ixas.ixa_multicast_ttl = ipha->ipha_ttl;
13293 13288                          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13294 13289                          ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
13295 13290                  }
13296 13291                  ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
13297 13292                  ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
13298 13293                  (void) ip_output_simple(mp, &ixas);
13299 13294                  ixa_cleanup(&ixas);
13300 13295  
13301 13296                  mutex_enter(&ill->ill_lock);
13302 13297          }
13303 13298          mutex_exit(&ill->ill_lock);
13304 13299  
13305 13300  done:
13306 13301          if (release_ill != NULL)
13307 13302                  ill_refrele(release_ill);
13308 13303  }
13309 13304  
13310 13305  /*
13311 13306   * Take down a specific interface, but don't lose any information about it.
13312 13307   * (Always called as writer.)
13313 13308   * This function goes through the down sequence even if the interface is
13314 13309   * already down. There are 2 reasons.
13315 13310   * a. Currently we permit interface routes that depend on down interfaces
13316 13311   *    to be added. This behaviour itself is questionable. However it appears
13317 13312   *    that both Solaris and 4.3 BSD have exhibited this behaviour for a long
13318 13313   *    time. We go thru the cleanup in order to remove these routes.
13319 13314   * b. The bringup of the interface could fail in ill_dl_up i.e. we get
13320 13315   *    DL_ERROR_ACK in response to the DL_BIND request. The interface is
13321 13316   *    down, but we need to cleanup i.e. do ill_dl_down and
13322 13317   *    ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
13323 13318   *
13324 13319   * IP-MT notes:
13325 13320   *
13326 13321   * Model of reference to interfaces.
13327 13322   *
13328 13323   * The following members in ipif_t track references to the ipif.
13329 13324   *      int     ipif_refcnt;    Active reference count
13330 13325   *
13331 13326   * The following members in ill_t track references to the ill.
13332 13327   *      int             ill_refcnt;     active refcnt
13333 13328   *      uint_t          ill_ire_cnt;    Number of ires referencing ill
13334 13329   *      uint_t          ill_ncec_cnt;   Number of ncecs referencing ill
13335 13330   *      uint_t          ill_nce_cnt;    Number of nces referencing ill
13336 13331   *      uint_t          ill_ilm_cnt;    Number of ilms referencing ill
13337 13332   *
13338 13333   * Reference to an ipif or ill can be obtained in any of the following ways.
13339 13334   *
13340 13335   * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
13341 13336   * Pointers to ipif / ill from other data structures viz ire and conn.
13342 13337   * Implicit reference to the ipif / ill by holding a reference to the ire.
13343 13338   *
13344 13339   * The ipif/ill lookup functions return a reference held ipif / ill.
13345 13340   * ipif_refcnt and ill_refcnt track the reference counts respectively.
13346 13341   * This is a purely dynamic reference count associated with threads holding
13347 13342   * references to the ipif / ill. Pointers from other structures do not
13348 13343   * count towards this reference count.
13349 13344   *
13350 13345   * ill_ire_cnt is the number of ire's associated with the
13351 13346   * ill. This is incremented whenever a new ire is created referencing the
13352 13347   * ill. This is done atomically inside ire_add_v[46] where the ire is
13353 13348   * actually added to the ire hash table. The count is decremented in
13354 13349   * ire_inactive where the ire is destroyed.
13355 13350   *
13356 13351   * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
13357 13352   * This is incremented atomically in
13358 13353   * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
13359 13354   * table. Similarly it is decremented in ncec_inactive() where the ncec
13360 13355   * is destroyed.
13361 13356   *
13362 13357   * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
13363 13358   * incremented atomically in nce_add() where the nce is actually added to the
13364 13359   * ill_nce. Similarly it is decremented in nce_inactive() where the nce
13365 13360   * is destroyed.
13366 13361   *
13367 13362   * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
13368 13363   * ilm_add() and decremented before the ilm is freed in ilm_delete().
13369 13364   *
13370 13365   * Flow of ioctls involving interface down/up
13371 13366   *
13372 13367   * The following is the sequence of an attempt to set some critical flags on an
13373 13368   * up interface.
13374 13369   * ip_sioctl_flags
13375 13370   * ipif_down
13376 13371   * wait for ipif to be quiescent
13377 13372   * ipif_down_tail
13378 13373   * ip_sioctl_flags_tail
13379 13374   *
13380 13375   * All set ioctls that involve down/up sequence would have a skeleton similar
13381 13376   * to the above. All the *tail functions are called after the refcounts have
13382 13377   * dropped to the appropriate values.
13383 13378   *
13384 13379   * SIOC ioctls during the IPIF_CHANGING interval.
13385 13380   *
13386 13381   * Threads handling SIOC set ioctls serialize on the squeue, but this
13387 13382   * is not done for SIOC get ioctls. Since a set ioctl can cause several
13388 13383   * steps of internal changes to the state, some of which are visible in
13389 13384   * ipif_flags (such as IFF_UP being cleared and later set), and we want
13390 13385   * the set ioctl to be atomic related to the get ioctls, the SIOC get code
13391 13386   * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
13392 13387   * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
13393 13388   * the current exclusive operation completes. The IPIF_CHANGING check
13394 13389   * and enqueue is atomic using the ill_lock and ipsq_lock. The
13395 13390   * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
13396 13391   * change while the ill_lock is held. Before dropping the ill_lock we acquire
13397 13392   * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
13398 13393   * until we release the ipsq_lock, even though the ill/ipif state flags
13399 13394   * can change after we drop the ill_lock.
13400 13395   */
13401 13396  int
13402 13397  ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13403 13398  {
13404 13399          ill_t           *ill = ipif->ipif_ill;
13405 13400          conn_t          *connp;
13406 13401          boolean_t       success;
13407 13402          boolean_t       ipif_was_up = B_FALSE;
13408 13403          ip_stack_t      *ipst = ill->ill_ipst;
13409 13404  
13410 13405          ASSERT(IAM_WRITER_IPIF(ipif));
13411 13406  
13412 13407          ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13413 13408  
13414 13409          DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
13415 13410              ill_t *, ill, ipif_t *, ipif);
13416 13411  
13417 13412          if (ipif->ipif_flags & IPIF_UP) {
13418 13413                  mutex_enter(&ill->ill_lock);
13419 13414                  ipif->ipif_flags &= ~IPIF_UP;
13420 13415                  ASSERT(ill->ill_ipif_up_count > 0);
13421 13416                  --ill->ill_ipif_up_count;
13422 13417                  mutex_exit(&ill->ill_lock);
13423 13418                  ipif_was_up = B_TRUE;
13424 13419                  /* Update status in SCTP's list */
13425 13420                  sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
13426 13421                  ill_nic_event_dispatch(ipif->ipif_ill,
13427 13422                      MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0);
13428 13423          }
13429 13424  
13430 13425          /*
13431 13426           * Removal of the last ipif from an ill may result in a DL_UNBIND
13432 13427           * being sent to the driver, and we must not send any data packets to
13433 13428           * the driver after the DL_UNBIND_REQ. To ensure this, all the
13434 13429           * ire and nce entries used in the data path will be cleaned
13435 13430           * up, and we also set  the ILL_DOWN_IN_PROGRESS bit to make
13436 13431           * sure on new entries will be added until the ill is bound
13437 13432           * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon
13438 13433           * receipt of a DL_BIND_ACK.
13439 13434           */
13440 13435          if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13441 13436              ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13442 13437              ill->ill_dl_up) {
13443 13438                  ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
13444 13439          }
13445 13440  
13446 13441          /*
13447 13442           * Blow away memberships we established in ipif_multicast_up().
13448 13443           */
13449 13444          ipif_multicast_down(ipif);
13450 13445  
13451 13446          /*
13452 13447           * Remove from the mapping for __sin6_src_id. We insert only
13453 13448           * when the address is not INADDR_ANY. As IPv4 addresses are
13454 13449           * stored as mapped addresses, we need to check for mapped
13455 13450           * INADDR_ANY also.
13456 13451           */
13457 13452          if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
13458 13453              !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
13459 13454              !(ipif->ipif_flags & IPIF_NOLOCAL)) {
13460 13455                  int err;
13461 13456  
13462 13457                  err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
13463 13458                      ipif->ipif_zoneid, ipst);
13464 13459                  if (err != 0) {
13465 13460                          ip0dbg(("ipif_down: srcid_remove %d\n", err));
13466 13461                  }
13467 13462          }
13468 13463  
13469 13464          if (ipif_was_up) {
13470 13465                  /* only delete if we'd added ire's before */
13471 13466                  if (ipif->ipif_isv6)
13472 13467                          ipif_delete_ires_v6(ipif);
13473 13468                  else
13474 13469                          ipif_delete_ires_v4(ipif);
13475 13470          }
13476 13471  
13477 13472          if (ipif_was_up && ill->ill_ipif_up_count == 0) {
13478 13473                  /*
13479 13474                   * Since the interface is now down, it may have just become
13480 13475                   * inactive.  Note that this needs to be done even for a
13481 13476                   * lll_logical_down(), or ARP entries will not get correctly
13482 13477                   * restored when the interface comes back up.
13483 13478                   */
13484 13479                  if (IS_UNDER_IPMP(ill))
13485 13480                          ipmp_ill_refresh_active(ill);
13486 13481          }
13487 13482  
13488 13483          /*
13489 13484           * neighbor-discovery or arp entries for this interface. The ipif
13490 13485           * has to be quiesced, so we walk all the nce's and delete those
13491 13486           * that point at the ipif->ipif_ill. At the same time, we also
13492 13487           * update IPMP so that ipifs for data addresses are unbound. We dont
13493 13488           * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
13494 13489           * that for ipif_down_tail()
13495 13490           */
13496 13491          ipif_nce_down(ipif);
13497 13492  
13498 13493          /*
13499 13494           * If this is the last ipif on the ill, we also need to remove
13500 13495           * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
13501 13496           * never succeed.
13502 13497           */
13503 13498          if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
13504 13499                  ire_walk_ill(0, 0, ill_downi, ill, ill);
13505 13500  
13506 13501          /*
13507 13502           * Walk all CONNs that can have a reference on an ire for this
13508 13503           * ipif (we actually walk all that now have stale references).
13509 13504           */
13510 13505          ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
13511 13506  
13512 13507          /*
13513 13508           * If mp is NULL the caller will wait for the appropriate refcnt.
13514 13509           * Eg. ip_sioctl_removeif -> ipif_free  -> ipif_down
13515 13510           * and ill_delete -> ipif_free -> ipif_down
13516 13511           */
13517 13512          if (mp == NULL) {
13518 13513                  ASSERT(q == NULL);
13519 13514                  return (0);
13520 13515          }
13521 13516  
13522 13517          if (CONN_Q(q)) {
13523 13518                  connp = Q_TO_CONN(q);
13524 13519                  mutex_enter(&connp->conn_lock);
13525 13520          } else {
13526 13521                  connp = NULL;
13527 13522          }
13528 13523          mutex_enter(&ill->ill_lock);
13529 13524          /*
13530 13525           * Are there any ire's pointing to this ipif that are still active ?
13531 13526           * If this is the last ipif going down, are there any ire's pointing
13532 13527           * to this ill that are still active ?
13533 13528           */
13534 13529          if (ipif_is_quiescent(ipif)) {
13535 13530                  mutex_exit(&ill->ill_lock);
13536 13531                  if (connp != NULL)
13537 13532                          mutex_exit(&connp->conn_lock);
13538 13533                  return (0);
13539 13534          }
13540 13535  
13541 13536          ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
13542 13537              ill->ill_name, (void *)ill));
13543 13538          /*
13544 13539           * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
13545 13540           * drops down, the operation will be restarted by ipif_ill_refrele_tail
13546 13541           * which in turn is called by the last refrele on the ipif/ill/ire.
13547 13542           */
13548 13543          success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
13549 13544          if (!success) {
13550 13545                  /* The conn is closing. So just return */
13551 13546                  ASSERT(connp != NULL);
13552 13547                  mutex_exit(&ill->ill_lock);
13553 13548                  mutex_exit(&connp->conn_lock);
13554 13549                  return (EINTR);
13555 13550          }
13556 13551  
13557 13552          mutex_exit(&ill->ill_lock);
13558 13553          if (connp != NULL)
13559 13554                  mutex_exit(&connp->conn_lock);
13560 13555          return (EINPROGRESS);
13561 13556  }
13562 13557  
13563 13558  int
13564 13559  ipif_down_tail(ipif_t *ipif)
13565 13560  {
13566 13561          ill_t   *ill = ipif->ipif_ill;
13567 13562          int     err = 0;
13568 13563  
13569 13564          DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
13570 13565              ill_t *, ill, ipif_t *, ipif);
13571 13566  
13572 13567          /*
13573 13568           * Skip any loopback interface (null wq).
13574 13569           * If this is the last logical interface on the ill
13575 13570           * have ill_dl_down tell the driver we are gone (unbind)
13576 13571           * Note that lun 0 can ipif_down even though
13577 13572           * there are other logical units that are up.
13578 13573           * This occurs e.g. when we change a "significant" IFF_ flag.
13579 13574           */
13580 13575          if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13581 13576              ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13582 13577              ill->ill_dl_up) {
13583 13578                  ill_dl_down(ill);
13584 13579          }
13585 13580          if (!ipif->ipif_isv6)
13586 13581                  err = ipif_arp_down(ipif);
13587 13582  
13588 13583          ill->ill_logical_down = 0;
13589 13584  
13590 13585          ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
13591 13586          ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
13592 13587          return (err);
13593 13588  }
13594 13589  
13595 13590  /*
13596 13591   * Bring interface logically down without bringing the physical interface
13597 13592   * down e.g. when the netmask is changed. This avoids long lasting link
13598 13593   * negotiations between an ethernet interface and a certain switches.
13599 13594   */
13600 13595  static int
13601 13596  ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13602 13597  {
13603 13598          DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
13604 13599              ill_t *, ipif->ipif_ill, ipif_t *, ipif);
13605 13600  
13606 13601          /*
13607 13602           * The ill_logical_down flag is a transient flag. It is set here
13608 13603           * and is cleared once the down has completed in ipif_down_tail.
13609 13604           * This flag does not indicate whether the ill stream is in the
13610 13605           * DL_BOUND state with the driver. Instead this flag is used by
13611 13606           * ipif_down_tail to determine whether to DL_UNBIND the stream with
13612 13607           * the driver. The state of the ill stream i.e. whether it is
13613 13608           * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
13614 13609           */
13615 13610          ipif->ipif_ill->ill_logical_down = 1;
13616 13611          return (ipif_down(ipif, q, mp));
13617 13612  }
13618 13613  
13619 13614  /*
13620 13615   * Initiate deallocate of an IPIF. Always called as writer. Called by
13621 13616   * ill_delete or ip_sioctl_removeif.
13622 13617   */
13623 13618  static void
13624 13619  ipif_free(ipif_t *ipif)
13625 13620  {
13626 13621          ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
13627 13622  
13628 13623          ASSERT(IAM_WRITER_IPIF(ipif));
13629 13624  
13630 13625          if (ipif->ipif_recovery_id != 0)
13631 13626                  (void) untimeout(ipif->ipif_recovery_id);
13632 13627          ipif->ipif_recovery_id = 0;
13633 13628  
13634 13629          /*
13635 13630           * Take down the interface. We can be called either from ill_delete
13636 13631           * or from ip_sioctl_removeif.
13637 13632           */
13638 13633          (void) ipif_down(ipif, NULL, NULL);
13639 13634  
13640 13635          /*
13641 13636           * Now that the interface is down, there's no chance it can still
13642 13637           * become a duplicate.  Cancel any timer that may have been set while
13643 13638           * tearing down.
13644 13639           */
13645 13640          if (ipif->ipif_recovery_id != 0)
13646 13641                  (void) untimeout(ipif->ipif_recovery_id);
13647 13642          ipif->ipif_recovery_id = 0;
13648 13643  
13649 13644          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13650 13645          /* Remove pointers to this ill in the multicast routing tables */
13651 13646          reset_mrt_vif_ipif(ipif);
13652 13647          /* If necessary, clear the cached source ipif rotor. */
13653 13648          if (ipif->ipif_ill->ill_src_ipif == ipif)
13654 13649                  ipif->ipif_ill->ill_src_ipif = NULL;
13655 13650          rw_exit(&ipst->ips_ill_g_lock);
13656 13651  }
13657 13652  
13658 13653  static void
13659 13654  ipif_free_tail(ipif_t *ipif)
13660 13655  {
13661 13656          ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13662 13657  
13663 13658          /*
13664 13659           * Need to hold both ill_g_lock and ill_lock while
13665 13660           * inserting or removing an ipif from the linked list
13666 13661           * of ipifs hanging off the ill.
13667 13662           */
13668 13663          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13669 13664  
13670 13665  #ifdef DEBUG
13671 13666          ipif_trace_cleanup(ipif);
13672 13667  #endif
13673 13668  
13674 13669          /* Ask SCTP to take it out of it list */
13675 13670          sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
13676 13671          ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT);
13677 13672  
13678 13673          /* Get it out of the ILL interface list. */
13679 13674          ipif_remove(ipif);
13680 13675          rw_exit(&ipst->ips_ill_g_lock);
13681 13676  
13682 13677          ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
13683 13678          ASSERT(ipif->ipif_recovery_id == 0);
13684 13679          ASSERT(ipif->ipif_ire_local == NULL);
13685 13680          ASSERT(ipif->ipif_ire_if == NULL);
13686 13681  
13687 13682          /* Free the memory. */
13688 13683          mi_free(ipif);
13689 13684  }
13690 13685  
13691 13686  /*
13692 13687   * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
13693 13688   * is zero.
13694 13689   */
13695 13690  void
13696 13691  ipif_get_name(const ipif_t *ipif, char *buf, int len)
13697 13692  {
13698 13693          char    lbuf[LIFNAMSIZ];
13699 13694          char    *name;
13700 13695          size_t  name_len;
13701 13696  
13702 13697          buf[0] = '\0';
13703 13698          name = ipif->ipif_ill->ill_name;
13704 13699          name_len = ipif->ipif_ill->ill_name_length;
13705 13700          if (ipif->ipif_id != 0) {
13706 13701                  (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
13707 13702                      ipif->ipif_id);
13708 13703                  name = lbuf;
13709 13704                  name_len = mi_strlen(name) + 1;
13710 13705          }
13711 13706          len -= 1;
13712 13707          buf[len] = '\0';
13713 13708          len = MIN(len, name_len);
13714 13709          bcopy(name, buf, len);
13715 13710  }
13716 13711  
13717 13712  /*
13718 13713   * Sets `buf' to an ill name.
13719 13714   */
13720 13715  void
13721 13716  ill_get_name(const ill_t *ill, char *buf, int len)
13722 13717  {
13723 13718          char    *name;
13724 13719          size_t  name_len;
13725 13720  
13726 13721          name = ill->ill_name;
13727 13722          name_len = ill->ill_name_length;
13728 13723          len -= 1;
13729 13724          buf[len] = '\0';
13730 13725          len = MIN(len, name_len);
13731 13726          bcopy(name, buf, len);
13732 13727  }
13733 13728  
13734 13729  /*
13735 13730   * Find an IPIF based on the name passed in.  Names can be of the form <phys>
13736 13731   * (e.g., le0) or <phys>:<#> (e.g., le0:1).  When there is no colon, the
13737 13732   * implied unit id is zero. <phys> must correspond to the name of an ILL.
13738 13733   * (May be called as writer.)
13739 13734   */
13740 13735  static ipif_t *
13741 13736  ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
13742 13737      boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
13743 13738  {
13744 13739          char    *cp;
13745 13740          char    *endp;
13746 13741          long    id;
13747 13742          ill_t   *ill;
13748 13743          ipif_t  *ipif;
13749 13744          uint_t  ire_type;
13750 13745          boolean_t did_alloc = B_FALSE;
13751 13746          char    last;
13752 13747  
13753 13748          /*
13754 13749           * If the caller wants to us to create the ipif, make sure we have a
13755 13750           * valid zoneid
13756 13751           */
13757 13752          ASSERT(!do_alloc || zoneid != ALL_ZONES);
13758 13753  
13759 13754          if (namelen == 0) {
13760 13755                  return (NULL);
13761 13756          }
13762 13757  
13763 13758          *exists = B_FALSE;
13764 13759          /* Look for a colon in the name. */
13765 13760          endp = &name[namelen];
13766 13761          for (cp = endp; --cp > name; ) {
13767 13762                  if (*cp == IPIF_SEPARATOR_CHAR)
13768 13763                          break;
13769 13764          }
13770 13765  
13771 13766          if (*cp == IPIF_SEPARATOR_CHAR) {
13772 13767                  /*
13773 13768                   * Reject any non-decimal aliases for logical
13774 13769                   * interfaces. Aliases with leading zeroes
13775 13770                   * are also rejected as they introduce ambiguity
13776 13771                   * in the naming of the interfaces.
13777 13772                   * In order to confirm with existing semantics,
13778 13773                   * and to not break any programs/script relying
13779 13774                   * on that behaviour, if<0>:0 is considered to be
13780 13775                   * a valid interface.
13781 13776                   *
13782 13777                   * If alias has two or more digits and the first
13783 13778                   * is zero, fail.
13784 13779                   */
13785 13780                  if (&cp[2] < endp && cp[1] == '0') {
13786 13781                          return (NULL);
13787 13782                  }
13788 13783          }
13789 13784  
13790 13785          if (cp <= name) {
13791 13786                  cp = endp;
13792 13787          }
13793 13788          last = *cp;
13794 13789          *cp = '\0';
13795 13790  
13796 13791          /*
13797 13792           * Look up the ILL, based on the portion of the name
13798 13793           * before the slash. ill_lookup_on_name returns a held ill.
13799 13794           * Temporary to check whether ill exists already. If so
13800 13795           * ill_lookup_on_name will clear it.
13801 13796           */
13802 13797          ill = ill_lookup_on_name(name, do_alloc, isv6,
13803 13798              &did_alloc, ipst);
13804 13799          *cp = last;
13805 13800          if (ill == NULL)
13806 13801                  return (NULL);
13807 13802  
13808 13803          /* Establish the unit number in the name. */
13809 13804          id = 0;
13810 13805          if (cp < endp && *endp == '\0') {
13811 13806                  /* If there was a colon, the unit number follows. */
13812 13807                  cp++;
13813 13808                  if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13814 13809                          ill_refrele(ill);
13815 13810                          return (NULL);
13816 13811                  }
13817 13812          }
13818 13813  
13819 13814          mutex_enter(&ill->ill_lock);
13820 13815          /* Now see if there is an IPIF with this unit number. */
13821 13816          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13822 13817                  if (ipif->ipif_id == id) {
13823 13818                          if (zoneid != ALL_ZONES &&
13824 13819                              zoneid != ipif->ipif_zoneid &&
13825 13820                              ipif->ipif_zoneid != ALL_ZONES) {
13826 13821                                  mutex_exit(&ill->ill_lock);
13827 13822                                  ill_refrele(ill);
13828 13823                                  return (NULL);
13829 13824                          }
13830 13825                          if (IPIF_CAN_LOOKUP(ipif)) {
13831 13826                                  ipif_refhold_locked(ipif);
13832 13827                                  mutex_exit(&ill->ill_lock);
13833 13828                                  if (!did_alloc)
13834 13829                                          *exists = B_TRUE;
13835 13830                                  /*
13836 13831                                   * Drop locks before calling ill_refrele
13837 13832                                   * since it can potentially call into
13838 13833                                   * ipif_ill_refrele_tail which can end up
13839 13834                                   * in trying to acquire any lock.
13840 13835                                   */
13841 13836                                  ill_refrele(ill);
13842 13837                                  return (ipif);
13843 13838                          }
13844 13839                  }
13845 13840          }
13846 13841  
13847 13842          if (!do_alloc) {
13848 13843                  mutex_exit(&ill->ill_lock);
13849 13844                  ill_refrele(ill);
13850 13845                  return (NULL);
13851 13846          }
13852 13847  
13853 13848          /*
13854 13849           * If none found, atomically allocate and return a new one.
13855 13850           * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
13856 13851           * to support "receive only" use of lo0:1 etc. as is still done
13857 13852           * below as an initial guess.
13858 13853           * However, this is now likely to be overriden later in ipif_up_done()
13859 13854           * when we know for sure what address has been configured on the
13860 13855           * interface, since we might have more than one loopback interface
13861 13856           * with a loopback address, e.g. in the case of zones, and all the
13862 13857           * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
13863 13858           */
13864 13859          if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
13865 13860                  ire_type = IRE_LOOPBACK;
13866 13861          else
13867 13862                  ire_type = IRE_LOCAL;
13868 13863          ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL);
13869 13864          if (ipif != NULL)
13870 13865                  ipif_refhold_locked(ipif);
13871 13866          mutex_exit(&ill->ill_lock);
13872 13867          ill_refrele(ill);
13873 13868          return (ipif);
13874 13869  }
13875 13870  
13876 13871  /*
13877 13872   * Variant of the above that queues the request on the ipsq when
13878 13873   * IPIF_CHANGING is set.
13879 13874   */
13880 13875  static ipif_t *
13881 13876  ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6,
13882 13877      zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
13883 13878      ip_stack_t *ipst)
13884 13879  {
13885 13880          char    *cp;
13886 13881          char    *endp;
13887 13882          long    id;
13888 13883          ill_t   *ill;
13889 13884          ipif_t  *ipif;
13890 13885          boolean_t did_alloc = B_FALSE;
13891 13886          ipsq_t  *ipsq;
13892 13887  
13893 13888          if (error != NULL)
13894 13889                  *error = 0;
13895 13890  
13896 13891          if (namelen == 0) {
13897 13892                  if (error != NULL)
13898 13893                          *error = ENXIO;
13899 13894                  return (NULL);
13900 13895          }
13901 13896  
13902 13897          /* Look for a colon in the name. */
13903 13898          endp = &name[namelen];
13904 13899          for (cp = endp; --cp > name; ) {
13905 13900                  if (*cp == IPIF_SEPARATOR_CHAR)
13906 13901                          break;
13907 13902          }
13908 13903  
13909 13904          if (*cp == IPIF_SEPARATOR_CHAR) {
13910 13905                  /*
13911 13906                   * Reject any non-decimal aliases for logical
13912 13907                   * interfaces. Aliases with leading zeroes
13913 13908                   * are also rejected as they introduce ambiguity
13914 13909                   * in the naming of the interfaces.
13915 13910                   * In order to confirm with existing semantics,
13916 13911                   * and to not break any programs/script relying
13917 13912                   * on that behaviour, if<0>:0 is considered to be
13918 13913                   * a valid interface.
13919 13914                   *
13920 13915                   * If alias has two or more digits and the first
13921 13916                   * is zero, fail.
13922 13917                   */
13923 13918                  if (&cp[2] < endp && cp[1] == '0') {
13924 13919                          if (error != NULL)
13925 13920                                  *error = EINVAL;
13926 13921                          return (NULL);
13927 13922                  }
13928 13923          }
13929 13924  
13930 13925          if (cp <= name) {
13931 13926                  cp = endp;
13932 13927          } else {
13933 13928                  *cp = '\0';
13934 13929          }
13935 13930  
13936 13931          /*
13937 13932           * Look up the ILL, based on the portion of the name
13938 13933           * before the slash. ill_lookup_on_name returns a held ill.
13939 13934           * Temporary to check whether ill exists already. If so
13940 13935           * ill_lookup_on_name will clear it.
13941 13936           */
13942 13937          ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst);
13943 13938          if (cp != endp)
13944 13939                  *cp = IPIF_SEPARATOR_CHAR;
13945 13940          if (ill == NULL)
13946 13941                  return (NULL);
13947 13942  
13948 13943          /* Establish the unit number in the name. */
13949 13944          id = 0;
13950 13945          if (cp < endp && *endp == '\0') {
13951 13946                  /* If there was a colon, the unit number follows. */
13952 13947                  cp++;
13953 13948                  if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13954 13949                          ill_refrele(ill);
13955 13950                          if (error != NULL)
13956 13951                                  *error = ENXIO;
13957 13952                          return (NULL);
13958 13953                  }
13959 13954          }
13960 13955  
13961 13956          GRAB_CONN_LOCK(q);
13962 13957          mutex_enter(&ill->ill_lock);
13963 13958          /* Now see if there is an IPIF with this unit number. */
13964 13959          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13965 13960                  if (ipif->ipif_id == id) {
13966 13961                          if (zoneid != ALL_ZONES &&
13967 13962                              zoneid != ipif->ipif_zoneid &&
13968 13963                              ipif->ipif_zoneid != ALL_ZONES) {
13969 13964                                  mutex_exit(&ill->ill_lock);
13970 13965                                  RELEASE_CONN_LOCK(q);
13971 13966                                  ill_refrele(ill);
13972 13967                                  if (error != NULL)
13973 13968                                          *error = ENXIO;
13974 13969                                  return (NULL);
13975 13970                          }
13976 13971  
13977 13972                          if (!(IPIF_IS_CHANGING(ipif) ||
13978 13973                              IPIF_IS_CONDEMNED(ipif)) ||
13979 13974                              IAM_WRITER_IPIF(ipif)) {
13980 13975                                  ipif_refhold_locked(ipif);
13981 13976                                  mutex_exit(&ill->ill_lock);
13982 13977                                  /*
13983 13978                                   * Drop locks before calling ill_refrele
13984 13979                                   * since it can potentially call into
13985 13980                                   * ipif_ill_refrele_tail which can end up
13986 13981                                   * in trying to acquire any lock.
13987 13982                                   */
13988 13983                                  RELEASE_CONN_LOCK(q);
13989 13984                                  ill_refrele(ill);
13990 13985                                  return (ipif);
13991 13986                          } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) {
13992 13987                                  ipsq = ill->ill_phyint->phyint_ipsq;
13993 13988                                  mutex_enter(&ipsq->ipsq_lock);
13994 13989                                  mutex_enter(&ipsq->ipsq_xop->ipx_lock);
13995 13990                                  mutex_exit(&ill->ill_lock);
13996 13991                                  ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
13997 13992                                  mutex_exit(&ipsq->ipsq_xop->ipx_lock);
13998 13993                                  mutex_exit(&ipsq->ipsq_lock);
13999 13994                                  RELEASE_CONN_LOCK(q);
14000 13995                                  ill_refrele(ill);
14001 13996                                  if (error != NULL)
14002 13997                                          *error = EINPROGRESS;
14003 13998                                  return (NULL);
14004 13999                          }
14005 14000                  }
14006 14001          }
14007 14002          RELEASE_CONN_LOCK(q);
14008 14003          mutex_exit(&ill->ill_lock);
14009 14004          ill_refrele(ill);
14010 14005          if (error != NULL)
14011 14006                  *error = ENXIO;
14012 14007          return (NULL);
14013 14008  }
14014 14009  
14015 14010  /*
14016 14011   * This routine is called whenever a new address comes up on an ipif.  If
14017 14012   * we are configured to respond to address mask requests, then we are supposed
14018 14013   * to broadcast an address mask reply at this time.  This routine is also
14019 14014   * called if we are already up, but a netmask change is made.  This is legal
14020 14015   * but might not make the system manager very popular.  (May be called
14021 14016   * as writer.)
14022 14017   */
14023 14018  void
14024 14019  ipif_mask_reply(ipif_t *ipif)
14025 14020  {
14026 14021          icmph_t *icmph;
14027 14022          ipha_t  *ipha;
14028 14023          mblk_t  *mp;
14029 14024          ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
14030 14025          ip_xmit_attr_t ixas;
14031 14026  
14032 14027  #define REPLY_LEN       (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
14033 14028  
14034 14029          if (!ipst->ips_ip_respond_to_address_mask_broadcast)
14035 14030                  return;
14036 14031  
14037 14032          /* ICMP mask reply is IPv4 only */
14038 14033          ASSERT(!ipif->ipif_isv6);
14039 14034          /* ICMP mask reply is not for a loopback interface */
14040 14035          ASSERT(ipif->ipif_ill->ill_wq != NULL);
14041 14036  
14042 14037          if (ipif->ipif_lcl_addr == INADDR_ANY)
14043 14038                  return;
14044 14039  
14045 14040          mp = allocb(REPLY_LEN, BPRI_HI);
14046 14041          if (mp == NULL)
14047 14042                  return;
14048 14043          mp->b_wptr = mp->b_rptr + REPLY_LEN;
14049 14044  
14050 14045          ipha = (ipha_t *)mp->b_rptr;
14051 14046          bzero(ipha, REPLY_LEN);
14052 14047          *ipha = icmp_ipha;
14053 14048          ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
14054 14049          ipha->ipha_src = ipif->ipif_lcl_addr;
14055 14050          ipha->ipha_dst = ipif->ipif_brd_addr;
14056 14051          ipha->ipha_length = htons(REPLY_LEN);
14057 14052          ipha->ipha_ident = 0;
14058 14053  
14059 14054          icmph = (icmph_t *)&ipha[1];
14060 14055          icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
14061 14056          bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
14062 14057          icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
14063 14058  
14064 14059          bzero(&ixas, sizeof (ixas));
14065 14060          ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
14066 14061          ixas.ixa_zoneid = ALL_ZONES;
14067 14062          ixas.ixa_ifindex = 0;
14068 14063          ixas.ixa_ipst = ipst;
14069 14064          ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
14070 14065          (void) ip_output_simple(mp, &ixas);
14071 14066          ixa_cleanup(&ixas);
14072 14067  #undef  REPLY_LEN
14073 14068  }
14074 14069  
14075 14070  /*
14076 14071   * Join the ipif specific multicast groups.
14077 14072   * Must be called after a mapping has been set up in the resolver.  (Always
14078 14073   * called as writer.)
14079 14074   */
14080 14075  void
14081 14076  ipif_multicast_up(ipif_t *ipif)
14082 14077  {
14083 14078          int err;
14084 14079          ill_t *ill;
14085 14080          ilm_t *ilm;
14086 14081  
14087 14082          ASSERT(IAM_WRITER_IPIF(ipif));
14088 14083  
14089 14084          ill = ipif->ipif_ill;
14090 14085  
14091 14086          ip1dbg(("ipif_multicast_up\n"));
14092 14087          if (!(ill->ill_flags & ILLF_MULTICAST) ||
14093 14088              ipif->ipif_allhosts_ilm != NULL)
14094 14089                  return;
14095 14090  
14096 14091          if (ipif->ipif_isv6) {
14097 14092                  in6_addr_t v6allmc = ipv6_all_hosts_mcast;
14098 14093                  in6_addr_t v6solmc = ipv6_solicited_node_mcast;
14099 14094  
14100 14095                  v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
14101 14096  
14102 14097                  if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
14103 14098                          return;
14104 14099  
14105 14100                  ip1dbg(("ipif_multicast_up - addmulti\n"));
14106 14101  
14107 14102                  /*
14108 14103                   * Join the all hosts multicast address.  We skip this for
14109 14104                   * underlying IPMP interfaces since they should be invisible.
14110 14105                   */
14111 14106                  if (!IS_UNDER_IPMP(ill)) {
14112 14107                          ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
14113 14108                              &err);
14114 14109                          if (ilm == NULL) {
14115 14110                                  ASSERT(err != 0);
14116 14111                                  ip0dbg(("ipif_multicast_up: "
14117 14112                                      "all_hosts_mcast failed %d\n", err));
14118 14113                                  return;
14119 14114                          }
14120 14115                          ipif->ipif_allhosts_ilm = ilm;
14121 14116                  }
14122 14117  
14123 14118                  /*
14124 14119                   * Enable multicast for the solicited node multicast address.
14125 14120                   * If IPMP we need to put the membership on the upper ill.
14126 14121                   */
14127 14122                  if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
14128 14123                          ill_t *mcast_ill = NULL;
14129 14124                          boolean_t need_refrele;
14130 14125  
14131 14126                          if (IS_UNDER_IPMP(ill) &&
14132 14127                              (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
14133 14128                                  need_refrele = B_TRUE;
14134 14129                          } else {
14135 14130                                  mcast_ill = ill;
14136 14131                                  need_refrele = B_FALSE;
14137 14132                          }
14138 14133  
14139 14134                          ilm = ip_addmulti(&v6solmc, mcast_ill,
14140 14135                              ipif->ipif_zoneid, &err);
14141 14136                          if (need_refrele)
14142 14137                                  ill_refrele(mcast_ill);
14143 14138  
14144 14139                          if (ilm == NULL) {
14145 14140                                  ASSERT(err != 0);
14146 14141                                  ip0dbg(("ipif_multicast_up: solicited MC"
14147 14142                                      " failed %d\n", err));
14148 14143                                  if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
14149 14144                                          ipif->ipif_allhosts_ilm = NULL;
14150 14145                                          (void) ip_delmulti(ilm);
14151 14146                                  }
14152 14147                                  return;
14153 14148                          }
14154 14149                          ipif->ipif_solmulti_ilm = ilm;
14155 14150                  }
14156 14151          } else {
14157 14152                  in6_addr_t v6group;
14158 14153  
14159 14154                  if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
14160 14155                          return;
14161 14156  
14162 14157                  /* Join the all hosts multicast address */
14163 14158                  ip1dbg(("ipif_multicast_up - addmulti\n"));
14164 14159                  IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
14165 14160  
14166 14161                  ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
14167 14162                  if (ilm == NULL) {
14168 14163                          ASSERT(err != 0);
14169 14164                          ip0dbg(("ipif_multicast_up: failed %d\n", err));
14170 14165                          return;
14171 14166                  }
14172 14167                  ipif->ipif_allhosts_ilm = ilm;
14173 14168          }
14174 14169  }
14175 14170  
14176 14171  /*
14177 14172   * Blow away any multicast groups that we joined in ipif_multicast_up().
14178 14173   * (ilms from explicit memberships are handled in conn_update_ill.)
14179 14174   */
14180 14175  void
14181 14176  ipif_multicast_down(ipif_t *ipif)
14182 14177  {
14183 14178          ASSERT(IAM_WRITER_IPIF(ipif));
14184 14179  
14185 14180          ip1dbg(("ipif_multicast_down\n"));
14186 14181  
14187 14182          if (ipif->ipif_allhosts_ilm != NULL) {
14188 14183                  (void) ip_delmulti(ipif->ipif_allhosts_ilm);
14189 14184                  ipif->ipif_allhosts_ilm = NULL;
14190 14185          }
14191 14186          if (ipif->ipif_solmulti_ilm != NULL) {
14192 14187                  (void) ip_delmulti(ipif->ipif_solmulti_ilm);
14193 14188                  ipif->ipif_solmulti_ilm = NULL;
14194 14189          }
14195 14190  }
14196 14191  
14197 14192  /*
14198 14193   * Used when an interface comes up to recreate any extra routes on this
14199 14194   * interface.
14200 14195   */
14201 14196  int
14202 14197  ill_recover_saved_ire(ill_t *ill)
14203 14198  {
14204 14199          mblk_t          *mp;
14205 14200          ip_stack_t      *ipst = ill->ill_ipst;
14206 14201  
14207 14202          ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
14208 14203  
14209 14204          mutex_enter(&ill->ill_saved_ire_lock);
14210 14205          for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
14211 14206                  ire_t           *ire, *nire;
14212 14207                  ifrt_t          *ifrt;
14213 14208  
14214 14209                  ifrt = (ifrt_t *)mp->b_rptr;
14215 14210                  /*
14216 14211                   * Create a copy of the IRE with the saved address and netmask.
14217 14212                   */
14218 14213                  if (ill->ill_isv6) {
14219 14214                          ire = ire_create_v6(
14220 14215                              &ifrt->ifrt_v6addr,
14221 14216                              &ifrt->ifrt_v6mask,
14222 14217                              &ifrt->ifrt_v6gateway_addr,
14223 14218                              ifrt->ifrt_type,
14224 14219                              ill,
14225 14220                              ifrt->ifrt_zoneid,
14226 14221                              ifrt->ifrt_flags,
14227 14222                              NULL,
14228 14223                              ipst);
14229 14224                  } else {
14230 14225                          ire = ire_create(
14231 14226                              (uint8_t *)&ifrt->ifrt_addr,
14232 14227                              (uint8_t *)&ifrt->ifrt_mask,
14233 14228                              (uint8_t *)&ifrt->ifrt_gateway_addr,
14234 14229                              ifrt->ifrt_type,
14235 14230                              ill,
14236 14231                              ifrt->ifrt_zoneid,
14237 14232                              ifrt->ifrt_flags,
14238 14233                              NULL,
14239 14234                              ipst);
14240 14235                  }
14241 14236                  if (ire == NULL) {
14242 14237                          mutex_exit(&ill->ill_saved_ire_lock);
14243 14238                          return (ENOMEM);
14244 14239                  }
14245 14240  
14246 14241                  if (ifrt->ifrt_flags & RTF_SETSRC) {
14247 14242                          if (ill->ill_isv6) {
14248 14243                                  ire->ire_setsrc_addr_v6 =
14249 14244                                      ifrt->ifrt_v6setsrc_addr;
14250 14245                          } else {
14251 14246                                  ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
14252 14247                          }
14253 14248                  }
14254 14249  
14255 14250                  /*
14256 14251                   * Some software (for example, GateD and Sun Cluster) attempts
14257 14252                   * to create (what amount to) IRE_PREFIX routes with the
14258 14253                   * loopback address as the gateway.  This is primarily done to
14259 14254                   * set up prefixes with the RTF_REJECT flag set (for example,
14260 14255                   * when generating aggregate routes.)
14261 14256                   *
14262 14257                   * If the IRE type (as defined by ill->ill_net_type) is
14263 14258                   * IRE_LOOPBACK, then we map the request into a
14264 14259                   * IRE_IF_NORESOLVER.
14265 14260                   */
14266 14261                  if (ill->ill_net_type == IRE_LOOPBACK)
14267 14262                          ire->ire_type = IRE_IF_NORESOLVER;
14268 14263  
14269 14264                  /*
14270 14265                   * ire held by ire_add, will be refreled' towards the
14271 14266                   * the end of ipif_up_done
14272 14267                   */
14273 14268                  nire = ire_add(ire);
14274 14269                  /*
14275 14270                   * Check if it was a duplicate entry. This handles
14276 14271                   * the case of two racing route adds for the same route
14277 14272                   */
14278 14273                  if (nire == NULL) {
14279 14274                          ip1dbg(("ill_recover_saved_ire: FAILED\n"));
14280 14275                  } else if (nire != ire) {
14281 14276                          ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
14282 14277                              (void *)nire));
14283 14278                          ire_delete(nire);
14284 14279                  } else {
14285 14280                          ip1dbg(("ill_recover_saved_ire: added ire %p\n",
14286 14281                              (void *)nire));
14287 14282                  }
14288 14283                  if (nire != NULL)
14289 14284                          ire_refrele(nire);
14290 14285          }
14291 14286          mutex_exit(&ill->ill_saved_ire_lock);
14292 14287          return (0);
14293 14288  }
14294 14289  
14295 14290  /*
14296 14291   * Used to set the netmask and broadcast address to default values when the
14297 14292   * interface is brought up.  (Always called as writer.)
14298 14293   */
14299 14294  static void
14300 14295  ipif_set_default(ipif_t *ipif)
14301 14296  {
14302 14297          ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14303 14298  
14304 14299          if (!ipif->ipif_isv6) {
14305 14300                  /*
14306 14301                   * Interface holds an IPv4 address. Default
14307 14302                   * mask is the natural netmask.
14308 14303                   */
14309 14304                  if (!ipif->ipif_net_mask) {
14310 14305                          ipaddr_t        v4mask;
14311 14306  
14312 14307                          v4mask = ip_net_mask(ipif->ipif_lcl_addr);
14313 14308                          V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
14314 14309                  }
14315 14310                  if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14316 14311                          /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14317 14312                          ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14318 14313                  } else {
14319 14314                          V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14320 14315                              ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14321 14316                  }
14322 14317                  /*
14323 14318                   * NOTE: SunOS 4.X does this even if the broadcast address
14324 14319                   * has been already set thus we do the same here.
14325 14320                   */
14326 14321                  if (ipif->ipif_flags & IPIF_BROADCAST) {
14327 14322                          ipaddr_t        v4addr;
14328 14323  
14329 14324                          v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
14330 14325                          IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
14331 14326                  }
14332 14327          } else {
14333 14328                  /*
14334 14329                   * Interface holds an IPv6-only address.  Default
14335 14330                   * mask is all-ones.
14336 14331                   */
14337 14332                  if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
14338 14333                          ipif->ipif_v6net_mask = ipv6_all_ones;
14339 14334                  if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14340 14335                          /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14341 14336                          ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14342 14337                  } else {
14343 14338                          V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14344 14339                              ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14345 14340                  }
14346 14341          }
14347 14342  }
14348 14343  
14349 14344  /*
14350 14345   * Return 0 if this address can be used as local address without causing
14351 14346   * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
14352 14347   * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
14353 14348   * Note that the same IPv6 link-local address is allowed as long as the ills
14354 14349   * are not on the same link.
14355 14350   */
14356 14351  int
14357 14352  ip_addr_availability_check(ipif_t *new_ipif)
14358 14353  {
14359 14354          in6_addr_t our_v6addr;
14360 14355          ill_t *ill;
14361 14356          ipif_t *ipif;
14362 14357          ill_walk_context_t ctx;
14363 14358          ip_stack_t      *ipst = new_ipif->ipif_ill->ill_ipst;
14364 14359  
14365 14360          ASSERT(IAM_WRITER_IPIF(new_ipif));
14366 14361          ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
14367 14362          ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
14368 14363  
14369 14364          new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
14370 14365          if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
14371 14366              IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
14372 14367                  return (0);
14373 14368  
14374 14369          our_v6addr = new_ipif->ipif_v6lcl_addr;
14375 14370  
14376 14371          if (new_ipif->ipif_isv6)
14377 14372                  ill = ILL_START_WALK_V6(&ctx, ipst);
14378 14373          else
14379 14374                  ill = ILL_START_WALK_V4(&ctx, ipst);
14380 14375  
14381 14376          for (; ill != NULL; ill = ill_next(&ctx, ill)) {
14382 14377                  for (ipif = ill->ill_ipif; ipif != NULL;
14383 14378                      ipif = ipif->ipif_next) {
14384 14379                          if ((ipif == new_ipif) ||
14385 14380                              !(ipif->ipif_flags & IPIF_UP) ||
14386 14381                              (ipif->ipif_flags & IPIF_UNNUMBERED) ||
14387 14382                              !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
14388 14383                              &our_v6addr))
14389 14384                                  continue;
14390 14385  
14391 14386                          if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
14392 14387                                  new_ipif->ipif_flags |= IPIF_UNNUMBERED;
14393 14388                          else if (ipif->ipif_flags & IPIF_POINTOPOINT)
14394 14389                                  ipif->ipif_flags |= IPIF_UNNUMBERED;
14395 14390                          else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
14396 14391                              IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
14397 14392                              !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
14398 14393                                  continue;
14399 14394                          else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
14400 14395                              ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
14401 14396                                  continue;
14402 14397                          else if (new_ipif->ipif_ill == ill)
14403 14398                                  return (EADDRINUSE);
14404 14399                          else
14405 14400                                  return (EADDRNOTAVAIL);
14406 14401                  }
14407 14402          }
14408 14403  
14409 14404          return (0);
14410 14405  }
14411 14406  
14412 14407  /*
14413 14408   * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
14414 14409   * IREs for the ipif.
14415 14410   * When the routine returns EINPROGRESS then mp has been consumed and
14416 14411   * the ioctl will be acked from ip_rput_dlpi.
14417 14412   */
14418 14413  int
14419 14414  ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
14420 14415  {
14421 14416          ill_t           *ill = ipif->ipif_ill;
14422 14417          boolean_t       isv6 = ipif->ipif_isv6;
14423 14418          int             err = 0;
14424 14419          boolean_t       success;
14425 14420          uint_t          ipif_orig_id;
14426 14421          ip_stack_t      *ipst = ill->ill_ipst;
14427 14422  
14428 14423          ASSERT(IAM_WRITER_IPIF(ipif));
14429 14424  
14430 14425          ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
14431 14426          DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
14432 14427              ill_t *, ill, ipif_t *, ipif);
14433 14428  
14434 14429          /* Shouldn't get here if it is already up. */
14435 14430          if (ipif->ipif_flags & IPIF_UP)
14436 14431                  return (EALREADY);
14437 14432  
14438 14433          /*
14439 14434           * If this is a request to bring up a data address on an interface
14440 14435           * under IPMP, then move the address to its IPMP meta-interface and
14441 14436           * try to bring it up.  One complication is that the zeroth ipif for
14442 14437           * an ill is special, in that every ill always has one, and that code
14443 14438           * throughout IP deferences ill->ill_ipif without holding any locks.
14444 14439           */
14445 14440          if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
14446 14441              (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
14447 14442                  ipif_t  *stubipif = NULL, *moveipif = NULL;
14448 14443                  ill_t   *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
14449 14444  
14450 14445                  /*
14451 14446                   * The ipif being brought up should be quiesced.  If it's not,
14452 14447                   * something has gone amiss and we need to bail out.  (If it's
14453 14448                   * quiesced, we know it will remain so via IPIF_CONDEMNED.)
14454 14449                   */
14455 14450                  mutex_enter(&ill->ill_lock);
14456 14451                  if (!ipif_is_quiescent(ipif)) {
14457 14452                          mutex_exit(&ill->ill_lock);
14458 14453                          return (EINVAL);
14459 14454                  }
14460 14455                  mutex_exit(&ill->ill_lock);
14461 14456  
14462 14457                  /*
14463 14458                   * If we're going to need to allocate ipifs, do it prior
14464 14459                   * to starting the move (and grabbing locks).
14465 14460                   */
14466 14461                  if (ipif->ipif_id == 0) {
14467 14462                          if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14468 14463                              B_FALSE, &err)) == NULL) {
14469 14464                                  return (err);
14470 14465                          }
14471 14466                          if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14472 14467                              B_FALSE, &err)) == NULL) {
14473 14468                                  mi_free(moveipif);
14474 14469                                  return (err);
14475 14470                          }
14476 14471                  }
14477 14472  
14478 14473                  /*
14479 14474                   * Grab or transfer the ipif to move.  During the move, keep
14480 14475                   * ill_g_lock held to prevent any ill walker threads from
14481 14476                   * seeing things in an inconsistent state.
14482 14477                   */
14483 14478                  rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14484 14479                  if (ipif->ipif_id != 0) {
14485 14480                          ipif_remove(ipif);
14486 14481                  } else {
14487 14482                          ipif_transfer(ipif, moveipif, stubipif);
14488 14483                          ipif = moveipif;
14489 14484                  }
14490 14485  
14491 14486                  /*
14492 14487                   * Place the ipif on the IPMP ill.  If the zeroth ipif on
14493 14488                   * the IPMP ill is a stub (0.0.0.0 down address) then we
14494 14489                   * replace that one.  Otherwise, pick the next available slot.
14495 14490                   */
14496 14491                  ipif->ipif_ill = ipmp_ill;
14497 14492                  ipif_orig_id = ipif->ipif_id;
14498 14493  
14499 14494                  if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
14500 14495                          ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
14501 14496                          ipif = ipmp_ill->ill_ipif;
14502 14497                  } else {
14503 14498                          ipif->ipif_id = -1;
14504 14499                          if ((err = ipif_insert(ipif, B_FALSE)) != 0) {
14505 14500                                  /*
14506 14501                                   * No more available ipif_id's -- put it back
14507 14502                                   * on the original ill and fail the operation.
14508 14503                                   * Since we're writer on the ill, we can be
14509 14504                                   * sure our old slot is still available.
14510 14505                                   */
14511 14506                                  ipif->ipif_id = ipif_orig_id;
14512 14507                                  ipif->ipif_ill = ill;
14513 14508                                  if (ipif_orig_id == 0) {
14514 14509                                          ipif_transfer(ipif, ill->ill_ipif,
14515 14510                                              NULL);
14516 14511                                  } else {
14517 14512                                          VERIFY(ipif_insert(ipif, B_FALSE) == 0);
14518 14513                                  }
14519 14514                                  rw_exit(&ipst->ips_ill_g_lock);
14520 14515                                  return (err);
14521 14516                          }
14522 14517                  }
14523 14518                  rw_exit(&ipst->ips_ill_g_lock);
14524 14519  
14525 14520                  /*
14526 14521                   * Tell SCTP that the ipif has moved.  Note that even if we
14527 14522                   * had to allocate a new ipif, the original sequence id was
14528 14523                   * preserved and therefore SCTP won't know.
14529 14524                   */
14530 14525                  sctp_move_ipif(ipif, ill, ipmp_ill);
14531 14526  
14532 14527                  /*
14533 14528                   * If the ipif being brought up was on slot zero, then we
14534 14529                   * first need to bring up the placeholder we stuck there.  In
14535 14530                   * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
14536 14531                   * call to ipif_up() itself, if we successfully bring up the
14537 14532                   * placeholder, we'll check ill_move_ipif and bring it up too.
14538 14533                   */
14539 14534                  if (ipif_orig_id == 0) {
14540 14535                          ASSERT(ill->ill_move_ipif == NULL);
14541 14536                          ill->ill_move_ipif = ipif;
14542 14537                          if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
14543 14538                                  ASSERT(ill->ill_move_ipif == NULL);
14544 14539                          if (err != EINPROGRESS)
14545 14540                                  ill->ill_move_ipif = NULL;
14546 14541                          return (err);
14547 14542                  }
14548 14543  
14549 14544                  /*
14550 14545                   * Bring it up on the IPMP ill.
14551 14546                   */
14552 14547                  return (ipif_up(ipif, q, mp));
14553 14548          }
14554 14549  
14555 14550          /* Skip arp/ndp for any loopback interface. */
14556 14551          if (ill->ill_wq != NULL) {
14557 14552                  conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14558 14553                  ipsq_t  *ipsq = ill->ill_phyint->phyint_ipsq;
14559 14554  
14560 14555                  if (!ill->ill_dl_up) {
14561 14556                          /*
14562 14557                           * ill_dl_up is not yet set. i.e. we are yet to
14563 14558                           * DL_BIND with the driver and this is the first
14564 14559                           * logical interface on the ill to become "up".
14565 14560                           * Tell the driver to get going (via DL_BIND_REQ).
14566 14561                           * Note that changing "significant" IFF_ flags
14567 14562                           * address/netmask etc cause a down/up dance, but
14568 14563                           * does not cause an unbind (DL_UNBIND) with the driver
14569 14564                           */
14570 14565                          if ((err = ill_dl_up(ill, ipif)) != 0) {
14571 14566                                  return (err);
14572 14567                          }
14573 14568                  }
14574 14569  
14575 14570                  /* Reject bringing up interfaces with unusable IP addresses */
14576 14571                  if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) {
14577 14572                          return (EPERM);
14578 14573                  }
14579 14574  
14580 14575                  /*
14581 14576                   * ipif_resolver_up may end up needeing to bind/attach
14582 14577                   * the ARP stream, which in turn necessitates a
14583 14578                   * DLPI message exchange with the driver. ioctls are
14584 14579                   * serialized and so we cannot send more than one
14585 14580                   * interface up message at a time. If ipif_resolver_up
14586 14581                   * does need to wait for the DLPI handshake for the ARP stream,
14587 14582                   * we get EINPROGRESS and we will complete in arp_bringup_done.
14588 14583                   */
14589 14584  
14590 14585                  ASSERT(connp != NULL || !CONN_Q(q));
14591 14586                  if (connp != NULL)
14592 14587                          mutex_enter(&connp->conn_lock);
14593 14588                  mutex_enter(&ill->ill_lock);
14594 14589                  success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14595 14590                  mutex_exit(&ill->ill_lock);
14596 14591                  if (connp != NULL)
14597 14592                          mutex_exit(&connp->conn_lock);
14598 14593                  if (!success)
14599 14594                          return (EINTR);
14600 14595  
14601 14596                  /*
14602 14597                   * Crank up IPv6 neighbor discovery. Unlike ARP, this should
14603 14598                   * complete when ipif_ndp_up returns.
14604 14599                   */
14605 14600                  err = ipif_resolver_up(ipif, Res_act_initial);
14606 14601                  if (err == EINPROGRESS) {
14607 14602                          /* We will complete it in arp_bringup_done() */
14608 14603                          return (err);
14609 14604                  }
14610 14605  
14611 14606                  if (isv6 && err == 0)
14612 14607                          err = ipif_ndp_up(ipif, B_TRUE);
14613 14608  
14614 14609                  ASSERT(err != EINPROGRESS);
14615 14610                  mp = ipsq_pending_mp_get(ipsq, &connp);
14616 14611                  ASSERT(mp != NULL);
14617 14612                  if (err != 0)
14618 14613                          return (err);
14619 14614          } else {
14620 14615                  /*
14621 14616                   * Interfaces without underlying hardware don't do duplicate
14622 14617                   * address detection.
14623 14618                   */
14624 14619                  ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
14625 14620                  ipif->ipif_addr_ready = 1;
14626 14621                  err = ill_add_ires(ill);
14627 14622                  /* allocation failure? */
14628 14623                  if (err != 0)
14629 14624                          return (err);
14630 14625          }
14631 14626  
14632 14627          err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
14633 14628          if (err == 0 && ill->ill_move_ipif != NULL) {
14634 14629                  ipif = ill->ill_move_ipif;
14635 14630                  ill->ill_move_ipif = NULL;
14636 14631                  return (ipif_up(ipif, q, mp));
14637 14632          }
14638 14633          return (err);
14639 14634  }
14640 14635  
14641 14636  /*
14642 14637   * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
14643 14638   * The identical set of IREs need to be removed in ill_delete_ires().
14644 14639   */
14645 14640  int
14646 14641  ill_add_ires(ill_t *ill)
14647 14642  {
14648 14643          ire_t   *ire;
14649 14644          in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1};
14650 14645          in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
14651 14646  
14652 14647          if (ill->ill_ire_multicast != NULL)
14653 14648                  return (0);
14654 14649  
14655 14650          /*
14656 14651           * provide some dummy ire_addr for creating the ire.
14657 14652           */
14658 14653          if (ill->ill_isv6) {
14659 14654                  ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
14660 14655                      ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14661 14656          } else {
14662 14657                  ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
14663 14658                      ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14664 14659          }
14665 14660          if (ire == NULL)
14666 14661                  return (ENOMEM);
14667 14662  
14668 14663          ill->ill_ire_multicast = ire;
14669 14664          return (0);
14670 14665  }
14671 14666  
14672 14667  void
14673 14668  ill_delete_ires(ill_t *ill)
14674 14669  {
14675 14670          if (ill->ill_ire_multicast != NULL) {
14676 14671                  /*
14677 14672                   * BIND/ATTACH completed; Release the ref for ill_ire_multicast
14678 14673                   * which was taken without any th_tracing enabled.
14679 14674                   * We also mark it as condemned (note that it was never added)
14680 14675                   * so that caching conn's can move off of it.
14681 14676                   */
14682 14677                  ire_make_condemned(ill->ill_ire_multicast);
14683 14678                  ire_refrele_notr(ill->ill_ire_multicast);
14684 14679                  ill->ill_ire_multicast = NULL;
14685 14680          }
14686 14681  }
14687 14682  
14688 14683  /*
14689 14684   * Perform a bind for the physical device.
14690 14685   *
14691 14686   * When the routine returns successfully then dlpi has been bound and
14692 14687   * capabilities negotiated. An unbind message will have been allocated
14693 14688   * for later use in ipif_down.
14694 14689   */
14695 14690  static int
14696 14691  ill_dl_up(ill_t *ill, ipif_t *ipif)
14697 14692  {
14698 14693          mblk_t  *bind_mp = NULL;
14699 14694          mblk_t  *unbind_mp = NULL;
14700 14695          int     err;
14701 14696  
14702 14697          DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
14703 14698  
14704 14699          ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
14705 14700          ASSERT(IAM_WRITER_ILL(ill));
14706 14701  
14707 14702          /*
14708 14703           * Make sure we have an IRE_MULTICAST in case we immediately
14709 14704           * start receiving packets.
14710 14705           */
14711 14706          err = ill_add_ires(ill);
14712 14707          if (err != 0)
14713 14708                  goto bad;
14714 14709  
14715 14710          bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
14716 14711              DL_BIND_REQ);
14717 14712          if (bind_mp == NULL)
14718 14713                  goto bad;
14719 14714          ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
14720 14715          ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
14721 14716  
14722 14717          /*
14723 14718           * ill_unbind_mp would be non-null if the following sequence had
14724 14719           * happened:
14725 14720           * - send DL_BIND_REQ to driver, wait for response
14726 14721           * - multiple ioctls that need to bring the ipif up are encountered,
14727 14722           *   but they cannot enter the ipsq due to the outstanding DL_BIND_REQ.
14728 14723           *   These ioctls will then be enqueued on the ipsq
14729 14724           * - a DL_ERROR_ACK is returned for the DL_BIND_REQ
14730 14725           * At this point, the pending ioctls in the ipsq will be drained, and
14731 14726           * since ill->ill_dl_up was not set, ill_dl_up would be invoked with
14732 14727           * a non-null ill->ill_unbind_mp
14733 14728           */
14734 14729          if (ill->ill_unbind_mp == NULL) {
14735 14730                  unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t),
14736 14731                      DL_UNBIND_REQ);
14737 14732                  if (unbind_mp == NULL)
14738 14733                          goto bad;
14739 14734          }
14740 14735  
14741 14736          /*
14742 14737           * Save the unbind message for ill_dl_down(); it will be consumed when
14743 14738           * the interface goes down.
14744 14739           */
14745 14740          if (ill->ill_unbind_mp == NULL)
14746 14741                  ill->ill_unbind_mp = unbind_mp;
14747 14742  
14748 14743          ill_dlpi_send(ill, bind_mp);
14749 14744          /* Send down link-layer capabilities probe if not already done. */
14750 14745          ill_capability_probe(ill);
14751 14746          /*
14752 14747           * Wait for DLPI to be bound and the capability probe to finish.
14753 14748           * The call drops-and-reacquires the squeue. If it couldn't because
14754 14749           * ILL_CONDEMNED got set, bail.
14755 14750           */
14756 14751          if (!ill_capability_wait(ill))
14757 14752                  return (ENXIO);
14758 14753  
14759 14754          /* DLPI failed to bind. Return the saved error */
14760 14755          if (!ill->ill_dl_up) {
14761 14756                  return (ill->ill_dl_bind_err);
14762 14757          }
14763 14758  
14764 14759          /*
14765 14760           * Sysid used to rely on the fact that netboots set domainname
14766 14761           * and the like. Now that miniroot boots aren't strictly netboots
14767 14762           * and miniroot network configuration is driven from userland
14768 14763           * these things still need to be set. This situation can be detected
14769 14764           * by comparing the interface being configured here to the one
14770 14765           * dhcifname was set to reference by the boot loader. Once sysid is
14771 14766           * converted to use dhcp_ipc_getinfo() this call can go away.
14772 14767           */
14773 14768          if ((ipif->ipif_flags & IPIF_DHCPRUNNING) &&
14774 14769              (strcmp(ill->ill_name, dhcifname) == 0) &&
14775 14770              (strlen(srpc_domain) == 0)) {
14776 14771                  if (dhcpinit() != 0)
14777 14772                          cmn_err(CE_WARN, "no cached dhcp response");
14778 14773          }
14779 14774  
14780 14775          return (0);
14781 14776  bad:
14782 14777          ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
14783 14778  
14784 14779          freemsg(bind_mp);
14785 14780          freemsg(unbind_mp);
14786 14781          return (ENOMEM);
14787 14782  }
14788 14783  
14789 14784  /* Add room for tcp+ip headers */
14790 14785  uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
14791 14786  
14792 14787  /*
14793 14788   * DLPI and ARP is up.
14794 14789   * Create all the IREs associated with an interface. Bring up multicast.
14795 14790   * Set the interface flag and finish other initialization
14796 14791   * that potentially had to be deferred to after DL_BIND_ACK.
14797 14792   */
14798 14793  int
14799 14794  ipif_up_done(ipif_t *ipif)
14800 14795  {
14801 14796          ill_t           *ill = ipif->ipif_ill;
14802 14797          int             err = 0;
14803 14798          boolean_t       loopback = B_FALSE;
14804 14799          boolean_t       update_src_selection = B_TRUE;
14805 14800          ipif_t          *tmp_ipif;
14806 14801  
14807 14802          ip1dbg(("ipif_up_done(%s:%u)\n",
14808 14803              ipif->ipif_ill->ill_name, ipif->ipif_id));
14809 14804          DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
14810 14805              ill_t *, ill, ipif_t *, ipif);
14811 14806  
14812 14807          /* Check if this is a loopback interface */
14813 14808          if (ipif->ipif_ill->ill_wq == NULL)
14814 14809                  loopback = B_TRUE;
14815 14810  
14816 14811          ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14817 14812  
14818 14813          /*
14819 14814           * If all other interfaces for this ill are down or DEPRECATED,
14820 14815           * or otherwise unsuitable for source address selection,
14821 14816           * reset the src generation numbers to make sure source
14822 14817           * address selection gets to take this new ipif into account.
14823 14818           * No need to hold ill_lock while traversing the ipif list since
14824 14819           * we are writer
14825 14820           */
14826 14821          for (tmp_ipif = ill->ill_ipif; tmp_ipif;
14827 14822              tmp_ipif = tmp_ipif->ipif_next) {
14828 14823                  if (((tmp_ipif->ipif_flags &
14829 14824                      (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
14830 14825                      !(tmp_ipif->ipif_flags & IPIF_UP)) ||
14831 14826                      (tmp_ipif == ipif))
14832 14827                          continue;
14833 14828                  /* first useable pre-existing interface */
14834 14829                  update_src_selection = B_FALSE;
14835 14830                  break;
14836 14831          }
14837 14832          if (update_src_selection)
14838 14833                  ip_update_source_selection(ill->ill_ipst);
14839 14834  
14840 14835          if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
14841 14836                  nce_t *loop_nce = NULL;
14842 14837                  uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
14843 14838  
14844 14839                  /*
14845 14840                   * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
14846 14841                   * ipif_lookup_on_name(), but in the case of zones we can have
14847 14842                   * several loopback addresses on lo0. So all the interfaces with
14848 14843                   * loopback addresses need to be marked IRE_LOOPBACK.
14849 14844                   */
14850 14845                  if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
14851 14846                      htonl(INADDR_LOOPBACK))
14852 14847                          ipif->ipif_ire_type = IRE_LOOPBACK;
14853 14848                  else
14854 14849                          ipif->ipif_ire_type = IRE_LOCAL;
14855 14850                  if (ill->ill_net_type != IRE_LOOPBACK)
14856 14851                          flags |= NCE_F_PUBLISH;
14857 14852  
14858 14853                  /* add unicast nce for the local addr */
14859 14854                  err = nce_lookup_then_add_v4(ill, NULL,
14860 14855                      ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
14861 14856                      ND_REACHABLE, &loop_nce);
14862 14857                  /* A shared-IP zone sees EEXIST for lo0:N */
14863 14858                  if (err == 0 || err == EEXIST) {
14864 14859                          ipif->ipif_added_nce = 1;
14865 14860                          loop_nce->nce_ipif_cnt++;
14866 14861                          nce_refrele(loop_nce);
14867 14862                          err = 0;
14868 14863                  } else {
14869 14864                          ASSERT(loop_nce == NULL);
14870 14865                          return (err);
14871 14866                  }
14872 14867          }
14873 14868  
14874 14869          /* Create all the IREs associated with this interface */
14875 14870          err = ipif_add_ires_v4(ipif, loopback);
14876 14871          if (err != 0) {
14877 14872                  /*
14878 14873                   * see comments about return value from
14879 14874                   * ip_addr_availability_check() in ipif_add_ires_v4().
14880 14875                   */
14881 14876                  if (err != EADDRINUSE) {
14882 14877                          (void) ipif_arp_down(ipif);
14883 14878                  } else {
14884 14879                          /*
14885 14880                           * Make IPMP aware of the deleted ipif so that
14886 14881                           * the needed ipmp cleanup (e.g., of ipif_bound_ill)
14887 14882                           * can be completed. Note that we do not want to
14888 14883                           * destroy the nce that was created on the ipmp_ill
14889 14884                           * for the active copy of the duplicate address in
14890 14885                           * use.
14891 14886                           */
14892 14887                          if (IS_IPMP(ill))
14893 14888                                  ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
14894 14889                          err = EADDRNOTAVAIL;
14895 14890                  }
14896 14891                  return (err);
14897 14892          }
14898 14893  
14899 14894          if (ill->ill_ipif_up_count == 1 && !loopback) {
14900 14895                  /* Recover any additional IREs entries for this ill */
14901 14896                  (void) ill_recover_saved_ire(ill);
14902 14897          }
14903 14898  
14904 14899          if (ill->ill_need_recover_multicast) {
14905 14900                  /*
14906 14901                   * Need to recover all multicast memberships in the driver.
14907 14902                   * This had to be deferred until we had attached.  The same
14908 14903                   * code exists in ipif_up_done_v6() to recover IPv6
14909 14904                   * memberships.
14910 14905                   *
14911 14906                   * Note that it would be preferable to unconditionally do the
14912 14907                   * ill_recover_multicast() in ill_dl_up(), but we cannot do
14913 14908                   * that since ill_join_allmulti() depends on ill_dl_up being
14914 14909                   * set, and it is not set until we receive a DL_BIND_ACK after
14915 14910                   * having called ill_dl_up().
14916 14911                   */
14917 14912                  ill_recover_multicast(ill);
14918 14913          }
14919 14914  
14920 14915          if (ill->ill_ipif_up_count == 1) {
14921 14916                  /*
14922 14917                   * Since the interface is now up, it may now be active.
14923 14918                   */
14924 14919                  if (IS_UNDER_IPMP(ill))
14925 14920                          ipmp_ill_refresh_active(ill);
14926 14921  
14927 14922                  /*
14928 14923                   * If this is an IPMP interface, we may now be able to
14929 14924                   * establish ARP entries.
14930 14925                   */
14931 14926                  if (IS_IPMP(ill))
14932 14927                          ipmp_illgrp_refresh_arpent(ill->ill_grp);
14933 14928          }
14934 14929  
14935 14930          /* Join the allhosts multicast address */
14936 14931          ipif_multicast_up(ipif);
14937 14932  
14938 14933          if (!loopback && !update_src_selection &&
14939 14934              !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
14940 14935                  ip_update_source_selection(ill->ill_ipst);
14941 14936  
14942 14937          if (!loopback && ipif->ipif_addr_ready) {
14943 14938                  /* Broadcast an address mask reply. */
14944 14939                  ipif_mask_reply(ipif);
14945 14940          }
14946 14941          /* Perhaps ilgs should use this ill */
14947 14942          update_conn_ill(NULL, ill->ill_ipst);
14948 14943  
14949 14944          /*
14950 14945           * This had to be deferred until we had bound.  Tell routing sockets and
14951 14946           * others that this interface is up if it looks like the address has
14952 14947           * been validated.  Otherwise, if it isn't ready yet, wait for
14953 14948           * duplicate address detection to do its thing.
14954 14949           */
14955 14950          if (ipif->ipif_addr_ready)
14956 14951                  ipif_up_notify(ipif);
14957 14952          return (0);
14958 14953  }
14959 14954  
14960 14955  /*
14961 14956   * Add the IREs associated with the ipif.
14962 14957   * Those MUST be explicitly removed in ipif_delete_ires_v4.
14963 14958   */
14964 14959  static int
14965 14960  ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
14966 14961  {
14967 14962          ill_t           *ill = ipif->ipif_ill;
14968 14963          ip_stack_t      *ipst = ill->ill_ipst;
14969 14964          ire_t           *ire_array[20];
14970 14965          ire_t           **irep = ire_array;
14971 14966          ire_t           **irep1;
14972 14967          ipaddr_t        net_mask = 0;
14973 14968          ipaddr_t        subnet_mask, route_mask;
14974 14969          int             err;
14975 14970          ire_t           *ire_local = NULL;      /* LOCAL or LOOPBACK */
14976 14971          ire_t           *ire_if = NULL;
14977 14972          uchar_t         *gw;
14978 14973  
14979 14974          if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14980 14975              !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14981 14976                  /*
14982 14977                   * If we're on a labeled system then make sure that zone-
14983 14978                   * private addresses have proper remote host database entries.
14984 14979                   */
14985 14980                  if (is_system_labeled() &&
14986 14981                      ipif->ipif_ire_type != IRE_LOOPBACK &&
14987 14982                      !tsol_check_interface_address(ipif))
14988 14983                          return (EINVAL);
14989 14984  
14990 14985                  /* Register the source address for __sin6_src_id */
14991 14986                  err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
14992 14987                      ipif->ipif_zoneid, ipst);
14993 14988                  if (err != 0) {
14994 14989                          ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
14995 14990                          return (err);
14996 14991                  }
14997 14992  
14998 14993                  if (loopback)
14999 14994                          gw = (uchar_t *)&ipif->ipif_lcl_addr;
15000 14995                  else
15001 14996                          gw = NULL;
15002 14997  
15003 14998                  /* If the interface address is set, create the local IRE. */
15004 14999                  ire_local = ire_create(
15005 15000                      (uchar_t *)&ipif->ipif_lcl_addr,    /* dest address */
15006 15001                      (uchar_t *)&ip_g_all_ones,          /* mask */
15007 15002                      gw,                                 /* gateway */
15008 15003                      ipif->ipif_ire_type,                /* LOCAL or LOOPBACK */
15009 15004                      ipif->ipif_ill,
15010 15005                      ipif->ipif_zoneid,
15011 15006                      ((ipif->ipif_flags & IPIF_PRIVATE) ?
15012 15007                      RTF_PRIVATE : 0) | RTF_KERNEL,
15013 15008                      NULL,
15014 15009                      ipst);
15015 15010                  ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
15016 15011                      " for 0x%x\n", (void *)ipif, (void *)ire_local,
15017 15012                      ipif->ipif_ire_type,
15018 15013                      ntohl(ipif->ipif_lcl_addr)));
15019 15014                  if (ire_local == NULL) {
15020 15015                          ip1dbg(("ipif_up_done: NULL ire_local\n"));
15021 15016                          err = ENOMEM;
15022 15017                          goto bad;
15023 15018                  }
15024 15019          } else {
15025 15020                  ip1dbg((
15026 15021                      "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
15027 15022                      ipif->ipif_ire_type,
15028 15023                      ntohl(ipif->ipif_lcl_addr),
15029 15024                      (uint_t)ipif->ipif_flags));
15030 15025          }
15031 15026          if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
15032 15027              !(ipif->ipif_flags & IPIF_NOLOCAL)) {
15033 15028                  net_mask = ip_net_mask(ipif->ipif_lcl_addr);
15034 15029          } else {
15035 15030                  net_mask = htonl(IN_CLASSA_NET);        /* fallback */
15036 15031          }
15037 15032  
15038 15033          subnet_mask = ipif->ipif_net_mask;
15039 15034  
15040 15035          /*
15041 15036           * If mask was not specified, use natural netmask of
15042 15037           * interface address. Also, store this mask back into the
15043 15038           * ipif struct.
15044 15039           */
15045 15040          if (subnet_mask == 0) {
15046 15041                  subnet_mask = net_mask;
15047 15042                  V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
15048 15043                  V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
15049 15044                      ipif->ipif_v6subnet);
15050 15045          }
15051 15046  
15052 15047          /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
15053 15048          if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
15054 15049              ipif->ipif_subnet != INADDR_ANY) {
15055 15050                  /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
15056 15051  
15057 15052                  if (ipif->ipif_flags & IPIF_POINTOPOINT) {
15058 15053                          route_mask = IP_HOST_MASK;
15059 15054                  } else {
15060 15055                          route_mask = subnet_mask;
15061 15056                  }
15062 15057  
15063 15058                  ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
15064 15059                      "creating if IRE ill_net_type 0x%x for 0x%x\n",
15065 15060                      (void *)ipif, (void *)ill, ill->ill_net_type,
15066 15061                      ntohl(ipif->ipif_subnet)));
15067 15062                  ire_if = ire_create(
15068 15063                      (uchar_t *)&ipif->ipif_subnet,
15069 15064                      (uchar_t *)&route_mask,
15070 15065                      (uchar_t *)&ipif->ipif_lcl_addr,
15071 15066                      ill->ill_net_type,
15072 15067                      ill,
15073 15068                      ipif->ipif_zoneid,
15074 15069                      ((ipif->ipif_flags & IPIF_PRIVATE) ?
15075 15070                      RTF_PRIVATE: 0) | RTF_KERNEL,
15076 15071                      NULL,
15077 15072                      ipst);
15078 15073                  if (ire_if == NULL) {
15079 15074                          ip1dbg(("ipif_up_done: NULL ire_if\n"));
15080 15075                          err = ENOMEM;
15081 15076                          goto bad;
15082 15077                  }
15083 15078          }
15084 15079  
15085 15080          /*
15086 15081           * Create any necessary broadcast IREs.
15087 15082           */
15088 15083          if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15089 15084              !(ipif->ipif_flags & IPIF_NOXMIT))
15090 15085                  irep = ipif_create_bcast_ires(ipif, irep);
15091 15086  
15092 15087          /* If an earlier ire_create failed, get out now */
15093 15088          for (irep1 = irep; irep1 > ire_array; ) {
15094 15089                  irep1--;
15095 15090                  if (*irep1 == NULL) {
15096 15091                          ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
15097 15092                          err = ENOMEM;
15098 15093                          goto bad;
15099 15094                  }
15100 15095          }
15101 15096  
15102 15097          /*
15103 15098           * Need to atomically check for IP address availability under
15104 15099           * ip_addr_avail_lock.  ill_g_lock is held as reader to ensure no new
15105 15100           * ills or new ipifs can be added while we are checking availability.
15106 15101           */
15107 15102          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15108 15103          mutex_enter(&ipst->ips_ip_addr_avail_lock);
15109 15104          /* Mark it up, and increment counters. */
15110 15105          ipif->ipif_flags |= IPIF_UP;
15111 15106          ill->ill_ipif_up_count++;
15112 15107          err = ip_addr_availability_check(ipif);
15113 15108          mutex_exit(&ipst->ips_ip_addr_avail_lock);
15114 15109          rw_exit(&ipst->ips_ill_g_lock);
15115 15110  
15116 15111          if (err != 0) {
15117 15112                  /*
15118 15113                   * Our address may already be up on the same ill. In this case,
15119 15114                   * the ARP entry for our ipif replaced the one for the other
15120 15115                   * ipif. So we don't want to delete it (otherwise the other ipif
15121 15116                   * would be unable to send packets).
15122 15117                   * ip_addr_availability_check() identifies this case for us and
15123 15118                   * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
15124 15119                   * which is the expected error code.
15125 15120                   */
15126 15121                  ill->ill_ipif_up_count--;
15127 15122                  ipif->ipif_flags &= ~IPIF_UP;
15128 15123                  goto bad;
15129 15124          }
15130 15125  
15131 15126          /*
15132 15127           * Add in all newly created IREs.  ire_create_bcast() has
15133 15128           * already checked for duplicates of the IRE_BROADCAST type.
15134 15129           * We add the IRE_INTERFACE before the IRE_LOCAL to ensure
15135 15130           * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is
15136 15131           * a /32 route.
15137 15132           */
15138 15133          if (ire_if != NULL) {
15139 15134                  ire_if = ire_add(ire_if);
15140 15135                  if (ire_if == NULL) {
15141 15136                          err = ENOMEM;
15142 15137                          goto bad2;
15143 15138                  }
15144 15139  #ifdef DEBUG
15145 15140                  ire_refhold_notr(ire_if);
15146 15141                  ire_refrele(ire_if);
15147 15142  #endif
15148 15143          }
15149 15144          if (ire_local != NULL) {
15150 15145                  ire_local = ire_add(ire_local);
15151 15146                  if (ire_local == NULL) {
15152 15147                          err = ENOMEM;
15153 15148                          goto bad2;
15154 15149                  }
15155 15150  #ifdef DEBUG
15156 15151                  ire_refhold_notr(ire_local);
15157 15152                  ire_refrele(ire_local);
15158 15153  #endif
15159 15154          }
15160 15155          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15161 15156          if (ire_local != NULL)
15162 15157                  ipif->ipif_ire_local = ire_local;
15163 15158          if (ire_if != NULL)
15164 15159                  ipif->ipif_ire_if = ire_if;
15165 15160          rw_exit(&ipst->ips_ill_g_lock);
15166 15161          ire_local = NULL;
15167 15162          ire_if = NULL;
15168 15163  
15169 15164          /*
15170 15165           * We first add all of them, and if that succeeds we refrele the
15171 15166           * bunch. That enables us to delete all of them should any of the
15172 15167           * ire_adds fail.
15173 15168           */
15174 15169          for (irep1 = irep; irep1 > ire_array; ) {
15175 15170                  irep1--;
15176 15171                  ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
15177 15172                  *irep1 = ire_add(*irep1);
15178 15173                  if (*irep1 == NULL) {
15179 15174                          err = ENOMEM;
15180 15175                          goto bad2;
15181 15176                  }
15182 15177          }
15183 15178  
15184 15179          for (irep1 = irep; irep1 > ire_array; ) {
15185 15180                  irep1--;
15186 15181                  /* refheld by ire_add. */
15187 15182                  if (*irep1 != NULL) {
15188 15183                          ire_refrele(*irep1);
15189 15184                          *irep1 = NULL;
15190 15185                  }
15191 15186          }
15192 15187  
15193 15188          if (!loopback) {
15194 15189                  /*
15195 15190                   * If the broadcast address has been set, make sure it makes
15196 15191                   * sense based on the interface address.
15197 15192                   * Only match on ill since we are sharing broadcast addresses.
15198 15193                   */
15199 15194                  if ((ipif->ipif_brd_addr != INADDR_ANY) &&
15200 15195                      (ipif->ipif_flags & IPIF_BROADCAST)) {
15201 15196                          ire_t   *ire;
15202 15197  
15203 15198                          ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
15204 15199                              IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL,
15205 15200                              (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
15206 15201  
15207 15202                          if (ire == NULL) {
15208 15203                                  /*
15209 15204                                   * If there isn't a matching broadcast IRE,
15210 15205                                   * revert to the default for this netmask.
15211 15206                                   */
15212 15207                                  ipif->ipif_v6brd_addr = ipv6_all_zeros;
15213 15208                                  mutex_enter(&ipif->ipif_ill->ill_lock);
15214 15209                                  ipif_set_default(ipif);
15215 15210                                  mutex_exit(&ipif->ipif_ill->ill_lock);
15216 15211                          } else {
15217 15212                                  ire_refrele(ire);
15218 15213                          }
15219 15214                  }
15220 15215  
15221 15216          }
15222 15217          return (0);
15223 15218  
15224 15219  bad2:
15225 15220          ill->ill_ipif_up_count--;
15226 15221          ipif->ipif_flags &= ~IPIF_UP;
15227 15222  
15228 15223  bad:
15229 15224          ip1dbg(("ipif_add_ires: FAILED \n"));
15230 15225          if (ire_local != NULL)
15231 15226                  ire_delete(ire_local);
15232 15227          if (ire_if != NULL)
15233 15228                  ire_delete(ire_if);
15234 15229  
15235 15230          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15236 15231          ire_local = ipif->ipif_ire_local;
15237 15232          ipif->ipif_ire_local = NULL;
15238 15233          ire_if = ipif->ipif_ire_if;
15239 15234          ipif->ipif_ire_if = NULL;
15240 15235          rw_exit(&ipst->ips_ill_g_lock);
15241 15236          if (ire_local != NULL) {
15242 15237                  ire_delete(ire_local);
15243 15238                  ire_refrele_notr(ire_local);
15244 15239          }
15245 15240          if (ire_if != NULL) {
15246 15241                  ire_delete(ire_if);
15247 15242                  ire_refrele_notr(ire_if);
15248 15243          }
15249 15244  
15250 15245          while (irep > ire_array) {
15251 15246                  irep--;
15252 15247                  if (*irep != NULL) {
15253 15248                          ire_delete(*irep);
15254 15249                  }
15255 15250          }
15256 15251          (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
15257 15252  
15258 15253          return (err);
15259 15254  }
15260 15255  
15261 15256  /* Remove all the IREs created by ipif_add_ires_v4 */
15262 15257  void
15263 15258  ipif_delete_ires_v4(ipif_t *ipif)
15264 15259  {
15265 15260          ill_t           *ill = ipif->ipif_ill;
15266 15261          ip_stack_t      *ipst = ill->ill_ipst;
15267 15262          ire_t           *ire;
15268 15263  
15269 15264          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15270 15265          ire = ipif->ipif_ire_local;
15271 15266          ipif->ipif_ire_local = NULL;
15272 15267          rw_exit(&ipst->ips_ill_g_lock);
15273 15268          if (ire != NULL) {
15274 15269                  /*
15275 15270                   * Move count to ipif so we don't loose the count due to
15276 15271                   * a down/up dance.
15277 15272                   */
15278 15273                  atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
15279 15274  
15280 15275                  ire_delete(ire);
15281 15276                  ire_refrele_notr(ire);
15282 15277          }
15283 15278          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15284 15279          ire = ipif->ipif_ire_if;
15285 15280          ipif->ipif_ire_if = NULL;
15286 15281          rw_exit(&ipst->ips_ill_g_lock);
15287 15282          if (ire != NULL) {
15288 15283                  ire_delete(ire);
15289 15284                  ire_refrele_notr(ire);
15290 15285          }
15291 15286  
15292 15287          /*
15293 15288           * Delete the broadcast IREs.
15294 15289           */
15295 15290          if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15296 15291              !(ipif->ipif_flags & IPIF_NOXMIT))
15297 15292                  ipif_delete_bcast_ires(ipif);
15298 15293  }
15299 15294  
15300 15295  /*
15301 15296   * Checks for availbility of a usable source address (if there is one) when the
15302 15297   * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
15303 15298   * this selection is done regardless of the destination.
15304 15299   */
15305 15300  boolean_t
15306 15301  ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
15307 15302      ip_stack_t *ipst)
15308 15303  {
15309 15304          ipif_t          *ipif = NULL;
15310 15305          ill_t           *uill;
15311 15306  
15312 15307          ASSERT(ifindex != 0);
15313 15308  
15314 15309          uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
15315 15310          if (uill == NULL)
15316 15311                  return (B_FALSE);
15317 15312  
15318 15313          mutex_enter(&uill->ill_lock);
15319 15314          for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15320 15315                  if (IPIF_IS_CONDEMNED(ipif))
15321 15316                          continue;
15322 15317                  if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15323 15318                          continue;
15324 15319                  if (!(ipif->ipif_flags & IPIF_UP))
15325 15320                          continue;
15326 15321                  if (ipif->ipif_zoneid != zoneid)
15327 15322                          continue;
15328 15323                  if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15329 15324                      ipif->ipif_lcl_addr == INADDR_ANY)
15330 15325                          continue;
15331 15326                  mutex_exit(&uill->ill_lock);
15332 15327                  ill_refrele(uill);
15333 15328                  return (B_TRUE);
15334 15329          }
15335 15330          mutex_exit(&uill->ill_lock);
15336 15331          ill_refrele(uill);
15337 15332          return (B_FALSE);
15338 15333  }
15339 15334  
15340 15335  /*
15341 15336   * Find an ipif with a good local address on the ill+zoneid.
15342 15337   */
15343 15338  ipif_t *
15344 15339  ipif_good_addr(ill_t *ill, zoneid_t zoneid)
15345 15340  {
15346 15341          ipif_t          *ipif;
15347 15342  
15348 15343          mutex_enter(&ill->ill_lock);
15349 15344          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15350 15345                  if (IPIF_IS_CONDEMNED(ipif))
15351 15346                          continue;
15352 15347                  if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15353 15348                          continue;
15354 15349                  if (!(ipif->ipif_flags & IPIF_UP))
15355 15350                          continue;
15356 15351                  if (ipif->ipif_zoneid != zoneid &&
15357 15352                      ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
15358 15353                          continue;
15359 15354                  if (ill->ill_isv6 ?
15360 15355                      IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15361 15356                      ipif->ipif_lcl_addr == INADDR_ANY)
15362 15357                          continue;
15363 15358                  ipif_refhold_locked(ipif);
15364 15359                  mutex_exit(&ill->ill_lock);
15365 15360                  return (ipif);
15366 15361          }
15367 15362          mutex_exit(&ill->ill_lock);
15368 15363          return (NULL);
15369 15364  }
15370 15365  
15371 15366  /*
15372 15367   * IP source address type, sorted from worst to best.  For a given type,
15373 15368   * always prefer IP addresses on the same subnet.  All-zones addresses are
15374 15369   * suboptimal because they pose problems with unlabeled destinations.
15375 15370   */
15376 15371  typedef enum {
15377 15372          IPIF_NONE,
15378 15373          IPIF_DIFFNET_DEPRECATED,        /* deprecated and different subnet */
15379 15374          IPIF_SAMENET_DEPRECATED,        /* deprecated and same subnet */
15380 15375          IPIF_DIFFNET_ALLZONES,          /* allzones and different subnet */
15381 15376          IPIF_SAMENET_ALLZONES,          /* allzones and same subnet */
15382 15377          IPIF_DIFFNET,                   /* normal and different subnet */
15383 15378          IPIF_SAMENET,                   /* normal and same subnet */
15384 15379          IPIF_LOCALADDR                  /* local loopback */
15385 15380  } ipif_type_t;
15386 15381  
15387 15382  /*
15388 15383   * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
15389 15384   * `zoneid'.  We rate usable ipifs from low -> high as per the ipif_type_t
15390 15385   * enumeration, and return the highest-rated ipif.  If there's a tie, we pick
15391 15386   * the first one, unless IPMP is used in which case we round-robin among them;
15392 15387   * see below for more.
15393 15388   *
15394 15389   * Returns NULL if there is no suitable source address for the ill.
15395 15390   * This only occurs when there is no valid source address for the ill.
15396 15391   */
15397 15392  ipif_t *
15398 15393  ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
15399 15394      boolean_t allow_usesrc, boolean_t *notreadyp)
15400 15395  {
15401 15396          ill_t   *usill = NULL;
15402 15397          ill_t   *ipmp_ill = NULL;
15403 15398          ipif_t  *start_ipif, *next_ipif, *ipif, *best_ipif;
15404 15399          ipif_type_t type, best_type;
15405 15400          tsol_tpc_t *src_rhtp, *dst_rhtp;
15406 15401          ip_stack_t *ipst = ill->ill_ipst;
15407 15402          boolean_t samenet;
15408 15403  
15409 15404          if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
15410 15405                  usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
15411 15406                      B_FALSE, ipst);
15412 15407                  if (usill != NULL)
15413 15408                          ill = usill;    /* Select source from usesrc ILL */
15414 15409                  else
15415 15410                          return (NULL);
15416 15411          }
15417 15412  
15418 15413          /*
15419 15414           * Test addresses should never be used for source address selection,
15420 15415           * so if we were passed one, switch to the IPMP meta-interface.
15421 15416           */
15422 15417          if (IS_UNDER_IPMP(ill)) {
15423 15418                  if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
15424 15419                          ill = ipmp_ill; /* Select source from IPMP ill */
15425 15420                  else
15426 15421                          return (NULL);
15427 15422          }
15428 15423  
15429 15424          /*
15430 15425           * If we're dealing with an unlabeled destination on a labeled system,
15431 15426           * make sure that we ignore source addresses that are incompatible with
15432 15427           * the destination's default label.  That destination's default label
15433 15428           * must dominate the minimum label on the source address.
15434 15429           */
15435 15430          dst_rhtp = NULL;
15436 15431          if (is_system_labeled()) {
15437 15432                  dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE);
15438 15433                  if (dst_rhtp == NULL)
15439 15434                          return (NULL);
15440 15435                  if (dst_rhtp->tpc_tp.host_type != UNLABELED) {
15441 15436                          TPC_RELE(dst_rhtp);
15442 15437                          dst_rhtp = NULL;
15443 15438                  }
15444 15439          }
15445 15440  
15446 15441          /*
15447 15442           * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
15448 15443           * can be deleted. But an ipif/ill can get CONDEMNED any time.
15449 15444           * After selecting the right ipif, under ill_lock make sure ipif is
15450 15445           * not condemned, and increment refcnt. If ipif is CONDEMNED,
15451 15446           * we retry. Inside the loop we still need to check for CONDEMNED,
15452 15447           * but not under a lock.
15453 15448           */
15454 15449          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15455 15450  retry:
15456 15451          /*
15457 15452           * For source address selection, we treat the ipif list as circular
15458 15453           * and continue until we get back to where we started.  This allows
15459 15454           * IPMP to vary source address selection (which improves inbound load
15460 15455           * spreading) by caching its last ending point and starting from
15461 15456           * there.  NOTE: we don't have to worry about ill_src_ipif changing
15462 15457           * ills since that can't happen on the IPMP ill.
15463 15458           */
15464 15459          start_ipif = ill->ill_ipif;
15465 15460          if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
15466 15461                  start_ipif = ill->ill_src_ipif;
15467 15462  
15468 15463          ipif = start_ipif;
15469 15464          best_ipif = NULL;
15470 15465          best_type = IPIF_NONE;
15471 15466          do {
15472 15467                  if ((next_ipif = ipif->ipif_next) == NULL)
15473 15468                          next_ipif = ill->ill_ipif;
15474 15469  
15475 15470                  if (IPIF_IS_CONDEMNED(ipif))
15476 15471                          continue;
15477 15472                  /* Always skip NOLOCAL and ANYCAST interfaces */
15478 15473                  if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15479 15474                          continue;
15480 15475                  /* Always skip NOACCEPT interfaces */
15481 15476                  if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT)
15482 15477                          continue;
15483 15478                  if (!(ipif->ipif_flags & IPIF_UP))
15484 15479                          continue;
15485 15480  
15486 15481                  if (!ipif->ipif_addr_ready) {
15487 15482                          if (notreadyp != NULL)
15488 15483                                  *notreadyp = B_TRUE;
15489 15484                          continue;
15490 15485                  }
15491 15486  
15492 15487                  if (zoneid != ALL_ZONES &&
15493 15488                      ipif->ipif_zoneid != zoneid &&
15494 15489                      ipif->ipif_zoneid != ALL_ZONES)
15495 15490                          continue;
15496 15491  
15497 15492                  /*
15498 15493                   * Interfaces with 0.0.0.0 address are allowed to be UP, but
15499 15494                   * are not valid as source addresses.
15500 15495                   */
15501 15496                  if (ipif->ipif_lcl_addr == INADDR_ANY)
15502 15497                          continue;
15503 15498  
15504 15499                  /*
15505 15500                   * Check compatibility of local address for destination's
15506 15501                   * default label if we're on a labeled system.  Incompatible
15507 15502                   * addresses can't be used at all.
15508 15503                   */
15509 15504                  if (dst_rhtp != NULL) {
15510 15505                          boolean_t incompat;
15511 15506  
15512 15507                          src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
15513 15508                              IPV4_VERSION, B_FALSE);
15514 15509                          if (src_rhtp == NULL)
15515 15510                                  continue;
15516 15511                          incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
15517 15512                              src_rhtp->tpc_tp.tp_doi !=
15518 15513                              dst_rhtp->tpc_tp.tp_doi ||
15519 15514                              (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
15520 15515                              &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
15521 15516                              !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
15522 15517                              src_rhtp->tpc_tp.tp_sl_set_cipso));
15523 15518                          TPC_RELE(src_rhtp);
15524 15519                          if (incompat)
15525 15520                                  continue;
15526 15521                  }
15527 15522  
15528 15523                  samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
15529 15524  
15530 15525                  if (ipif->ipif_lcl_addr == dst) {
15531 15526                          type = IPIF_LOCALADDR;
15532 15527                  } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
15533 15528                          type = samenet ? IPIF_SAMENET_DEPRECATED :
15534 15529                              IPIF_DIFFNET_DEPRECATED;
15535 15530                  } else if (ipif->ipif_zoneid == ALL_ZONES) {
15536 15531                          type = samenet ? IPIF_SAMENET_ALLZONES :
15537 15532                              IPIF_DIFFNET_ALLZONES;
15538 15533                  } else {
15539 15534                          type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
15540 15535                  }
15541 15536  
15542 15537                  if (type > best_type) {
15543 15538                          best_type = type;
15544 15539                          best_ipif = ipif;
15545 15540                          if (best_type == IPIF_LOCALADDR)
15546 15541                                  break; /* can't get better */
15547 15542                  }
15548 15543          } while ((ipif = next_ipif) != start_ipif);
15549 15544  
15550 15545          if ((ipif = best_ipif) != NULL) {
15551 15546                  mutex_enter(&ipif->ipif_ill->ill_lock);
15552 15547                  if (IPIF_IS_CONDEMNED(ipif)) {
15553 15548                          mutex_exit(&ipif->ipif_ill->ill_lock);
15554 15549                          goto retry;
15555 15550                  }
15556 15551                  ipif_refhold_locked(ipif);
15557 15552  
15558 15553                  /*
15559 15554                   * For IPMP, update the source ipif rotor to the next ipif,
15560 15555                   * provided we can look it up.  (We must not use it if it's
15561 15556                   * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
15562 15557                   * ipif_free() checked ill_src_ipif.)
15563 15558                   */
15564 15559                  if (IS_IPMP(ill) && ipif != NULL) {
15565 15560                          next_ipif = ipif->ipif_next;
15566 15561                          if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
15567 15562                                  ill->ill_src_ipif = next_ipif;
15568 15563                          else
15569 15564                                  ill->ill_src_ipif = NULL;
15570 15565                  }
15571 15566                  mutex_exit(&ipif->ipif_ill->ill_lock);
15572 15567          }
15573 15568  
15574 15569          rw_exit(&ipst->ips_ill_g_lock);
15575 15570          if (usill != NULL)
15576 15571                  ill_refrele(usill);
15577 15572          if (ipmp_ill != NULL)
15578 15573                  ill_refrele(ipmp_ill);
15579 15574          if (dst_rhtp != NULL)
15580 15575                  TPC_RELE(dst_rhtp);
15581 15576  
15582 15577  #ifdef DEBUG
15583 15578          if (ipif == NULL) {
15584 15579                  char buf1[INET6_ADDRSTRLEN];
15585 15580  
15586 15581                  ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
15587 15582                      ill->ill_name,
15588 15583                      inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
15589 15584          } else {
15590 15585                  char buf1[INET6_ADDRSTRLEN];
15591 15586                  char buf2[INET6_ADDRSTRLEN];
15592 15587  
15593 15588                  ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
15594 15589                      ipif->ipif_ill->ill_name,
15595 15590                      inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
15596 15591                      inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
15597 15592                      buf2, sizeof (buf2))));
15598 15593          }
15599 15594  #endif /* DEBUG */
15600 15595          return (ipif);
15601 15596  }
15602 15597  
15603 15598  /*
15604 15599   * Pick a source address based on the destination ill and an optional setsrc
15605 15600   * address.
15606 15601   * The result is stored in srcp. If generation is set, then put the source
15607 15602   * generation number there before we look for the source address (to avoid
15608 15603   * missing changes in the set of source addresses.
15609 15604   * If flagsp is set, then us it to pass back ipif_flags.
15610 15605   *
15611 15606   * If the caller wants to cache the returned source address and detect when
15612 15607   * that might be stale, the caller should pass in a generation argument,
15613 15608   * which the caller can later compare against ips_src_generation
15614 15609   *
15615 15610   * The precedence order for selecting an IPv4 source address is:
15616 15611   *  - RTF_SETSRC on the offlink ire always wins.
15617 15612   *  - If usrsrc is set, swap the ill to be the usesrc one.
15618 15613   *  - If IPMP is used on the ill, select a random address from the most
15619 15614   *    preferred ones below:
15620 15615   * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
15621 15616   * 2. Not deprecated, not ALL_ZONES
15622 15617   * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
15623 15618   * 4. Not deprecated, ALL_ZONES
15624 15619   * 5. If onlink destination, same subnet and deprecated
15625 15620   * 6. Deprecated.
15626 15621   *
15627 15622   * We have lower preference for ALL_ZONES IP addresses,
15628 15623   * as they pose problems with unlabeled destinations.
15629 15624   *
15630 15625   * Note that when multiple IP addresses match e.g., #1 we pick
15631 15626   * the first one if IPMP is not in use. With IPMP we randomize.
15632 15627   */
15633 15628  int
15634 15629  ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
15635 15630      ipaddr_t multicast_ifaddr,
15636 15631      zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
15637 15632      uint32_t *generation, uint64_t *flagsp)
15638 15633  {
15639 15634          ipif_t *ipif;
15640 15635          boolean_t notready = B_FALSE;   /* Set if !ipif_addr_ready found */
15641 15636  
15642 15637          if (flagsp != NULL)
15643 15638                  *flagsp = 0;
15644 15639  
15645 15640          /*
15646 15641           * Need to grab the generation number before we check to
15647 15642           * avoid a race with a change to the set of local addresses.
15648 15643           * No lock needed since the thread which updates the set of local
15649 15644           * addresses use ipif/ill locks and exit those (hence a store memory
15650 15645           * barrier) before doing the atomic increase of ips_src_generation.
15651 15646           */
15652 15647          if (generation != NULL) {
15653 15648                  *generation = ipst->ips_src_generation;
15654 15649          }
15655 15650  
15656 15651          if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
15657 15652                  *srcp = multicast_ifaddr;
15658 15653                  return (0);
15659 15654          }
15660 15655  
15661 15656          /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
15662 15657          if (setsrc != INADDR_ANY) {
15663 15658                  *srcp = setsrc;
15664 15659                  return (0);
15665 15660          }
15666 15661          ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, &notready);
15667 15662          if (ipif == NULL) {
15668 15663                  if (notready)
15669 15664                          return (ENETDOWN);
15670 15665                  else
15671 15666                          return (EADDRNOTAVAIL);
15672 15667          }
15673 15668          *srcp = ipif->ipif_lcl_addr;
15674 15669          if (flagsp != NULL)
15675 15670                  *flagsp = ipif->ipif_flags;
15676 15671          ipif_refrele(ipif);
15677 15672          return (0);
15678 15673  }
15679 15674  
15680 15675  /* ARGSUSED */
15681 15676  int
15682 15677  if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15683 15678      ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15684 15679  {
15685 15680          /*
15686 15681           * ill_phyint_reinit merged the v4 and v6 into a single
15687 15682           * ipsq.  We might not have been able to complete the
15688 15683           * operation in ipif_set_values, if we could not become
15689 15684           * exclusive.  If so restart it here.
15690 15685           */
15691 15686          return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15692 15687  }
15693 15688  
15694 15689  /*
15695 15690   * Can operate on either a module or a driver queue.
15696 15691   * Returns an error if not a module queue.
15697 15692   */
15698 15693  /* ARGSUSED */
15699 15694  int
15700 15695  if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15701 15696      ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15702 15697  {
15703 15698          queue_t         *q1 = q;
15704 15699          char            *cp;
15705 15700          char            interf_name[LIFNAMSIZ];
15706 15701          uint_t          ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
15707 15702  
15708 15703          if (q->q_next == NULL) {
15709 15704                  ip1dbg((
15710 15705                      "if_unitsel: IF_UNITSEL: no q_next\n"));
15711 15706                  return (EINVAL);
15712 15707          }
15713 15708  
15714 15709          if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
15715 15710                  return (EALREADY);
15716 15711  
15717 15712          do {
15718 15713                  q1 = q1->q_next;
15719 15714          } while (q1->q_next);
15720 15715          cp = q1->q_qinfo->qi_minfo->mi_idname;
15721 15716          (void) sprintf(interf_name, "%s%d", cp, ppa);
15722 15717  
15723 15718          /*
15724 15719           * Here we are not going to delay the ioack until after
15725 15720           * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
15726 15721           * original ioctl message before sending the requests.
15727 15722           */
15728 15723          return (ipif_set_values(q, mp, interf_name, &ppa));
15729 15724  }
15730 15725  
15731 15726  /* ARGSUSED */
15732 15727  int
15733 15728  ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15734 15729      ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15735 15730  {
15736 15731          return (ENXIO);
15737 15732  }
15738 15733  
15739 15734  /*
15740 15735   * Create any IRE_BROADCAST entries for `ipif', and store those entries in
15741 15736   * `irep'.  Returns a pointer to the next free `irep' entry
15742 15737   * A mirror exists in ipif_delete_bcast_ires().
15743 15738   *
15744 15739   * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
15745 15740   * done in ire_add.
15746 15741   */
15747 15742  static ire_t **
15748 15743  ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
15749 15744  {
15750 15745          ipaddr_t addr;
15751 15746          ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15752 15747          ipaddr_t subnetmask = ipif->ipif_net_mask;
15753 15748          ill_t *ill = ipif->ipif_ill;
15754 15749          zoneid_t zoneid = ipif->ipif_zoneid;
15755 15750  
15756 15751          ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
15757 15752  
15758 15753          ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15759 15754          ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15760 15755  
15761 15756          if (ipif->ipif_lcl_addr == INADDR_ANY ||
15762 15757              (ipif->ipif_flags & IPIF_NOLOCAL))
15763 15758                  netmask = htonl(IN_CLASSA_NET);         /* fallback */
15764 15759  
15765 15760          irep = ire_create_bcast(ill, 0, zoneid, irep);
15766 15761          irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
15767 15762  
15768 15763          /*
15769 15764           * For backward compatibility, we create net broadcast IREs based on
15770 15765           * the old "IP address class system", since some old machines only
15771 15766           * respond to these class derived net broadcast.  However, we must not
15772 15767           * create these net broadcast IREs if the subnetmask is shorter than
15773 15768           * the IP address class based derived netmask.  Otherwise, we may
15774 15769           * create a net broadcast address which is the same as an IP address
15775 15770           * on the subnet -- and then TCP will refuse to talk to that address.
15776 15771           */
15777 15772          if (netmask < subnetmask) {
15778 15773                  addr = netmask & ipif->ipif_subnet;
15779 15774                  irep = ire_create_bcast(ill, addr, zoneid, irep);
15780 15775                  irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
15781 15776          }
15782 15777  
15783 15778          /*
15784 15779           * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15785 15780           * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15786 15781           * created.  Creating these broadcast IREs will only create confusion
15787 15782           * as `addr' will be the same as the IP address.
15788 15783           */
15789 15784          if (subnetmask != 0xFFFFFFFF) {
15790 15785                  addr = ipif->ipif_subnet;
15791 15786                  irep = ire_create_bcast(ill, addr, zoneid, irep);
15792 15787                  irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
15793 15788          }
15794 15789  
15795 15790          return (irep);
15796 15791  }
15797 15792  
15798 15793  /*
15799 15794   * Mirror of ipif_create_bcast_ires()
15800 15795   */
15801 15796  static void
15802 15797  ipif_delete_bcast_ires(ipif_t *ipif)
15803 15798  {
15804 15799          ipaddr_t        addr;
15805 15800          ipaddr_t        netmask = ip_net_mask(ipif->ipif_lcl_addr);
15806 15801          ipaddr_t        subnetmask = ipif->ipif_net_mask;
15807 15802          ill_t           *ill = ipif->ipif_ill;
15808 15803          zoneid_t        zoneid = ipif->ipif_zoneid;
15809 15804          ire_t           *ire;
15810 15805  
15811 15806          ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15812 15807          ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15813 15808  
15814 15809          if (ipif->ipif_lcl_addr == INADDR_ANY ||
15815 15810              (ipif->ipif_flags & IPIF_NOLOCAL))
15816 15811                  netmask = htonl(IN_CLASSA_NET);         /* fallback */
15817 15812  
15818 15813          ire = ire_lookup_bcast(ill, 0, zoneid);
15819 15814          ASSERT(ire != NULL);
15820 15815          ire_delete(ire); ire_refrele(ire);
15821 15816          ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
15822 15817          ASSERT(ire != NULL);
15823 15818          ire_delete(ire); ire_refrele(ire);
15824 15819  
15825 15820          /*
15826 15821           * For backward compatibility, we create net broadcast IREs based on
15827 15822           * the old "IP address class system", since some old machines only
15828 15823           * respond to these class derived net broadcast.  However, we must not
15829 15824           * create these net broadcast IREs if the subnetmask is shorter than
15830 15825           * the IP address class based derived netmask.  Otherwise, we may
15831 15826           * create a net broadcast address which is the same as an IP address
15832 15827           * on the subnet -- and then TCP will refuse to talk to that address.
15833 15828           */
15834 15829          if (netmask < subnetmask) {
15835 15830                  addr = netmask & ipif->ipif_subnet;
15836 15831                  ire = ire_lookup_bcast(ill, addr, zoneid);
15837 15832                  ASSERT(ire != NULL);
15838 15833                  ire_delete(ire); ire_refrele(ire);
15839 15834                  ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
15840 15835                  ASSERT(ire != NULL);
15841 15836                  ire_delete(ire); ire_refrele(ire);
15842 15837          }
15843 15838  
15844 15839          /*
15845 15840           * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15846 15841           * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15847 15842           * created.  Creating these broadcast IREs will only create confusion
15848 15843           * as `addr' will be the same as the IP address.
15849 15844           */
15850 15845          if (subnetmask != 0xFFFFFFFF) {
15851 15846                  addr = ipif->ipif_subnet;
15852 15847                  ire = ire_lookup_bcast(ill, addr, zoneid);
15853 15848                  ASSERT(ire != NULL);
15854 15849                  ire_delete(ire); ire_refrele(ire);
15855 15850                  ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
15856 15851                  ASSERT(ire != NULL);
15857 15852                  ire_delete(ire); ire_refrele(ire);
15858 15853          }
15859 15854  }
15860 15855  
15861 15856  /*
15862 15857   * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
15863 15858   * from lifr_flags and the name from lifr_name.
15864 15859   * Set IFF_IPV* and ill_isv6 prior to doing the lookup
15865 15860   * since ipif_lookup_on_name uses the _isv6 flags when matching.
15866 15861   * Returns EINPROGRESS when mp has been consumed by queueing it on
15867 15862   * ipx_pending_mp and the ioctl will complete in ip_rput.
15868 15863   *
15869 15864   * Can operate on either a module or a driver queue.
15870 15865   * Returns an error if not a module queue.
15871 15866   */
15872 15867  /* ARGSUSED */
15873 15868  int
15874 15869  ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15875 15870      ip_ioctl_cmd_t *ipip, void *if_req)
15876 15871  {
15877 15872          ill_t   *ill = q->q_ptr;
15878 15873          phyint_t *phyi;
15879 15874          ip_stack_t *ipst;
15880 15875          struct lifreq *lifr = if_req;
15881 15876          uint64_t new_flags;
15882 15877  
15883 15878          ASSERT(ipif != NULL);
15884 15879          ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
15885 15880  
15886 15881          if (q->q_next == NULL) {
15887 15882                  ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
15888 15883                  return (EINVAL);
15889 15884          }
15890 15885  
15891 15886          /*
15892 15887           * If we are not writer on 'q' then this interface exists already
15893 15888           * and previous lookups (ip_extract_lifreq()) found this ipif --
15894 15889           * so return EALREADY.
15895 15890           */
15896 15891          if (ill != ipif->ipif_ill)
15897 15892                  return (EALREADY);
15898 15893  
15899 15894          if (ill->ill_name[0] != '\0')
15900 15895                  return (EALREADY);
15901 15896  
15902 15897          /*
15903 15898           * If there's another ill already with the requested name, ensure
15904 15899           * that it's of the same type.  Otherwise, ill_phyint_reinit() will
15905 15900           * fuse together two unrelated ills, which will cause chaos.
15906 15901           */
15907 15902          ipst = ill->ill_ipst;
15908 15903          phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
15909 15904              lifr->lifr_name, NULL);
15910 15905          if (phyi != NULL) {
15911 15906                  ill_t *ill_mate = phyi->phyint_illv4;
15912 15907  
15913 15908                  if (ill_mate == NULL)
15914 15909                          ill_mate = phyi->phyint_illv6;
15915 15910                  ASSERT(ill_mate != NULL);
15916 15911  
15917 15912                  if (ill_mate->ill_media->ip_m_mac_type !=
15918 15913                      ill->ill_media->ip_m_mac_type) {
15919 15914                          ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to "
15920 15915                              "use the same ill name on differing media\n"));
15921 15916                          return (EINVAL);
15922 15917                  }
15923 15918          }
15924 15919  
15925 15920          /*
15926 15921           * We start off as IFF_IPV4 in ipif_allocate and become
15927 15922           * IFF_IPV4 or IFF_IPV6 here depending  on lifr_flags value.
15928 15923           * The only flags that we read from user space are IFF_IPV4,
15929 15924           * IFF_IPV6, and IFF_BROADCAST.
15930 15925           *
15931 15926           * This ill has not been inserted into the global list.
15932 15927           * So we are still single threaded and don't need any lock
15933 15928           *
15934 15929           * Saniy check the flags.
15935 15930           */
15936 15931  
15937 15932          if ((lifr->lifr_flags & IFF_BROADCAST) &&
15938 15933              ((lifr->lifr_flags & IFF_IPV6) ||
15939 15934              (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
15940 15935                  ip1dbg(("ip_sioctl_slifname: link not broadcast capable "
15941 15936                      "or IPv6 i.e., no broadcast \n"));
15942 15937                  return (EINVAL);
15943 15938          }
15944 15939  
15945 15940          new_flags =
15946 15941              lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
15947 15942  
15948 15943          if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
15949 15944                  ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
15950 15945                      "IFF_IPV4 or IFF_IPV6\n"));
15951 15946                  return (EINVAL);
15952 15947          }
15953 15948  
15954 15949          /*
15955 15950           * We always start off as IPv4, so only need to check for IPv6.
15956 15951           */
15957 15952          if ((new_flags & IFF_IPV6) != 0) {
15958 15953                  ill->ill_flags |= ILLF_IPV6;
15959 15954                  ill->ill_flags &= ~ILLF_IPV4;
15960 15955  
15961 15956                  if (lifr->lifr_flags & IFF_NOLINKLOCAL)
15962 15957                          ill->ill_flags |= ILLF_NOLINKLOCAL;
15963 15958          }
15964 15959  
15965 15960          if ((new_flags & IFF_BROADCAST) != 0)
15966 15961                  ipif->ipif_flags |= IPIF_BROADCAST;
15967 15962          else
15968 15963                  ipif->ipif_flags &= ~IPIF_BROADCAST;
15969 15964  
15970 15965          /* We started off as V4. */
15971 15966          if (ill->ill_flags & ILLF_IPV6) {
15972 15967                  ill->ill_phyint->phyint_illv6 = ill;
15973 15968                  ill->ill_phyint->phyint_illv4 = NULL;
15974 15969          }
15975 15970  
15976 15971          return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa));
15977 15972  }
15978 15973  
15979 15974  /* ARGSUSED */
15980 15975  int
15981 15976  ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15982 15977      ip_ioctl_cmd_t *ipip, void *if_req)
15983 15978  {
15984 15979          /*
15985 15980           * ill_phyint_reinit merged the v4 and v6 into a single
15986 15981           * ipsq.  We might not have been able to complete the
15987 15982           * slifname in ipif_set_values, if we could not become
15988 15983           * exclusive.  If so restart it here
15989 15984           */
15990 15985          return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15991 15986  }
15992 15987  
15993 15988  /*
15994 15989   * Return a pointer to the ipif which matches the index, IP version type and
15995 15990   * zoneid.
15996 15991   */
15997 15992  ipif_t *
15998 15993  ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
15999 15994      ip_stack_t *ipst)
16000 15995  {
16001 15996          ill_t   *ill;
16002 15997          ipif_t  *ipif = NULL;
16003 15998  
16004 15999          ill = ill_lookup_on_ifindex(index, isv6, ipst);
16005 16000          if (ill != NULL) {
16006 16001                  mutex_enter(&ill->ill_lock);
16007 16002                  for (ipif = ill->ill_ipif; ipif != NULL;
16008 16003                      ipif = ipif->ipif_next) {
16009 16004                          if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
16010 16005                              zoneid == ipif->ipif_zoneid ||
16011 16006                              ipif->ipif_zoneid == ALL_ZONES)) {
16012 16007                                  ipif_refhold_locked(ipif);
16013 16008                                  break;
16014 16009                          }
16015 16010                  }
16016 16011                  mutex_exit(&ill->ill_lock);
16017 16012                  ill_refrele(ill);
16018 16013          }
16019 16014          return (ipif);
16020 16015  }
16021 16016  
16022 16017  /*
16023 16018   * Change an existing physical interface's index. If the new index
16024 16019   * is acceptable we update the index and the phyint_list_avl_by_index tree.
16025 16020   * Finally, we update other systems which may have a dependence on the
16026 16021   * index value.
16027 16022   */
16028 16023  /* ARGSUSED */
16029 16024  int
16030 16025  ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16031 16026      ip_ioctl_cmd_t *ipip, void *ifreq)
16032 16027  {
16033 16028          ill_t           *ill;
16034 16029          phyint_t        *phyi;
16035 16030          struct ifreq    *ifr = (struct ifreq *)ifreq;
16036 16031          struct lifreq   *lifr = (struct lifreq *)ifreq;
16037 16032          uint_t  old_index, index;
16038 16033          ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
16039 16034          avl_index_t     where;
16040 16035  
16041 16036          if (ipip->ipi_cmd_type == IF_CMD)
16042 16037                  index = ifr->ifr_index;
16043 16038          else
16044 16039                  index = lifr->lifr_index;
16045 16040  
16046 16041          /*
16047 16042           * Only allow on physical interface. Also, index zero is illegal.
16048 16043           */
16049 16044          ill = ipif->ipif_ill;
16050 16045          phyi = ill->ill_phyint;
16051 16046          if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) {
16052 16047                  return (EINVAL);
16053 16048          }
16054 16049  
16055 16050          /* If the index is not changing, no work to do */
16056 16051          if (phyi->phyint_ifindex == index)
16057 16052                  return (0);
16058 16053  
16059 16054          /*
16060 16055           * Use phyint_exists() to determine if the new interface index
16061 16056           * is already in use. If the index is unused then we need to
16062 16057           * change the phyint's position in the phyint_list_avl_by_index
16063 16058           * tree. If we do not do this, subsequent lookups (using the new
16064 16059           * index value) will not find the phyint.
16065 16060           */
16066 16061          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16067 16062          if (phyint_exists(index, ipst)) {
16068 16063                  rw_exit(&ipst->ips_ill_g_lock);
16069 16064                  return (EEXIST);
16070 16065          }
16071 16066  
16072 16067          /*
16073 16068           * The new index is unused. Set it in the phyint. However we must not
16074 16069           * forget to trigger NE_IFINDEX_CHANGE event before the ifindex
16075 16070           * changes. The event must be bound to old ifindex value.
16076 16071           */
16077 16072          ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE,
16078 16073              &index, sizeof (index));
16079 16074  
16080 16075          old_index = phyi->phyint_ifindex;
16081 16076          phyi->phyint_ifindex = index;
16082 16077  
16083 16078          avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi);
16084 16079          (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16085 16080              &index, &where);
16086 16081          avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16087 16082              phyi, where);
16088 16083          rw_exit(&ipst->ips_ill_g_lock);
16089 16084  
16090 16085          /* Update SCTP's ILL list */
16091 16086          sctp_ill_reindex(ill, old_index);
16092 16087  
16093 16088          /* Send the routing sockets message */
16094 16089          ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
16095 16090          if (ILL_OTHER(ill))
16096 16091                  ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
16097 16092  
16098 16093          /* Perhaps ilgs should use this ill */
16099 16094          update_conn_ill(NULL, ill->ill_ipst);
16100 16095          return (0);
16101 16096  }
16102 16097  
16103 16098  /* ARGSUSED */
16104 16099  int
16105 16100  ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16106 16101      ip_ioctl_cmd_t *ipip, void *ifreq)
16107 16102  {
16108 16103          struct ifreq    *ifr = (struct ifreq *)ifreq;
16109 16104          struct lifreq   *lifr = (struct lifreq *)ifreq;
16110 16105  
16111 16106          ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
16112 16107              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16113 16108          /* Get the interface index */
16114 16109          if (ipip->ipi_cmd_type == IF_CMD) {
16115 16110                  ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16116 16111          } else {
16117 16112                  lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16118 16113          }
16119 16114          return (0);
16120 16115  }
16121 16116  
16122 16117  /* ARGSUSED */
16123 16118  int
16124 16119  ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16125 16120      ip_ioctl_cmd_t *ipip, void *ifreq)
16126 16121  {
16127 16122          struct lifreq   *lifr = (struct lifreq *)ifreq;
16128 16123  
16129 16124          ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
16130 16125              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16131 16126          /* Get the interface zone */
16132 16127          ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16133 16128          lifr->lifr_zoneid = ipif->ipif_zoneid;
16134 16129          return (0);
16135 16130  }
16136 16131  
16137 16132  /*
16138 16133   * Set the zoneid of an interface.
16139 16134   */
16140 16135  /* ARGSUSED */
16141 16136  int
16142 16137  ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16143 16138      ip_ioctl_cmd_t *ipip, void *ifreq)
16144 16139  {
16145 16140          struct lifreq   *lifr = (struct lifreq *)ifreq;
16146 16141          int err = 0;
16147 16142          boolean_t need_up = B_FALSE;
16148 16143          zone_t *zptr;
16149 16144          zone_status_t status;
16150 16145          zoneid_t zoneid;
16151 16146  
16152 16147          ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16153 16148          if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) {
16154 16149                  if (!is_system_labeled())
16155 16150                          return (ENOTSUP);
16156 16151                  zoneid = GLOBAL_ZONEID;
16157 16152          }
16158 16153  
16159 16154          /* cannot assign instance zero to a non-global zone */
16160 16155          if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
16161 16156                  return (ENOTSUP);
16162 16157  
16163 16158          /*
16164 16159           * Cannot assign to a zone that doesn't exist or is shutting down.  In
16165 16160           * the event of a race with the zone shutdown processing, since IP
16166 16161           * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
16167 16162           * interface will be cleaned up even if the zone is shut down
16168 16163           * immediately after the status check. If the interface can't be brought
16169 16164           * down right away, and the zone is shut down before the restart
16170 16165           * function is called, we resolve the possible races by rechecking the
16171 16166           * zone status in the restart function.
16172 16167           */
16173 16168          if ((zptr = zone_find_by_id(zoneid)) == NULL)
16174 16169                  return (EINVAL);
16175 16170          status = zone_status_get(zptr);
16176 16171          zone_rele(zptr);
16177 16172  
16178 16173          if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
16179 16174                  return (EINVAL);
16180 16175  
16181 16176          if (ipif->ipif_flags & IPIF_UP) {
16182 16177                  /*
16183 16178                   * If the interface is already marked up,
16184 16179                   * we call ipif_down which will take care
16185 16180                   * of ditching any IREs that have been set
16186 16181                   * up based on the old interface address.
16187 16182                   */
16188 16183                  err = ipif_logical_down(ipif, q, mp);
16189 16184                  if (err == EINPROGRESS)
16190 16185                          return (err);
16191 16186                  (void) ipif_down_tail(ipif);
16192 16187                  need_up = B_TRUE;
16193 16188          }
16194 16189  
16195 16190          err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
16196 16191          return (err);
16197 16192  }
16198 16193  
16199 16194  static int
16200 16195  ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
16201 16196      queue_t *q, mblk_t *mp, boolean_t need_up)
16202 16197  {
16203 16198          int     err = 0;
16204 16199          ip_stack_t      *ipst;
16205 16200  
16206 16201          ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
16207 16202              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16208 16203  
16209 16204          if (CONN_Q(q))
16210 16205                  ipst = CONNQ_TO_IPST(q);
16211 16206          else
16212 16207                  ipst = ILLQ_TO_IPST(q);
16213 16208  
16214 16209          /*
16215 16210           * For exclusive stacks we don't allow a different zoneid than
16216 16211           * global.
16217 16212           */
16218 16213          if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
16219 16214              zoneid != GLOBAL_ZONEID)
16220 16215                  return (EINVAL);
16221 16216  
16222 16217          /* Set the new zone id. */
16223 16218          ipif->ipif_zoneid = zoneid;
16224 16219  
16225 16220          /* Update sctp list */
16226 16221          sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
16227 16222  
16228 16223          /* The default multicast interface might have changed */
16229 16224          ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
16230 16225  
16231 16226          if (need_up) {
16232 16227                  /*
16233 16228                   * Now bring the interface back up.  If this
16234 16229                   * is the only IPIF for the ILL, ipif_up
16235 16230                   * will have to re-bind to the device, so
16236 16231                   * we may get back EINPROGRESS, in which
16237 16232                   * case, this IOCTL will get completed in
16238 16233                   * ip_rput_dlpi when we see the DL_BIND_ACK.
16239 16234                   */
16240 16235                  err = ipif_up(ipif, q, mp);
16241 16236          }
16242 16237          return (err);
16243 16238  }
16244 16239  
16245 16240  /* ARGSUSED */
16246 16241  int
16247 16242  ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16248 16243      ip_ioctl_cmd_t *ipip, void *if_req)
16249 16244  {
16250 16245          struct lifreq *lifr = (struct lifreq *)if_req;
16251 16246          zoneid_t zoneid;
16252 16247          zone_t *zptr;
16253 16248          zone_status_t status;
16254 16249  
16255 16250          ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16256 16251          if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
16257 16252                  zoneid = GLOBAL_ZONEID;
16258 16253  
16259 16254          ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
16260 16255              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16261 16256  
16262 16257          /*
16263 16258           * We recheck the zone status to resolve the following race condition:
16264 16259           * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
16265 16260           * 2) hme0:1 is up and can't be brought down right away;
16266 16261           * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
16267 16262           * 3) zone "myzone" is halted; the zone status switches to
16268 16263           * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
16269 16264           * the interfaces to remove - hme0:1 is not returned because it's not
16270 16265           * yet in "myzone", so it won't be removed;
16271 16266           * 4) the restart function for SIOCSLIFZONE is called; without the
16272 16267           * status check here, we would have hme0:1 in "myzone" after it's been
16273 16268           * destroyed.
16274 16269           * Note that if the status check fails, we need to bring the interface
16275 16270           * back to its state prior to ip_sioctl_slifzone(), hence the call to
16276 16271           * ipif_up_done[_v6]().
16277 16272           */
16278 16273          status = ZONE_IS_UNINITIALIZED;
16279 16274          if ((zptr = zone_find_by_id(zoneid)) != NULL) {
16280 16275                  status = zone_status_get(zptr);
16281 16276                  zone_rele(zptr);
16282 16277          }
16283 16278          if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
16284 16279                  if (ipif->ipif_isv6) {
16285 16280                          (void) ipif_up_done_v6(ipif);
16286 16281                  } else {
16287 16282                          (void) ipif_up_done(ipif);
16288 16283                  }
16289 16284                  return (EINVAL);
16290 16285          }
16291 16286  
16292 16287          (void) ipif_down_tail(ipif);
16293 16288  
16294 16289          return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
16295 16290              B_TRUE));
16296 16291  }
16297 16292  
16298 16293  /*
16299 16294   * Return the number of addresses on `ill' with one or more of the values
16300 16295   * in `set' set and all of the values in `clear' clear.
16301 16296   */
16302 16297  static uint_t
16303 16298  ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
16304 16299  {
16305 16300          ipif_t  *ipif;
16306 16301          uint_t  cnt = 0;
16307 16302  
16308 16303          ASSERT(IAM_WRITER_ILL(ill));
16309 16304  
16310 16305          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
16311 16306                  if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
16312 16307                          cnt++;
16313 16308  
16314 16309          return (cnt);
16315 16310  }
16316 16311  
16317 16312  /*
16318 16313   * Return the number of migratable addresses on `ill' that are under
16319 16314   * application control.
16320 16315   */
16321 16316  uint_t
16322 16317  ill_appaddr_cnt(const ill_t *ill)
16323 16318  {
16324 16319          return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
16325 16320              IPIF_NOFAILOVER));
16326 16321  }
16327 16322  
16328 16323  /*
16329 16324   * Return the number of point-to-point addresses on `ill'.
16330 16325   */
16331 16326  uint_t
16332 16327  ill_ptpaddr_cnt(const ill_t *ill)
16333 16328  {
16334 16329          return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
16335 16330  }
16336 16331  
16337 16332  /* ARGSUSED */
16338 16333  int
16339 16334  ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16340 16335      ip_ioctl_cmd_t *ipip, void *ifreq)
16341 16336  {
16342 16337          struct lifreq   *lifr = ifreq;
16343 16338  
16344 16339          ASSERT(q->q_next == NULL);
16345 16340          ASSERT(CONN_Q(q));
16346 16341  
16347 16342          ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
16348 16343              ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16349 16344          lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
16350 16345          ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
16351 16346  
16352 16347          return (0);
16353 16348  }
16354 16349  
16355 16350  /* Find the previous ILL in this usesrc group */
16356 16351  static ill_t *
16357 16352  ill_prev_usesrc(ill_t *uill)
16358 16353  {
16359 16354          ill_t *ill;
16360 16355  
16361 16356          for (ill = uill->ill_usesrc_grp_next;
16362 16357              ASSERT(ill), ill->ill_usesrc_grp_next != uill;
16363 16358              ill = ill->ill_usesrc_grp_next)
16364 16359                  /* do nothing */;
16365 16360          return (ill);
16366 16361  }
16367 16362  
16368 16363  /*
16369 16364   * Release all members of the usesrc group. This routine is called
16370 16365   * from ill_delete when the interface being unplumbed is the
16371 16366   * group head.
16372 16367   *
16373 16368   * This silently clears the usesrc that ifconfig setup.
16374 16369   * An alternative would be to keep that ifindex, and drop packets on the floor
16375 16370   * since no source address can be selected.
16376 16371   * Even if we keep the current semantics, don't need a lock and a linked list.
16377 16372   * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
16378 16373   * the one that is being removed. Issue is how we return the usesrc users
16379 16374   * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
16380 16375   * ill_usesrc_ifindex matching a target ill. We could also do that with an
16381 16376   * ill walk, but the walker would need to insert in the ioctl response.
16382 16377   */
16383 16378  static void
16384 16379  ill_disband_usesrc_group(ill_t *uill)
16385 16380  {
16386 16381          ill_t *next_ill, *tmp_ill;
16387 16382          ip_stack_t      *ipst = uill->ill_ipst;
16388 16383  
16389 16384          ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16390 16385          next_ill = uill->ill_usesrc_grp_next;
16391 16386  
16392 16387          do {
16393 16388                  ASSERT(next_ill != NULL);
16394 16389                  tmp_ill = next_ill->ill_usesrc_grp_next;
16395 16390                  ASSERT(tmp_ill != NULL);
16396 16391                  next_ill->ill_usesrc_grp_next = NULL;
16397 16392                  next_ill->ill_usesrc_ifindex = 0;
16398 16393                  next_ill = tmp_ill;
16399 16394          } while (next_ill->ill_usesrc_ifindex != 0);
16400 16395          uill->ill_usesrc_grp_next = NULL;
16401 16396  }
16402 16397  
16403 16398  /*
16404 16399   * Remove the client usesrc ILL from the list and relink to a new list
16405 16400   */
16406 16401  int
16407 16402  ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
16408 16403  {
16409 16404          ill_t *ill, *tmp_ill;
16410 16405          ip_stack_t      *ipst = ucill->ill_ipst;
16411 16406  
16412 16407          ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
16413 16408              (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16414 16409  
16415 16410          /*
16416 16411           * Check if the usesrc client ILL passed in is not already
16417 16412           * in use as a usesrc ILL i.e one whose source address is
16418 16413           * in use OR a usesrc ILL is not already in use as a usesrc
16419 16414           * client ILL
16420 16415           */
16421 16416          if ((ucill->ill_usesrc_ifindex == 0) ||
16422 16417              (uill->ill_usesrc_ifindex != 0)) {
16423 16418                  return (-1);
16424 16419          }
16425 16420  
16426 16421          ill = ill_prev_usesrc(ucill);
16427 16422          ASSERT(ill->ill_usesrc_grp_next != NULL);
16428 16423  
16429 16424          /* Remove from the current list */
16430 16425          if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
16431 16426                  /* Only two elements in the list */
16432 16427                  ASSERT(ill->ill_usesrc_ifindex == 0);
16433 16428                  ill->ill_usesrc_grp_next = NULL;
16434 16429          } else {
16435 16430                  ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
16436 16431          }
16437 16432  
16438 16433          if (ifindex == 0) {
16439 16434                  ucill->ill_usesrc_ifindex = 0;
16440 16435                  ucill->ill_usesrc_grp_next = NULL;
16441 16436                  return (0);
16442 16437          }
16443 16438  
16444 16439          ucill->ill_usesrc_ifindex = ifindex;
16445 16440          tmp_ill = uill->ill_usesrc_grp_next;
16446 16441          uill->ill_usesrc_grp_next = ucill;
16447 16442          ucill->ill_usesrc_grp_next =
16448 16443              (tmp_ill != NULL) ? tmp_ill : uill;
16449 16444          return (0);
16450 16445  }
16451 16446  
16452 16447  /*
16453 16448   * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
16454 16449   * ip.c for locking details.
16455 16450   */
16456 16451  /* ARGSUSED */
16457 16452  int
16458 16453  ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16459 16454      ip_ioctl_cmd_t *ipip, void *ifreq)
16460 16455  {
16461 16456          struct lifreq *lifr = (struct lifreq *)ifreq;
16462 16457          boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
16463 16458          ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
16464 16459          int err = 0, ret;
16465 16460          uint_t ifindex;
16466 16461          ipsq_t *ipsq = NULL;
16467 16462          ip_stack_t      *ipst = ipif->ipif_ill->ill_ipst;
16468 16463  
16469 16464          ASSERT(IAM_WRITER_IPIF(ipif));
16470 16465          ASSERT(q->q_next == NULL);
16471 16466          ASSERT(CONN_Q(q));
16472 16467  
16473 16468          isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
16474 16469  
16475 16470          ifindex = lifr->lifr_index;
16476 16471          if (ifindex == 0) {
16477 16472                  if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
16478 16473                          /* non usesrc group interface, nothing to reset */
16479 16474                          return (0);
16480 16475                  }
16481 16476                  ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
16482 16477                  /* valid reset request */
16483 16478                  reset_flg = B_TRUE;
16484 16479          }
16485 16480  
16486 16481          usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
16487 16482          if (usesrc_ill == NULL)
16488 16483                  return (ENXIO);
16489 16484          if (usesrc_ill == ipif->ipif_ill) {
16490 16485                  ill_refrele(usesrc_ill);
16491 16486                  return (EINVAL);
16492 16487          }
16493 16488  
16494 16489          ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
16495 16490              NEW_OP, B_TRUE);
16496 16491          if (ipsq == NULL) {
16497 16492                  err = EINPROGRESS;
16498 16493                  /* Operation enqueued on the ipsq of the usesrc ILL */
16499 16494                  goto done;
16500 16495          }
16501 16496  
16502 16497          /* USESRC isn't currently supported with IPMP */
16503 16498          if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
16504 16499                  err = ENOTSUP;
16505 16500                  goto done;
16506 16501          }
16507 16502  
16508 16503          /*
16509 16504           * USESRC isn't compatible with the STANDBY flag.  (STANDBY is only
16510 16505           * used by IPMP underlying interfaces, but someone might think it's
16511 16506           * more general and try to use it independently with VNI.)
16512 16507           */
16513 16508          if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
16514 16509                  err = ENOTSUP;
16515 16510                  goto done;
16516 16511          }
16517 16512  
16518 16513          /*
16519 16514           * If the client is already in use as a usesrc_ill or a usesrc_ill is
16520 16515           * already a client then return EINVAL
16521 16516           */
16522 16517          if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
16523 16518                  err = EINVAL;
16524 16519                  goto done;
16525 16520          }
16526 16521  
16527 16522          /*
16528 16523           * If the ill_usesrc_ifindex field is already set to what it needs to
16529 16524           * be then this is a duplicate operation.
16530 16525           */
16531 16526          if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
16532 16527                  err = 0;
16533 16528                  goto done;
16534 16529          }
16535 16530  
16536 16531          ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
16537 16532              " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
16538 16533              usesrc_ill->ill_isv6));
16539 16534  
16540 16535          /*
16541 16536           * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
16542 16537           * and the ill_usesrc_ifindex fields
16543 16538           */
16544 16539          rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
16545 16540  
16546 16541          if (reset_flg) {
16547 16542                  ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
16548 16543                  if (ret != 0) {
16549 16544                          err = EINVAL;
16550 16545                  }
16551 16546                  rw_exit(&ipst->ips_ill_g_usesrc_lock);
16552 16547                  goto done;
16553 16548          }
16554 16549  
16555 16550          /*
16556 16551           * Four possibilities to consider:
16557 16552           * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
16558 16553           * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
16559 16554           * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
16560 16555           * 4. Both are part of their respective usesrc groups
16561 16556           */
16562 16557          if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
16563 16558              (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16564 16559                  ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
16565 16560                  usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16566 16561                  usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16567 16562                  usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
16568 16563          } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
16569 16564              (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16570 16565                  usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16571 16566                  /* Insert at head of list */
16572 16567                  usesrc_cli_ill->ill_usesrc_grp_next =
16573 16568                      usesrc_ill->ill_usesrc_grp_next;
16574 16569                  usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16575 16570          } else {
16576 16571                  ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
16577 16572                      ifindex);
16578 16573                  if (ret != 0)
16579 16574                          err = EINVAL;
16580 16575          }
16581 16576          rw_exit(&ipst->ips_ill_g_usesrc_lock);
16582 16577  
16583 16578  done:
16584 16579          if (ipsq != NULL)
16585 16580                  ipsq_exit(ipsq);
16586 16581          /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
16587 16582          ill_refrele(usesrc_ill);
16588 16583  
16589 16584          /* Let conn_ixa caching know that source address selection changed */
16590 16585          ip_update_source_selection(ipst);
16591 16586  
16592 16587          return (err);
16593 16588  }
16594 16589  
16595 16590  /* ARGSUSED */
16596 16591  int
16597 16592  ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16598 16593      ip_ioctl_cmd_t *ipip, void *if_req)
16599 16594  {
16600 16595          struct lifreq   *lifr = (struct lifreq *)if_req;
16601 16596          ill_t           *ill = ipif->ipif_ill;
16602 16597  
16603 16598          /*
16604 16599           * Need a lock since IFF_UP can be set even when there are
16605 16600           * references to the ipif.
16606 16601           */
16607 16602          mutex_enter(&ill->ill_lock);
16608 16603          if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0)
16609 16604                  lifr->lifr_dadstate = DAD_IN_PROGRESS;
16610 16605          else
16611 16606                  lifr->lifr_dadstate = DAD_DONE;
16612 16607          mutex_exit(&ill->ill_lock);
16613 16608          return (0);
16614 16609  }
16615 16610  
16616 16611  /*
16617 16612   * comparison function used by avl.
16618 16613   */
16619 16614  static int
16620 16615  ill_phyint_compare_index(const void *index_ptr, const void *phyip)
16621 16616  {
16622 16617  
16623 16618          uint_t index;
16624 16619  
16625 16620          ASSERT(phyip != NULL && index_ptr != NULL);
16626 16621  
16627 16622          index = *((uint_t *)index_ptr);
16628 16623          /*
16629 16624           * let the phyint with the lowest index be on top.
16630 16625           */
16631 16626          if (((phyint_t *)phyip)->phyint_ifindex < index)
16632 16627                  return (1);
16633 16628          if (((phyint_t *)phyip)->phyint_ifindex > index)
16634 16629                  return (-1);
16635 16630          return (0);
16636 16631  }
16637 16632  
16638 16633  /*
16639 16634   * comparison function used by avl.
16640 16635   */
16641 16636  static int
16642 16637  ill_phyint_compare_name(const void *name_ptr, const void *phyip)
16643 16638  {
16644 16639          ill_t *ill;
16645 16640          int res = 0;
16646 16641  
16647 16642          ASSERT(phyip != NULL && name_ptr != NULL);
16648 16643  
16649 16644          if (((phyint_t *)phyip)->phyint_illv4)
16650 16645                  ill = ((phyint_t *)phyip)->phyint_illv4;
16651 16646          else
16652 16647                  ill = ((phyint_t *)phyip)->phyint_illv6;
16653 16648          ASSERT(ill != NULL);
16654 16649  
16655 16650          res = strcmp(ill->ill_name, (char *)name_ptr);
16656 16651          if (res > 0)
16657 16652                  return (1);
16658 16653          else if (res < 0)
16659 16654                  return (-1);
16660 16655          return (0);
16661 16656  }
16662 16657  
16663 16658  /*
16664 16659   * This function is called on the unplumb path via ill_glist_delete() when
16665 16660   * there are no ills left on the phyint and thus the phyint can be freed.
16666 16661   */
16667 16662  static void
16668 16663  phyint_free(phyint_t *phyi)
16669 16664  {
16670 16665          ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
16671 16666  
16672 16667          ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
16673 16668  
16674 16669          /*
16675 16670           * If this phyint was an IPMP meta-interface, blow away the group.
16676 16671           * This is safe to do because all of the illgrps have already been
16677 16672           * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
16678 16673           * If we're cleaning up as a result of failed initialization,
16679 16674           * phyint_grp may be NULL.
16680 16675           */
16681 16676          if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
16682 16677                  rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16683 16678                  ipmp_grp_destroy(phyi->phyint_grp);
16684 16679                  phyi->phyint_grp = NULL;
16685 16680                  rw_exit(&ipst->ips_ipmp_lock);
16686 16681          }
16687 16682  
16688 16683          /*
16689 16684           * If this interface was under IPMP, take it out of the group.
16690 16685           */
16691 16686          if (phyi->phyint_grp != NULL)
16692 16687                  ipmp_phyint_leave_grp(phyi);
16693 16688  
16694 16689          /*
16695 16690           * Delete the phyint and disassociate its ipsq.  The ipsq itself
16696 16691           * will be freed in ipsq_exit().
16697 16692           */
16698 16693          phyi->phyint_ipsq->ipsq_phyint = NULL;
16699 16694          phyi->phyint_name[0] = '\0';
16700 16695  
16701 16696          mi_free(phyi);
16702 16697  }
16703 16698  
16704 16699  /*
16705 16700   * Attach the ill to the phyint structure which can be shared by both
16706 16701   * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
16707 16702   * function is called from ipif_set_values and ill_lookup_on_name (for
16708 16703   * loopback) where we know the name of the ill. We lookup the ill and if
16709 16704   * there is one present already with the name use that phyint. Otherwise
16710 16705   * reuse the one allocated by ill_init.
16711 16706   */
16712 16707  static void
16713 16708  ill_phyint_reinit(ill_t *ill)
16714 16709  {
16715 16710          boolean_t isv6 = ill->ill_isv6;
16716 16711          phyint_t *phyi_old;
16717 16712          phyint_t *phyi;
16718 16713          avl_index_t where = 0;
16719 16714          ill_t   *ill_other = NULL;
16720 16715          ip_stack_t      *ipst = ill->ill_ipst;
16721 16716  
16722 16717          ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16723 16718  
16724 16719          phyi_old = ill->ill_phyint;
16725 16720          ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
16726 16721              phyi_old->phyint_illv6 == NULL));
16727 16722          ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
16728 16723              phyi_old->phyint_illv4 == NULL));
16729 16724          ASSERT(phyi_old->phyint_ifindex == 0);
16730 16725  
16731 16726          /*
16732 16727           * Now that our ill has a name, set it in the phyint.
16733 16728           */
16734 16729          (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
16735 16730  
16736 16731          phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16737 16732              ill->ill_name, &where);
16738 16733  
16739 16734          /*
16740 16735           * 1. We grabbed the ill_g_lock before inserting this ill into
16741 16736           *    the global list of ills. So no other thread could have located
16742 16737           *    this ill and hence the ipsq of this ill is guaranteed to be empty.
16743 16738           * 2. Now locate the other protocol instance of this ill.
16744 16739           * 3. Now grab both ill locks in the right order, and the phyint lock of
16745 16740           *    the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
16746 16741           *    of neither ill can change.
16747 16742           * 4. Merge the phyint and thus the ipsq as well of this ill onto the
16748 16743           *    other ill.
16749 16744           * 5. Release all locks.
16750 16745           */
16751 16746  
16752 16747          /*
16753 16748           * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
16754 16749           * we are initializing IPv4.
16755 16750           */
16756 16751          if (phyi != NULL) {
16757 16752                  ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
16758 16753                  ASSERT(ill_other->ill_phyint != NULL);
16759 16754                  ASSERT((isv6 && !ill_other->ill_isv6) ||
16760 16755                      (!isv6 && ill_other->ill_isv6));
16761 16756                  GRAB_ILL_LOCKS(ill, ill_other);
16762 16757                  /*
16763 16758                   * We are potentially throwing away phyint_flags which
16764 16759                   * could be different from the one that we obtain from
16765 16760                   * ill_other->ill_phyint. But it is okay as we are assuming
16766 16761                   * that the state maintained within IP is correct.
16767 16762                   */
16768 16763                  mutex_enter(&phyi->phyint_lock);
16769 16764                  if (isv6) {
16770 16765                          ASSERT(phyi->phyint_illv6 == NULL);
16771 16766                          phyi->phyint_illv6 = ill;
16772 16767                  } else {
16773 16768                          ASSERT(phyi->phyint_illv4 == NULL);
16774 16769                          phyi->phyint_illv4 = ill;
16775 16770                  }
16776 16771  
16777 16772                  /*
16778 16773                   * Delete the old phyint and make its ipsq eligible
16779 16774                   * to be freed in ipsq_exit().
16780 16775                   */
16781 16776                  phyi_old->phyint_illv4 = NULL;
16782 16777                  phyi_old->phyint_illv6 = NULL;
16783 16778                  phyi_old->phyint_ipsq->ipsq_phyint = NULL;
16784 16779                  phyi_old->phyint_name[0] = '\0';
16785 16780                  mi_free(phyi_old);
16786 16781          } else {
16787 16782                  mutex_enter(&ill->ill_lock);
16788 16783                  /*
16789 16784                   * We don't need to acquire any lock, since
16790 16785                   * the ill is not yet visible globally  and we
16791 16786                   * have not yet released the ill_g_lock.
16792 16787                   */
16793 16788                  phyi = phyi_old;
16794 16789                  mutex_enter(&phyi->phyint_lock);
16795 16790                  /* XXX We need a recovery strategy here. */
16796 16791                  if (!phyint_assign_ifindex(phyi, ipst))
16797 16792                          cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
16798 16793  
16799 16794                  avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16800 16795                      (void *)phyi, where);
16801 16796  
16802 16797                  (void) avl_find(&ipst->ips_phyint_g_list->
16803 16798                      phyint_list_avl_by_index,
16804 16799                      &phyi->phyint_ifindex, &where);
16805 16800                  avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16806 16801                      (void *)phyi, where);
16807 16802          }
16808 16803  
16809 16804          /*
16810 16805           * Reassigning ill_phyint automatically reassigns the ipsq also.
16811 16806           * pending mp is not affected because that is per ill basis.
16812 16807           */
16813 16808          ill->ill_phyint = phyi;
16814 16809  
16815 16810          /*
16816 16811           * Now that the phyint's ifindex has been assigned, complete the
16817 16812           * remaining
16818 16813           */
16819 16814          ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
16820 16815          if (ill->ill_isv6) {
16821 16816                  ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
16822 16817                      ill->ill_phyint->phyint_ifindex;
16823 16818                  ill->ill_mcast_type = ipst->ips_mld_max_version;
16824 16819          } else {
16825 16820                  ill->ill_mcast_type = ipst->ips_igmp_max_version;
16826 16821          }
16827 16822  
16828 16823          /*
16829 16824           * Generate an event within the hooks framework to indicate that
16830 16825           * a new interface has just been added to IP.  For this event to
16831 16826           * be generated, the network interface must, at least, have an
16832 16827           * ifindex assigned to it.  (We don't generate the event for
16833 16828           * loopback since ill_lookup_on_name() has its own NE_PLUMB event.)
16834 16829           *
16835 16830           * This needs to be run inside the ill_g_lock perimeter to ensure
16836 16831           * that the ordering of delivered events to listeners matches the
16837 16832           * order of them in the kernel.
16838 16833           */
16839 16834          if (!IS_LOOPBACK(ill)) {
16840 16835                  ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
16841 16836                      ill->ill_name_length);
16842 16837          }
16843 16838          RELEASE_ILL_LOCKS(ill, ill_other);
16844 16839          mutex_exit(&phyi->phyint_lock);
16845 16840  }
16846 16841  
16847 16842  /*
16848 16843   * Notify any downstream modules of the name of this interface.
16849 16844   * An M_IOCTL is used even though we don't expect a successful reply.
16850 16845   * Any reply message from the driver (presumably an M_IOCNAK) will
16851 16846   * eventually get discarded somewhere upstream.  The message format is
16852 16847   * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
16853 16848   * to IP.
16854 16849   */
16855 16850  static void
16856 16851  ip_ifname_notify(ill_t *ill, queue_t *q)
16857 16852  {
16858 16853          mblk_t *mp1, *mp2;
16859 16854          struct iocblk *iocp;
16860 16855          struct lifreq *lifr;
16861 16856  
16862 16857          mp1 = mkiocb(SIOCSLIFNAME);
16863 16858          if (mp1 == NULL)
16864 16859                  return;
16865 16860          mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
16866 16861          if (mp2 == NULL) {
16867 16862                  freeb(mp1);
16868 16863                  return;
16869 16864          }
16870 16865  
16871 16866          mp1->b_cont = mp2;
16872 16867          iocp = (struct iocblk *)mp1->b_rptr;
16873 16868          iocp->ioc_count = sizeof (struct lifreq);
16874 16869  
16875 16870          lifr = (struct lifreq *)mp2->b_rptr;
16876 16871          mp2->b_wptr += sizeof (struct lifreq);
16877 16872          bzero(lifr, sizeof (struct lifreq));
16878 16873  
16879 16874          (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
16880 16875          lifr->lifr_ppa = ill->ill_ppa;
16881 16876          lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
16882 16877  
16883 16878          DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
16884 16879              char *, "SIOCSLIFNAME", ill_t *, ill);
16885 16880          putnext(q, mp1);
16886 16881  }
16887 16882  
16888 16883  static int
16889 16884  ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
16890 16885  {
16891 16886          int             err;
16892 16887          ip_stack_t      *ipst = ill->ill_ipst;
16893 16888          phyint_t        *phyi = ill->ill_phyint;
16894 16889  
16895 16890          /*
16896 16891           * Now that ill_name is set, the configuration for the IPMP
16897 16892           * meta-interface can be performed.
16898 16893           */
16899 16894          if (IS_IPMP(ill)) {
16900 16895                  rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16901 16896                  /*
16902 16897                   * If phyi->phyint_grp is NULL, then this is the first IPMP
16903 16898                   * meta-interface and we need to create the IPMP group.
16904 16899                   */
16905 16900                  if (phyi->phyint_grp == NULL) {
16906 16901                          /*
16907 16902                           * If someone has renamed another IPMP group to have
16908 16903                           * the same name as our interface, bail.
16909 16904                           */
16910 16905                          if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
16911 16906                                  rw_exit(&ipst->ips_ipmp_lock);
16912 16907                                  return (EEXIST);
16913 16908                          }
16914 16909                          phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
16915 16910                          if (phyi->phyint_grp == NULL) {
16916 16911                                  rw_exit(&ipst->ips_ipmp_lock);
16917 16912                                  return (ENOMEM);
16918 16913                          }
16919 16914                  }
16920 16915                  rw_exit(&ipst->ips_ipmp_lock);
16921 16916          }
16922 16917  
16923 16918          /* Tell downstream modules where they are. */
16924 16919          ip_ifname_notify(ill, q);
16925 16920  
16926 16921          /*
16927 16922           * ill_dl_phys returns EINPROGRESS in the usual case.
16928 16923           * Error cases are ENOMEM ...
16929 16924           */
16930 16925          err = ill_dl_phys(ill, ipif, mp, q);
16931 16926  
16932 16927          if (ill->ill_isv6) {
16933 16928                  mutex_enter(&ipst->ips_mld_slowtimeout_lock);
16934 16929                  if (ipst->ips_mld_slowtimeout_id == 0) {
16935 16930                          ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
16936 16931                              (void *)ipst,
16937 16932                              MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16938 16933                  }
16939 16934                  mutex_exit(&ipst->ips_mld_slowtimeout_lock);
16940 16935          } else {
16941 16936                  mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
16942 16937                  if (ipst->ips_igmp_slowtimeout_id == 0) {
16943 16938                          ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
16944 16939                              (void *)ipst,
16945 16940                              MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16946 16941                  }
16947 16942                  mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
16948 16943          }
16949 16944  
16950 16945          return (err);
16951 16946  }
16952 16947  
16953 16948  /*
16954 16949   * Common routine for ppa and ifname setting. Should be called exclusive.
16955 16950   *
16956 16951   * Returns EINPROGRESS when mp has been consumed by queueing it on
16957 16952   * ipx_pending_mp and the ioctl will complete in ip_rput.
16958 16953   *
16959 16954   * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
16960 16955   * the new name and new ppa in lifr_name and lifr_ppa respectively.
16961 16956   * For SLIFNAME, we pass these values back to the userland.
16962 16957   */
16963 16958  static int
16964 16959  ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
16965 16960  {
16966 16961          ill_t   *ill;
16967 16962          ipif_t  *ipif;
16968 16963          ipsq_t  *ipsq;
16969 16964          char    *ppa_ptr;
16970 16965          char    *old_ptr;
16971 16966          char    old_char;
16972 16967          int     error;
16973 16968          ip_stack_t      *ipst;
16974 16969  
16975 16970          ip1dbg(("ipif_set_values: interface %s\n", interf_name));
16976 16971          ASSERT(q->q_next != NULL);
16977 16972          ASSERT(interf_name != NULL);
16978 16973  
16979 16974          ill = (ill_t *)q->q_ptr;
16980 16975          ipst = ill->ill_ipst;
16981 16976  
16982 16977          ASSERT(ill->ill_ipst != NULL);
16983 16978          ASSERT(ill->ill_name[0] == '\0');
16984 16979          ASSERT(IAM_WRITER_ILL(ill));
16985 16980          ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
16986 16981          ASSERT(ill->ill_ppa == UINT_MAX);
16987 16982  
16988 16983          ill->ill_defend_start = ill->ill_defend_count = 0;
16989 16984          /* The ppa is sent down by ifconfig or is chosen */
16990 16985          if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
16991 16986                  return (EINVAL);
16992 16987          }
16993 16988  
16994 16989          /*
16995 16990           * make sure ppa passed in is same as ppa in the name.
16996 16991           * This check is not made when ppa == UINT_MAX in that case ppa
16997 16992           * in the name could be anything. System will choose a ppa and
16998 16993           * update new_ppa_ptr and inter_name to contain the choosen ppa.
16999 16994           */
17000 16995          if (*new_ppa_ptr != UINT_MAX) {
17001 16996                  /* stoi changes the pointer */
17002 16997                  old_ptr = ppa_ptr;
17003 16998                  /*
17004 16999                   * ifconfig passed in 0 for the ppa for DLPI 1 style devices
17005 17000                   * (they don't have an externally visible ppa).  We assign one
17006 17001                   * here so that we can manage the interface.  Note that in
17007 17002                   * the past this value was always 0 for DLPI 1 drivers.
17008 17003                   */
17009 17004                  if (*new_ppa_ptr == 0)
17010 17005                          *new_ppa_ptr = stoi(&old_ptr);
17011 17006                  else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
17012 17007                          return (EINVAL);
17013 17008          }
17014 17009          /*
17015 17010           * terminate string before ppa
17016 17011           * save char at that location.
17017 17012           */
17018 17013          old_char = ppa_ptr[0];
17019 17014          ppa_ptr[0] = '\0';
17020 17015  
17021 17016          ill->ill_ppa = *new_ppa_ptr;
17022 17017          /*
17023 17018           * Finish as much work now as possible before calling ill_glist_insert
17024 17019           * which makes the ill globally visible and also merges it with the
17025 17020           * other protocol instance of this phyint. The remaining work is
17026 17021           * done after entering the ipsq which may happen sometime later.
17027 17022           */
17028 17023          ipif = ill->ill_ipif;
17029 17024  
17030 17025          /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
17031 17026          ipif_assign_seqid(ipif);
17032 17027  
17033 17028          if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
17034 17029                  ill->ill_flags |= ILLF_IPV4;
17035 17030  
17036 17031          ASSERT(ipif->ipif_next == NULL);        /* Only one ipif on ill */
17037 17032          ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
17038 17033  
17039 17034          if (ill->ill_flags & ILLF_IPV6) {
17040 17035  
17041 17036                  ill->ill_isv6 = B_TRUE;
17042 17037                  ill_set_inputfn(ill);
17043 17038                  if (ill->ill_rq != NULL) {
17044 17039                          ill->ill_rq->q_qinfo = &iprinitv6;
17045 17040                  }
17046 17041  
17047 17042                  /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
17048 17043                  ipif->ipif_v6lcl_addr = ipv6_all_zeros;
17049 17044                  ipif->ipif_v6subnet = ipv6_all_zeros;
17050 17045                  ipif->ipif_v6net_mask = ipv6_all_zeros;
17051 17046                  ipif->ipif_v6brd_addr = ipv6_all_zeros;
17052 17047                  ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
17053 17048                  ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
17054 17049                  /*
17055 17050                   * point-to-point or Non-mulicast capable
17056 17051                   * interfaces won't do NUD unless explicitly
17057 17052                   * configured to do so.
17058 17053                   */
17059 17054                  if (ipif->ipif_flags & IPIF_POINTOPOINT ||
17060 17055                      !(ill->ill_flags & ILLF_MULTICAST)) {
17061 17056                          ill->ill_flags |= ILLF_NONUD;
17062 17057                  }
17063 17058                  /* Make sure IPv4 specific flag is not set on IPv6 if */
17064 17059                  if (ill->ill_flags & ILLF_NOARP) {
17065 17060                          /*
17066 17061                           * Note: xresolv interfaces will eventually need
17067 17062                           * NOARP set here as well, but that will require
17068 17063                           * those external resolvers to have some
17069 17064                           * knowledge of that flag and act appropriately.
17070 17065                           * Not to be changed at present.
17071 17066                           */
17072 17067                          ill->ill_flags &= ~ILLF_NOARP;
17073 17068                  }
17074 17069                  /*
17075 17070                   * Set the ILLF_ROUTER flag according to the global
17076 17071                   * IPv6 forwarding policy.
17077 17072                   */
17078 17073                  if (ipst->ips_ipv6_forwarding != 0)
17079 17074                          ill->ill_flags |= ILLF_ROUTER;
17080 17075          } else if (ill->ill_flags & ILLF_IPV4) {
17081 17076                  ill->ill_isv6 = B_FALSE;
17082 17077                  ill_set_inputfn(ill);
17083 17078                  ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
17084 17079                  IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
17085 17080                  IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
17086 17081                  IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
17087 17082                  IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
17088 17083                  IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
17089 17084                  /*
17090 17085                   * Set the ILLF_ROUTER flag according to the global
17091 17086                   * IPv4 forwarding policy.
17092 17087                   */
17093 17088                  if (ipst->ips_ip_forwarding != 0)
17094 17089                          ill->ill_flags |= ILLF_ROUTER;
17095 17090          }
17096 17091  
17097 17092          ASSERT(ill->ill_phyint != NULL);
17098 17093  
17099 17094          /*
17100 17095           * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
17101 17096           * be completed in ill_glist_insert -> ill_phyint_reinit
17102 17097           */
17103 17098          if (!ill_allocate_mibs(ill))
17104 17099                  return (ENOMEM);
17105 17100  
17106 17101          /*
17107 17102           * Pick a default sap until we get the DL_INFO_ACK back from
17108 17103           * the driver.
17109 17104           */
17110 17105          ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap :
17111 17106              ill->ill_media->ip_m_ipv4sap;
17112 17107  
17113 17108          ill->ill_ifname_pending = 1;
17114 17109          ill->ill_ifname_pending_err = 0;
17115 17110  
17116 17111          /*
17117 17112           * When the first ipif comes up in ipif_up_done(), multicast groups
17118 17113           * that were joined while this ill was not bound to the DLPI link need
17119 17114           * to be recovered by ill_recover_multicast().
17120 17115           */
17121 17116          ill->ill_need_recover_multicast = 1;
17122 17117  
17123 17118          ill_refhold(ill);
17124 17119          rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
17125 17120          if ((error = ill_glist_insert(ill, interf_name,
17126 17121              (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
17127 17122                  ill->ill_ppa = UINT_MAX;
17128 17123                  ill->ill_name[0] = '\0';
17129 17124                  /*
17130 17125                   * undo null termination done above.
17131 17126                   */
17132 17127                  ppa_ptr[0] = old_char;
17133 17128                  rw_exit(&ipst->ips_ill_g_lock);
17134 17129                  ill_refrele(ill);
17135 17130                  return (error);
17136 17131          }
17137 17132  
17138 17133          ASSERT(ill->ill_name_length <= LIFNAMSIZ);
17139 17134  
17140 17135          /*
17141 17136           * When we return the buffer pointed to by interf_name should contain
17142 17137           * the same name as in ill_name.
17143 17138           * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
17144 17139           * the buffer pointed to by new_ppa_ptr would not contain the right ppa
17145 17140           * so copy full name and update the ppa ptr.
17146 17141           * When ppa passed in != UINT_MAX all values are correct just undo
17147 17142           * null termination, this saves a bcopy.
17148 17143           */
17149 17144          if (*new_ppa_ptr == UINT_MAX) {
17150 17145                  bcopy(ill->ill_name, interf_name, ill->ill_name_length);
17151 17146                  *new_ppa_ptr = ill->ill_ppa;
17152 17147          } else {
17153 17148                  /*
17154 17149                   * undo null termination done above.
17155 17150                   */
17156 17151                  ppa_ptr[0] = old_char;
17157 17152          }
17158 17153  
17159 17154          /* Let SCTP know about this ILL */
17160 17155          sctp_update_ill(ill, SCTP_ILL_INSERT);
17161 17156  
17162 17157          /*
17163 17158           * ill_glist_insert has made the ill visible globally, and
17164 17159           * ill_phyint_reinit could have changed the ipsq. At this point,
17165 17160           * we need to hold the ips_ill_g_lock across the call to enter the
17166 17161           * ipsq to enforce atomicity and prevent reordering. In the event
17167 17162           * the ipsq has changed, and if the new ipsq is currently busy,
17168 17163           * we need to make sure that this half-completed ioctl is ahead of
17169 17164           * any subsequent ioctl. We achieve this by not dropping the
17170 17165           * ips_ill_g_lock which prevents any ill lookup itself thereby
17171 17166           * ensuring that new ioctls can't start.
17172 17167           */
17173 17168          ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
17174 17169              B_TRUE);
17175 17170  
17176 17171          rw_exit(&ipst->ips_ill_g_lock);
17177 17172          ill_refrele(ill);
17178 17173          if (ipsq == NULL)
17179 17174                  return (EINPROGRESS);
17180 17175  
17181 17176          /*
17182 17177           * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
17183 17178           */
17184 17179          if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
17185 17180                  ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
17186 17181          else
17187 17182                  ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
17188 17183  
17189 17184          error = ipif_set_values_tail(ill, ipif, mp, q);
17190 17185          ipsq_exit(ipsq);
17191 17186          if (error != 0 && error != EINPROGRESS) {
17192 17187                  /*
17193 17188                   * restore previous values
17194 17189                   */
17195 17190                  ill->ill_isv6 = B_FALSE;
17196 17191                  ill_set_inputfn(ill);
17197 17192          }
17198 17193          return (error);
17199 17194  }
17200 17195  
17201 17196  void
17202 17197  ipif_init(ip_stack_t *ipst)
17203 17198  {
17204 17199          int i;
17205 17200  
17206 17201          for (i = 0; i < MAX_G_HEADS; i++) {
17207 17202                  ipst->ips_ill_g_heads[i].ill_g_list_head =
17208 17203                      (ill_if_t *)&ipst->ips_ill_g_heads[i];
17209 17204                  ipst->ips_ill_g_heads[i].ill_g_list_tail =
17210 17205                      (ill_if_t *)&ipst->ips_ill_g_heads[i];
17211 17206          }
17212 17207  
17213 17208          avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
17214 17209              ill_phyint_compare_index,
17215 17210              sizeof (phyint_t),
17216 17211              offsetof(struct phyint, phyint_avl_by_index));
17217 17212          avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
17218 17213              ill_phyint_compare_name,
17219 17214              sizeof (phyint_t),
17220 17215              offsetof(struct phyint, phyint_avl_by_name));
17221 17216  }
17222 17217  
17223 17218  /*
17224 17219   * Save enough information so that we can recreate the IRE if
17225 17220   * the interface goes down and then up.
17226 17221   */
17227 17222  void
17228 17223  ill_save_ire(ill_t *ill, ire_t *ire)
17229 17224  {
17230 17225          mblk_t  *save_mp;
17231 17226  
17232 17227          save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
17233 17228          if (save_mp != NULL) {
17234 17229                  ifrt_t  *ifrt;
17235 17230  
17236 17231                  save_mp->b_wptr += sizeof (ifrt_t);
17237 17232                  ifrt = (ifrt_t *)save_mp->b_rptr;
17238 17233                  bzero(ifrt, sizeof (ifrt_t));
17239 17234                  ifrt->ifrt_type = ire->ire_type;
17240 17235                  if (ire->ire_ipversion == IPV4_VERSION) {
17241 17236                          ASSERT(!ill->ill_isv6);
17242 17237                          ifrt->ifrt_addr = ire->ire_addr;
17243 17238                          ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
17244 17239                          ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
17245 17240                          ifrt->ifrt_mask = ire->ire_mask;
17246 17241                  } else {
17247 17242                          ASSERT(ill->ill_isv6);
17248 17243                          ifrt->ifrt_v6addr = ire->ire_addr_v6;
17249 17244                          /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
17250 17245                          mutex_enter(&ire->ire_lock);
17251 17246                          ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
17252 17247                          mutex_exit(&ire->ire_lock);
17253 17248                          ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
17254 17249                          ifrt->ifrt_v6mask = ire->ire_mask_v6;
17255 17250                  }
17256 17251                  ifrt->ifrt_flags = ire->ire_flags;
17257 17252                  ifrt->ifrt_zoneid = ire->ire_zoneid;
17258 17253                  mutex_enter(&ill->ill_saved_ire_lock);
17259 17254                  save_mp->b_cont = ill->ill_saved_ire_mp;
17260 17255                  ill->ill_saved_ire_mp = save_mp;
17261 17256                  ill->ill_saved_ire_cnt++;
17262 17257                  mutex_exit(&ill->ill_saved_ire_lock);
17263 17258          }
17264 17259  }
17265 17260  
17266 17261  /*
17267 17262   * Remove one entry from ill_saved_ire_mp.
17268 17263   */
17269 17264  void
17270 17265  ill_remove_saved_ire(ill_t *ill, ire_t *ire)
17271 17266  {
17272 17267          mblk_t  **mpp;
17273 17268          mblk_t  *mp;
17274 17269          ifrt_t  *ifrt;
17275 17270  
17276 17271          /* Remove from ill_saved_ire_mp list if it is there */
17277 17272          mutex_enter(&ill->ill_saved_ire_lock);
17278 17273          for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
17279 17274              mpp = &(*mpp)->b_cont) {
17280 17275                  in6_addr_t      gw_addr_v6;
17281 17276  
17282 17277                  /*
17283 17278                   * On a given ill, the tuple of address, gateway, mask,
17284 17279                   * ire_type, and zoneid is unique for each saved IRE.
17285 17280                   */
17286 17281                  mp = *mpp;
17287 17282                  ifrt = (ifrt_t *)mp->b_rptr;
17288 17283                  /* ire_gateway_addr_v6 can change - need lock */
17289 17284                  mutex_enter(&ire->ire_lock);
17290 17285                  gw_addr_v6 = ire->ire_gateway_addr_v6;
17291 17286                  mutex_exit(&ire->ire_lock);
17292 17287  
17293 17288                  if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
17294 17289                      ifrt->ifrt_type != ire->ire_type)
17295 17290                          continue;
17296 17291  
17297 17292                  if (ill->ill_isv6 ?
17298 17293                      (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
17299 17294                      &ire->ire_addr_v6) &&
17300 17295                      IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
17301 17296                      &gw_addr_v6) &&
17302 17297                      IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
17303 17298                      &ire->ire_mask_v6)) :
17304 17299                      (ifrt->ifrt_addr == ire->ire_addr &&
17305 17300                      ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
17306 17301                      ifrt->ifrt_mask == ire->ire_mask)) {
17307 17302                          *mpp = mp->b_cont;
17308 17303                          ill->ill_saved_ire_cnt--;
17309 17304                          freeb(mp);
17310 17305                          break;
17311 17306                  }
17312 17307          }
17313 17308          mutex_exit(&ill->ill_saved_ire_lock);
17314 17309  }
17315 17310  
17316 17311  /*
17317 17312   * IP multirouting broadcast routes handling
17318 17313   * Append CGTP broadcast IREs to regular ones created
17319 17314   * at ifconfig time.
17320 17315   * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both
17321 17316   * the destination and the gateway are broadcast addresses.
17322 17317   * The caller has verified that the destination is an IRE_BROADCAST and that
17323 17318   * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then
17324 17319   * we create a MULTIRT IRE_BROADCAST.
17325 17320   * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything
17326 17321   * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion.
17327 17322   */
17328 17323  static void
17329 17324  ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst)
17330 17325  {
17331 17326          ire_t *ire_prim;
17332 17327  
17333 17328          ASSERT(ire != NULL);
17334 17329  
17335 17330          ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17336 17331              IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
17337 17332              NULL);
17338 17333          if (ire_prim != NULL) {
17339 17334                  /*
17340 17335                   * We are in the special case of broadcasts for
17341 17336                   * CGTP. We add an IRE_BROADCAST that holds
17342 17337                   * the RTF_MULTIRT flag, the destination
17343 17338                   * address and the low level
17344 17339                   * info of ire_prim. In other words, CGTP
17345 17340                   * broadcast is added to the redundant ipif.
17346 17341                   */
17347 17342                  ill_t *ill_prim;
17348 17343                  ire_t  *bcast_ire;
17349 17344  
17350 17345                  ill_prim = ire_prim->ire_ill;
17351 17346  
17352 17347                  ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n",
17353 17348                      (void *)ire_prim, (void *)ill_prim));
17354 17349  
17355 17350                  bcast_ire = ire_create(
17356 17351                      (uchar_t *)&ire->ire_addr,
17357 17352                      (uchar_t *)&ip_g_all_ones,
17358 17353                      (uchar_t *)&ire->ire_gateway_addr,
17359 17354                      IRE_BROADCAST,
17360 17355                      ill_prim,
17361 17356                      GLOBAL_ZONEID,      /* CGTP is only for the global zone */
17362 17357                      ire->ire_flags | RTF_KERNEL,
17363 17358                      NULL,
17364 17359                      ipst);
17365 17360  
17366 17361                  /*
17367 17362                   * Here we assume that ire_add does head insertion so that
17368 17363                   * the added IRE_BROADCAST comes before the existing IRE_HOST.
17369 17364                   */
17370 17365                  if (bcast_ire != NULL) {
17371 17366                          if (ire->ire_flags & RTF_SETSRC) {
17372 17367                                  bcast_ire->ire_setsrc_addr =
17373 17368                                      ire->ire_setsrc_addr;
17374 17369                          }
17375 17370                          bcast_ire = ire_add(bcast_ire);
17376 17371                          if (bcast_ire != NULL) {
17377 17372                                  ip2dbg(("ip_cgtp_filter_bcast_add: "
17378 17373                                      "added bcast_ire %p\n",
17379 17374                                      (void *)bcast_ire));
17380 17375  
17381 17376                                  ill_save_ire(ill_prim, bcast_ire);
17382 17377                                  ire_refrele(bcast_ire);
17383 17378                          }
17384 17379                  }
17385 17380                  ire_refrele(ire_prim);
17386 17381          }
17387 17382  }
17388 17383  
17389 17384  /*
17390 17385   * IP multirouting broadcast routes handling
17391 17386   * Remove the broadcast ire.
17392 17387   * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both
17393 17388   * the destination and the gateway are broadcast addresses.
17394 17389   * The caller has only verified that RTF_MULTIRT was set. We check
17395 17390   * that the destination is broadcast and that the gateway is a broadcast
17396 17391   * address, and if so delete the IRE added by ip_cgtp_bcast_add().
17397 17392   */
17398 17393  static void
17399 17394  ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
17400 17395  {
17401 17396          ASSERT(ire != NULL);
17402 17397  
17403 17398          if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) {
17404 17399                  ire_t *ire_prim;
17405 17400  
17406 17401                  ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17407 17402                      IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0,
17408 17403                      ipst, NULL);
17409 17404                  if (ire_prim != NULL) {
17410 17405                          ill_t *ill_prim;
17411 17406                          ire_t  *bcast_ire;
17412 17407  
17413 17408                          ill_prim = ire_prim->ire_ill;
17414 17409  
17415 17410                          ip2dbg(("ip_cgtp_filter_bcast_delete: "
17416 17411                              "ire_prim %p, ill_prim %p\n",
17417 17412                              (void *)ire_prim, (void *)ill_prim));
17418 17413  
17419 17414                          bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0,
17420 17415                              ire->ire_gateway_addr, IRE_BROADCAST,
17421 17416                              ill_prim, ALL_ZONES, NULL,
17422 17417                              MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL |
17423 17418                              MATCH_IRE_MASK, 0, ipst, NULL);
17424 17419  
17425 17420                          if (bcast_ire != NULL) {
17426 17421                                  ip2dbg(("ip_cgtp_filter_bcast_delete: "
17427 17422                                      "looked up bcast_ire %p\n",
17428 17423                                      (void *)bcast_ire));
17429 17424                                  ill_remove_saved_ire(bcast_ire->ire_ill,
17430 17425                                      bcast_ire);
17431 17426                                  ire_delete(bcast_ire);
17432 17427                                  ire_refrele(bcast_ire);
17433 17428                          }
17434 17429                          ire_refrele(ire_prim);
17435 17430                  }
17436 17431          }
17437 17432  }
17438 17433  
17439 17434  /*
17440 17435   * Derive an interface id from the link layer address.
17441 17436   * Knows about IEEE 802 and IEEE EUI-64 mappings.
17442 17437   */
17443 17438  static void
17444 17439  ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17445 17440  {
17446 17441          char            *addr;
17447 17442  
17448 17443          /*
17449 17444           * Note that some IPv6 interfaces get plumbed over links that claim to
17450 17445           * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g.
17451 17446           * PPP links).  The ETHERADDRL check here ensures that we only set the
17452 17447           * interface ID on IPv6 interfaces above links that actually have real
17453 17448           * Ethernet addresses.
17454 17449           */
17455 17450          if (ill->ill_phys_addr_length == ETHERADDRL) {
17456 17451                  /* Form EUI-64 like address */
17457 17452                  addr = (char *)&v6addr->s6_addr32[2];
17458 17453                  bcopy(ill->ill_phys_addr, addr, 3);
17459 17454                  addr[0] ^= 0x2;         /* Toggle Universal/Local bit */
17460 17455                  addr[3] = (char)0xff;
17461 17456                  addr[4] = (char)0xfe;
17462 17457                  bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
17463 17458          }
17464 17459  }
17465 17460  
17466 17461  /* ARGSUSED */
17467 17462  static void
17468 17463  ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17469 17464  {
17470 17465  }
17471 17466  
17472 17467  typedef struct ipmp_ifcookie {
17473 17468          uint32_t        ic_hostid;
17474 17469          char            ic_ifname[LIFNAMSIZ];
17475 17470          char            ic_zonename[ZONENAME_MAX];
17476 17471  } ipmp_ifcookie_t;
17477 17472  
17478 17473  /*
17479 17474   * Construct a pseudo-random interface ID for the IPMP interface that's both
17480 17475   * predictable and (almost) guaranteed to be unique.
17481 17476   */
17482 17477  static void
17483 17478  ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17484 17479  {
17485 17480          zone_t          *zp;
17486 17481          uint8_t         *addr;
17487 17482          uchar_t         hash[16];
17488 17483          ulong_t         hostid;
17489 17484          MD5_CTX         ctx;
17490 17485          ipmp_ifcookie_t ic = { 0 };
17491 17486  
17492 17487          ASSERT(IS_IPMP(ill));
17493 17488  
17494 17489          (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
17495 17490          ic.ic_hostid = htonl((uint32_t)hostid);
17496 17491  
17497 17492          (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
17498 17493  
17499 17494          if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
17500 17495                  (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
17501 17496                  zone_rele(zp);
17502 17497          }
17503 17498  
17504 17499          MD5Init(&ctx);
17505 17500          MD5Update(&ctx, &ic, sizeof (ic));
17506 17501          MD5Final(hash, &ctx);
17507 17502  
17508 17503          /*
17509 17504           * Map the hash to an interface ID per the basic approach in RFC3041.
17510 17505           */
17511 17506          addr = &v6addr->s6_addr8[8];
17512 17507          bcopy(hash + 8, addr, sizeof (uint64_t));
17513 17508          addr[0] &= ~0x2;                                /* set local bit */
17514 17509  }
17515 17510  
17516 17511  /*
17517 17512   * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
17518 17513   */
17519 17514  static void
17520 17515  ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
17521 17516  {
17522 17517          phyint_t *phyi = ill->ill_phyint;
17523 17518  
17524 17519          /*
17525 17520           * Check PHYI_MULTI_BCAST and length of physical
17526 17521           * address to determine if we use the mapping or the
17527 17522           * broadcast address.
17528 17523           */
17529 17524          if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17530 17525              ill->ill_phys_addr_length != ETHERADDRL) {
17531 17526                  ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
17532 17527                  return;
17533 17528          }
17534 17529          m_physaddr[0] = 0x33;
17535 17530          m_physaddr[1] = 0x33;
17536 17531          m_physaddr[2] = m_ip6addr[12];
17537 17532          m_physaddr[3] = m_ip6addr[13];
17538 17533          m_physaddr[4] = m_ip6addr[14];
17539 17534          m_physaddr[5] = m_ip6addr[15];
17540 17535  }
17541 17536  
17542 17537  /*
17543 17538   * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
17544 17539   */
17545 17540  static void
17546 17541  ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17547 17542  {
17548 17543          phyint_t *phyi = ill->ill_phyint;
17549 17544  
17550 17545          /*
17551 17546           * Check PHYI_MULTI_BCAST and length of physical
17552 17547           * address to determine if we use the mapping or the
17553 17548           * broadcast address.
17554 17549           */
17555 17550          if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17556 17551              ill->ill_phys_addr_length != ETHERADDRL) {
17557 17552                  ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
17558 17553                  return;
17559 17554          }
17560 17555          m_physaddr[0] = 0x01;
17561 17556          m_physaddr[1] = 0x00;
17562 17557          m_physaddr[2] = 0x5e;
17563 17558          m_physaddr[3] = m_ipaddr[1] & 0x7f;
17564 17559          m_physaddr[4] = m_ipaddr[2];
17565 17560          m_physaddr[5] = m_ipaddr[3];
17566 17561  }
17567 17562  
17568 17563  /* ARGSUSED */
17569 17564  static void
17570 17565  ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17571 17566  {
17572 17567          /*
17573 17568           * for the MULTI_BCAST case and other cases when we want to
17574 17569           * use the link-layer broadcast address for multicast.
17575 17570           */
17576 17571          uint8_t *bphys_addr;
17577 17572          dl_unitdata_req_t *dlur;
17578 17573  
17579 17574          dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17580 17575          if (ill->ill_sap_length < 0) {
17581 17576                  bphys_addr = (uchar_t *)dlur +
17582 17577                      dlur->dl_dest_addr_offset;
17583 17578          } else  {
17584 17579                  bphys_addr = (uchar_t *)dlur +
17585 17580                      dlur->dl_dest_addr_offset + ill->ill_sap_length;
17586 17581          }
17587 17582  
17588 17583          bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
17589 17584  }
17590 17585  
17591 17586  /*
17592 17587   * Derive IPoIB interface id from the link layer address.
17593 17588   */
17594 17589  static void
17595 17590  ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17596 17591  {
17597 17592          char            *addr;
17598 17593  
17599 17594          ASSERT(ill->ill_phys_addr_length == 20);
17600 17595          addr = (char *)&v6addr->s6_addr32[2];
17601 17596          bcopy(ill->ill_phys_addr + 12, addr, 8);
17602 17597          /*
17603 17598           * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
17604 17599           * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
17605 17600           * rules. In these cases, the IBA considers these GUIDs to be in
17606 17601           * "Modified EUI-64" format, and thus toggling the u/l bit is not
17607 17602           * required; vendors are required not to assign global EUI-64's
17608 17603           * that differ only in u/l bit values, thus guaranteeing uniqueness
17609 17604           * of the interface identifier. Whether the GUID is in modified
17610 17605           * or proper EUI-64 format, the ipv6 identifier must have the u/l
17611 17606           * bit set to 1.
17612 17607           */
17613 17608          addr[0] |= 2;                   /* Set Universal/Local bit to 1 */
17614 17609  }
17615 17610  
17616 17611  /*
17617 17612   * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
17618 17613   * Note on mapping from multicast IP addresses to IPoIB multicast link
17619 17614   * addresses. IPoIB multicast link addresses are based on IBA link addresses.
17620 17615   * The format of an IPoIB multicast address is:
17621 17616   *
17622 17617   *  4 byte QPN      Scope Sign.  Pkey
17623 17618   * +--------------------------------------------+
17624 17619   * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
17625 17620   * +--------------------------------------------+
17626 17621   *
17627 17622   * The Scope and Pkey components are properties of the IBA port and
17628 17623   * network interface. They can be ascertained from the broadcast address.
17629 17624   * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
17630 17625   */
17631 17626  static void
17632 17627  ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17633 17628  {
17634 17629          static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17635 17630              0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
17636 17631              0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17637 17632          uint8_t *bphys_addr;
17638 17633          dl_unitdata_req_t *dlur;
17639 17634  
17640 17635          bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17641 17636  
17642 17637          /*
17643 17638           * RFC 4391: IPv4 MGID is 28-bit long.
17644 17639           */
17645 17640          m_physaddr[16] = m_ipaddr[0] & 0x0f;
17646 17641          m_physaddr[17] = m_ipaddr[1];
17647 17642          m_physaddr[18] = m_ipaddr[2];
17648 17643          m_physaddr[19] = m_ipaddr[3];
17649 17644  
17650 17645  
17651 17646          dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17652 17647          if (ill->ill_sap_length < 0) {
17653 17648                  bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17654 17649          } else  {
17655 17650                  bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17656 17651                      ill->ill_sap_length;
17657 17652          }
17658 17653          /*
17659 17654           * Now fill in the IBA scope/Pkey values from the broadcast address.
17660 17655           */
17661 17656          m_physaddr[5] = bphys_addr[5];
17662 17657          m_physaddr[8] = bphys_addr[8];
17663 17658          m_physaddr[9] = bphys_addr[9];
17664 17659  }
17665 17660  
17666 17661  static void
17667 17662  ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17668 17663  {
17669 17664          static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17670 17665              0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
17671 17666              0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17672 17667          uint8_t *bphys_addr;
17673 17668          dl_unitdata_req_t *dlur;
17674 17669  
17675 17670          bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17676 17671  
17677 17672          /*
17678 17673           * RFC 4391: IPv4 MGID is 80-bit long.
17679 17674           */
17680 17675          bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
17681 17676  
17682 17677          dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17683 17678          if (ill->ill_sap_length < 0) {
17684 17679                  bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17685 17680          } else  {
17686 17681                  bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17687 17682                      ill->ill_sap_length;
17688 17683          }
17689 17684          /*
17690 17685           * Now fill in the IBA scope/Pkey values from the broadcast address.
17691 17686           */
17692 17687          m_physaddr[5] = bphys_addr[5];
17693 17688          m_physaddr[8] = bphys_addr[8];
17694 17689          m_physaddr[9] = bphys_addr[9];
17695 17690  }
17696 17691  
17697 17692  /*
17698 17693   * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4
17699 17694   * tunnel).  The IPv4 address simply get placed in the lower 4 bytes of the
17700 17695   * IPv6 interface id.  This is a suggested mechanism described in section 3.7
17701 17696   * of RFC4213.
17702 17697   */
17703 17698  static void
17704 17699  ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17705 17700  {
17706 17701          ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t));
17707 17702          v6addr->s6_addr32[2] = 0;
17708 17703          bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t));
17709 17704  }
17710 17705  
17711 17706  /*
17712 17707   * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6
17713 17708   * tunnel).  The lower 8 bytes of the IPv6 address simply become the interface
17714 17709   * id.
17715 17710   */
17716 17711  static void
17717 17712  ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17718 17713  {
17719 17714          in6_addr_t *v6lladdr = (in6_addr_t *)physaddr;
17720 17715  
17721 17716          ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t));
17722 17717          bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8);
17723 17718  }
17724 17719  
17725 17720  static void
17726 17721  ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17727 17722  {
17728 17723          ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17729 17724  }
17730 17725  
17731 17726  static void
17732 17727  ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17733 17728  {
17734 17729          ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17735 17730  }
17736 17731  
17737 17732  static void
17738 17733  ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17739 17734  {
17740 17735          ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17741 17736  }
17742 17737  
17743 17738  static void
17744 17739  ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17745 17740  {
17746 17741          ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17747 17742  }
17748 17743  
17749 17744  /*
17750 17745   * Lookup an ill and verify that the zoneid has an ipif on that ill.
17751 17746   * Returns an held ill, or NULL.
17752 17747   */
17753 17748  ill_t *
17754 17749  ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
17755 17750      ip_stack_t *ipst)
17756 17751  {
17757 17752          ill_t   *ill;
17758 17753          ipif_t  *ipif;
17759 17754  
17760 17755          ill = ill_lookup_on_ifindex(index, isv6, ipst);
17761 17756          if (ill == NULL)
17762 17757                  return (NULL);
17763 17758  
17764 17759          mutex_enter(&ill->ill_lock);
17765 17760          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17766 17761                  if (IPIF_IS_CONDEMNED(ipif))
17767 17762                          continue;
17768 17763                  if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
17769 17764                      ipif->ipif_zoneid != ALL_ZONES)
17770 17765                          continue;
17771 17766  
17772 17767                  mutex_exit(&ill->ill_lock);
17773 17768                  return (ill);
17774 17769          }
17775 17770          mutex_exit(&ill->ill_lock);
17776 17771          ill_refrele(ill);
17777 17772          return (NULL);
17778 17773  }
17779 17774  
17780 17775  /*
17781 17776   * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
17782 17777   * If a pointer to an ipif_t is returned then the caller will need to do
17783 17778   * an ill_refrele().
17784 17779   */
17785 17780  ipif_t *
17786 17781  ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
17787 17782      ip_stack_t *ipst)
17788 17783  {
17789 17784          ipif_t *ipif;
17790 17785          ill_t *ill;
17791 17786  
17792 17787          ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
17793 17788          if (ill == NULL)
17794 17789                  return (NULL);
17795 17790  
17796 17791          mutex_enter(&ill->ill_lock);
17797 17792          if (ill->ill_state_flags & ILL_CONDEMNED) {
17798 17793                  mutex_exit(&ill->ill_lock);
17799 17794                  ill_refrele(ill);
17800 17795                  return (NULL);
17801 17796          }
17802 17797  
17803 17798          for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17804 17799                  if (!IPIF_CAN_LOOKUP(ipif))
17805 17800                          continue;
17806 17801                  if (lifidx == ipif->ipif_id) {
17807 17802                          ipif_refhold_locked(ipif);
17808 17803                          break;
17809 17804                  }
17810 17805          }
17811 17806  
17812 17807          mutex_exit(&ill->ill_lock);
17813 17808          ill_refrele(ill);
17814 17809          return (ipif);
17815 17810  }
17816 17811  
17817 17812  /*
17818 17813   * Set ill_inputfn based on the current know state.
17819 17814   * This needs to be called when any of the factors taken into
17820 17815   * account changes.
17821 17816   */
17822 17817  void
17823 17818  ill_set_inputfn(ill_t *ill)
17824 17819  {
17825 17820          ip_stack_t      *ipst = ill->ill_ipst;
17826 17821  
17827 17822          if (ill->ill_isv6) {
17828 17823                  if (is_system_labeled())
17829 17824                          ill->ill_inputfn = ill_input_full_v6;
17830 17825                  else
17831 17826                          ill->ill_inputfn = ill_input_short_v6;
17832 17827          } else {
17833 17828                  if (is_system_labeled())
17834 17829                          ill->ill_inputfn = ill_input_full_v4;
17835 17830                  else if (ill->ill_dhcpinit != 0)
17836 17831                          ill->ill_inputfn = ill_input_full_v4;
17837 17832                  else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
17838 17833                      != NULL)
17839 17834                          ill->ill_inputfn = ill_input_full_v4;
17840 17835                  else if (ipst->ips_ip_cgtp_filter &&
17841 17836                      ipst->ips_ip_cgtp_filter_ops != NULL)
17842 17837                          ill->ill_inputfn = ill_input_full_v4;
17843 17838                  else
17844 17839                          ill->ill_inputfn = ill_input_short_v4;
17845 17840          }
17846 17841  }
17847 17842  
17848 17843  /*
17849 17844   * Re-evaluate ill_inputfn for all the IPv4 ills.
17850 17845   * Used when RSVP and CGTP comes and goes.
17851 17846   */
17852 17847  void
17853 17848  ill_set_inputfn_all(ip_stack_t *ipst)
17854 17849  {
17855 17850          ill_walk_context_t      ctx;
17856 17851          ill_t                   *ill;
17857 17852  
17858 17853          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
17859 17854          ill = ILL_START_WALK_V4(&ctx, ipst);
17860 17855          for (; ill != NULL; ill = ill_next(&ctx, ill))
17861 17856                  ill_set_inputfn(ill);
17862 17857  
17863 17858          rw_exit(&ipst->ips_ill_g_lock);
17864 17859  }
17865 17860  
17866 17861  /*
17867 17862   * Set the physical address information for `ill' to the contents of the
17868 17863   * dl_notify_ind_t pointed to by `mp'.  Must be called as writer, and will be
17869 17864   * asynchronous if `ill' cannot immediately be quiesced -- in which case
17870 17865   * EINPROGRESS will be returned.
17871 17866   */
17872 17867  int
17873 17868  ill_set_phys_addr(ill_t *ill, mblk_t *mp)
17874 17869  {
17875 17870          ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17876 17871          dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
17877 17872  
17878 17873          ASSERT(IAM_WRITER_IPSQ(ipsq));
17879 17874  
17880 17875          if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
17881 17876              dlindp->dl_data != DL_CURR_DEST_ADDR &&
17882 17877              dlindp->dl_data != DL_CURR_PHYS_ADDR) {
17883 17878                  /* Changing DL_IPV6_TOKEN is not yet supported */
17884 17879                  return (0);
17885 17880          }
17886 17881  
17887 17882          /*
17888 17883           * We need to store up to two copies of `mp' in `ill'.  Due to the
17889 17884           * design of ipsq_pending_mp_add(), we can't pass them as separate
17890 17885           * arguments to ill_set_phys_addr_tail().  Instead, chain them
17891 17886           * together here, then pull 'em apart in ill_set_phys_addr_tail().
17892 17887           */
17893 17888          if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
17894 17889                  freemsg(mp);
17895 17890                  return (ENOMEM);
17896 17891          }
17897 17892  
17898 17893          ipsq_current_start(ipsq, ill->ill_ipif, 0);
17899 17894  
17900 17895          /*
17901 17896           * Since we'll only do a logical down, we can't rely on ipif_down
17902 17897           * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset
17903 17898           * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this
17904 17899           * case, to quiesce ire's and nce's for ill_is_quiescent.
17905 17900           */
17906 17901          mutex_enter(&ill->ill_lock);
17907 17902          ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
17908 17903          /* no more ire/nce addition allowed */
17909 17904          mutex_exit(&ill->ill_lock);
17910 17905  
17911 17906          /*
17912 17907           * If we can quiesce the ill, then set the address.  If not, then
17913 17908           * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
17914 17909           */
17915 17910          ill_down_ipifs(ill, B_TRUE);
17916 17911          mutex_enter(&ill->ill_lock);
17917 17912          if (!ill_is_quiescent(ill)) {
17918 17913                  /* call cannot fail since `conn_t *' argument is NULL */
17919 17914                  (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17920 17915                      mp, ILL_DOWN);
17921 17916                  mutex_exit(&ill->ill_lock);
17922 17917                  return (EINPROGRESS);
17923 17918          }
17924 17919          mutex_exit(&ill->ill_lock);
17925 17920  
17926 17921          ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
17927 17922          return (0);
17928 17923  }
17929 17924  
17930 17925  /*
17931 17926   * When the allowed-ips link property is set on the datalink, IP receives a
17932 17927   * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips()
17933 17928   * to initialize the ill_allowed_ips[] array in the ill_t. This array is then
17934 17929   * used to vet addresses passed to ip_sioctl_addr() and to ensure that the
17935 17930   * only IP addresses configured on the ill_t are those in the ill_allowed_ips[]
17936 17931   * array.
17937 17932   */
17938 17933  void
17939 17934  ill_set_allowed_ips(ill_t *ill, mblk_t *mp)
17940 17935  {
17941 17936          ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17942 17937          dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr;
17943 17938          mac_protect_t *mrp;
17944 17939          int i;
17945 17940  
17946 17941          ASSERT(IAM_WRITER_IPSQ(ipsq));
17947 17942          mrp = (mac_protect_t *)&dlip[1];
17948 17943  
17949 17944          if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */
17950 17945                  kmem_free(ill->ill_allowed_ips,
17951 17946                      ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17952 17947                  ill->ill_allowed_ips_cnt = 0;
17953 17948                  ill->ill_allowed_ips = NULL;
17954 17949                  mutex_enter(&ill->ill_phyint->phyint_lock);
17955 17950                  ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT;
17956 17951                  mutex_exit(&ill->ill_phyint->phyint_lock);
17957 17952                  return;
17958 17953          }
17959 17954  
17960 17955          if (ill->ill_allowed_ips != NULL) {
17961 17956                  kmem_free(ill->ill_allowed_ips,
17962 17957                      ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17963 17958          }
17964 17959          ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt;
17965 17960          ill->ill_allowed_ips = kmem_alloc(
17966 17961              ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP);
17967 17962          for (i = 0; i < mrp->mp_ipaddrcnt;  i++)
17968 17963                  ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr;
17969 17964  
17970 17965          mutex_enter(&ill->ill_phyint->phyint_lock);
17971 17966          ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT;
17972 17967          mutex_exit(&ill->ill_phyint->phyint_lock);
17973 17968  }
17974 17969  
17975 17970  /*
17976 17971   * Once the ill associated with `q' has quiesced, set its physical address
17977 17972   * information to the values in `addrmp'.  Note that two copies of `addrmp'
17978 17973   * are passed (linked by b_cont), since we sometimes need to save two distinct
17979 17974   * copies in the ill_t, and our context doesn't permit sleeping or allocation
17980 17975   * failure (we'll free the other copy if it's not needed).  Since the ill_t
17981 17976   * is quiesced, we know any stale nce's with the old address information have
17982 17977   * already been removed, so we don't need to call nce_flush().
17983 17978   */
17984 17979  /* ARGSUSED */
17985 17980  static void
17986 17981  ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
17987 17982  {
17988 17983          ill_t           *ill = q->q_ptr;
17989 17984          mblk_t          *addrmp2 = unlinkb(addrmp);
17990 17985          dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
17991 17986          uint_t          addrlen, addroff;
17992 17987          int             status;
17993 17988  
17994 17989          ASSERT(IAM_WRITER_IPSQ(ipsq));
17995 17990  
17996 17991          addroff = dlindp->dl_addr_offset;
17997 17992          addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
17998 17993  
17999 17994          switch (dlindp->dl_data) {
18000 17995          case DL_IPV6_LINK_LAYER_ADDR:
18001 17996                  ill_set_ndmp(ill, addrmp, addroff, addrlen);
18002 17997                  freemsg(addrmp2);
18003 17998                  break;
18004 17999  
18005 18000          case DL_CURR_DEST_ADDR:
18006 18001                  freemsg(ill->ill_dest_addr_mp);
18007 18002                  ill->ill_dest_addr = addrmp->b_rptr + addroff;
18008 18003                  ill->ill_dest_addr_mp = addrmp;
18009 18004                  if (ill->ill_isv6) {
18010 18005                          ill_setdesttoken(ill);
18011 18006                          ipif_setdestlinklocal(ill->ill_ipif);
18012 18007                  }
18013 18008                  freemsg(addrmp2);
18014 18009                  break;
18015 18010  
18016 18011          case DL_CURR_PHYS_ADDR:
18017 18012                  freemsg(ill->ill_phys_addr_mp);
18018 18013                  ill->ill_phys_addr = addrmp->b_rptr + addroff;
18019 18014                  ill->ill_phys_addr_mp = addrmp;
18020 18015                  ill->ill_phys_addr_length = addrlen;
18021 18016                  if (ill->ill_isv6)
18022 18017                          ill_set_ndmp(ill, addrmp2, addroff, addrlen);
18023 18018                  else
18024 18019                          freemsg(addrmp2);
18025 18020                  if (ill->ill_isv6) {
18026 18021                          ill_setdefaulttoken(ill);
18027 18022                          ipif_setlinklocal(ill->ill_ipif);
18028 18023                  }
18029 18024                  break;
18030 18025          default:
18031 18026                  ASSERT(0);
18032 18027          }
18033 18028  
18034 18029          /*
18035 18030           * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires
18036 18031           * as we bring the ipifs up again.
18037 18032           */
18038 18033          mutex_enter(&ill->ill_lock);
18039 18034          ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
18040 18035          mutex_exit(&ill->ill_lock);
18041 18036          /*
18042 18037           * If there are ipifs to bring up, ill_up_ipifs() will return
18043 18038           * EINPROGRESS, and ipsq_current_finish() will be called by
18044 18039           * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
18045 18040           * brought up.
18046 18041           */
18047 18042          status = ill_up_ipifs(ill, q, addrmp);
18048 18043          if (status != EINPROGRESS)
18049 18044                  ipsq_current_finish(ipsq);
18050 18045  }
18051 18046  
18052 18047  /*
18053 18048   * Helper routine for setting the ill_nd_lla fields.
18054 18049   */
18055 18050  void
18056 18051  ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
18057 18052  {
18058 18053          freemsg(ill->ill_nd_lla_mp);
18059 18054          ill->ill_nd_lla = ndmp->b_rptr + addroff;
18060 18055          ill->ill_nd_lla_mp = ndmp;
18061 18056          ill->ill_nd_lla_len = addrlen;
18062 18057  }
18063 18058  
18064 18059  /*
18065 18060   * Replumb the ill.
18066 18061   */
18067 18062  int
18068 18063  ill_replumb(ill_t *ill, mblk_t *mp)
18069 18064  {
18070 18065          ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
18071 18066  
18072 18067          ASSERT(IAM_WRITER_IPSQ(ipsq));
18073 18068  
18074 18069          ipsq_current_start(ipsq, ill->ill_ipif, 0);
18075 18070  
18076 18071          /*
18077 18072           * If we can quiesce the ill, then continue.  If not, then
18078 18073           * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
18079 18074           */
18080 18075          ill_down_ipifs(ill, B_FALSE);
18081 18076  
18082 18077          mutex_enter(&ill->ill_lock);
18083 18078          if (!ill_is_quiescent(ill)) {
18084 18079                  /* call cannot fail since `conn_t *' argument is NULL */
18085 18080                  (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
18086 18081                      mp, ILL_DOWN);
18087 18082                  mutex_exit(&ill->ill_lock);
18088 18083                  return (EINPROGRESS);
18089 18084          }
18090 18085          mutex_exit(&ill->ill_lock);
18091 18086  
18092 18087          ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL);
18093 18088          return (0);
18094 18089  }
18095 18090  
18096 18091  /* ARGSUSED */
18097 18092  static void
18098 18093  ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
18099 18094  {
18100 18095          ill_t *ill = q->q_ptr;
18101 18096          int err;
18102 18097          conn_t *connp = NULL;
18103 18098  
18104 18099          ASSERT(IAM_WRITER_IPSQ(ipsq));
18105 18100          freemsg(ill->ill_replumb_mp);
18106 18101          ill->ill_replumb_mp = copyb(mp);
18107 18102  
18108 18103          if (ill->ill_replumb_mp == NULL) {
18109 18104                  /* out of memory */
18110 18105                  ipsq_current_finish(ipsq);
18111 18106                  return;
18112 18107          }
18113 18108  
18114 18109          mutex_enter(&ill->ill_lock);
18115 18110          ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
18116 18111              ill->ill_rq, ill->ill_replumb_mp, 0);
18117 18112          mutex_exit(&ill->ill_lock);
18118 18113  
18119 18114          if (!ill->ill_up_ipifs) {
18120 18115                  /* already closing */
18121 18116                  ipsq_current_finish(ipsq);
18122 18117                  return;
18123 18118          }
18124 18119          ill->ill_replumbing = 1;
18125 18120          err = ill_down_ipifs_tail(ill);
18126 18121  
18127 18122          /*
18128 18123           * Successfully quiesced and brought down the interface, now we send
18129 18124           * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
18130 18125           * DL_NOTE_REPLUMB message.
18131 18126           */
18132 18127          mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
18133 18128              DL_NOTIFY_CONF);
18134 18129          ASSERT(mp != NULL);
18135 18130          ((dl_notify_conf_t *)mp->b_rptr)->dl_notification =
18136 18131              DL_NOTE_REPLUMB_DONE;
18137 18132          ill_dlpi_send(ill, mp);
18138 18133  
18139 18134          /*
18140 18135           * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
18141 18136           * streams have to be unbound. When all the DLPI exchanges are done,
18142 18137           * ipsq_current_finish() will be called by arp_bringup_done(). The
18143 18138           * remainder of ipif bringup via ill_up_ipifs() will also be done in
18144 18139           * arp_bringup_done().
18145 18140           */
18146 18141          ASSERT(ill->ill_replumb_mp != NULL);
18147 18142          if (err == EINPROGRESS)
18148 18143                  return;
18149 18144          else
18150 18145                  ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
18151 18146          ASSERT(connp == NULL);
18152 18147          if (err == 0 && ill->ill_replumb_mp != NULL &&
18153 18148              ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
18154 18149                  return;
18155 18150          }
18156 18151          ipsq_current_finish(ipsq);
18157 18152  }
18158 18153  
18159 18154  /*
18160 18155   * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf'
18161 18156   * which is `bufsize' bytes.  On success, zero is returned and `buf' updated
18162 18157   * as per the ioctl.  On failure, an errno is returned.
18163 18158   */
18164 18159  static int
18165 18160  ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr)
18166 18161  {
18167 18162          int rval;
18168 18163          struct strioctl iocb;
18169 18164  
18170 18165          iocb.ic_cmd = cmd;
18171 18166          iocb.ic_timout = 15;
18172 18167          iocb.ic_len = bufsize;
18173 18168          iocb.ic_dp = buf;
18174 18169  
18175 18170          return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval));
18176 18171  }
18177 18172  
18178 18173  /*
18179 18174   * Issue an SIOCGLIFCONF for address family `af' and store the result into a
18180 18175   * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success.
18181 18176   */
18182 18177  static int
18183 18178  ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp,
18184 18179      uint_t *bufsizep, cred_t *cr)
18185 18180  {
18186 18181          int err;
18187 18182          struct lifnum lifn;
18188 18183  
18189 18184          bzero(&lifn, sizeof (lifn));
18190 18185          lifn.lifn_family = af;
18191 18186          lifn.lifn_flags = LIFC_UNDER_IPMP;
18192 18187  
18193 18188          if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0)
18194 18189                  return (err);
18195 18190  
18196 18191          /*
18197 18192           * Pad the interface count to account for additional interfaces that
18198 18193           * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
18199 18194           */
18200 18195          lifn.lifn_count += 4;
18201 18196          bzero(lifcp, sizeof (*lifcp));
18202 18197          lifcp->lifc_flags = LIFC_UNDER_IPMP;
18203 18198          lifcp->lifc_family = af;
18204 18199          lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
18205 18200          lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
18206 18201  
18207 18202          err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr);
18208 18203          if (err != 0) {
18209 18204                  kmem_free(lifcp->lifc_buf, *bufsizep);
18210 18205                  return (err);
18211 18206          }
18212 18207  
18213 18208          return (0);
18214 18209  }
18215 18210  
18216 18211  /*
18217 18212   * Helper for ip_interface_cleanup() that removes the loopback interface.
18218 18213   */
18219 18214  static void
18220 18215  ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18221 18216  {
18222 18217          int err;
18223 18218          struct lifreq lifr;
18224 18219  
18225 18220          bzero(&lifr, sizeof (lifr));
18226 18221          (void) strcpy(lifr.lifr_name, ipif_loopback_name);
18227 18222  
18228 18223          /*
18229 18224           * Attempt to remove the interface.  It may legitimately not exist
18230 18225           * (e.g. the zone administrator unplumbed it), so ignore ENXIO.
18231 18226           */
18232 18227          err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr);
18233 18228          if (err != 0 && err != ENXIO) {
18234 18229                  ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: "
18235 18230                      "error %d\n", isv6 ? "v6" : "v4", err));
18236 18231          }
18237 18232  }
18238 18233  
18239 18234  /*
18240 18235   * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP
18241 18236   * groups and that IPMP data addresses are down.  These conditions must be met
18242 18237   * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp().
18243 18238   */
18244 18239  static void
18245 18240  ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18246 18241  {
18247 18242          int af = isv6 ? AF_INET6 : AF_INET;
18248 18243          int i, nifs;
18249 18244          int err;
18250 18245          uint_t bufsize;
18251 18246          uint_t lifrsize = sizeof (struct lifreq);
18252 18247          struct lifconf lifc;
18253 18248          struct lifreq *lifrp;
18254 18249  
18255 18250          if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) {
18256 18251                  cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list "
18257 18252                      "(error %d); any IPMP interfaces cannot be shutdown", err);
18258 18253                  return;
18259 18254          }
18260 18255  
18261 18256          nifs = lifc.lifc_len / lifrsize;
18262 18257          for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
18263 18258                  err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18264 18259                  if (err != 0) {
18265 18260                          cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get "
18266 18261                              "flags: error %d", lifrp->lifr_name, err);
18267 18262                          continue;
18268 18263                  }
18269 18264  
18270 18265                  if (lifrp->lifr_flags & IFF_IPMP) {
18271 18266                          if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0)
18272 18267                                  continue;
18273 18268  
18274 18269                          lifrp->lifr_flags &= ~IFF_UP;
18275 18270                          err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr);
18276 18271                          if (err != 0) {
18277 18272                                  cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18278 18273                                      "bring down (error %d); IPMP interface may "
18279 18274                                      "not be shutdown", lifrp->lifr_name, err);
18280 18275                          }
18281 18276  
18282 18277                          /*
18283 18278                           * Check if IFF_DUPLICATE is still set -- and if so,
18284 18279                           * reset the address to clear it.
18285 18280                           */
18286 18281                          err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18287 18282                          if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE))
18288 18283                                  continue;
18289 18284  
18290 18285                          err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr);
18291 18286                          if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR,
18292 18287                              lifrp, lifrsize, cr)) != 0) {
18293 18288                                  cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18294 18289                                      "reset DAD (error %d); IPMP interface may "
18295 18290                                      "not be shutdown", lifrp->lifr_name, err);
18296 18291                          }
18297 18292                          continue;
18298 18293                  }
18299 18294  
18300 18295                  if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) {
18301 18296                          lifrp->lifr_groupname[0] = '\0';
18302 18297                          if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp,
18303 18298                              lifrsize, cr)) != 0) {
18304 18299                                  cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18305 18300                                      "leave IPMP group (error %d); associated "
18306 18301                                      "IPMP interface may not be shutdown",
18307 18302                                      lifrp->lifr_name, err);
18308 18303                                  continue;
18309 18304                          }
18310 18305                  }
18311 18306          }
18312 18307  
18313 18308          kmem_free(lifc.lifc_buf, bufsize);
18314 18309  }
18315 18310  
18316 18311  #define UDPDEV          "/devices/pseudo/udp@0:udp"
18317 18312  #define UDP6DEV         "/devices/pseudo/udp6@0:udp6"
18318 18313  
18319 18314  /*
18320 18315   * Remove the loopback interfaces and prep the IPMP interfaces to be torn down.
18321 18316   * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away
18322 18317   * when the user-level processes in the zone are killed and the latter are
18323 18318   * cleaned up by str_stack_shutdown().
18324 18319   */
18325 18320  void
18326 18321  ip_interface_cleanup(ip_stack_t *ipst)
18327 18322  {
18328 18323          ldi_handle_t    lh;
18329 18324          ldi_ident_t     li;
18330 18325          cred_t          *cr;
18331 18326          int             err;
18332 18327          int             i;
18333 18328          char            *devs[] = { UDP6DEV, UDPDEV };
18334 18329          netstackid_t    stackid = ipst->ips_netstack->netstack_stackid;
18335 18330  
18336 18331          if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) {
18337 18332                  cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:"
18338 18333                      " error %d", err);
18339 18334                  return;
18340 18335          }
18341 18336  
18342 18337          cr = zone_get_kcred(netstackid_to_zoneid(stackid));
18343 18338          ASSERT(cr != NULL);
18344 18339  
18345 18340          /*
18346 18341           * NOTE: loop executes exactly twice and is hardcoded to know that the
18347 18342           * first iteration is IPv6.  (Unrolling yields repetitious code, hence
18348 18343           * the loop.)
18349 18344           */
18350 18345          for (i = 0; i < 2; i++) {
18351 18346                  err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li);
18352 18347                  if (err != 0) {
18353 18348                          cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:"
18354 18349                              " error %d", devs[i], err);
18355 18350                          continue;
18356 18351                  }
18357 18352  
18358 18353                  ip_loopback_removeif(lh, i == 0, cr);
18359 18354                  ip_ipmp_cleanup(lh, i == 0, cr);
18360 18355  
18361 18356                  (void) ldi_close(lh, FREAD|FWRITE, cr);
18362 18357          }
18363 18358  
18364 18359          ldi_ident_release(li);
18365 18360          crfree(cr);
18366 18361  }
18367 18362  
18368 18363  /*
18369 18364   * This needs to be in-sync with nic_event_t definition
18370 18365   */
18371 18366  static const char *
18372 18367  ill_hook_event2str(nic_event_t event)
18373 18368  {
18374 18369          switch (event) {
18375 18370          case NE_PLUMB:
18376 18371                  return ("PLUMB");
18377 18372          case NE_UNPLUMB:
18378 18373                  return ("UNPLUMB");
18379 18374          case NE_UP:
18380 18375                  return ("UP");
18381 18376          case NE_DOWN:
18382 18377                  return ("DOWN");
18383 18378          case NE_ADDRESS_CHANGE:
18384 18379                  return ("ADDRESS_CHANGE");
18385 18380          case NE_LIF_UP:
18386 18381                  return ("LIF_UP");
18387 18382          case NE_LIF_DOWN:
18388 18383                  return ("LIF_DOWN");
18389 18384          case NE_IFINDEX_CHANGE:
18390 18385                  return ("IFINDEX_CHANGE");
18391 18386          default:
18392 18387                  return ("UNKNOWN");
18393 18388          }
18394 18389  }
18395 18390  
18396 18391  void
18397 18392  ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
18398 18393      nic_event_data_t data, size_t datalen)
18399 18394  {
18400 18395          ip_stack_t              *ipst = ill->ill_ipst;
18401 18396          hook_nic_event_int_t    *info;
18402 18397          const char              *str = NULL;
18403 18398  
18404 18399          /* create a new nic event info */
18405 18400          if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
18406 18401                  goto fail;
18407 18402  
18408 18403          info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
18409 18404          info->hnei_event.hne_lif = lif;
18410 18405          info->hnei_event.hne_event = event;
18411 18406          info->hnei_event.hne_protocol = ill->ill_isv6 ?
18412 18407              ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
18413 18408          info->hnei_event.hne_data = NULL;
18414 18409          info->hnei_event.hne_datalen = 0;
18415 18410          info->hnei_stackid = ipst->ips_netstack->netstack_stackid;
18416 18411  
18417 18412          if (data != NULL && datalen != 0) {
18418 18413                  info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP);
18419 18414                  if (info->hnei_event.hne_data == NULL)
18420 18415                          goto fail;
18421 18416                  bcopy(data, info->hnei_event.hne_data, datalen);
18422 18417                  info->hnei_event.hne_datalen = datalen;
18423 18418          }
18424 18419  
18425 18420          if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info,
18426 18421              DDI_NOSLEEP) == DDI_SUCCESS)
18427 18422                  return;
18428 18423  
18429 18424  fail:
18430 18425          if (info != NULL) {
18431 18426                  if (info->hnei_event.hne_data != NULL) {
18432 18427                          kmem_free(info->hnei_event.hne_data,
18433 18428                              info->hnei_event.hne_datalen);
18434 18429                  }
18435 18430                  kmem_free(info, sizeof (hook_nic_event_t));
18436 18431          }
18437 18432          str = ill_hook_event2str(event);
18438 18433          ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event "
18439 18434              "information for %s (ENOMEM)\n", str, ill->ill_name));
18440 18435  }
18441 18436  
18442 18437  static int
18443 18438  ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
18444 18439  {
18445 18440          int             err = 0;
18446 18441          const in_addr_t *addr = NULL;
18447 18442          nce_t           *nce = NULL;
18448 18443          ill_t           *ill = ipif->ipif_ill;
18449 18444          ill_t           *bound_ill;
18450 18445          boolean_t       added_ipif = B_FALSE;
18451 18446          uint16_t        state;
18452 18447          uint16_t        flags;
18453 18448  
18454 18449          DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
18455 18450              ill_t *, ill, ipif_t *, ipif);
18456 18451          if (ipif->ipif_lcl_addr != INADDR_ANY) {
18457 18452                  addr = &ipif->ipif_lcl_addr;
18458 18453          }
18459 18454  
18460 18455          if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
18461 18456                  if (res_act != Res_act_initial)
18462 18457                          return (EINVAL);
18463 18458          }
18464 18459  
18465 18460          if (addr != NULL) {
18466 18461                  ipmp_illgrp_t   *illg = ill->ill_grp;
18467 18462  
18468 18463                  /* add unicast nce for the local addr */
18469 18464  
18470 18465                  if (IS_IPMP(ill)) {
18471 18466                          /*
18472 18467                           * If we're here via ipif_up(), then the ipif
18473 18468                           * won't be bound yet -- add it to the group,
18474 18469                           * which will bind it if possible. (We would
18475 18470                           * add it in ipif_up(), but deleting on failure
18476 18471                           * there is gruesome.)  If we're here via
18477 18472                           * ipmp_ill_bind_ipif(), then the ipif has
18478 18473                           * already been added to the group and we
18479 18474                           * just need to use the binding.
18480 18475                           */
18481 18476                          if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
18482 18477                                  bound_ill  = ipmp_illgrp_add_ipif(illg, ipif);
18483 18478                                  if (bound_ill == NULL) {
18484 18479                                          /*
18485 18480                                           * We couldn't bind the ipif to an ill
18486 18481                                           * yet, so we have nothing to publish.
18487 18482                                           * Mark the address as ready and return.
18488 18483                                           */
18489 18484                                          ipif->ipif_addr_ready = 1;
18490 18485                                          return (0);
18491 18486                                  }
18492 18487                                  added_ipif = B_TRUE;
18493 18488                          }
18494 18489                  } else {
18495 18490                          bound_ill = ill;
18496 18491                  }
18497 18492  
18498 18493                  flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
18499 18494                      NCE_F_NONUD);
18500 18495                  /*
18501 18496                   * If this is an initial bring-up (or the ipif was never
18502 18497                   * completely brought up), do DAD.  Otherwise, we're here
18503 18498                   * because IPMP has rebound an address to this ill: send
18504 18499                   * unsolicited advertisements (ARP announcements) to
18505 18500                   * inform others.
18506 18501                   */
18507 18502                  if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
18508 18503                          state = ND_UNCHANGED; /* compute in nce_add_common() */
18509 18504                  } else {
18510 18505                          state = ND_REACHABLE;
18511 18506                          flags |= NCE_F_UNSOL_ADV;
18512 18507                  }
18513 18508  
18514 18509  retry:
18515 18510                  err = nce_lookup_then_add_v4(ill,
18516 18511                      bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
18517 18512                      addr, flags, state, &nce);
18518 18513  
18519 18514                  /*
18520 18515                   * note that we may encounter EEXIST if we are moving
18521 18516                   * the nce as a result of a rebind operation.
18522 18517                   */
18523 18518                  switch (err) {
18524 18519                  case 0:
18525 18520                          ipif->ipif_added_nce = 1;
18526 18521                          nce->nce_ipif_cnt++;
18527 18522                          break;
18528 18523                  case EEXIST:
18529 18524                          ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
18530 18525                              ill->ill_name));
18531 18526                          if (!NCE_MYADDR(nce->nce_common)) {
18532 18527                                  /*
18533 18528                                   * A leftover nce from before this address
18534 18529                                   * existed
18535 18530                                   */
18536 18531                                  ncec_delete(nce->nce_common);
18537 18532                                  nce_refrele(nce);
18538 18533                                  nce = NULL;
18539 18534                                  goto retry;
18540 18535                          }
18541 18536                          if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
18542 18537                                  nce_refrele(nce);
18543 18538                                  nce = NULL;
18544 18539                                  ip1dbg(("ipif_arp_up: NCE already exists "
18545 18540                                      "for %s:%u\n", ill->ill_name,
18546 18541                                      ipif->ipif_id));
18547 18542                                  goto arp_up_done;
18548 18543                          }
18549 18544                          /*
18550 18545                           * Duplicate local addresses are permissible for
18551 18546                           * IPIF_POINTOPOINT interfaces which will get marked
18552 18547                           * IPIF_UNNUMBERED later in
18553 18548                           * ip_addr_availability_check().
18554 18549                           *
18555 18550                           * The nce_ipif_cnt field tracks the number of
18556 18551                           * ipifs that have nce_addr as their local address.
18557 18552                           */
18558 18553                          ipif->ipif_addr_ready = 1;
18559 18554                          ipif->ipif_added_nce = 1;
18560 18555                          nce->nce_ipif_cnt++;
18561 18556                          err = 0;
18562 18557                          break;
18563 18558                  default:
18564 18559                          ASSERT(nce == NULL);
18565 18560                          goto arp_up_done;
18566 18561                  }
18567 18562                  if (arp_no_defense) {
18568 18563                          if ((ipif->ipif_flags & IPIF_UP) &&
18569 18564                              !ipif->ipif_addr_ready)
18570 18565                                  ipif_up_notify(ipif);
18571 18566                          ipif->ipif_addr_ready = 1;
18572 18567                  }
18573 18568          } else {
18574 18569                  /* zero address. nothing to publish */
18575 18570                  ipif->ipif_addr_ready = 1;
18576 18571          }
18577 18572          if (nce != NULL)
18578 18573                  nce_refrele(nce);
18579 18574  arp_up_done:
18580 18575          if (added_ipif && err != 0)
18581 18576                  ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18582 18577          return (err);
18583 18578  }
18584 18579  
18585 18580  int
18586 18581  ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
18587 18582  {
18588 18583          int             err = 0;
18589 18584          ill_t           *ill = ipif->ipif_ill;
18590 18585          boolean_t       first_interface, wait_for_dlpi = B_FALSE;
18591 18586  
18592 18587          DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
18593 18588              ill_t *, ill, ipif_t *, ipif);
18594 18589  
18595 18590          /*
18596 18591           * need to bring up ARP or setup mcast mapping only
18597 18592           * when the first interface is coming UP.
18598 18593           */
18599 18594          first_interface = (ill->ill_ipif_up_count == 0 &&
18600 18595              ill->ill_ipif_dup_count == 0 && !was_dup);
18601 18596  
18602 18597          if (res_act == Res_act_initial && first_interface) {
18603 18598                  /*
18604 18599                   * Send ATTACH + BIND
18605 18600                   */
18606 18601                  err = arp_ll_up(ill);
18607 18602                  if (err != EINPROGRESS && err != 0)
18608 18603                          return (err);
18609 18604  
18610 18605                  /*
18611 18606                   * Add NCE for local address. Start DAD.
18612 18607                   * we'll wait to hear that DAD has finished
18613 18608                   * before using the interface.
18614 18609                   */
18615 18610                  if (err == EINPROGRESS)
18616 18611                          wait_for_dlpi = B_TRUE;
18617 18612          }
18618 18613  
18619 18614          if (!wait_for_dlpi)
18620 18615                  (void) ipif_arp_up_done_tail(ipif, res_act);
18621 18616  
18622 18617          return (!wait_for_dlpi ? 0 : EINPROGRESS);
18623 18618  }
18624 18619  
18625 18620  /*
18626 18621   * Finish processing of "arp_up" after all the DLPI message
18627 18622   * exchanges have completed between arp and the driver.
18628 18623   */
18629 18624  void
18630 18625  arp_bringup_done(ill_t *ill, int err)
18631 18626  {
18632 18627          mblk_t  *mp1;
18633 18628          ipif_t  *ipif;
18634 18629          conn_t *connp = NULL;
18635 18630          ipsq_t  *ipsq;
18636 18631          queue_t *q;
18637 18632  
18638 18633          ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
18639 18634  
18640 18635          ASSERT(IAM_WRITER_ILL(ill));
18641 18636  
18642 18637          ipsq = ill->ill_phyint->phyint_ipsq;
18643 18638          ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18644 18639          mp1 = ipsq_pending_mp_get(ipsq, &connp);
18645 18640          ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18646 18641          if (mp1 == NULL) /* bringup was aborted by the user */
18647 18642                  return;
18648 18643  
18649 18644          /*
18650 18645           * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18651 18646           * must have an associated conn_t.  Otherwise, we're bringing this
18652 18647           * interface back up as part of handling an asynchronous event (e.g.,
18653 18648           * physical address change).
18654 18649           */
18655 18650          if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18656 18651                  ASSERT(connp != NULL);
18657 18652                  q = CONNP_TO_WQ(connp);
18658 18653          } else {
18659 18654                  ASSERT(connp == NULL);
18660 18655                  q = ill->ill_rq;
18661 18656          }
18662 18657          if (err == 0) {
18663 18658                  if (ipif->ipif_isv6) {
18664 18659                          if ((err = ipif_up_done_v6(ipif)) != 0)
18665 18660                                  ip0dbg(("arp_bringup_done: init failed\n"));
18666 18661                  } else {
18667 18662                          err = ipif_arp_up_done_tail(ipif, Res_act_initial);
18668 18663                          if (err != 0 ||
18669 18664                              (err = ipif_up_done(ipif)) != 0) {
18670 18665                                  ip0dbg(("arp_bringup_done: "
18671 18666                                      "init failed err %x\n", err));
18672 18667                                  (void) ipif_arp_down(ipif);
18673 18668                          }
18674 18669  
18675 18670                  }
18676 18671          } else {
18677 18672                  ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
18678 18673          }
18679 18674  
18680 18675          if ((err == 0) && (ill->ill_up_ipifs)) {
18681 18676                  err = ill_up_ipifs(ill, q, mp1);
18682 18677                  if (err == EINPROGRESS)
18683 18678                          return;
18684 18679          }
18685 18680  
18686 18681          /*
18687 18682           * If we have a moved ipif to bring up, and everything has succeeded
18688 18683           * to this point, bring it up on the IPMP ill.  Otherwise, leave it
18689 18684           * down -- the admin can try to bring it up by hand if need be.
18690 18685           */
18691 18686          if (ill->ill_move_ipif != NULL) {
18692 18687                  ipif = ill->ill_move_ipif;
18693 18688                  ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
18694 18689                      ipif->ipif_ill->ill_name));
18695 18690                  ill->ill_move_ipif = NULL;
18696 18691                  if (err == 0) {
18697 18692                          err = ipif_up(ipif, q, mp1);
18698 18693                          if (err == EINPROGRESS)
18699 18694                                  return;
18700 18695                  }
18701 18696          }
18702 18697  
18703 18698          /*
18704 18699           * The operation must complete without EINPROGRESS since
18705 18700           * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18706 18701           * Otherwise, the operation will be stuck forever in the ipsq.
18707 18702           */
18708 18703          ASSERT(err != EINPROGRESS);
18709 18704          if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18710 18705                  DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
18711 18706                      int, ipsq->ipsq_xop->ipx_current_ioctl,
18712 18707                      ill_t *, ill, ipif_t *, ipif);
18713 18708                  ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18714 18709          } else {
18715 18710                  ipsq_current_finish(ipsq);
18716 18711          }
18717 18712  }
18718 18713  
18719 18714  /*
18720 18715   * Finish processing of arp replumb after all the DLPI message
18721 18716   * exchanges have completed between arp and the driver.
18722 18717   */
18723 18718  void
18724 18719  arp_replumb_done(ill_t *ill, int err)
18725 18720  {
18726 18721          mblk_t  *mp1;
18727 18722          ipif_t  *ipif;
18728 18723          conn_t *connp = NULL;
18729 18724          ipsq_t  *ipsq;
18730 18725          queue_t *q;
18731 18726  
18732 18727          ASSERT(IAM_WRITER_ILL(ill));
18733 18728  
18734 18729          ipsq = ill->ill_phyint->phyint_ipsq;
18735 18730          ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18736 18731          mp1 = ipsq_pending_mp_get(ipsq, &connp);
18737 18732          ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18738 18733          if (mp1 == NULL) {
18739 18734                  ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
18740 18735                      ipsq->ipsq_xop->ipx_current_ioctl));
18741 18736                  /* bringup was aborted by the user */
18742 18737                  return;
18743 18738          }
18744 18739          /*
18745 18740           * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18746 18741           * must have an associated conn_t.  Otherwise, we're bringing this
18747 18742           * interface back up as part of handling an asynchronous event (e.g.,
18748 18743           * physical address change).
18749 18744           */
18750 18745          if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18751 18746                  ASSERT(connp != NULL);
18752 18747                  q = CONNP_TO_WQ(connp);
18753 18748          } else {
18754 18749                  ASSERT(connp == NULL);
18755 18750                  q = ill->ill_rq;
18756 18751          }
18757 18752          if ((err == 0) && (ill->ill_up_ipifs)) {
18758 18753                  err = ill_up_ipifs(ill, q, mp1);
18759 18754                  if (err == EINPROGRESS)
18760 18755                          return;
18761 18756          }
18762 18757          /*
18763 18758           * The operation must complete without EINPROGRESS since
18764 18759           * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18765 18760           * Otherwise, the operation will be stuck forever in the ipsq.
18766 18761           */
18767 18762          ASSERT(err != EINPROGRESS);
18768 18763          if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18769 18764                  DTRACE_PROBE4(ipif__ioctl, char *,
18770 18765                      "arp_replumb_done finish",
18771 18766                      int, ipsq->ipsq_xop->ipx_current_ioctl,
18772 18767                      ill_t *, ill, ipif_t *, ipif);
18773 18768                  ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18774 18769          } else {
18775 18770                  ipsq_current_finish(ipsq);
18776 18771          }
18777 18772  }
18778 18773  
18779 18774  void
18780 18775  ipif_up_notify(ipif_t *ipif)
18781 18776  {
18782 18777          ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
18783 18778          ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
18784 18779          sctp_update_ipif(ipif, SCTP_IPIF_UP);
18785 18780          ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
18786 18781              NE_LIF_UP, NULL, 0);
18787 18782  }
18788 18783  
18789 18784  /*
18790 18785   * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and
18791 18786   * this assumes the context is cv_wait'able.  Hence it shouldnt' be used on
18792 18787   * TPI end points with STREAMS modules pushed above.  This is assured by not
18793 18788   * having the IPI_MODOK flag for the ioctl.  And IP ensures the ILB ioctl
18794 18789   * never ends up on an ipsq, otherwise we may end up processing the ioctl
18795 18790   * while unwinding from the ispq and that could be a thread from the bottom.
18796 18791   */
18797 18792  /* ARGSUSED */
18798 18793  int
18799 18794  ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
18800 18795      ip_ioctl_cmd_t *ipip, void *arg)
18801 18796  {
18802 18797          mblk_t *cmd_mp = mp->b_cont->b_cont;
18803 18798          ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr);
18804 18799          int ret = 0;
18805 18800          int i;
18806 18801          size_t size;
18807 18802          ip_stack_t *ipst;
18808 18803          zoneid_t zoneid;
18809 18804          ilb_stack_t *ilbs;
18810 18805  
18811 18806          ipst = CONNQ_TO_IPST(q);
18812 18807          ilbs = ipst->ips_netstack->netstack_ilb;
18813 18808          zoneid = Q_TO_CONN(q)->conn_zoneid;
18814 18809  
18815 18810          switch (command) {
18816 18811          case ILB_CREATE_RULE: {
18817 18812                  ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18818 18813  
18819 18814                  if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18820 18815                          ret = EINVAL;
18821 18816                          break;
18822 18817                  }
18823 18818  
18824 18819                  ret = ilb_rule_add(ilbs, zoneid, cmd);
18825 18820                  break;
18826 18821          }
18827 18822          case ILB_DESTROY_RULE:
18828 18823          case ILB_ENABLE_RULE:
18829 18824          case ILB_DISABLE_RULE: {
18830 18825                  ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr;
18831 18826  
18832 18827                  if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) {
18833 18828                          ret = EINVAL;
18834 18829                          break;
18835 18830                  }
18836 18831  
18837 18832                  if (cmd->flags & ILB_RULE_ALLRULES) {
18838 18833                          if (command == ILB_DESTROY_RULE) {
18839 18834                                  ilb_rule_del_all(ilbs, zoneid);
18840 18835                                  break;
18841 18836                          } else if (command == ILB_ENABLE_RULE) {
18842 18837                                  ilb_rule_enable_all(ilbs, zoneid);
18843 18838                                  break;
18844 18839                          } else if (command == ILB_DISABLE_RULE) {
18845 18840                                  ilb_rule_disable_all(ilbs, zoneid);
18846 18841                                  break;
18847 18842                          }
18848 18843                  } else {
18849 18844                          if (command == ILB_DESTROY_RULE) {
18850 18845                                  ret = ilb_rule_del(ilbs, zoneid, cmd->name);
18851 18846                          } else if (command == ILB_ENABLE_RULE) {
18852 18847                                  ret = ilb_rule_enable(ilbs, zoneid, cmd->name,
18853 18848                                      NULL);
18854 18849                          } else if (command == ILB_DISABLE_RULE) {
18855 18850                                  ret = ilb_rule_disable(ilbs, zoneid, cmd->name,
18856 18851                                      NULL);
18857 18852                          }
18858 18853                  }
18859 18854                  break;
18860 18855          }
18861 18856          case ILB_NUM_RULES: {
18862 18857                  ilb_num_rules_cmd_t *cmd;
18863 18858  
18864 18859                  if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) {
18865 18860                          ret = EINVAL;
18866 18861                          break;
18867 18862                  }
18868 18863                  cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr;
18869 18864                  ilb_get_num_rules(ilbs, zoneid, &(cmd->num));
18870 18865                  break;
18871 18866          }
18872 18867          case ILB_RULE_NAMES: {
18873 18868                  ilb_rule_names_cmd_t *cmd;
18874 18869  
18875 18870                  cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr;
18876 18871                  if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) ||
18877 18872                      cmd->num_names == 0) {
18878 18873                          ret = EINVAL;
18879 18874                          break;
18880 18875                  }
18881 18876                  size = cmd->num_names * ILB_RULE_NAMESZ;
18882 18877                  if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) +
18883 18878                      size != cmd_mp->b_wptr) {
18884 18879                          ret = EINVAL;
18885 18880                          break;
18886 18881                  }
18887 18882                  ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf);
18888 18883                  break;
18889 18884          }
18890 18885          case ILB_NUM_SERVERS: {
18891 18886                  ilb_num_servers_cmd_t *cmd;
18892 18887  
18893 18888                  if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) {
18894 18889                          ret = EINVAL;
18895 18890                          break;
18896 18891                  }
18897 18892                  cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr;
18898 18893                  ret = ilb_get_num_servers(ilbs, zoneid, cmd->name,
18899 18894                      &(cmd->num));
18900 18895                  break;
18901 18896          }
18902 18897          case ILB_LIST_RULE: {
18903 18898                  ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18904 18899  
18905 18900                  if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18906 18901                          ret = EINVAL;
18907 18902                          break;
18908 18903                  }
18909 18904                  ret = ilb_rule_list(ilbs, zoneid, cmd);
18910 18905                  break;
18911 18906          }
18912 18907          case ILB_LIST_SERVERS: {
18913 18908                  ilb_servers_info_cmd_t *cmd;
18914 18909  
18915 18910                  cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18916 18911                  if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) ||
18917 18912                      cmd->num_servers == 0) {
18918 18913                          ret = EINVAL;
18919 18914                          break;
18920 18915                  }
18921 18916                  size = cmd->num_servers * sizeof (ilb_server_info_t);
18922 18917                  if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18923 18918                      size != cmd_mp->b_wptr) {
18924 18919                          ret = EINVAL;
18925 18920                          break;
18926 18921                  }
18927 18922  
18928 18923                  ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers,
18929 18924                      &cmd->num_servers);
18930 18925                  break;
18931 18926          }
18932 18927          case ILB_ADD_SERVERS: {
18933 18928                  ilb_servers_info_cmd_t *cmd;
18934 18929                  ilb_rule_t *rule;
18935 18930  
18936 18931                  cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18937 18932                  if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) {
18938 18933                          ret = EINVAL;
18939 18934                          break;
18940 18935                  }
18941 18936                  size = cmd->num_servers * sizeof (ilb_server_info_t);
18942 18937                  if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18943 18938                      size != cmd_mp->b_wptr) {
18944 18939                          ret = EINVAL;
18945 18940                          break;
18946 18941                  }
18947 18942                  rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18948 18943                  if (rule == NULL) {
18949 18944                          ASSERT(ret != 0);
18950 18945                          break;
18951 18946                  }
18952 18947                  for (i = 0; i < cmd->num_servers; i++) {
18953 18948                          ilb_server_info_t *s;
18954 18949  
18955 18950                          s = &cmd->servers[i];
18956 18951                          s->err = ilb_server_add(ilbs, rule, s);
18957 18952                  }
18958 18953                  ILB_RULE_REFRELE(rule);
18959 18954                  break;
18960 18955          }
18961 18956          case ILB_DEL_SERVERS:
18962 18957          case ILB_ENABLE_SERVERS:
18963 18958          case ILB_DISABLE_SERVERS: {
18964 18959                  ilb_servers_cmd_t *cmd;
18965 18960                  ilb_rule_t *rule;
18966 18961                  int (*f)();
18967 18962  
18968 18963                  cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr;
18969 18964                  if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) {
18970 18965                          ret = EINVAL;
18971 18966                          break;
18972 18967                  }
18973 18968                  size = cmd->num_servers * sizeof (ilb_server_arg_t);
18974 18969                  if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) +
18975 18970                      size != cmd_mp->b_wptr) {
18976 18971                          ret = EINVAL;
18977 18972                          break;
18978 18973                  }
18979 18974  
18980 18975                  if (command == ILB_DEL_SERVERS)
18981 18976                          f = ilb_server_del;
18982 18977                  else if (command == ILB_ENABLE_SERVERS)
18983 18978                          f = ilb_server_enable;
18984 18979                  else if (command == ILB_DISABLE_SERVERS)
18985 18980                          f = ilb_server_disable;
18986 18981  
18987 18982                  rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18988 18983                  if (rule == NULL) {
18989 18984                          ASSERT(ret != 0);
18990 18985                          break;
18991 18986                  }
18992 18987  
18993 18988                  for (i = 0; i < cmd->num_servers; i++) {
18994 18989                          ilb_server_arg_t *s;
18995 18990  
18996 18991                          s = &cmd->servers[i];
18997 18992                          s->err = f(ilbs, zoneid, NULL, rule, &s->addr);
18998 18993                  }
18999 18994                  ILB_RULE_REFRELE(rule);
19000 18995                  break;
19001 18996          }
19002 18997          case ILB_LIST_NAT_TABLE: {
19003 18998                  ilb_list_nat_cmd_t *cmd;
19004 18999  
19005 19000                  cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr;
19006 19001                  if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) {
19007 19002                          ret = EINVAL;
19008 19003                          break;
19009 19004                  }
19010 19005                  size = cmd->num_nat * sizeof (ilb_nat_entry_t);
19011 19006                  if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) +
19012 19007                      size != cmd_mp->b_wptr) {
19013 19008                          ret = EINVAL;
19014 19009                          break;
19015 19010                  }
19016 19011  
19017 19012                  ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat,
19018 19013                      &cmd->flags);
19019 19014                  break;
19020 19015          }
19021 19016          case ILB_LIST_STICKY_TABLE: {
19022 19017                  ilb_list_sticky_cmd_t *cmd;
19023 19018  
19024 19019                  cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr;
19025 19020                  if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) {
19026 19021                          ret = EINVAL;
19027 19022                          break;
19028 19023                  }
19029 19024                  size = cmd->num_sticky * sizeof (ilb_sticky_entry_t);
19030 19025                  if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) +
19031 19026                      size != cmd_mp->b_wptr) {
19032 19027                          ret = EINVAL;
19033 19028                          break;
19034 19029                  }
19035 19030  
19036 19031                  ret = ilb_list_sticky(ilbs, zoneid, cmd->entries,
19037 19032                      &cmd->num_sticky, &cmd->flags);
19038 19033                  break;
19039 19034          }
19040 19035          default:
19041 19036                  ret = EINVAL;
19042 19037                  break;
19043 19038          }
19044 19039  done:
19045 19040          return (ret);
19046 19041  }
19047 19042  
19048 19043  /* Remove all cache entries for this logical interface */
19049 19044  void
19050 19045  ipif_nce_down(ipif_t *ipif)
19051 19046  {
19052 19047          ill_t *ill = ipif->ipif_ill;
19053 19048          nce_t *nce;
19054 19049  
19055 19050          DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
19056 19051              ill_t *, ill, ipif_t *, ipif);
19057 19052          if (ipif->ipif_added_nce) {
19058 19053                  if (ipif->ipif_isv6)
19059 19054                          nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
19060 19055                  else
19061 19056                          nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
19062 19057                  if (nce != NULL) {
19063 19058                          if (--nce->nce_ipif_cnt == 0)
19064 19059                                  ncec_delete(nce->nce_common);
19065 19060                          ipif->ipif_added_nce = 0;
19066 19061                          nce_refrele(nce);
19067 19062                  } else {
19068 19063                          /*
19069 19064                           * nce may already be NULL because it was already
19070 19065                           * flushed, e.g., due to a call to nce_flush
19071 19066                           */
19072 19067                          ipif->ipif_added_nce = 0;
19073 19068                  }
19074 19069          }
19075 19070          /*
19076 19071           * Make IPMP aware of the deleted data address.
19077 19072           */
19078 19073          if (IS_IPMP(ill))
19079 19074                  ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
19080 19075  
19081 19076          /*
19082 19077           * Remove all other nces dependent on this ill when the last ipif
19083 19078           * is going away.
19084 19079           */
19085 19080          if (ill->ill_ipif_up_count == 0) {
19086 19081                  ncec_walk(ill, ncec_delete_per_ill, ill, ill->ill_ipst);
19087 19082                  if (IS_UNDER_IPMP(ill))
19088 19083                          nce_flush(ill, B_TRUE);
19089 19084          }
19090 19085  }
19091 19086  
19092 19087  /*
19093 19088   * find the first interface that uses usill for its source address.
19094 19089   */
19095 19090  ill_t *
19096 19091  ill_lookup_usesrc(ill_t *usill)
19097 19092  {
19098 19093          ip_stack_t *ipst = usill->ill_ipst;
19099 19094          ill_t *ill;
19100 19095  
19101 19096          ASSERT(usill != NULL);
19102 19097  
19103 19098          /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
19104 19099          rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
19105 19100          rw_enter(&ipst->ips_ill_g_lock, RW_READER);
19106 19101          for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
19107 19102              ill = ill->ill_usesrc_grp_next) {
19108 19103                  if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
19109 19104                      !ILL_IS_CONDEMNED(ill)) {
19110 19105                          ill_refhold(ill);
19111 19106                          break;
19112 19107                  }
19113 19108          }
19114 19109          rw_exit(&ipst->ips_ill_g_lock);
19115 19110          rw_exit(&ipst->ips_ill_g_usesrc_lock);
19116 19111          return (ill);
19117 19112  }
19118 19113  
19119 19114  /*
19120 19115   * This comment applies to both ip_sioctl_get_ifhwaddr and
19121 19116   * ip_sioctl_get_lifhwaddr as the basic function of these two functions
19122 19117   * is the same.
19123 19118   *
19124 19119   * The goal here is to find an IP interface that corresponds to the name
19125 19120   * provided by the caller in the ifreq/lifreq structure held in the mblk_t
19126 19121   * chain and to fill out a sockaddr/sockaddr_storage structure with the
19127 19122   * mac address.
19128 19123   *
19129 19124   * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number
19130 19125   * of different reasons:
19131 19126   * ENXIO - the device name is not known to IP.
19132 19127   * EADDRNOTAVAIL - the device has no hardware address. This is indicated
19133 19128   * by ill_phys_addr not pointing to an actual address.
19134 19129   * EPFNOSUPPORT - this will indicate that a request is being made for a
19135 19130   * mac address that will not fit in the data structure supplier (struct
19136 19131   * sockaddr).
19137 19132   *
19138 19133   */
19139 19134  /* ARGSUSED */
19140 19135  int
19141 19136  ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19142 19137      ip_ioctl_cmd_t *ipip, void *if_req)
19143 19138  {
19144 19139          struct sockaddr *sock;
19145 19140          struct ifreq *ifr;
19146 19141          mblk_t *mp1;
19147 19142          ill_t *ill;
19148 19143  
19149 19144          ASSERT(ipif != NULL);
19150 19145          ill = ipif->ipif_ill;
19151 19146  
19152 19147          if (ill->ill_phys_addr == NULL) {
19153 19148                  return (EADDRNOTAVAIL);
19154 19149          }
19155 19150          if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) {
19156 19151                  return (EPFNOSUPPORT);
19157 19152          }
19158 19153  
19159 19154          ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name));
19160 19155  
19161 19156          /* Existence of mp1 has been checked in ip_wput_nondata */
19162 19157          mp1 = mp->b_cont->b_cont;
19163 19158          ifr = (struct ifreq *)mp1->b_rptr;
19164 19159  
19165 19160          sock = &ifr->ifr_addr;
19166 19161          /*
19167 19162           * The "family" field in the returned structure is set to a value
19168 19163           * that represents the type of device to which the address belongs.
19169 19164           * The value returned may differ to that on Linux but it will still
19170 19165           * represent the correct symbol on Solaris.
19171 19166           */
19172 19167          sock->sa_family = arp_hw_type(ill->ill_mactype);
19173 19168          bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length);
19174 19169  
19175 19170          return (0);
19176 19171  }
19177 19172  
19178 19173  /*
19179 19174   * The expection of applications using SIOCGIFHWADDR is that data will
19180 19175   * be returned in the sa_data field of the sockaddr structure. With
19181 19176   * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux
19182 19177   * equivalent. In light of this, struct sockaddr_dl is used as it
19183 19178   * offers more space for address storage in sll_data.
19184 19179   */
19185 19180  /* ARGSUSED */
19186 19181  int
19187 19182  ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19188 19183      ip_ioctl_cmd_t *ipip, void *if_req)
19189 19184  {
19190 19185          struct sockaddr_dl *sock;
19191 19186          struct lifreq *lifr;
19192 19187          mblk_t *mp1;
19193 19188          ill_t *ill;
19194 19189  
19195 19190          ASSERT(ipif != NULL);
19196 19191          ill = ipif->ipif_ill;
19197 19192  
19198 19193          if (ill->ill_phys_addr == NULL) {
19199 19194                  return (EADDRNOTAVAIL);
19200 19195          }
19201 19196          if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) {
19202 19197                  return (EPFNOSUPPORT);
19203 19198          }
19204 19199  
19205 19200          ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name));
19206 19201  
19207 19202          /* Existence of mp1 has been checked in ip_wput_nondata */
19208 19203          mp1 = mp->b_cont->b_cont;
19209 19204          lifr = (struct lifreq *)mp1->b_rptr;
19210 19205  
19211 19206          /*
19212 19207           * sockaddr_ll is used here because it is also the structure used in
19213 19208           * responding to the same ioctl in sockpfp. The only other choice is
19214 19209           * sockaddr_dl which contains fields that are not required here
19215 19210           * because its purpose is different.
19216 19211           */
19217 19212          lifr->lifr_type = ill->ill_type;
19218 19213          sock = (struct sockaddr_dl *)&lifr->lifr_addr;
19219 19214          sock->sdl_family = AF_LINK;
19220 19215          sock->sdl_index = ill->ill_phyint->phyint_ifindex;
19221 19216          sock->sdl_type = ill->ill_mactype;
19222 19217          sock->sdl_nlen = 0;
19223 19218          sock->sdl_slen = 0;
19224 19219          sock->sdl_alen = ill->ill_phys_addr_length;
19225 19220          bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length);
19226 19221  
19227 19222          return (0);
19228 19223  }

↓ open down ↓

6270 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX