Print this page
    
14025 ipnet sniffing leaks promisc mode
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/inet/ipnet/ipnet.c
          +++ new/usr/src/uts/common/inet/ipnet/ipnet.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  
    | 
      ↓ open down ↓ | 
    18 lines elided | 
    
      ↑ open up ↑ | 
  
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   * Copyright (c) 2016 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29      - * Copyright (c) 2016, Joyent, Inc. All rights reserved.
       29 + * Copyright 2021 Joyent, Inc.
  30   30   */
  31   31  
  32   32  /*
  33   33   * The ipnet device defined here provides access to packets at the IP layer. To
  34   34   * provide access to packets at this layer it registers a callback function in
  35   35   * the ip module and when there are open instances of the device ip will pass
  36   36   * packets into the device. Packets from ip are passed on the input, output and
  37   37   * loopback paths. Internally the module returns to ip as soon as possible by
  38   38   * deferring processing using a taskq.
  39   39   *
  40   40   * Management of the devices in /dev/ipnet/ is handled by the devname
  41   41   * filesystem and use of the neti interfaces.  This module registers for NIC
  42   42   * events using the neti framework so that when IP interfaces are bought up,
  43   43   * taken down etc. the ipnet module is notified and its view of the interfaces
  44   44   * configured on the system adjusted.  On attach, the module gets an initial
  45   45   * view of the system again using the neti framework but as it has already
  46   46   * registered for IP interface events, it is still up-to-date with any changes.
  47   47   */
  48   48  
  49   49  #include <sys/types.h>
  50   50  #include <sys/conf.h>
  51   51  #include <sys/cred.h>
  52   52  #include <sys/stat.h>
  53   53  #include <sys/ddi.h>
  54   54  #include <sys/sunddi.h>
  55   55  #include <sys/modctl.h>
  56   56  #include <sys/dlpi.h>
  57   57  #include <sys/strsun.h>
  58   58  #include <sys/id_space.h>
  59   59  #include <sys/kmem.h>
  60   60  #include <sys/mkdev.h>
  61   61  #include <sys/neti.h>
  62   62  #include <net/if.h>
  63   63  #include <sys/errno.h>
  64   64  #include <sys/list.h>
  65   65  #include <sys/ksynch.h>
  66   66  #include <sys/hook_event.h>
  67   67  #include <sys/sdt.h>
  68   68  #include <sys/stropts.h>
  69   69  #include <sys/sysmacros.h>
  70   70  #include <inet/ip.h>
  71   71  #include <inet/ip_if.h>
  72   72  #include <inet/ip_multi.h>
  73   73  #include <inet/ip6.h>
  74   74  #include <inet/ipnet.h>
  75   75  #include <net/bpf.h>
  76   76  #include <net/bpfdesc.h>
  77   77  #include <net/dlt.h>
  78   78  
  79   79  static struct module_info ipnet_minfo = {
  80   80          1,              /* mi_idnum */
  81   81          "ipnet",        /* mi_idname */
  82   82          0,              /* mi_minpsz */
  83   83          INFPSZ,         /* mi_maxpsz */
  84   84          2048,           /* mi_hiwat */
  85   85          0               /* mi_lowat */
  86   86  };
  87   87  
  88   88  /*
  89   89   * List to hold static view of ipnetif_t's on the system. This is needed to
  90   90   * avoid holding the lock protecting the avl tree of ipnetif's over the
  91   91   * callback into the dev filesystem.
  92   92   */
  93   93  typedef struct ipnetif_cbdata {
  94   94          char            ic_ifname[LIFNAMSIZ];
  95   95          dev_t           ic_dev;
  96   96          list_node_t     ic_next;
  97   97  } ipnetif_cbdata_t;
  98   98  
  99   99  /*
 100  100   * Convenience enumerated type for ipnet_accept().  It describes the
 101  101   * properties of a given ipnet_addrp_t relative to a single ipnet_t
 102  102   * client stream.  The values represent whether the address is ...
 103  103   */
 104  104  typedef enum {
 105  105          IPNETADDR_MYADDR,       /* an address on my ipnetif_t. */
 106  106          IPNETADDR_MBCAST,       /* a multicast or broadcast address. */
 107  107          IPNETADDR_UNKNOWN       /* none of the above. */
 108  108  } ipnet_addrtype_t;
 109  109  
 110  110  /* Argument used for the ipnet_nicevent_taskq callback. */
 111  111  typedef struct ipnet_nicevent_s {
 112  112          nic_event_t             ipne_event;
 113  113          net_handle_t            ipne_protocol;
 114  114          netstackid_t            ipne_stackid;
 115  115          uint64_t                ipne_ifindex;
 116  116          uint64_t                ipne_lifindex;
 117  117          char                    ipne_ifname[LIFNAMSIZ];
 118  118  } ipnet_nicevent_t;
 119  119  
 120  120  static dev_info_t       *ipnet_dip;
 121  121  static major_t          ipnet_major;
 122  122  static ddi_taskq_t      *ipnet_taskq;           /* taskq for packets */
 123  123  static ddi_taskq_t      *ipnet_nicevent_taskq;  /* taskq for NIC events */
 124  124  static id_space_t       *ipnet_minor_space;
 125  125  static const int        IPNET_MINOR_LO = 1;     /* minor number for /dev/lo0 */
 126  126  static const int        IPNET_MINOR_MIN = 2;    /* start of dynamic minors */
 127  127  static dl_info_ack_t    ipnet_infoack = IPNET_INFO_ACK_INIT;
 128  128  static ipnet_acceptfn_t ipnet_accept, ipnet_loaccept;
 129  129  static bpf_itap_fn_t    ipnet_itap;
 130  130  
 131  131  static void     ipnet_input(mblk_t *);
 132  132  static int      ipnet_wput(queue_t *, mblk_t *);
 133  133  static int      ipnet_rsrv(queue_t *);
 134  134  static int      ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
 135  135  static int      ipnet_close(queue_t *, int, cred_t *);
 136  136  static void     ipnet_ioctl(queue_t *, mblk_t *);
 137  137  static void     ipnet_iocdata(queue_t *, mblk_t *);
 138  138  static void     ipnet_wputnondata(queue_t *, mblk_t *);
 139  139  static int      ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
 140  140  static int      ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
 141  141  static int      ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 142  142  static void     ipnet_inforeq(queue_t *q, mblk_t *mp);
 143  143  static void     ipnet_bindreq(queue_t *q, mblk_t *mp);
 144  144  static void     ipnet_unbindreq(queue_t *q, mblk_t *mp);
 145  145  static void     ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
 146  146  static void     ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
 147  147  static int      ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
 148  148  static void     ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
 149  149  static int      ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
 150  150  static void     ipnet_nicevent_task(void *);
 151  151  static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
 152  152      uint64_t);
 153  153  static void     ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
 154  154  static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
 155  155  static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
 156  156  static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
 157  157  static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
 158  158  static void     ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
 159  159  static int      ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
 160  160  static int      ipnetif_compare_name(const void *, const void *);
 161  161  static int      ipnetif_compare_name_zone(const void *, const void *);
 162  162  static int      ipnetif_compare_index(const void *, const void *);
 163  163  static void     ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
 164  164  static void     ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
 165  165  static void     ipnetif_refhold(ipnetif_t *);
 166  166  static void     ipnetif_refrele(ipnetif_t *);
 167  167  static void     ipnet_walkers_inc(ipnet_stack_t *);
 168  168  static void     ipnet_walkers_dec(ipnet_stack_t *);
 169  169  static void     ipnet_register_netihook(ipnet_stack_t *);
 170  170  static void     *ipnet_stack_init(netstackid_t, netstack_t *);
 171  171  static void     ipnet_stack_fini(netstackid_t, void *);
 172  172  static void     ipnet_dispatch(void *);
 173  173  static int      ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
 174  174  static int      ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
 175  175  static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
 176  176  static void     ipnetif_clone_release(ipnetif_t *);
 177  177  
 178  178  static struct qinit ipnet_rinit = {
 179  179          NULL,           /* qi_putp */
 180  180          ipnet_rsrv,     /* qi_srvp */
 181  181          ipnet_open,     /* qi_qopen */
 182  182          ipnet_close,    /* qi_qclose */
 183  183          NULL,           /* qi_qadmin */
 184  184          &ipnet_minfo,   /* qi_minfo */
 185  185  };
 186  186  
 187  187  static struct qinit ipnet_winit = {
 188  188          ipnet_wput,     /* qi_putp */
 189  189          NULL,           /* qi_srvp */
 190  190          NULL,           /* qi_qopen */
 191  191          NULL,           /* qi_qclose */
 192  192          NULL,           /* qi_qadmin */
 193  193          &ipnet_minfo,   /* qi_minfo */
 194  194  };
 195  195  
 196  196  static struct streamtab ipnet_info = {
 197  197          &ipnet_rinit, &ipnet_winit
 198  198  };
 199  199  
 200  200  DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
 201  201      ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
 202  202      ddi_quiesce_not_supported);
 203  203  
 204  204  static struct modldrv modldrv = {
 205  205          &mod_driverops,
 206  206          "STREAMS ipnet driver",
 207  207          &ipnet_ops
 208  208  };
 209  209  
 210  210  static struct modlinkage modlinkage = {
 211  211          MODREV_1, &modldrv, NULL
 212  212  };
 213  213  
 214  214  /*
 215  215   * This structure contains the template data (names and type) that is
 216  216   * copied, in bulk, into the new kstats structure created by net_kstat_create.
 217  217   * No actual statistical information is stored in this instance of the
 218  218   * ipnet_kstats_t structure.
 219  219   */
 220  220  static ipnet_kstats_t stats_template = {
 221  221          { "duplicationFail",    KSTAT_DATA_UINT64 },
 222  222          { "dispatchOk",         KSTAT_DATA_UINT64 },
 223  223          { "dispatchFail",       KSTAT_DATA_UINT64 },
 224  224          { "dispatchHeaderDrop", KSTAT_DATA_UINT64 },
 225  225          { "dispatchDupDrop",    KSTAT_DATA_UINT64 },
 226  226          { "dispatchDeliver",    KSTAT_DATA_UINT64 },
 227  227          { "acceptOk",           KSTAT_DATA_UINT64 },
 228  228          { "acceptFail",         KSTAT_DATA_UINT64 }
 229  229  };
 230  230  
 231  231  /*
 232  232   * Walk the list of physical interfaces on the machine, for each
 233  233   * interface create a new ipnetif_t and add any addresses to it. We
 234  234   * need to do the walk twice, once for IPv4 and once for IPv6.
 235  235   *
 236  236   * The interfaces are destroyed as part of ipnet_stack_fini() for each
 237  237   * stack.  Note that we cannot do this initialization in
 238  238   * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
 239  239   */
 240  240  static int
 241  241  ipnetif_init(void)
 242  242  {
 243  243          netstack_handle_t       nh;
 244  244          netstack_t              *ns;
 245  245          ipnet_stack_t           *ips;
 246  246          int                     ret = 0;
 247  247  
 248  248          netstack_next_init(&nh);
 249  249          while ((ns = netstack_next(&nh)) != NULL) {
 250  250                  ips = ns->netstack_ipnet;
 251  251                  if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
 252  252                          ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
 253  253                  netstack_rele(ns);
 254  254                  if (ret != 0)
 255  255                          break;
 256  256          }
 257  257          netstack_next_fini(&nh);
 258  258          return (ret);
 259  259  }
 260  260  
 261  261  /*
 262  262   * Standard module entry points.
 263  263   */
 264  264  int
 265  265  _init(void)
 266  266  {
 267  267          int             ret;
 268  268          boolean_t       netstack_registered = B_FALSE;
 269  269  
 270  270          if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
 271  271                  return (ENODEV);
 272  272          ipnet_minor_space = id_space_create("ipnet_minor_space",
 273  273              IPNET_MINOR_MIN, MAXMIN32);
 274  274  
 275  275          /*
 276  276           * We call ddi_taskq_create() with nthread == 1 to ensure in-order
 277  277           * delivery of packets to clients.  Note that we need to create the
 278  278           * taskqs before calling netstack_register() since ipnet_stack_init()
 279  279           * registers callbacks that use 'em.
 280  280           */
 281  281          ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
 282  282          ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
 283  283              1, TASKQ_DEFAULTPRI, 0);
 284  284          if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
 285  285                  ret = ENOMEM;
 286  286                  goto done;
 287  287          }
 288  288  
 289  289          netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
 290  290          netstack_registered = B_TRUE;
 291  291  
 292  292          if ((ret = ipnetif_init()) == 0)
 293  293                  ret = mod_install(&modlinkage);
 294  294  done:
 295  295          if (ret != 0) {
 296  296                  if (ipnet_taskq != NULL)
 297  297                          ddi_taskq_destroy(ipnet_taskq);
 298  298                  if (ipnet_nicevent_taskq != NULL)
 299  299                          ddi_taskq_destroy(ipnet_nicevent_taskq);
 300  300                  if (netstack_registered)
 301  301                          netstack_unregister(NS_IPNET);
 302  302                  id_space_destroy(ipnet_minor_space);
 303  303          }
 304  304          return (ret);
 305  305  }
 306  306  
 307  307  int
 308  308  _fini(void)
 309  309  {
 310  310          int     err;
 311  311  
 312  312          if ((err = mod_remove(&modlinkage)) != 0)
 313  313                  return (err);
 314  314  
 315  315          netstack_unregister(NS_IPNET);
 316  316          ddi_taskq_destroy(ipnet_nicevent_taskq);
 317  317          ddi_taskq_destroy(ipnet_taskq);
 318  318          id_space_destroy(ipnet_minor_space);
 319  319          return (0);
 320  320  }
 321  321  
 322  322  int
 323  323  _info(struct modinfo *modinfop)
 324  324  {
 325  325          return (mod_info(&modlinkage, modinfop));
 326  326  }
 327  327  
 328  328  static void
 329  329  ipnet_register_netihook(ipnet_stack_t *ips)
 330  330  {
 331  331          int             ret;
 332  332          zoneid_t        zoneid;
 333  333          netid_t         netid;
 334  334  
 335  335          HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
 336  336              ips);
 337  337  
 338  338          /*
 339  339           * It is possible for an exclusive stack to be in the process of
 340  340           * shutting down here, and the netid and protocol lookups could fail
 341  341           * in that case.
 342  342           */
 343  343          zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
 344  344          if ((netid = net_zoneidtonetid(zoneid)) == -1)
 345  345                  return;
 346  346  
 347  347          if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
 348  348                  if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
 349  349                      ips->ips_nicevents)) != 0) {
 350  350                          VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
 351  351                          ips->ips_ndv4 = NULL;
 352  352                          cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
 353  353                              " in zone %d: %d", zoneid, ret);
 354  354                  }
 355  355          }
 356  356          if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
 357  357                  if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
 358  358                      ips->ips_nicevents)) != 0) {
 359  359                          VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
 360  360                          ips->ips_ndv6 = NULL;
 361  361                          cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
 362  362                              " in zone %d: %d", zoneid, ret);
 363  363                  }
 364  364          }
 365  365  
 366  366          /*
 367  367           * Create a local set of kstats for each zone.
 368  368           */
 369  369          ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
 370  370              "misc", KSTAT_TYPE_NAMED,
 371  371              sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
 372  372          if (ips->ips_kstatp != NULL) {
 373  373                  bcopy(&stats_template, &ips->ips_stats,
 374  374                      sizeof (ips->ips_stats));
 375  375                  ips->ips_kstatp->ks_data = &ips->ips_stats;
 376  376                  ips->ips_kstatp->ks_private =
 377  377                      (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
 378  378                  kstat_install(ips->ips_kstatp);
 379  379          } else {
 380  380                  cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
 381  381                      "ipnet", "ipnet_stats", "misc");
 382  382          }
 383  383  }
 384  384  
 385  385  /*
 386  386   * This function is called on attach to build an initial view of the
 387  387   * interfaces on the system. It will be called once for IPv4 and once
 388  388   * for IPv6, although there is only one ipnet interface for both IPv4
 389  389   * and IPv6 there are separate address lists.
 390  390   */
 391  391  static int
 392  392  ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
 393  393  {
 394  394          phy_if_t        phyif;
 395  395          lif_if_t        lif;
 396  396          ipnetif_t       *ipnetif;
 397  397          char            name[LIFNAMSIZ];
 398  398          boolean_t       new_if = B_FALSE;
 399  399          uint64_t        ifflags;
 400  400          int             ret = 0;
 401  401  
 402  402          /*
 403  403           * If ipnet_register_netihook() was unable to initialize this
 404  404           * stack's net_handle_t, then we cannot populate any interface
 405  405           * information.  This usually happens when we attempted to
 406  406           * grab a net_handle_t as a stack was shutting down.  We don't
 407  407           * want to fail the entire _init() operation because of a
 408  408           * stack shutdown (other stacks will continue to work just
 409  409           * fine), so we silently return success here.
 410  410           */
 411  411          if (nd == NULL)
 412  412                  return (0);
 413  413  
 414  414          /*
 415  415           * Make sure we're not processing NIC events during the
 416  416           * population of our interfaces and address lists.
 417  417           */
 418  418          mutex_enter(&ips->ips_event_lock);
 419  419  
 420  420          for (phyif = net_phygetnext(nd, 0); phyif != 0;
 421  421              phyif = net_phygetnext(nd, phyif)) {
 422  422                  if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
 423  423                          continue;
 424  424                  ifflags =  0;
 425  425                  (void) net_getlifflags(nd, phyif, 0, &ifflags);
 426  426                  if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
 427  427                          ipnetif = ipnetif_create(name, phyif, ips, ifflags);
 428  428                          if (ipnetif == NULL) {
 429  429                                  ret = ENOMEM;
 430  430                                  goto done;
 431  431                          }
 432  432                          new_if = B_TRUE;
 433  433                  }
 434  434                  ipnetif->if_flags |=
 435  435                      isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
 436  436  
 437  437                  for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
 438  438                      lif = net_lifgetnext(nd, phyif, lif)) {
 439  439                          /*
 440  440                           * Skip addresses that aren't up.  We'll add
 441  441                           * them when we receive an NE_LIF_UP event.
 442  442                           */
 443  443                          if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
 444  444                              !(ifflags & IFF_UP))
 445  445                                  continue;
 446  446                          /* Don't add it if we already have it. */
 447  447                          if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
 448  448                                  continue;
 449  449                          ipnet_add_ifaddr(lif, ipnetif, nd);
 450  450                  }
 451  451                  if (!new_if)
 452  452                          ipnetif_refrele(ipnetif);
 453  453          }
 454  454  
 455  455  done:
 456  456          mutex_exit(&ips->ips_event_lock);
 457  457          return (ret);
 458  458  }
 459  459  
 460  460  static int
 461  461  ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 462  462  {
 463  463          if (cmd != DDI_ATTACH)
 464  464                  return (DDI_FAILURE);
 465  465  
 466  466          if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
 467  467              DDI_PSEUDO, 0) == DDI_FAILURE)
 468  468                  return (DDI_FAILURE);
 469  469  
 470  470          ipnet_dip = dip;
 471  471          return (DDI_SUCCESS);
 472  472  }
 473  473  
 474  474  static int
 475  475  ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 476  476  {
 477  477          if (cmd != DDI_DETACH)
 478  478                  return (DDI_FAILURE);
 479  479  
 480  480          ASSERT(dip == ipnet_dip);
 481  481          ddi_remove_minor_node(ipnet_dip, NULL);
 482  482          ipnet_dip = NULL;
 483  483          return (DDI_SUCCESS);
 484  484  }
 485  485  
 486  486  /* ARGSUSED */
 487  487  static int
 488  488  ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 489  489  {
 490  490          int     error = DDI_FAILURE;
 491  491  
 492  492          switch (infocmd) {
 493  493          case DDI_INFO_DEVT2INSTANCE:
 494  494                  *result = (void *)0;
 495  495                  error = DDI_SUCCESS;
 496  496                  break;
 497  497          case DDI_INFO_DEVT2DEVINFO:
 498  498                  if (ipnet_dip != NULL) {
 499  499                          *result = ipnet_dip;
 500  500                          error = DDI_SUCCESS;
 501  501                  }
 502  502                  break;
 503  503          }
 504  504          return (error);
 505  505  }
 506  506  
 507  507  /* ARGSUSED */
 508  508  static int
 509  509  ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
 510  510  {
 511  511          ipnet_t         *ipnet;
 512  512          netstack_t      *ns = NULL;
 513  513          ipnet_stack_t   *ips;
 514  514          int             err = 0;
 515  515          zoneid_t        zoneid = crgetzoneid(crp);
 516  516  
 517  517          /*
 518  518           * If the system is labeled, only the global zone is allowed to open
 519  519           * IP observability nodes.
 520  520           */
 521  521          if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
 522  522                  return (EACCES);
 523  523  
 524  524          /* We don't support open as a module */
 525  525          if (sflag & MODOPEN)
 526  526                  return (ENOTSUP);
 527  527  
 528  528          /* This driver is self-cloning, we don't support re-open. */
 529  529          if (rq->q_ptr != NULL)
 530  530                  return (EBUSY);
 531  531  
 532  532          if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
 533  533                  return (ENOMEM);
 534  534  
 535  535          VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
 536  536          ips = ns->netstack_ipnet;
 537  537  
 538  538          rq->q_ptr = WR(rq)->q_ptr = ipnet;
 539  539          ipnet->ipnet_rq = rq;
 540  540          ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
 541  541          ipnet->ipnet_zoneid = zoneid;
 542  542          ipnet->ipnet_dlstate = DL_UNBOUND;
 543  543          ipnet->ipnet_ns = ns;
 544  544  
 545  545          /*
 546  546           * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
 547  547           * to be processed after ipnet_if is set and the ipnet_t has been
 548  548           * inserted in the ips_str_list.
 549  549           */
 550  550          mutex_enter(&ips->ips_event_lock);
 551  551          if (getminor(*dev) == IPNET_MINOR_LO) {
 552  552                  ipnet->ipnet_flags |= IPNET_LOMODE;
 553  553                  ipnet->ipnet_acceptfn = ipnet_loaccept;
 554  554          } else {
 555  555                  ipnet->ipnet_acceptfn = ipnet_accept;
 556  556                  ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
 557  557                  if (ipnet->ipnet_if == NULL ||
 558  558                      !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
 559  559                          err = ENODEV;
 560  560                          goto done;
 561  561                  }
 562  562          }
 563  563  
 564  564          mutex_enter(&ips->ips_walkers_lock);
 565  565          while (ips->ips_walkers_cnt != 0)
 566  566                  cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
 567  567          list_insert_head(&ips->ips_str_list, ipnet);
 568  568          *dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
 569  569          qprocson(rq);
 570  570  
 571  571          /*
 572  572           * Only register our callback if we're the first open client; we call
 573  573           * unregister in close() for the last open client.
 574  574           */
 575  575          if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
 576  576                  ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
 577  577          mutex_exit(&ips->ips_walkers_lock);
 578  578  
 579  579  done:
 580  580          mutex_exit(&ips->ips_event_lock);
 581  581          if (err != 0) {
 582  582                  netstack_rele(ns);
 583  583                  id_free(ipnet_minor_space, ipnet->ipnet_minor);
 584  584                  if (ipnet->ipnet_if != NULL)
 585  585                          ipnetif_refrele(ipnet->ipnet_if);
 586  586                  kmem_free(ipnet, sizeof (*ipnet));
 587  587          }
 588  588          return (err);
 589  589  }
 590  590  
 591  591  /* ARGSUSED */
 592  592  static int
 593  593  ipnet_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
 594  594  {
 595  595          ipnet_t         *ipnet = rq->q_ptr;
 596  596          ipnet_stack_t   *ips = ipnet->ipnet_ns->netstack_ipnet;
 597  597  
 598  598          if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
 599  599                  ipnet_leave_allmulti(ipnet->ipnet_if, ips);
 600  600          if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
 601  601                  ipnet_leave_allmulti(ipnet->ipnet_if, ips);
 602  602  
 603  603          mutex_enter(&ips->ips_walkers_lock);
 604  604          while (ips->ips_walkers_cnt != 0)
 605  605                  cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
 606  606  
 607  607          qprocsoff(rq);
 608  608  
 609  609          list_remove(&ips->ips_str_list, ipnet);
 610  610          if (ipnet->ipnet_if != NULL)
 611  611                  ipnetif_refrele(ipnet->ipnet_if);
 612  612          id_free(ipnet_minor_space, ipnet->ipnet_minor);
 613  613  
 614  614          if (list_is_empty(&ips->ips_str_list)) {
 615  615                  ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
 616  616                  ips->ips_hook = NULL;
 617  617          }
 618  618  
 619  619          kmem_free(ipnet, sizeof (*ipnet));
 620  620  
 621  621          mutex_exit(&ips->ips_walkers_lock);
 622  622          netstack_rele(ips->ips_netstack);
 623  623          return (0);
 624  624  }
 625  625  
 626  626  static int
 627  627  ipnet_wput(queue_t *q, mblk_t *mp)
 628  628  {
 629  629          switch (mp->b_datap->db_type) {
 630  630          case M_FLUSH:
 631  631                  if (*mp->b_rptr & FLUSHW) {
 632  632                          flushq(q, FLUSHDATA);
 633  633                          *mp->b_rptr &= ~FLUSHW;
 634  634                  }
 635  635                  if (*mp->b_rptr & FLUSHR)
 636  636                          qreply(q, mp);
 637  637                  else
 638  638                          freemsg(mp);
 639  639                  break;
 640  640          case M_PROTO:
 641  641          case M_PCPROTO:
 642  642                  ipnet_wputnondata(q, mp);
 643  643                  break;
 644  644          case M_IOCTL:
 645  645                  ipnet_ioctl(q, mp);
 646  646                  break;
 647  647          case M_IOCDATA:
 648  648                  ipnet_iocdata(q, mp);
 649  649                  break;
 650  650          default:
 651  651                  freemsg(mp);
 652  652                  break;
 653  653          }
 654  654          return (0);
 655  655  }
 656  656  
 657  657  static int
 658  658  ipnet_rsrv(queue_t *q)
 659  659  {
 660  660          mblk_t  *mp;
 661  661  
 662  662          while ((mp = getq(q)) != NULL) {
 663  663                  ASSERT(DB_TYPE(mp) == M_DATA);
 664  664                  if (canputnext(q)) {
 665  665                          putnext(q, mp);
 666  666                  } else {
 667  667                          (void) putbq(q, mp);
 668  668                          break;
 669  669                  }
 670  670          }
 671  671          return (0);
 672  672  }
 673  673  
 674  674  static void
 675  675  ipnet_ioctl(queue_t *q, mblk_t *mp)
 676  676  {
 677  677          struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 678  678  
 679  679          switch (iocp->ioc_cmd) {
 680  680          case DLIOCRAW:
 681  681                  miocack(q, mp, 0, 0);
 682  682                  break;
 683  683          case DLIOCIPNETINFO:
 684  684                  if (iocp->ioc_count == TRANSPARENT) {
 685  685                          mcopyin(mp, NULL, sizeof (uint_t), NULL);
 686  686                          qreply(q, mp);
 687  687                          break;
 688  688                  }
 689  689                  /* We don't support I_STR with DLIOCIPNETINFO. */
 690  690                  /* FALLTHROUGH */
 691  691          default:
 692  692                  miocnak(q, mp, 0, EINVAL);
 693  693                  break;
 694  694          }
 695  695  }
 696  696  
 697  697  static void
 698  698  ipnet_iocdata(queue_t *q, mblk_t *mp)
 699  699  {
 700  700          struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 701  701          ipnet_t *ipnet = q->q_ptr;
 702  702  
 703  703          switch (iocp->ioc_cmd) {
 704  704          case DLIOCIPNETINFO:
 705  705                  if (*(int *)mp->b_cont->b_rptr == 1)
 706  706                          ipnet->ipnet_flags |= IPNET_INFO;
 707  707                  else if (*(int *)mp->b_cont->b_rptr == 0)
 708  708                          ipnet->ipnet_flags &= ~IPNET_INFO;
 709  709                  else
 710  710                          goto iocnak;
 711  711                  miocack(q, mp, 0, DL_IPNETINFO_VERSION);
 712  712                  break;
 713  713          default:
 714  714  iocnak:
 715  715                  miocnak(q, mp, 0, EINVAL);
 716  716                  break;
 717  717          }
 718  718  }
 719  719  
 720  720  static void
 721  721  ipnet_wputnondata(queue_t *q, mblk_t *mp)
 722  722  {
 723  723          union DL_primitives     *dlp = (union DL_primitives *)mp->b_rptr;
 724  724          t_uscalar_t             prim = dlp->dl_primitive;
 725  725  
 726  726          switch (prim) {
 727  727          case DL_INFO_REQ:
 728  728                  ipnet_inforeq(q, mp);
 729  729                  break;
 730  730          case DL_UNBIND_REQ:
 731  731                  ipnet_unbindreq(q, mp);
 732  732                  break;
 733  733          case DL_BIND_REQ:
 734  734                  ipnet_bindreq(q, mp);
 735  735                  break;
 736  736          case DL_PROMISCON_REQ:
 737  737                  ipnet_dlpromisconreq(q, mp);
 738  738                  break;
 739  739          case DL_PROMISCOFF_REQ:
 740  740                  ipnet_dlpromiscoffreq(q, mp);
 741  741                  break;
 742  742          case DL_UNITDATA_REQ:
 743  743          case DL_DETACH_REQ:
 744  744          case DL_PHYS_ADDR_REQ:
 745  745          case DL_SET_PHYS_ADDR_REQ:
 746  746          case DL_ENABMULTI_REQ:
 747  747          case DL_DISABMULTI_REQ:
 748  748          case DL_ATTACH_REQ:
 749  749                  dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
 750  750                  break;
 751  751          default:
 752  752                  dlerrorack(q, mp, prim, DL_BADPRIM, 0);
 753  753                  break;
 754  754          }
 755  755  }
 756  756  
 757  757  static void
 758  758  ipnet_inforeq(queue_t *q, mblk_t *mp)
 759  759  {
 760  760          dl_info_ack_t   *dlip;
 761  761          size_t          size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
 762  762  
 763  763          if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
 764  764                  dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
 765  765                  return;
 766  766          }
 767  767  
 768  768          if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
 769  769                  return;
 770  770  
 771  771          dlip = (dl_info_ack_t *)mp->b_rptr;
 772  772          *dlip = ipnet_infoack;
 773  773          qreply(q, mp);
 774  774  }
 775  775  
 776  776  static void
 777  777  ipnet_bindreq(queue_t *q, mblk_t *mp)
 778  778  {
 779  779          union DL_primitives     *dlp = (union DL_primitives *)mp->b_rptr;
 780  780          ipnet_t                 *ipnet = q->q_ptr;
 781  781  
 782  782          if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
 783  783                  dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
 784  784                  return;
 785  785          }
 786  786  
 787  787          switch (dlp->bind_req.dl_sap) {
 788  788          case 0 :
 789  789                  ipnet->ipnet_family = AF_UNSPEC;
 790  790                  break;
 791  791          case IPV4_VERSION :
 792  792                  ipnet->ipnet_family = AF_INET;
 793  793                  break;
 794  794          case IPV6_VERSION :
 795  795                  ipnet->ipnet_family = AF_INET6;
 796  796                  break;
 797  797          default :
 798  798                  dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
 799  799                  return;
 800  800                  /*NOTREACHED*/
 801  801          }
 802  802  
 803  803          ipnet->ipnet_dlstate = DL_IDLE;
 804  804          dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
 805  805  }
 806  806  
 807  807  static void
 808  808  ipnet_unbindreq(queue_t *q, mblk_t *mp)
 809  809  {
 810  810          ipnet_t *ipnet = q->q_ptr;
 811  811  
 812  812          if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
 813  813                  dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
 814  814                  return;
 815  815          }
 816  816  
 817  817          if (ipnet->ipnet_dlstate != DL_IDLE) {
 818  818                  dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
 819  819          } else {
 820  820                  ipnet->ipnet_dlstate = DL_UNBOUND;
 821  821                  ipnet->ipnet_family = AF_UNSPEC;
 822  822                  dlokack(q, mp, DL_UNBIND_REQ);
 823  823          }
 824  824  }
 825  825  
 826  826  static void
 827  827  ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
 828  828  {
 829  829          ipnet_t         *ipnet = q->q_ptr;
 830  830          t_uscalar_t     level;
 831  831          int             err;
 832  832  
 833  833          if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
 834  834                  dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
 835  835                  return;
 836  836          }
 837  837  
 838  838          if (ipnet->ipnet_flags & IPNET_LOMODE) {
 839  839                  dlokack(q, mp, DL_PROMISCON_REQ);
 840  840                  return;
 841  841          }
 842  842  
 843  843          level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
 844  844          if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
 845  845                  if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
 846  846                      ipnet->ipnet_ns->netstack_ipnet)) != 0) {
 847  847                          dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
 848  848                          return;
 849  849                  }
 850  850          }
 851  851  
 852  852          switch (level) {
 853  853          case DL_PROMISC_PHYS:
 854  854                  ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
 855  855                  break;
 856  856          case DL_PROMISC_SAP:
 857  857                  ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
 858  858                  break;
 859  859          case DL_PROMISC_MULTI:
 860  860                  ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
 861  861                  break;
 862  862          default:
 863  863                  dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
 864  864                  return;
 865  865          }
 866  866  
 867  867          dlokack(q, mp, DL_PROMISCON_REQ);
 868  868  }
 869  869  
 870  870  static void
 871  871  ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
 872  872  {
 873  873          ipnet_t         *ipnet = q->q_ptr;
 874  874          t_uscalar_t     level;
 875  875          uint16_t        orig_ipnet_flags = ipnet->ipnet_flags;
 876  876  
 877  877          if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
 878  878                  dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
 879  879                  return;
 880  880          }
 881  881  
 882  882          if (ipnet->ipnet_flags & IPNET_LOMODE) {
 883  883                  dlokack(q, mp, DL_PROMISCOFF_REQ);
 884  884                  return;
 885  885          }
 886  886  
 887  887          level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
 888  888          switch (level) {
 889  889          case DL_PROMISC_PHYS:
 890  890                  if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
 891  891                          ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
 892  892                  break;
 893  893          case DL_PROMISC_SAP:
 894  894                  if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
 895  895                          ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
 896  896                  break;
 897  897          case DL_PROMISC_MULTI:
 898  898                  if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
 899  899                          ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
 900  900                  break;
 901  901          default:
 902  902                  dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
 903  903                  return;
 904  904          }
 905  905  
 906  906          if (orig_ipnet_flags == ipnet->ipnet_flags) {
 907  907                  dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
 908  908                  return;
 909  909          }
 910  910  
 911  911          if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
 912  912                  ipnet_leave_allmulti(ipnet->ipnet_if,
 913  913                      ipnet->ipnet_ns->netstack_ipnet);
 914  914          }
 915  915  
 916  916          dlokack(q, mp, DL_PROMISCOFF_REQ);
 917  917  }
 918  918  
 919  919  static int
 920  920  ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
 921  921  {
 922  922          int             err = 0;
 923  923          ip_stack_t      *ipst = ips->ips_netstack->netstack_ip;
 924  924          uint64_t        index = ipnetif->if_index;
 925  925  
 926  926          mutex_enter(&ips->ips_event_lock);
 927  927          if (ipnetif->if_multicnt == 0) {
 928  928                  ASSERT((ipnetif->if_flags &
 929  929                      (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
 930  930                  if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
 931  931                          err = ip_join_allmulti(index, B_FALSE, ipst);
 932  932                          if (err != 0)
 933  933                                  goto done;
 934  934                          ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
 935  935                  }
 936  936                  if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
 937  937                          err = ip_join_allmulti(index, B_TRUE, ipst);
 938  938                          if (err != 0 &&
 939  939                              (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
 940  940                                  (void) ip_leave_allmulti(index, B_FALSE, ipst);
 941  941                                  ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
 942  942                                  goto done;
 943  943                          }
 944  944                          ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
 945  945                  }
 946  946          }
 947  947          ipnetif->if_multicnt++;
 948  948  
 949  949  done:
 950  950          mutex_exit(&ips->ips_event_lock);
 951  951          return (err);
 952  952  }
 953  953  
 954  954  static void
 955  955  ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
 956  956  {
 957  957          int             err;
 958  958          ip_stack_t      *ipst = ips->ips_netstack->netstack_ip;
 959  959          uint64_t        index = ipnetif->if_index;
 960  960  
 961  961          mutex_enter(&ips->ips_event_lock);
 962  962          ASSERT(ipnetif->if_multicnt != 0);
 963  963          if (--ipnetif->if_multicnt == 0) {
 964  964                  if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
 965  965                          err = ip_leave_allmulti(index, B_FALSE, ipst);
 966  966                          ASSERT(err == 0 || err == ENODEV);
 967  967                          ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
 968  968                  }
 969  969                  if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
 970  970                          err = ip_leave_allmulti(index, B_TRUE, ipst);
 971  971                          ASSERT(err == 0 || err == ENODEV);
 972  972                          ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
 973  973                  }
 974  974          }
 975  975          mutex_exit(&ips->ips_event_lock);
 976  976  }
 977  977  
 978  978  /*
 979  979   * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
 980  980   * The structure it copies the header information from,
 981  981   * hook_pkt_observe_t, is constructed using network byte
 982  982   * order in ipobs_hook(), so there is no conversion here.
 983  983   */
 984  984  static mblk_t *
 985  985  ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
 986  986  {
 987  987          mblk_t          *dlhdr;
 988  988          dl_ipnetinfo_t  *dl;
 989  989  
 990  990          if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
 991  991                  freemsg(mp);
 992  992                  return (NULL);
 993  993          }
 994  994          dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
 995  995          dl->dli_version = DL_IPNETINFO_VERSION;
 996  996          dl->dli_family = hdr->hpo_family;
 997  997          dl->dli_htype = hdr->hpo_htype;
 998  998          dl->dli_pktlen = hdr->hpo_pktlen;
 999  999          dl->dli_ifindex = hdr->hpo_ifindex;
1000 1000          dl->dli_grifindex = hdr->hpo_grifindex;
1001 1001          dl->dli_zsrc = hdr->hpo_zsrc;
1002 1002          dl->dli_zdst = hdr->hpo_zdst;
1003 1003          dlhdr->b_wptr += sizeof (*dl);
1004 1004          dlhdr->b_cont = mp;
1005 1005  
1006 1006          return (dlhdr);
1007 1007  }
1008 1008  
1009 1009  static ipnet_addrtype_t
1010 1010  ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1011 1011  {
1012 1012          list_t                  *list;
1013 1013          ipnetif_t               *ipnetif = ipnet->ipnet_if;
1014 1014          ipnetif_addr_t          *ifaddr;
1015 1015          ipnet_addrtype_t        addrtype = IPNETADDR_UNKNOWN;
1016 1016  
1017 1017          /* First check if the address is multicast or limited broadcast. */
1018 1018          switch (addr->iap_family) {
1019 1019          case AF_INET:
1020 1020                  if (CLASSD(*(addr->iap_addr4)) ||
1021 1021                      *(addr->iap_addr4) == INADDR_BROADCAST)
1022 1022                          return (IPNETADDR_MBCAST);
1023 1023                  break;
1024 1024          case AF_INET6:
1025 1025                  if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1026 1026                          return (IPNETADDR_MBCAST);
1027 1027                  break;
1028 1028          }
1029 1029  
1030 1030          /*
1031 1031           * Walk the address list to see if the address belongs to our
1032 1032           * interface or is one of our subnet broadcast addresses.
1033 1033           */
1034 1034          mutex_enter(&ipnetif->if_addr_lock);
1035 1035          list = (addr->iap_family == AF_INET) ?
1036 1036              &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1037 1037          for (ifaddr = list_head(list);
1038 1038              ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1039 1039              ifaddr = list_next(list, ifaddr)) {
1040 1040                  /*
1041 1041                   * If we're not in the global zone, then only look at
1042 1042                   * addresses in our zone.
1043 1043                   */
1044 1044                  if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1045 1045                      ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1046 1046                          continue;
1047 1047                  switch (addr->iap_family) {
1048 1048                  case AF_INET:
1049 1049                          if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1050 1050                              *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1051 1051                                  addrtype = IPNETADDR_MYADDR;
1052 1052                          else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1053 1053                              *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1054 1054                                  addrtype = IPNETADDR_MBCAST;
1055 1055                          break;
1056 1056                  case AF_INET6:
1057 1057                          if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1058 1058                              &ifaddr->ifa_ip6addr))
1059 1059                                  addrtype = IPNETADDR_MYADDR;
1060 1060                          break;
1061 1061                  }
1062 1062          }
1063 1063          mutex_exit(&ipnetif->if_addr_lock);
1064 1064  
1065 1065          return (addrtype);
1066 1066  }
1067 1067  
1068 1068  /*
1069 1069   * Verify if the packet contained in hdr should be passed up to the
1070 1070   * ipnet client stream.
1071 1071   */
1072 1072  static boolean_t
1073 1073  ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1074 1074      ipnet_addrp_t *dst)
1075 1075  {
1076 1076          boolean_t               obsif;
1077 1077          uint64_t                ifindex = ipnet->ipnet_if->if_index;
1078 1078          ipnet_addrtype_t        srctype;
1079 1079          ipnet_addrtype_t        dsttype;
1080 1080  
1081 1081          srctype = ipnet_get_addrtype(ipnet, src);
1082 1082          dsttype = ipnet_get_addrtype(ipnet, dst);
1083 1083  
1084 1084          /*
1085 1085           * If the packet's ifindex matches ours, or the packet's group ifindex
1086 1086           * matches ours, it's on the interface we're observing.  (Thus,
1087 1087           * observing on the group ifindex matches all ifindexes in the group.)
1088 1088           */
1089 1089          obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1090 1090              ntohl(hdr->hpo_grifindex) == ifindex);
1091 1091  
1092 1092          DTRACE_PROBE5(ipnet_accept__addr,
1093 1093              ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1094 1094              ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1095 1095              boolean_t, obsif);
1096 1096  
1097 1097          /*
1098 1098           * Do not allow an ipnet stream to see packets that are not from or to
1099 1099           * its zone.  The exception is when zones are using the shared stack
1100 1100           * model.  In this case, streams in the global zone have visibility
1101 1101           * into other shared-stack zones, and broadcast and multicast traffic
1102 1102           * is visible by all zones in the stack.
1103 1103           */
1104 1104          if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1105 1105              dsttype != IPNETADDR_MBCAST) {
1106 1106                  if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1107 1107                      ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1108 1108                          return (B_FALSE);
1109 1109          }
1110 1110  
1111 1111          /*
1112 1112           * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1113 1113           * packet's IP version.
1114 1114           */
1115 1115          if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1116 1116              ipnet->ipnet_family != hdr->hpo_family)
1117 1117                  return (B_FALSE);
1118 1118  
1119 1119          /* If the destination address is ours, then accept the packet. */
1120 1120          if (dsttype == IPNETADDR_MYADDR)
1121 1121                  return (B_TRUE);
1122 1122  
1123 1123          /*
1124 1124           * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1125 1125           * sent or received on the interface we're observing, or packets that
1126 1126           * have our source address (this allows us to see packets we send).
1127 1127           */
1128 1128          if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1129 1129                  if (srctype == IPNETADDR_MYADDR || obsif)
1130 1130                          return (B_TRUE);
1131 1131          }
1132 1132  
1133 1133          /*
1134 1134           * We accept multicast and broadcast packets transmitted or received
1135 1135           * on the interface we're observing.
1136 1136           */
1137 1137          if (dsttype == IPNETADDR_MBCAST && obsif)
1138 1138                  return (B_TRUE);
1139 1139  
1140 1140          return (B_FALSE);
1141 1141  }
1142 1142  
1143 1143  /*
1144 1144   * Verify if the packet contained in hdr should be passed up to the ipnet
1145 1145   * client stream that's in IPNET_LOMODE.
1146 1146   */
1147 1147  /* ARGSUSED */
1148 1148  static boolean_t
1149 1149  ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1150 1150      ipnet_addrp_t *dst)
1151 1151  {
1152 1152          if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1153 1153                  /*
1154 1154                   * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1155 1155                   */
1156 1156                  if (ipnet->ipnet_if == NULL)
1157 1157                          return (B_FALSE);
1158 1158          }
1159 1159  
1160 1160          /*
1161 1161           * An ipnet stream must not see packets that are not from/to its zone.
1162 1162           */
1163 1163          if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1164 1164                  if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1165 1165                      ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1166 1166                          return (B_FALSE);
1167 1167          }
1168 1168  
1169 1169          return (ipnet->ipnet_family == AF_UNSPEC ||
1170 1170              ipnet->ipnet_family == hdr->hpo_family);
1171 1171  }
1172 1172  
1173 1173  static void
1174 1174  ipnet_dispatch(void *arg)
1175 1175  {
1176 1176          mblk_t                  *mp = arg;
1177 1177          hook_pkt_observe_t      *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1178 1178          ipnet_t                 *ipnet;
1179 1179          mblk_t                  *netmp;
1180 1180          list_t                  *list;
1181 1181          ipnet_stack_t           *ips;
1182 1182          ipnet_addrp_t           src;
1183 1183          ipnet_addrp_t           dst;
1184 1184  
1185 1185          ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1186 1186  
1187 1187          netmp = hdr->hpo_pkt->b_cont;
1188 1188          src.iap_family = hdr->hpo_family;
1189 1189          dst.iap_family = hdr->hpo_family;
1190 1190  
1191 1191          if (hdr->hpo_family == AF_INET) {
1192 1192                  src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1193 1193                  dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1194 1194          } else {
1195 1195                  src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1196 1196                  dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1197 1197          }
1198 1198  
1199 1199          ipnet_walkers_inc(ips);
1200 1200  
1201 1201          list = &ips->ips_str_list;
1202 1202          for (ipnet = list_head(list); ipnet != NULL;
1203 1203              ipnet = list_next(list, ipnet)) {
1204 1204                  if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1205 1205                          IPSK_BUMP(ips, ik_acceptFail);
1206 1206                          continue;
1207 1207                  }
1208 1208                  IPSK_BUMP(ips, ik_acceptOk);
1209 1209  
1210 1210                  if (list_next(list, ipnet) == NULL) {
1211 1211                          netmp = hdr->hpo_pkt->b_cont;
1212 1212                          hdr->hpo_pkt->b_cont = NULL;
1213 1213                  } else {
1214 1214                          if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1215 1215                              (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1216 1216                                  IPSK_BUMP(ips, ik_duplicationFail);
1217 1217                                  continue;
1218 1218                          }
1219 1219                  }
1220 1220  
1221 1221                  if (ipnet->ipnet_flags & IPNET_INFO) {
1222 1222                          if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1223 1223                                  IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1224 1224                                  continue;
1225 1225                          }
1226 1226                  }
1227 1227  
1228 1228                  if (ipnet->ipnet_rq->q_first == NULL &&
1229 1229                      canputnext(ipnet->ipnet_rq)) {
1230 1230                          putnext(ipnet->ipnet_rq, netmp);
1231 1231                          IPSK_BUMP(ips, ik_dispatchDeliver);
1232 1232                  } else if (canput(ipnet->ipnet_rq)) {
1233 1233                          (void) putq(ipnet->ipnet_rq, netmp);
1234 1234                          IPSK_BUMP(ips, ik_dispatchDeliver);
1235 1235                  } else {
1236 1236                          freemsg(netmp);
1237 1237                          IPSK_BUMP(ips, ik_dispatchPutDrop);
1238 1238                  }
1239 1239          }
1240 1240  
1241 1241          ipnet_walkers_dec(ips);
1242 1242  
1243 1243          freemsg(mp);
1244 1244  }
1245 1245  
1246 1246  static void
1247 1247  ipnet_input(mblk_t *mp)
1248 1248  {
1249 1249          hook_pkt_observe_t      *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1250 1250          ipnet_stack_t           *ips;
1251 1251  
1252 1252          ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1253 1253  
1254 1254          if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1255 1255              DDI_SUCCESS) {
1256 1256                  IPSK_BUMP(ips, ik_dispatchFail);
1257 1257                  freemsg(mp);
1258 1258          } else {
1259 1259                  IPSK_BUMP(ips, ik_dispatchOk);
1260 1260          }
1261 1261  }
1262 1262  
1263 1263  static ipnetif_t *
1264 1264  ipnet_alloc_if(ipnet_stack_t *ips)
1265 1265  {
1266 1266          ipnetif_t       *ipnetif;
1267 1267  
1268 1268          if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1269 1269                  return (NULL);
1270 1270  
1271 1271          mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1272 1272          list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1273 1273              offsetof(ipnetif_addr_t, ifa_link));
1274 1274          list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1275 1275              offsetof(ipnetif_addr_t, ifa_link));
1276 1276          mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1277 1277  
1278 1278          ipnetif->if_stackp = ips;
1279 1279  
1280 1280          return (ipnetif);
1281 1281  }
1282 1282  
1283 1283  /*
1284 1284   * Create a new ipnetif_t and new minor node for it.  If creation is
1285 1285   * successful the new ipnetif_t is inserted into an avl_tree
1286 1286   * containing ipnetif's for this stack instance.
1287 1287   */
1288 1288  static ipnetif_t *
1289 1289  ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1290 1290      uint64_t ifflags)
1291 1291  {
1292 1292          ipnetif_t       *ipnetif;
1293 1293          avl_index_t     where = 0;
1294 1294          minor_t         ifminor;
1295 1295  
1296 1296          /*
1297 1297           * Because ipnetif_create() can be called from a NIC event
1298 1298           * callback, it should not block.
1299 1299           */
1300 1300          ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1301 1301          if (ifminor == (minor_t)-1)
1302 1302                  return (NULL);
1303 1303          if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1304 1304                  id_free(ipnet_minor_space, ifminor);
1305 1305                  return (NULL);
1306 1306          }
1307 1307  
1308 1308          (void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1309 1309          ipnetif->if_index = (uint_t)index;
1310 1310          ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1311 1311          ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1312 1312  
1313 1313          ipnetif->if_refcnt = 1;
1314 1314          if ((ifflags & IFF_LOOPBACK) != 0)
1315 1315                  ipnetif->if_flags = IPNETIF_LOOPBACK;
1316 1316  
1317 1317          mutex_enter(&ips->ips_avl_lock);
1318 1318          VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1319 1319          avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1320 1320          VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1321 1321          avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1322 1322          mutex_exit(&ips->ips_avl_lock);
1323 1323  
1324 1324          return (ipnetif);
1325 1325  }
1326 1326  
1327 1327  static void
1328 1328  ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1329 1329  {
1330 1330          ipnet_t *ipnet;
1331 1331  
1332 1332          ipnet_walkers_inc(ips);
1333 1333          /* Send a SIGHUP to all open streams associated with this ipnetif. */
1334 1334          for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1335 1335              ipnet = list_next(&ips->ips_str_list, ipnet)) {
1336 1336                  if (ipnet->ipnet_if == ipnetif)
1337 1337                          (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1338 1338          }
1339 1339          ipnet_walkers_dec(ips);
1340 1340          mutex_enter(&ips->ips_avl_lock);
1341 1341          avl_remove(&ips->ips_avl_by_index, ipnetif);
1342 1342          avl_remove(&ips->ips_avl_by_name, ipnetif);
1343 1343          mutex_exit(&ips->ips_avl_lock);
1344 1344          /*
1345 1345           * Release the reference we implicitly held in ipnetif_create().
1346 1346           */
1347 1347          ipnetif_refrele(ipnetif);
1348 1348  }
1349 1349  
1350 1350  static void
1351 1351  ipnet_purge_addrlist(list_t *addrlist)
1352 1352  {
1353 1353          ipnetif_addr_t  *ifa;
1354 1354  
1355 1355          while ((ifa = list_head(addrlist)) != NULL) {
1356 1356                  list_remove(addrlist, ifa);
1357 1357                  if (ifa->ifa_shared != NULL)
1358 1358                          ipnetif_clone_release(ifa->ifa_shared);
1359 1359                  kmem_free(ifa, sizeof (*ifa));
1360 1360          }
1361 1361  }
1362 1362  
1363 1363  static void
1364 1364  ipnetif_free(ipnetif_t *ipnetif)
1365 1365  {
1366 1366          ASSERT(ipnetif->if_refcnt == 0);
1367 1367          ASSERT(ipnetif->if_sharecnt == 0);
1368 1368  
1369 1369          /* Remove IPv4/v6 address lists from the ipnetif */
1370 1370          ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1371 1371          list_destroy(&ipnetif->if_ip4addr_list);
1372 1372          ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1373 1373          list_destroy(&ipnetif->if_ip6addr_list);
1374 1374          mutex_destroy(&ipnetif->if_addr_lock);
1375 1375          mutex_destroy(&ipnetif->if_reflock);
1376 1376          if (ipnetif->if_dev != 0)
1377 1377                  id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1378 1378          kmem_free(ipnetif, sizeof (*ipnetif));
1379 1379  }
1380 1380  
1381 1381  /*
1382 1382   * Create an ipnetif_addr_t with the given logical interface id (lif)
1383 1383   * and add it to the supplied ipnetif.  The lif is the netinfo
1384 1384   * representation of logical interface id, and we use this id to match
1385 1385   * incoming netinfo events against our lists of addresses.
1386 1386   */
1387 1387  static void
1388 1388  ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1389 1389  {
1390 1390          ipnetif_addr_t          *ifaddr;
1391 1391          zoneid_t                zoneid;
1392 1392          struct sockaddr_in      bcast;
1393 1393          struct sockaddr_storage addr;
1394 1394          net_ifaddr_t            type = NA_ADDRESS;
1395 1395          uint64_t                phyif = ipnetif->if_index;
1396 1396  
1397 1397          if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1398 1398              net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1399 1399                  return;
1400 1400  
1401 1401          if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1402 1402                  return;
1403 1403          ifaddr->ifa_zone = zoneid;
1404 1404          ifaddr->ifa_id = lif;
1405 1405          ifaddr->ifa_shared = NULL;
1406 1406  
1407 1407          switch (addr.ss_family) {
1408 1408          case AF_INET:
1409 1409                  ifaddr->ifa_ip4addr =
1410 1410                      ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1411 1411                  /*
1412 1412                   * Try and get the broadcast address.  Note that it's okay for
1413 1413                   * an interface to not have a broadcast address, so we don't
1414 1414                   * fail the entire operation if net_getlifaddr() fails here.
1415 1415                   */
1416 1416                  type = NA_BROADCAST;
1417 1417                  if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1418 1418                          ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1419 1419                  break;
1420 1420          case AF_INET6:
1421 1421                  ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1422 1422                  break;
1423 1423          }
1424 1424  
1425 1425          /*
1426 1426           * The zoneid stored in ipnetif_t needs to correspond to the actual
1427 1427           * zone the address is being used in. This facilitates finding the
1428 1428           * correct netstack_t pointer, amongst other things, later.
1429 1429           */
1430 1430          if (zoneid == ALL_ZONES)
1431 1431                  zoneid = GLOBAL_ZONEID;
1432 1432  
1433 1433          mutex_enter(&ipnetif->if_addr_lock);
1434 1434          if (zoneid != ipnetif->if_zoneid) {
1435 1435                  ipnetif_t *ifp2;
1436 1436  
1437 1437                  ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1438 1438                  ifaddr->ifa_shared = ifp2;
1439 1439          }
1440 1440          list_insert_tail(addr.ss_family == AF_INET ?
1441 1441              &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1442 1442          mutex_exit(&ipnetif->if_addr_lock);
1443 1443  }
1444 1444  
1445 1445  static void
1446 1446  ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1447 1447  {
1448 1448          mutex_enter(&ipnetif->if_addr_lock);
1449 1449          if (ifaddr->ifa_shared != NULL)
1450 1450                  ipnetif_clone_release(ifaddr->ifa_shared);
1451 1451  
1452 1452          list_remove(isv6 ?
1453 1453              &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1454 1454          mutex_exit(&ipnetif->if_addr_lock);
1455 1455          kmem_free(ifaddr, sizeof (*ifaddr));
1456 1456  }
1457 1457  
1458 1458  static void
1459 1459  ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1460 1460  {
1461 1461          ipnetif_t       *ipnetif;
1462 1462          boolean_t       refrele_needed = B_TRUE;
1463 1463          uint64_t        ifflags;
1464 1464          uint64_t        ifindex;
1465 1465          char            *ifname;
1466 1466  
1467 1467          ifflags = 0;
1468 1468          ifname = ipne->ipne_ifname;
1469 1469          ifindex = ipne->ipne_ifindex;
1470 1470  
1471 1471          (void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1472 1472  
1473 1473          if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1474 1474                  ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1475 1475                  refrele_needed = B_FALSE;
1476 1476          }
1477 1477          if (ipnetif != NULL) {
1478 1478                  ipnetif->if_flags |=
1479 1479                      isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1480 1480          }
1481 1481  
1482 1482          if (ipnetif->if_multicnt != 0) {
1483 1483                  if (ip_join_allmulti(ifindex, isv6,
1484 1484                      ips->ips_netstack->netstack_ip) == 0) {
1485 1485                          ipnetif->if_flags |=
1486 1486                              isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1487 1487                  }
1488 1488          }
1489 1489  
1490 1490          if (refrele_needed)
1491 1491                  ipnetif_refrele(ipnetif);
1492 1492  }
1493 1493  
1494 1494  static void
1495 1495  ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1496 1496  {
1497 1497          ipnetif_t       *ipnetif;
1498 1498  
1499 1499          if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1500 1500                  return;
1501 1501  
1502 1502          mutex_enter(&ipnetif->if_addr_lock);
1503 1503          ipnet_purge_addrlist(isv6 ?
1504 1504              &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1505 1505          mutex_exit(&ipnetif->if_addr_lock);
1506 1506  
1507 1507          /*
1508 1508           * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1509 1509           * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1510 1510           * if both IPv4 and IPv6 interfaces have been unplumbed.
1511 1511           */
1512 1512          ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1513 1513          if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1514 1514                  ipnetif_remove(ipnetif, ips);
1515 1515          ipnetif_refrele(ipnetif);
1516 1516  }
1517 1517  
1518 1518  static void
1519 1519  ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1520 1520      ipnet_stack_t *ips, boolean_t isv6)
1521 1521  {
1522 1522          ipnetif_t       *ipnetif;
1523 1523          ipnetif_addr_t  *ifaddr;
1524 1524  
1525 1525          if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1526 1526                  return;
1527 1527          if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1528 1528                  /*
1529 1529                   * We must have missed a NE_LIF_DOWN event.  Delete this
1530 1530                   * ifaddr and re-create it.
1531 1531                   */
1532 1532                  ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1533 1533          }
1534 1534  
1535 1535          ipnet_add_ifaddr(lifindex, ipnetif, nd);
1536 1536          ipnetif_refrele(ipnetif);
1537 1537  }
1538 1538  
1539 1539  static void
1540 1540  ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1541 1541      boolean_t isv6)
1542 1542  {
1543 1543          ipnetif_t       *ipnetif;
1544 1544          ipnetif_addr_t  *ifaddr;
1545 1545  
1546 1546          if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1547 1547                  return;
1548 1548          if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1549 1549                  ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1550 1550          ipnetif_refrele(ipnetif);
1551 1551          /*
1552 1552           * Make sure that open streams on this ipnetif are still allowed to
1553 1553           * have it open.
1554 1554           */
1555 1555          ipnetif_zonecheck(ipnetif, ips);
1556 1556  }
1557 1557  
1558 1558  /*
1559 1559   * This callback from the NIC event framework dispatches a taskq as the event
1560 1560   * handlers may block.
1561 1561   */
1562 1562  /* ARGSUSED */
1563 1563  static int
1564 1564  ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1565 1565  {
1566 1566          ipnet_stack_t           *ips = arg;
1567 1567          hook_nic_event_t        *hn = (hook_nic_event_t *)info;
1568 1568          ipnet_nicevent_t        *ipne;
1569 1569  
1570 1570          if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1571 1571                  return (0);
1572 1572          ipne->ipne_event = hn->hne_event;
1573 1573          ipne->ipne_protocol = hn->hne_protocol;
1574 1574          ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1575 1575          ipne->ipne_ifindex = hn->hne_nic;
1576 1576          ipne->ipne_lifindex = hn->hne_lif;
1577 1577          if (hn->hne_datalen != 0) {
1578 1578                  (void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1579 1579                      sizeof (ipne->ipne_ifname));
1580 1580          }
1581 1581          (void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1582 1582              ipne, DDI_NOSLEEP);
1583 1583          return (0);
1584 1584  }
1585 1585  
1586 1586  static void
1587 1587  ipnet_nicevent_task(void *arg)
1588 1588  {
1589 1589          ipnet_nicevent_t        *ipne = arg;
1590 1590          netstack_t              *ns;
1591 1591          ipnet_stack_t           *ips;
1592 1592          boolean_t               isv6;
1593 1593  
1594 1594          if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1595 1595                  goto done;
1596 1596          ips = ns->netstack_ipnet;
1597 1597          isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1598 1598  
1599 1599          mutex_enter(&ips->ips_event_lock);
1600 1600          switch (ipne->ipne_event) {
1601 1601          case NE_PLUMB:
1602 1602                  ipnet_plumb_ev(ipne, ips, isv6);
1603 1603                  break;
1604 1604          case NE_UNPLUMB:
1605 1605                  ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1606 1606                  break;
1607 1607          case NE_LIF_UP:
1608 1608                  ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1609 1609                      ipne->ipne_protocol, ips, isv6);
1610 1610                  break;
1611 1611          case NE_LIF_DOWN:
1612 1612                  ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1613 1613                      isv6);
1614 1614                  break;
1615 1615          default:
1616 1616                  break;
1617 1617          }
1618 1618          mutex_exit(&ips->ips_event_lock);
1619 1619  done:
1620 1620          if (ns != NULL)
1621 1621                  netstack_rele(ns);
1622 1622          kmem_free(ipne, sizeof (ipnet_nicevent_t));
1623 1623  }
1624 1624  
1625 1625  dev_t
1626 1626  ipnet_if_getdev(char *name, zoneid_t zoneid)
1627 1627  {
1628 1628          netstack_t      *ns;
1629 1629          ipnet_stack_t   *ips;
1630 1630          ipnetif_t       *ipnetif;
1631 1631          dev_t           dev = (dev_t)-1;
1632 1632  
1633 1633          if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1634 1634                  return (dev);
1635 1635          if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1636 1636                  return (dev);
1637 1637  
1638 1638          ips = ns->netstack_ipnet;
1639 1639          mutex_enter(&ips->ips_avl_lock);
1640 1640          if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1641 1641                  if (ipnetif_in_zone(ipnetif, zoneid, ips))
1642 1642                          dev = ipnetif->if_dev;
1643 1643          }
1644 1644          mutex_exit(&ips->ips_avl_lock);
1645 1645          netstack_rele(ns);
1646 1646  
1647 1647          return (dev);
1648 1648  }
1649 1649  
1650 1650  static ipnetif_t *
1651 1651  ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1652 1652  {
1653 1653          ipnetif_t       *ipnetif;
1654 1654  
1655 1655          mutex_enter(&ips->ips_avl_lock);
1656 1656          if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1657 1657                  ipnetif_refhold(ipnetif);
1658 1658          mutex_exit(&ips->ips_avl_lock);
1659 1659          return (ipnetif);
1660 1660  }
1661 1661  
1662 1662  static ipnetif_t *
1663 1663  ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1664 1664  {
1665 1665          ipnetif_t       *ipnetif;
1666 1666          avl_tree_t      *tree;
1667 1667  
1668 1668          mutex_enter(&ips->ips_avl_lock);
1669 1669          tree = &ips->ips_avl_by_index;
1670 1670          for (ipnetif = avl_first(tree); ipnetif != NULL;
1671 1671              ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1672 1672                  if (ipnetif->if_dev == dev) {
1673 1673                          ipnetif_refhold(ipnetif);
1674 1674                          break;
1675 1675                  }
1676 1676          }
1677 1677          mutex_exit(&ips->ips_avl_lock);
1678 1678          return (ipnetif);
1679 1679  }
1680 1680  
1681 1681  static ipnetif_addr_t *
1682 1682  ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1683 1683  {
1684 1684          ipnetif_addr_t  *ifaddr;
1685 1685          list_t  *list;
1686 1686  
1687 1687          mutex_enter(&ipnetif->if_addr_lock);
1688 1688          list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1689 1689          for (ifaddr = list_head(list); ifaddr != NULL;
1690 1690              ifaddr = list_next(list, ifaddr)) {
1691 1691                  if (lid == ifaddr->ifa_id)
1692 1692                          break;
1693 1693          }
1694 1694          mutex_exit(&ipnetif->if_addr_lock);
1695 1695          return (ifaddr);
1696 1696  }
1697 1697  
1698 1698  /* ARGSUSED */
1699 1699  static void *
1700 1700  ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1701 1701  {
1702 1702          ipnet_stack_t   *ips;
1703 1703  
1704 1704          ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1705 1705          ips->ips_netstack = ns;
1706 1706          mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1707 1707          avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1708 1708              sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1709 1709          avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1710 1710              sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1711 1711          avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1712 1712              sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1713 1713          mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1714 1714          cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1715 1715          list_create(&ips->ips_str_list, sizeof (ipnet_t),
1716 1716              offsetof(ipnet_t, ipnet_next));
1717 1717          ipnet_register_netihook(ips);
1718 1718          return (ips);
1719 1719  }
1720 1720  
1721 1721  /* ARGSUSED */
1722 1722  static void
1723 1723  ipnet_stack_fini(netstackid_t stackid, void *arg)
1724 1724  {
1725 1725          ipnet_stack_t   *ips = arg;
1726 1726          ipnetif_t       *ipnetif, *nipnetif;
1727 1727  
1728 1728          if (ips->ips_kstatp != NULL) {
1729 1729                  zoneid_t zoneid;
1730 1730  
1731 1731                  zoneid = netstackid_to_zoneid(stackid);
1732 1732                  net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1733 1733          }
1734 1734          if (ips->ips_ndv4 != NULL) {
1735 1735                  VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1736 1736                      ips->ips_nicevents) == 0);
1737 1737                  VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1738 1738          }
1739 1739          if (ips->ips_ndv6 != NULL) {
1740 1740                  VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1741 1741                      ips->ips_nicevents) == 0);
1742 1742                  VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1743 1743          }
1744 1744          hook_free(ips->ips_nicevents);
1745 1745  
1746 1746          for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1747 1747              ipnetif = nipnetif) {
1748 1748                  nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1749 1749                  ipnetif_remove(ipnetif, ips);
1750 1750          }
1751 1751          avl_destroy(&ips->ips_avl_by_shared);
1752 1752          avl_destroy(&ips->ips_avl_by_index);
1753 1753          avl_destroy(&ips->ips_avl_by_name);
1754 1754          mutex_destroy(&ips->ips_avl_lock);
1755 1755          mutex_destroy(&ips->ips_walkers_lock);
1756 1756          cv_destroy(&ips->ips_walkers_cv);
1757 1757          list_destroy(&ips->ips_str_list);
1758 1758          kmem_free(ips, sizeof (*ips));
1759 1759  }
1760 1760  
1761 1761  /* Do any of the addresses in addrlist belong the supplied zoneid? */
1762 1762  static boolean_t
1763 1763  ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1764 1764  {
1765 1765          ipnetif_addr_t  *ifa;
1766 1766  
1767 1767          for (ifa = list_head(addrlist); ifa != NULL;
1768 1768              ifa = list_next(addrlist, ifa)) {
1769 1769                  if (ifa->ifa_zone == zoneid)
1770 1770                          return (B_TRUE);
1771 1771          }
1772 1772          return (B_FALSE);
1773 1773  }
1774 1774  
1775 1775  /* Should the supplied ipnetif be visible from the supplied zoneid? */
1776 1776  static boolean_t
1777 1777  ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1778 1778  {
1779 1779          int     ret;
1780 1780  
1781 1781          /*
1782 1782           * The global zone has visibility into all interfaces in the global
1783 1783           * stack, and exclusive stack zones have visibility into all
1784 1784           * interfaces in their stack.
1785 1785           */
1786 1786          if (zoneid == GLOBAL_ZONEID ||
1787 1787              ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1788 1788                  return (B_TRUE);
1789 1789  
1790 1790          /*
1791 1791           * Shared-stack zones only have visibility for interfaces that have
1792 1792           * addresses in their zone.
1793 1793           */
1794 1794          mutex_enter(&ipnetif->if_addr_lock);
1795 1795          ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1796 1796              ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1797 1797          mutex_exit(&ipnetif->if_addr_lock);
1798 1798          return (ret);
1799 1799  }
1800 1800  
1801 1801  /*
1802 1802   * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1803 1803   * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1804 1804   * to have an ipnetif open if there are no longer any addresses that belong to
1805 1805   * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1806 1806   * case, send the ipnet_t an M_HANGUP.
1807 1807   */
1808 1808  static void
1809 1809  ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1810 1810  {
1811 1811          list_t  *strlist = &ips->ips_str_list;
1812 1812          ipnet_t *ipnet;
1813 1813  
1814 1814          ipnet_walkers_inc(ips);
1815 1815          for (ipnet = list_head(strlist); ipnet != NULL;
1816 1816              ipnet = list_next(strlist, ipnet)) {
1817 1817                  if (ipnet->ipnet_if != ipnetif)
1818 1818                          continue;
1819 1819                  if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1820 1820                          (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1821 1821          }
1822 1822          ipnet_walkers_dec(ips);
1823 1823  }
1824 1824  
1825 1825  void
1826 1826  ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1827 1827  {
1828 1828          ipnetif_t               *ipnetif;
1829 1829          list_t                  cbdata;
1830 1830          ipnetif_cbdata_t        *cbnode;
1831 1831          netstack_t              *ns;
1832 1832          ipnet_stack_t           *ips;
1833 1833  
1834 1834          /*
1835 1835           * On labeled systems, non-global zones shouldn't see anything
1836 1836           * in /dev/ipnet.
1837 1837           */
1838 1838          if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1839 1839                  return;
1840 1840  
1841 1841          if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1842 1842                  return;
1843 1843  
1844 1844          ips = ns->netstack_ipnet;
1845 1845          list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1846 1846              offsetof(ipnetif_cbdata_t, ic_next));
1847 1847  
1848 1848          mutex_enter(&ips->ips_avl_lock);
1849 1849          for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1850 1850              ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1851 1851                  if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1852 1852                          continue;
1853 1853                  cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1854 1854                  (void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1855 1855                  cbnode->ic_dev = ipnetif->if_dev;
1856 1856                  list_insert_head(&cbdata, cbnode);
1857 1857          }
1858 1858          mutex_exit(&ips->ips_avl_lock);
1859 1859  
1860 1860          while ((cbnode = list_head(&cbdata)) != NULL) {
1861 1861                  cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1862 1862                  list_remove(&cbdata, cbnode);
1863 1863                  kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1864 1864          }
1865 1865          list_destroy(&cbdata);
1866 1866          netstack_rele(ns);
1867 1867  }
1868 1868  
1869 1869  static int
1870 1870  ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1871 1871  {
1872 1872          int64_t index1 = *((int64_t *)index_ptr);
1873 1873          int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1874 1874  
1875 1875          return (SIGNOF(index2 - index1));
1876 1876  }
1877 1877  
1878 1878  static int
1879 1879  ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1880 1880  {
1881 1881          int     res;
1882 1882  
1883 1883          res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1884 1884          return (SIGNOF(res));
1885 1885  }
1886 1886  
1887 1887  static int
1888 1888  ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1889 1889  {
1890 1890          const uintptr_t *ptr = key_ptr;
1891 1891          const ipnetif_t *ifp;
1892 1892          int             res;
1893 1893  
1894 1894          ifp = ipnetifp;
1895 1895          res = ifp->if_zoneid - ptr[0];
1896 1896          if (res != 0)
1897 1897                  return (SIGNOF(res));
1898 1898          res = strcmp(ifp->if_name, (char *)ptr[1]);
1899 1899          return (SIGNOF(res));
1900 1900  }
1901 1901  
1902 1902  static void
1903 1903  ipnetif_refhold(ipnetif_t *ipnetif)
1904 1904  {
1905 1905          mutex_enter(&ipnetif->if_reflock);
1906 1906          ipnetif->if_refcnt++;
1907 1907          mutex_exit(&ipnetif->if_reflock);
1908 1908  }
1909 1909  
1910 1910  static void
1911 1911  ipnetif_refrele(ipnetif_t *ipnetif)
1912 1912  {
1913 1913          mutex_enter(&ipnetif->if_reflock);
1914 1914          ASSERT(ipnetif->if_refcnt > 0);
1915 1915          if (--ipnetif->if_refcnt == 0)
1916 1916                  ipnetif_free(ipnetif);
1917 1917          else
1918 1918                  mutex_exit(&ipnetif->if_reflock);
1919 1919  }
1920 1920  
1921 1921  static void
1922 1922  ipnet_walkers_inc(ipnet_stack_t *ips)
1923 1923  {
1924 1924          mutex_enter(&ips->ips_walkers_lock);
1925 1925          ips->ips_walkers_cnt++;
1926 1926          mutex_exit(&ips->ips_walkers_lock);
1927 1927  }
1928 1928  
1929 1929  static void
1930 1930  ipnet_walkers_dec(ipnet_stack_t *ips)
1931 1931  {
1932 1932          mutex_enter(&ips->ips_walkers_lock);
1933 1933          ASSERT(ips->ips_walkers_cnt != 0);
1934 1934          if (--ips->ips_walkers_cnt == 0)
1935 1935                  cv_broadcast(&ips->ips_walkers_cv);
1936 1936          mutex_exit(&ips->ips_walkers_lock);
1937 1937  }
1938 1938  
1939 1939  /*ARGSUSED*/
1940 1940  static int
1941 1941  ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1942 1942  {
1943 1943          hook_pkt_observe_t      *hdr;
1944 1944          pfv_t                   func = (pfv_t)arg;
1945 1945          mblk_t                  *mp;
1946 1946  
1947 1947          hdr = (hook_pkt_observe_t *)info;
1948 1948          /*
1949 1949           * Code in ip_input() expects that it is the only one accessing the
1950 1950           * packet.
1951 1951           */
1952 1952          mp = copymsg(hdr->hpo_pkt);
1953 1953          if (mp == NULL)  {
1954 1954                  netstack_t *ns = hdr->hpo_ctx;
1955 1955                  ipnet_stack_t *ips = ns->netstack_ipnet;
1956 1956  
1957 1957                  IPSK_BUMP(ips, ik_dispatchDupDrop);
1958 1958                  return (0);
1959 1959          }
1960 1960  
1961 1961          hdr = (hook_pkt_observe_t *)mp->b_rptr;
1962 1962          hdr->hpo_pkt = mp;
1963 1963  
1964 1964          func(mp);
1965 1965  
1966 1966          return (0);
1967 1967  }
1968 1968  
1969 1969  hook_t *
1970 1970  ipobs_register_hook(netstack_t *ns, pfv_t func)
1971 1971  {
1972 1972          ip_stack_t      *ipst = ns->netstack_ip;
1973 1973          char            name[32];
1974 1974          hook_t          *hook;
1975 1975  
1976 1976          HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1977 1977          VERIFY(hook != NULL);
1978 1978  
1979 1979          /*
1980 1980           * To register multiple hooks with the same callback function,
1981 1981           * a unique name is needed.
1982 1982           */
1983 1983          (void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1984 1984          hook->h_name = strdup(name);
1985 1985  
1986 1986          (void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1987 1987          (void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1988 1988  
1989 1989          return (hook);
1990 1990  }
1991 1991  
1992 1992  void
1993 1993  ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1994 1994  {
1995 1995          ip_stack_t      *ipst = ns->netstack_ip;
1996 1996  
1997 1997          (void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1998 1998  
1999 1999          (void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
2000 2000  
2001 2001          strfree(hook->h_name);
2002 2002  
2003 2003          hook_free(hook);
2004 2004  }
2005 2005  
2006 2006  /* ******************************************************************** */
2007 2007  /* BPF Functions below                                                  */
2008 2008  /* ******************************************************************** */
2009 2009  
2010 2010  /*
2011 2011   * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2012 2012   */
2013 2013  ipnet_stack_t *
2014 2014  ipnet_find_by_zoneid(zoneid_t zoneid)
2015 2015  {
2016 2016          netstack_t      *ns;
2017 2017  
2018 2018          VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2019 2019          return (ns->netstack_ipnet);
2020 2020  }
2021 2021  
2022 2022  /*
2023 2023   * Functions, such as the above ipnet_find_by_zoneid(), will return a
2024 2024   * pointer to ipnet_stack_t by calling a netstack lookup function.
2025 2025   * The netstack_find_*() functions return a pointer after doing a "hold"
2026 2026   * on the data structure and thereby require a "release" when the caller
2027 2027   * is finished with it. We need to mirror that API here and thus a caller
2028 2028   * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2029 2029   */
2030 2030  void
2031 2031  ipnet_rele(ipnet_stack_t *ips)
2032 2032  {
2033 2033          netstack_rele(ips->ips_netstack);
2034 2034  }
2035 2035  
2036 2036  /*
2037 2037   */
2038 2038  void
2039 2039  ipnet_set_itap(bpf_itap_fn_t tapfunc)
2040 2040  {
2041 2041          ipnet_itap = tapfunc;
2042 2042  }
2043 2043  
2044 2044  /*
2045 2045   * The list of interfaces available via ipnet is private for each zone,
2046 2046   * so the AVL tree of each zone must be searched for a given name, even
2047 2047   * if all names are unique.
2048 2048   */
2049 2049  int
2050 2050  ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2051 2051  {
2052 2052          ipnet_stack_t   *ips;
2053 2053          ipnetif_t       *ipnetif;
2054 2054  
2055 2055          ASSERT(ptr != NULL);
2056 2056          VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2057 2057  
2058 2058          mutex_enter(&ips->ips_avl_lock);
2059 2059  
2060 2060          /*
2061 2061           * Shared instance zone?
2062 2062           */
2063 2063          if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2064 2064                  uintptr_t key[2] = { zoneid, (uintptr_t)name };
2065 2065  
2066 2066                  ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2067 2067          } else {
2068 2068                  ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2069 2069          }
2070 2070          if (ipnetif != NULL)
2071 2071                  ipnetif_refhold(ipnetif);
2072 2072          mutex_exit(&ips->ips_avl_lock);
2073 2073  
2074 2074          *ptr = ipnetif;
2075 2075          ipnet_rele(ips);
2076 2076  
2077 2077          if (ipnetif == NULL)
2078 2078                  return (ESRCH);
2079 2079          return (0);
2080 2080  }
2081 2081  
2082 2082  void
2083 2083  ipnet_close_byhandle(ipnetif_t *ifp)
2084 2084  {
2085 2085          ASSERT(ifp != NULL);
2086 2086          ipnetif_refrele(ifp);
2087 2087  }
2088 2088  
2089 2089  const char *
2090 2090  ipnet_name(ipnetif_t *ifp)
2091 2091  {
2092 2092          ASSERT(ifp != NULL);
2093 2093          return (ifp->if_name);
2094 2094  }
2095 2095  
2096 2096  /*
2097 2097   * To find the linkid for a given name, it is necessary to know which zone
2098 2098   * the interface name belongs to and to search the avl tree for that zone
2099 2099   * as there is no master list of all interfaces and which zone they belong
2100 2100   * to. It is assumed that the caller of this function is somehow already
2101 2101   * working with the ipnet interfaces and hence the ips_event_lock is held.
2102 2102   * When BPF calls into this function, it is doing so because of an event
2103 2103   * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2104 2104   * value returned has meaning without the need for grabbing a hold on the
2105 2105   * owning structure.
2106 2106   */
2107 2107  int
2108 2108  ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2109 2109  {
2110 2110          ipnet_stack_t   *ips;
2111 2111          ipnetif_t       *ifp;
2112 2112  
2113 2113          VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2114 2114          ASSERT(mutex_owned(&ips->ips_event_lock));
2115 2115  
2116 2116          mutex_enter(&ips->ips_avl_lock);
2117 2117          ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2118 2118          if (ifp != NULL)
2119 2119                  *idp = (uint_t)ifp->if_index;
2120 2120  
2121 2121          /*
2122 2122           * Shared instance zone?
2123 2123           */
2124 2124          if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2125 2125                  uintptr_t key[2] = { zoneid, (uintptr_t)name };
2126 2126  
2127 2127                  ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2128 2128                  if (ifp != NULL)
2129 2129                          *idp = (uint_t)ifp->if_index;
2130 2130          }
2131 2131  
2132 2132          mutex_exit(&ips->ips_avl_lock);
2133 2133          ipnet_rele(ips);
2134 2134  
2135 2135          if (ifp == NULL)
2136 2136                  return (ESRCH);
2137 2137          return (0);
2138 2138  }
2139 2139  
2140 2140  /*
2141 2141   * Strictly speaking, there is no such thing as a "client" in ipnet, like
2142 2142   * there is in mac. BPF only needs to have this because it is required as
2143 2143   * part of interfacing correctly with mac. The reuse of the original
2144 2144   * ipnetif_t as a client poses no danger, so long as it is done with its
2145 2145   * own ref-count'd hold that is given up on close.
2146 2146   */
2147 2147  int
2148 2148  ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2149 2149  {
2150 2150          ASSERT(ptr != NULL);
2151 2151          ASSERT(result != NULL);
2152 2152          ipnetif_refhold(ptr);
2153 2153          *result = ptr;
2154 2154  
2155 2155          return (0);
2156 2156  }
2157 2157  
2158 2158  void
2159 2159  ipnet_client_close(ipnetif_t *ptr)
2160 2160  {
2161 2161          ASSERT(ptr != NULL);
2162 2162          ipnetif_refrele(ptr);
2163 2163  }
2164 2164  
2165 2165  /*
2166 2166   * This is called from BPF when it needs to start receiving packets
2167 2167   * from ipnet.
2168 2168   *
2169 2169   * The use of the ipnet_t structure here is somewhat lightweight when
2170 2170   * compared to how it is used elsewhere but it already has all of the
2171 2171   * right fields in it, so reuse here doesn't seem out of order. Its
2172 2172   * primary purpose here is to provide the means to store pointers for
2173 2173   * use when ipnet_promisc_remove() needs to be called.
2174 2174   *
2175 2175   * This should never be called for the IPNET_MINOR_LO device as it is
2176 2176   * never created via ipnetif_create.
2177 2177   */
2178 2178  /*ARGSUSED*/
2179 2179  int
2180 2180  ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2181 2181      int flags)
2182 2182  {
2183 2183          ip_stack_t      *ipst;
2184 2184          netstack_t      *ns;
2185 2185          ipnetif_t       *ifp;
2186 2186          ipnet_t         *ipnet;
2187 2187          char            name[32];
2188 2188          int             error;
2189 2189  
2190 2190          ifp = (ipnetif_t *)handle;
2191 2191  
2192 2192          if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
2193 2193                  return (EINVAL);
2194 2194  
2195 2195          ns = netstack_find_by_zoneid(ifp->if_zoneid);
2196 2196  
2197 2197          if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
2198 2198                  netstack_rele(ns);
2199 2199                  return (error);
2200 2200          }
2201 2201  
2202 2202          ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2203 2203          ipnet->ipnet_if = ifp;
2204 2204          ipnet->ipnet_ns = ns;
2205 2205          ipnet->ipnet_flags = flags;
2206 2206  
2207 2207          if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2208 2208                  ipnet->ipnet_acceptfn = ipnet_loaccept;
2209 2209          } else {
2210 2210                  ipnet->ipnet_acceptfn = ipnet_accept;
2211 2211          }
2212 2212  
2213 2213          /*
2214 2214           * To register multiple hooks with the same callback function,
2215 2215           * a unique name is needed.
2216 2216           */
2217 2217          HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2218 2218          (void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2219 2219              (void *)ipnet->ipnet_hook);
2220 2220          ipnet->ipnet_hook->h_name = strdup(name);
2221 2221          ipnet->ipnet_data = data;
2222 2222          ipnet->ipnet_zoneid = ifp->if_zoneid;
2223 2223  
2224 2224          ipst = ns->netstack_ip;
2225 2225  
2226 2226          error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2227 2227              ipnet->ipnet_hook);
2228 2228          if (error != 0)
2229 2229                  goto regfail;
2230 2230  
2231 2231          error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2232 2232              ipnet->ipnet_hook);
2233 2233          if (error != 0) {
2234 2234                  (void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2235 2235                      NH_OBSERVE, ipnet->ipnet_hook);
2236 2236                  goto regfail;
2237 2237          }
  
    | 
      ↓ open down ↓ | 
    2198 lines elided | 
    
      ↑ open up ↑ | 
  
2238 2238  
2239 2239          *mhandle = (uintptr_t)ipnet;
2240 2240          netstack_rele(ns);
2241 2241  
2242 2242          return (0);
2243 2243  
2244 2244  regfail:
2245 2245          cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2246 2246          strfree(ipnet->ipnet_hook->h_name);
2247 2247          hook_free(ipnet->ipnet_hook);
     2248 +        ipnet_leave_allmulti(ifp, ns->netstack_ipnet);
2248 2249          netstack_rele(ns);
2249 2250          return (error);
2250 2251  }
2251 2252  
2252 2253  void
2253 2254  ipnet_promisc_remove(void *data)
2254 2255  {
2255 2256          ip_stack_t      *ipst;
2256 2257          ipnet_t         *ipnet;
2257 2258          hook_t          *hook;
2258 2259  
2259 2260          ipnet = data;
2260 2261          ipst = ipnet->ipnet_ns->netstack_ip;
2261 2262          hook = ipnet->ipnet_hook;
2262 2263  
  
    | 
      ↓ open down ↓ | 
    5 lines elided | 
    
      ↑ open up ↑ | 
  
2263 2264          VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2264 2265              hook) == 0);
2265 2266  
2266 2267          VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2267 2268              hook) == 0);
2268 2269  
2269 2270          strfree(hook->h_name);
2270 2271  
2271 2272          hook_free(hook);
2272 2273  
     2274 +        ipnet_leave_allmulti(ipnet->ipnet_if, ipnet->ipnet_ns->netstack_ipnet);
     2275 +
2273 2276          kmem_free(ipnet, sizeof (*ipnet));
2274 2277  }
2275 2278  
2276 2279  /*
2277 2280   * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2278 2281   * An important field from that structure is "ipnet_data" that
2279 2282   * contains the "data" pointer passed into ipnet_promisc_add: it needs
2280 2283   * to be passed back to bpf when we call into ipnet_itap.
2281 2284   *
2282 2285   * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2283 2286   * from BPF.
2284 2287   */
2285 2288  /*ARGSUSED*/
2286 2289  static int
2287 2290  ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2288 2291  {
2289 2292          hook_pkt_observe_t      *hdr;
2290 2293          ipnet_addrp_t           src;
2291 2294          ipnet_addrp_t           dst;
2292 2295          ipnet_stack_t           *ips;
2293 2296          ipnet_t                 *ipnet;
2294 2297          mblk_t                  *netmp;
2295 2298          mblk_t                  *mp;
2296 2299  
2297 2300          hdr = (hook_pkt_observe_t *)info;
2298 2301          mp = hdr->hpo_pkt;
2299 2302          ipnet = (ipnet_t *)arg;
2300 2303          ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2301 2304  
2302 2305          netmp = hdr->hpo_pkt->b_cont;
2303 2306          src.iap_family = hdr->hpo_family;
2304 2307          dst.iap_family = hdr->hpo_family;
2305 2308  
2306 2309          if (hdr->hpo_family == AF_INET) {
2307 2310                  src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2308 2311                  dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2309 2312          } else {
2310 2313                  src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2311 2314                  dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2312 2315          }
2313 2316  
2314 2317          if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2315 2318                  IPSK_BUMP(ips, ik_acceptFail);
2316 2319                  return (0);
2317 2320          }
2318 2321          IPSK_BUMP(ips, ik_acceptOk);
2319 2322  
2320 2323          ipnet_itap(ipnet->ipnet_data, mp,
2321 2324              hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2322 2325              ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2323 2326  
2324 2327          return (0);
2325 2328  }
2326 2329  
2327 2330  /*
2328 2331   * clone'd ipnetif_t's are created when a shared IP instance zone comes
2329 2332   * to life and configures an IP address. The model that BPF uses is that
2330 2333   * each interface must have a unique pointer and each interface must be
2331 2334   * representative of what it can capture. They are limited to one DLT
2332 2335   * per interface and one zone per interface. Thus every interface that
2333 2336   * can be seen in a zone must be announced via an attach to bpf. For
2334 2337   * shared instance zones, this means the ipnet driver needs to detect
2335 2338   * when an address is added to an interface in a zone for the first
2336 2339   * time (and also when the last address is removed.)
2337 2340   */
2338 2341  static ipnetif_t *
2339 2342  ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2340 2343  {
2341 2344          uintptr_t       key[2] = { zoneid, (uintptr_t)ifp->if_name };
2342 2345          ipnet_stack_t   *ips = ifp->if_stackp;
2343 2346          avl_index_t     where = 0;
2344 2347          ipnetif_t       *newif;
2345 2348  
2346 2349          mutex_enter(&ips->ips_avl_lock);
2347 2350          newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2348 2351          if (newif != NULL) {
2349 2352                  ipnetif_refhold(newif);
2350 2353                  newif->if_sharecnt++;
2351 2354                  mutex_exit(&ips->ips_avl_lock);
2352 2355                  return (newif);
2353 2356          }
2354 2357  
2355 2358          newif = ipnet_alloc_if(ips);
2356 2359          if (newif == NULL) {
2357 2360                  mutex_exit(&ips->ips_avl_lock);
2358 2361                  return (NULL);
2359 2362          }
2360 2363  
2361 2364          newif->if_refcnt = 1;
2362 2365          newif->if_sharecnt = 1;
2363 2366          newif->if_zoneid = zoneid;
2364 2367          (void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2365 2368          newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2366 2369          newif->if_index = ifp->if_index;
2367 2370  
2368 2371          avl_insert(&ips->ips_avl_by_shared, newif, where);
2369 2372          mutex_exit(&ips->ips_avl_lock);
2370 2373  
2371 2374          return (newif);
2372 2375  }
2373 2376  
2374 2377  static void
2375 2378  ipnetif_clone_release(ipnetif_t *ipnetif)
2376 2379  {
2377 2380          boolean_t       dofree = B_FALSE;
2378 2381          boolean_t       doremove = B_FALSE;
2379 2382          ipnet_stack_t   *ips = ipnetif->if_stackp;
2380 2383  
2381 2384          mutex_enter(&ipnetif->if_reflock);
2382 2385          ASSERT(ipnetif->if_refcnt > 0);
2383 2386          if (--ipnetif->if_refcnt == 0)
2384 2387                  dofree = B_TRUE;
2385 2388          ASSERT(ipnetif->if_sharecnt > 0);
2386 2389          if (--ipnetif->if_sharecnt == 0)
2387 2390                  doremove = B_TRUE;
2388 2391          mutex_exit(&ipnetif->if_reflock);
2389 2392          if (doremove) {
2390 2393                  mutex_enter(&ips->ips_avl_lock);
2391 2394                  avl_remove(&ips->ips_avl_by_shared, ipnetif);
2392 2395                  mutex_exit(&ips->ips_avl_lock);
2393 2396          }
2394 2397          if (dofree) {
2395 2398                  ASSERT(ipnetif->if_sharecnt == 0);
2396 2399                  ipnetif_free(ipnetif);
2397 2400          }
2398 2401  }
  
    | 
      ↓ open down ↓ | 
    116 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX