1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright (c) 2016 by Delphix. All rights reserved.
  26  */
  27 
  28 /*
  29  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
  30  */
  31 
  32 /*
  33  * The ipnet device defined here provides access to packets at the IP layer. To
  34  * provide access to packets at this layer it registers a callback function in
  35  * the ip module and when there are open instances of the device ip will pass
  36  * packets into the device. Packets from ip are passed on the input, output and
  37  * loopback paths. Internally the module returns to ip as soon as possible by
  38  * deferring processing using a taskq.
  39  *
  40  * Management of the devices in /dev/ipnet/ is handled by the devname
  41  * filesystem and use of the neti interfaces.  This module registers for NIC
  42  * events using the neti framework so that when IP interfaces are bought up,
  43  * taken down etc. the ipnet module is notified and its view of the interfaces
  44  * configured on the system adjusted.  On attach, the module gets an initial
  45  * view of the system again using the neti framework but as it has already
  46  * registered for IP interface events, it is still up-to-date with any changes.
  47  */
  48 
  49 #include <sys/types.h>
  50 #include <sys/conf.h>
  51 #include <sys/cred.h>
  52 #include <sys/stat.h>
  53 #include <sys/ddi.h>
  54 #include <sys/sunddi.h>
  55 #include <sys/modctl.h>
  56 #include <sys/dlpi.h>
  57 #include <sys/strsun.h>
  58 #include <sys/id_space.h>
  59 #include <sys/kmem.h>
  60 #include <sys/mkdev.h>
  61 #include <sys/neti.h>
  62 #include <net/if.h>
  63 #include <sys/errno.h>
  64 #include <sys/list.h>
  65 #include <sys/ksynch.h>
  66 #include <sys/hook_event.h>
  67 #include <sys/sdt.h>
  68 #include <sys/stropts.h>
  69 #include <sys/sysmacros.h>
  70 #include <inet/ip.h>
  71 #include <inet/ip_if.h>
  72 #include <inet/ip_multi.h>
  73 #include <inet/ip6.h>
  74 #include <inet/ipnet.h>
  75 #include <net/bpf.h>
  76 #include <net/bpfdesc.h>
  77 #include <net/dlt.h>
  78 
  79 static struct module_info ipnet_minfo = {
  80         1,              /* mi_idnum */
  81         "ipnet",        /* mi_idname */
  82         0,              /* mi_minpsz */
  83         INFPSZ,         /* mi_maxpsz */
  84         2048,           /* mi_hiwat */
  85         0               /* mi_lowat */
  86 };
  87 
  88 /*
  89  * List to hold static view of ipnetif_t's on the system. This is needed to
  90  * avoid holding the lock protecting the avl tree of ipnetif's over the
  91  * callback into the dev filesystem.
  92  */
  93 typedef struct ipnetif_cbdata {
  94         char            ic_ifname[LIFNAMSIZ];
  95         dev_t           ic_dev;
  96         list_node_t     ic_next;
  97 } ipnetif_cbdata_t;
  98 
  99 /*
 100  * Convenience enumerated type for ipnet_accept().  It describes the
 101  * properties of a given ipnet_addrp_t relative to a single ipnet_t
 102  * client stream.  The values represent whether the address is ...
 103  */
 104 typedef enum {
 105         IPNETADDR_MYADDR,       /* an address on my ipnetif_t. */
 106         IPNETADDR_MBCAST,       /* a multicast or broadcast address. */
 107         IPNETADDR_UNKNOWN       /* none of the above. */
 108 } ipnet_addrtype_t;
 109 
 110 /* Argument used for the ipnet_nicevent_taskq callback. */
 111 typedef struct ipnet_nicevent_s {
 112         nic_event_t             ipne_event;
 113         net_handle_t            ipne_protocol;
 114         netstackid_t            ipne_stackid;
 115         uint64_t                ipne_ifindex;
 116         uint64_t                ipne_lifindex;
 117         char                    ipne_ifname[LIFNAMSIZ];
 118 } ipnet_nicevent_t;
 119 
 120 static dev_info_t       *ipnet_dip;
 121 static major_t          ipnet_major;
 122 static ddi_taskq_t      *ipnet_taskq;           /* taskq for packets */
 123 static ddi_taskq_t      *ipnet_nicevent_taskq;  /* taskq for NIC events */
 124 static id_space_t       *ipnet_minor_space;
 125 static const int        IPNET_MINOR_LO = 1;     /* minor number for /dev/lo0 */
 126 static const int        IPNET_MINOR_MIN = 2;    /* start of dynamic minors */
 127 static dl_info_ack_t    ipnet_infoack = IPNET_INFO_ACK_INIT;
 128 static ipnet_acceptfn_t ipnet_accept, ipnet_loaccept;
 129 static bpf_itap_fn_t    ipnet_itap;
 130 
 131 static void     ipnet_input(mblk_t *);
 132 static int      ipnet_wput(queue_t *, mblk_t *);
 133 static int      ipnet_rsrv(queue_t *);
 134 static int      ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
 135 static int      ipnet_close(queue_t *, int, cred_t *);
 136 static void     ipnet_ioctl(queue_t *, mblk_t *);
 137 static void     ipnet_iocdata(queue_t *, mblk_t *);
 138 static void     ipnet_wputnondata(queue_t *, mblk_t *);
 139 static int      ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
 140 static int      ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
 141 static int      ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 142 static void     ipnet_inforeq(queue_t *q, mblk_t *mp);
 143 static void     ipnet_bindreq(queue_t *q, mblk_t *mp);
 144 static void     ipnet_unbindreq(queue_t *q, mblk_t *mp);
 145 static void     ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
 146 static void     ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
 147 static int      ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
 148 static void     ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
 149 static int      ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
 150 static void     ipnet_nicevent_task(void *);
 151 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
 152     uint64_t);
 153 static void     ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
 154 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
 155 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
 156 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
 157 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
 158 static void     ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
 159 static int      ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
 160 static int      ipnetif_compare_name(const void *, const void *);
 161 static int      ipnetif_compare_name_zone(const void *, const void *);
 162 static int      ipnetif_compare_index(const void *, const void *);
 163 static void     ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
 164 static void     ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
 165 static void     ipnetif_refhold(ipnetif_t *);
 166 static void     ipnetif_refrele(ipnetif_t *);
 167 static void     ipnet_walkers_inc(ipnet_stack_t *);
 168 static void     ipnet_walkers_dec(ipnet_stack_t *);
 169 static void     ipnet_register_netihook(ipnet_stack_t *);
 170 static void     *ipnet_stack_init(netstackid_t, netstack_t *);
 171 static void     ipnet_stack_fini(netstackid_t, void *);
 172 static void     ipnet_dispatch(void *);
 173 static int      ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
 174 static int      ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
 175 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
 176 static void     ipnetif_clone_release(ipnetif_t *);
 177 
 178 static struct qinit ipnet_rinit = {
 179         NULL,           /* qi_putp */
 180         ipnet_rsrv,     /* qi_srvp */
 181         ipnet_open,     /* qi_qopen */
 182         ipnet_close,    /* qi_qclose */
 183         NULL,           /* qi_qadmin */
 184         &ipnet_minfo,       /* qi_minfo */
 185 };
 186 
 187 static struct qinit ipnet_winit = {
 188         ipnet_wput,     /* qi_putp */
 189         NULL,           /* qi_srvp */
 190         NULL,           /* qi_qopen */
 191         NULL,           /* qi_qclose */
 192         NULL,           /* qi_qadmin */
 193         &ipnet_minfo,       /* qi_minfo */
 194 };
 195 
 196 static struct streamtab ipnet_info = {
 197         &ipnet_rinit, &ipnet_winit
 198 };
 199 
 200 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
 201     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
 202     ddi_quiesce_not_supported);
 203 
 204 static struct modldrv modldrv = {
 205         &mod_driverops,
 206         "STREAMS ipnet driver",
 207         &ipnet_ops
 208 };
 209 
 210 static struct modlinkage modlinkage = {
 211         MODREV_1, &modldrv, NULL
 212 };
 213 
 214 /*
 215  * This structure contains the template data (names and type) that is
 216  * copied, in bulk, into the new kstats structure created by net_kstat_create.
 217  * No actual statistical information is stored in this instance of the
 218  * ipnet_kstats_t structure.
 219  */
 220 static ipnet_kstats_t stats_template = {
 221         { "duplicationFail",    KSTAT_DATA_UINT64 },
 222         { "dispatchOk",         KSTAT_DATA_UINT64 },
 223         { "dispatchFail",       KSTAT_DATA_UINT64 },
 224         { "dispatchHeaderDrop", KSTAT_DATA_UINT64 },
 225         { "dispatchDupDrop",    KSTAT_DATA_UINT64 },
 226         { "dispatchDeliver",    KSTAT_DATA_UINT64 },
 227         { "acceptOk",           KSTAT_DATA_UINT64 },
 228         { "acceptFail",         KSTAT_DATA_UINT64 }
 229 };
 230 
 231 /*
 232  * Walk the list of physical interfaces on the machine, for each
 233  * interface create a new ipnetif_t and add any addresses to it. We
 234  * need to do the walk twice, once for IPv4 and once for IPv6.
 235  *
 236  * The interfaces are destroyed as part of ipnet_stack_fini() for each
 237  * stack.  Note that we cannot do this initialization in
 238  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
 239  */
 240 static int
 241 ipnetif_init(void)
 242 {
 243         netstack_handle_t       nh;
 244         netstack_t              *ns;
 245         ipnet_stack_t           *ips;
 246         int                     ret = 0;
 247 
 248         netstack_next_init(&nh);
 249         while ((ns = netstack_next(&nh)) != NULL) {
 250                 ips = ns->netstack_ipnet;
 251                 if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
 252                         ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
 253                 netstack_rele(ns);
 254                 if (ret != 0)
 255                         break;
 256         }
 257         netstack_next_fini(&nh);
 258         return (ret);
 259 }
 260 
 261 /*
 262  * Standard module entry points.
 263  */
 264 int
 265 _init(void)
 266 {
 267         int             ret;
 268         boolean_t       netstack_registered = B_FALSE;
 269 
 270         if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
 271                 return (ENODEV);
 272         ipnet_minor_space = id_space_create("ipnet_minor_space",
 273             IPNET_MINOR_MIN, MAXMIN32);
 274 
 275         /*
 276          * We call ddi_taskq_create() with nthread == 1 to ensure in-order
 277          * delivery of packets to clients.  Note that we need to create the
 278          * taskqs before calling netstack_register() since ipnet_stack_init()
 279          * registers callbacks that use 'em.
 280          */
 281         ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
 282         ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
 283             1, TASKQ_DEFAULTPRI, 0);
 284         if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
 285                 ret = ENOMEM;
 286                 goto done;
 287         }
 288 
 289         netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
 290         netstack_registered = B_TRUE;
 291 
 292         if ((ret = ipnetif_init()) == 0)
 293                 ret = mod_install(&modlinkage);
 294 done:
 295         if (ret != 0) {
 296                 if (ipnet_taskq != NULL)
 297                         ddi_taskq_destroy(ipnet_taskq);
 298                 if (ipnet_nicevent_taskq != NULL)
 299                         ddi_taskq_destroy(ipnet_nicevent_taskq);
 300                 if (netstack_registered)
 301                         netstack_unregister(NS_IPNET);
 302                 id_space_destroy(ipnet_minor_space);
 303         }
 304         return (ret);
 305 }
 306 
 307 int
 308 _fini(void)
 309 {
 310         int     err;
 311 
 312         if ((err = mod_remove(&modlinkage)) != 0)
 313                 return (err);
 314 
 315         netstack_unregister(NS_IPNET);
 316         ddi_taskq_destroy(ipnet_nicevent_taskq);
 317         ddi_taskq_destroy(ipnet_taskq);
 318         id_space_destroy(ipnet_minor_space);
 319         return (0);
 320 }
 321 
 322 int
 323 _info(struct modinfo *modinfop)
 324 {
 325         return (mod_info(&modlinkage, modinfop));
 326 }
 327 
 328 static void
 329 ipnet_register_netihook(ipnet_stack_t *ips)
 330 {
 331         int             ret;
 332         zoneid_t        zoneid;
 333         netid_t         netid;
 334 
 335         HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
 336             ips);
 337 
 338         /*
 339          * It is possible for an exclusive stack to be in the process of
 340          * shutting down here, and the netid and protocol lookups could fail
 341          * in that case.
 342          */
 343         zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
 344         if ((netid = net_zoneidtonetid(zoneid)) == -1)
 345                 return;
 346 
 347         if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
 348                 if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
 349                     ips->ips_nicevents)) != 0) {
 350                         VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
 351                         ips->ips_ndv4 = NULL;
 352                         cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
 353                             " in zone %d: %d", zoneid, ret);
 354                 }
 355         }
 356         if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
 357                 if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
 358                     ips->ips_nicevents)) != 0) {
 359                         VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
 360                         ips->ips_ndv6 = NULL;
 361                         cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
 362                             " in zone %d: %d", zoneid, ret);
 363                 }
 364         }
 365 
 366         /*
 367          * Create a local set of kstats for each zone.
 368          */
 369         ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
 370             "misc", KSTAT_TYPE_NAMED,
 371             sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
 372         if (ips->ips_kstatp != NULL) {
 373                 bcopy(&stats_template, &ips->ips_stats,
 374                     sizeof (ips->ips_stats));
 375                 ips->ips_kstatp->ks_data = &ips->ips_stats;
 376                 ips->ips_kstatp->ks_private =
 377                     (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
 378                 kstat_install(ips->ips_kstatp);
 379         } else {
 380                 cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
 381                     "ipnet", "ipnet_stats", "misc");
 382         }
 383 }
 384 
 385 /*
 386  * This function is called on attach to build an initial view of the
 387  * interfaces on the system. It will be called once for IPv4 and once
 388  * for IPv6, although there is only one ipnet interface for both IPv4
 389  * and IPv6 there are separate address lists.
 390  */
 391 static int
 392 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
 393 {
 394         phy_if_t        phyif;
 395         lif_if_t        lif;
 396         ipnetif_t       *ipnetif;
 397         char            name[LIFNAMSIZ];
 398         boolean_t       new_if = B_FALSE;
 399         uint64_t        ifflags;
 400         int             ret = 0;
 401 
 402         /*
 403          * If ipnet_register_netihook() was unable to initialize this
 404          * stack's net_handle_t, then we cannot populate any interface
 405          * information.  This usually happens when we attempted to
 406          * grab a net_handle_t as a stack was shutting down.  We don't
 407          * want to fail the entire _init() operation because of a
 408          * stack shutdown (other stacks will continue to work just
 409          * fine), so we silently return success here.
 410          */
 411         if (nd == NULL)
 412                 return (0);
 413 
 414         /*
 415          * Make sure we're not processing NIC events during the
 416          * population of our interfaces and address lists.
 417          */
 418         mutex_enter(&ips->ips_event_lock);
 419 
 420         for (phyif = net_phygetnext(nd, 0); phyif != 0;
 421             phyif = net_phygetnext(nd, phyif)) {
 422                 if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
 423                         continue;
 424                 ifflags =  0;
 425                 (void) net_getlifflags(nd, phyif, 0, &ifflags);
 426                 if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
 427                         ipnetif = ipnetif_create(name, phyif, ips, ifflags);
 428                         if (ipnetif == NULL) {
 429                                 ret = ENOMEM;
 430                                 goto done;
 431                         }
 432                         new_if = B_TRUE;
 433                 }
 434                 ipnetif->if_flags |=
 435                     isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
 436 
 437                 for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
 438                     lif = net_lifgetnext(nd, phyif, lif)) {
 439                         /*
 440                          * Skip addresses that aren't up.  We'll add
 441                          * them when we receive an NE_LIF_UP event.
 442                          */
 443                         if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
 444                             !(ifflags & IFF_UP))
 445                                 continue;
 446                         /* Don't add it if we already have it. */
 447                         if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
 448                                 continue;
 449                         ipnet_add_ifaddr(lif, ipnetif, nd);
 450                 }
 451                 if (!new_if)
 452                         ipnetif_refrele(ipnetif);
 453         }
 454 
 455 done:
 456         mutex_exit(&ips->ips_event_lock);
 457         return (ret);
 458 }
 459 
 460 static int
 461 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 462 {
 463         if (cmd != DDI_ATTACH)
 464                 return (DDI_FAILURE);
 465 
 466         if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
 467             DDI_PSEUDO, 0) == DDI_FAILURE)
 468                 return (DDI_FAILURE);
 469 
 470         ipnet_dip = dip;
 471         return (DDI_SUCCESS);
 472 }
 473 
 474 static int
 475 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 476 {
 477         if (cmd != DDI_DETACH)
 478                 return (DDI_FAILURE);
 479 
 480         ASSERT(dip == ipnet_dip);
 481         ddi_remove_minor_node(ipnet_dip, NULL);
 482         ipnet_dip = NULL;
 483         return (DDI_SUCCESS);
 484 }
 485 
 486 /* ARGSUSED */
 487 static int
 488 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 489 {
 490         int     error = DDI_FAILURE;
 491 
 492         switch (infocmd) {
 493         case DDI_INFO_DEVT2INSTANCE:
 494                 *result = (void *)0;
 495                 error = DDI_SUCCESS;
 496                 break;
 497         case DDI_INFO_DEVT2DEVINFO:
 498                 if (ipnet_dip != NULL) {
 499                         *result = ipnet_dip;
 500                         error = DDI_SUCCESS;
 501                 }
 502                 break;
 503         }
 504         return (error);
 505 }
 506 
 507 /* ARGSUSED */
 508 static int
 509 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
 510 {
 511         ipnet_t         *ipnet;
 512         netstack_t      *ns = NULL;
 513         ipnet_stack_t   *ips;
 514         int             err = 0;
 515         zoneid_t        zoneid = crgetzoneid(crp);
 516 
 517         /*
 518          * If the system is labeled, only the global zone is allowed to open
 519          * IP observability nodes.
 520          */
 521         if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
 522                 return (EACCES);
 523 
 524         /* We don't support open as a module */
 525         if (sflag & MODOPEN)
 526                 return (ENOTSUP);
 527 
 528         /* This driver is self-cloning, we don't support re-open. */
 529         if (rq->q_ptr != NULL)
 530                 return (EBUSY);
 531 
 532         if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
 533                 return (ENOMEM);
 534 
 535         VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
 536         ips = ns->netstack_ipnet;
 537 
 538         rq->q_ptr = WR(rq)->q_ptr = ipnet;
 539         ipnet->ipnet_rq = rq;
 540         ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
 541         ipnet->ipnet_zoneid = zoneid;
 542         ipnet->ipnet_dlstate = DL_UNBOUND;
 543         ipnet->ipnet_ns = ns;
 544 
 545         /*
 546          * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
 547          * to be processed after ipnet_if is set and the ipnet_t has been
 548          * inserted in the ips_str_list.
 549          */
 550         mutex_enter(&ips->ips_event_lock);
 551         if (getminor(*dev) == IPNET_MINOR_LO) {
 552                 ipnet->ipnet_flags |= IPNET_LOMODE;
 553                 ipnet->ipnet_acceptfn = ipnet_loaccept;
 554         } else {
 555                 ipnet->ipnet_acceptfn = ipnet_accept;
 556                 ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
 557                 if (ipnet->ipnet_if == NULL ||
 558                     !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
 559                         err = ENODEV;
 560                         goto done;
 561                 }
 562         }
 563 
 564         mutex_enter(&ips->ips_walkers_lock);
 565         while (ips->ips_walkers_cnt != 0)
 566                 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
 567         list_insert_head(&ips->ips_str_list, ipnet);
 568         *dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
 569         qprocson(rq);
 570 
 571         /*
 572          * Only register our callback if we're the first open client; we call
 573          * unregister in close() for the last open client.
 574          */
 575         if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
 576                 ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
 577         mutex_exit(&ips->ips_walkers_lock);
 578 
 579 done:
 580         mutex_exit(&ips->ips_event_lock);
 581         if (err != 0) {
 582                 netstack_rele(ns);
 583                 id_free(ipnet_minor_space, ipnet->ipnet_minor);
 584                 if (ipnet->ipnet_if != NULL)
 585                         ipnetif_refrele(ipnet->ipnet_if);
 586                 kmem_free(ipnet, sizeof (*ipnet));
 587         }
 588         return (err);
 589 }
 590 
 591 /* ARGSUSED */
 592 static int
 593 ipnet_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
 594 {
 595         ipnet_t         *ipnet = rq->q_ptr;
 596         ipnet_stack_t   *ips = ipnet->ipnet_ns->netstack_ipnet;
 597 
 598         if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
 599                 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
 600         if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
 601                 ipnet_leave_allmulti(ipnet->ipnet_if, ips);
 602 
 603         mutex_enter(&ips->ips_walkers_lock);
 604         while (ips->ips_walkers_cnt != 0)
 605                 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
 606 
 607         qprocsoff(rq);
 608 
 609         list_remove(&ips->ips_str_list, ipnet);
 610         if (ipnet->ipnet_if != NULL)
 611                 ipnetif_refrele(ipnet->ipnet_if);
 612         id_free(ipnet_minor_space, ipnet->ipnet_minor);
 613 
 614         if (list_is_empty(&ips->ips_str_list)) {
 615                 ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
 616                 ips->ips_hook = NULL;
 617         }
 618 
 619         kmem_free(ipnet, sizeof (*ipnet));
 620 
 621         mutex_exit(&ips->ips_walkers_lock);
 622         netstack_rele(ips->ips_netstack);
 623         return (0);
 624 }
 625 
 626 static int
 627 ipnet_wput(queue_t *q, mblk_t *mp)
 628 {
 629         switch (mp->b_datap->db_type) {
 630         case M_FLUSH:
 631                 if (*mp->b_rptr & FLUSHW) {
 632                         flushq(q, FLUSHDATA);
 633                         *mp->b_rptr &= ~FLUSHW;
 634                 }
 635                 if (*mp->b_rptr & FLUSHR)
 636                         qreply(q, mp);
 637                 else
 638                         freemsg(mp);
 639                 break;
 640         case M_PROTO:
 641         case M_PCPROTO:
 642                 ipnet_wputnondata(q, mp);
 643                 break;
 644         case M_IOCTL:
 645                 ipnet_ioctl(q, mp);
 646                 break;
 647         case M_IOCDATA:
 648                 ipnet_iocdata(q, mp);
 649                 break;
 650         default:
 651                 freemsg(mp);
 652                 break;
 653         }
 654         return (0);
 655 }
 656 
 657 static int
 658 ipnet_rsrv(queue_t *q)
 659 {
 660         mblk_t  *mp;
 661 
 662         while ((mp = getq(q)) != NULL) {
 663                 ASSERT(DB_TYPE(mp) == M_DATA);
 664                 if (canputnext(q)) {
 665                         putnext(q, mp);
 666                 } else {
 667                         (void) putbq(q, mp);
 668                         break;
 669                 }
 670         }
 671         return (0);
 672 }
 673 
 674 static void
 675 ipnet_ioctl(queue_t *q, mblk_t *mp)
 676 {
 677         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 678 
 679         switch (iocp->ioc_cmd) {
 680         case DLIOCRAW:
 681                 miocack(q, mp, 0, 0);
 682                 break;
 683         case DLIOCIPNETINFO:
 684                 if (iocp->ioc_count == TRANSPARENT) {
 685                         mcopyin(mp, NULL, sizeof (uint_t), NULL);
 686                         qreply(q, mp);
 687                         break;
 688                 }
 689                 /* We don't support I_STR with DLIOCIPNETINFO. */
 690                 /* FALLTHROUGH */
 691         default:
 692                 miocnak(q, mp, 0, EINVAL);
 693                 break;
 694         }
 695 }
 696 
 697 static void
 698 ipnet_iocdata(queue_t *q, mblk_t *mp)
 699 {
 700         struct iocblk   *iocp = (struct iocblk *)mp->b_rptr;
 701         ipnet_t *ipnet = q->q_ptr;
 702 
 703         switch (iocp->ioc_cmd) {
 704         case DLIOCIPNETINFO:
 705                 if (*(int *)mp->b_cont->b_rptr == 1)
 706                         ipnet->ipnet_flags |= IPNET_INFO;
 707                 else if (*(int *)mp->b_cont->b_rptr == 0)
 708                         ipnet->ipnet_flags &= ~IPNET_INFO;
 709                 else
 710                         goto iocnak;
 711                 miocack(q, mp, 0, DL_IPNETINFO_VERSION);
 712                 break;
 713         default:
 714 iocnak:
 715                 miocnak(q, mp, 0, EINVAL);
 716                 break;
 717         }
 718 }
 719 
 720 static void
 721 ipnet_wputnondata(queue_t *q, mblk_t *mp)
 722 {
 723         union DL_primitives     *dlp = (union DL_primitives *)mp->b_rptr;
 724         t_uscalar_t             prim = dlp->dl_primitive;
 725 
 726         switch (prim) {
 727         case DL_INFO_REQ:
 728                 ipnet_inforeq(q, mp);
 729                 break;
 730         case DL_UNBIND_REQ:
 731                 ipnet_unbindreq(q, mp);
 732                 break;
 733         case DL_BIND_REQ:
 734                 ipnet_bindreq(q, mp);
 735                 break;
 736         case DL_PROMISCON_REQ:
 737                 ipnet_dlpromisconreq(q, mp);
 738                 break;
 739         case DL_PROMISCOFF_REQ:
 740                 ipnet_dlpromiscoffreq(q, mp);
 741                 break;
 742         case DL_UNITDATA_REQ:
 743         case DL_DETACH_REQ:
 744         case DL_PHYS_ADDR_REQ:
 745         case DL_SET_PHYS_ADDR_REQ:
 746         case DL_ENABMULTI_REQ:
 747         case DL_DISABMULTI_REQ:
 748         case DL_ATTACH_REQ:
 749                 dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
 750                 break;
 751         default:
 752                 dlerrorack(q, mp, prim, DL_BADPRIM, 0);
 753                 break;
 754         }
 755 }
 756 
 757 static void
 758 ipnet_inforeq(queue_t *q, mblk_t *mp)
 759 {
 760         dl_info_ack_t   *dlip;
 761         size_t          size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
 762 
 763         if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
 764                 dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
 765                 return;
 766         }
 767 
 768         if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
 769                 return;
 770 
 771         dlip = (dl_info_ack_t *)mp->b_rptr;
 772         *dlip = ipnet_infoack;
 773         qreply(q, mp);
 774 }
 775 
 776 static void
 777 ipnet_bindreq(queue_t *q, mblk_t *mp)
 778 {
 779         union DL_primitives     *dlp = (union DL_primitives *)mp->b_rptr;
 780         ipnet_t                 *ipnet = q->q_ptr;
 781 
 782         if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
 783                 dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
 784                 return;
 785         }
 786 
 787         switch (dlp->bind_req.dl_sap) {
 788         case 0 :
 789                 ipnet->ipnet_family = AF_UNSPEC;
 790                 break;
 791         case IPV4_VERSION :
 792                 ipnet->ipnet_family = AF_INET;
 793                 break;
 794         case IPV6_VERSION :
 795                 ipnet->ipnet_family = AF_INET6;
 796                 break;
 797         default :
 798                 dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
 799                 return;
 800                 /*NOTREACHED*/
 801         }
 802 
 803         ipnet->ipnet_dlstate = DL_IDLE;
 804         dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
 805 }
 806 
 807 static void
 808 ipnet_unbindreq(queue_t *q, mblk_t *mp)
 809 {
 810         ipnet_t *ipnet = q->q_ptr;
 811 
 812         if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
 813                 dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
 814                 return;
 815         }
 816 
 817         if (ipnet->ipnet_dlstate != DL_IDLE) {
 818                 dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
 819         } else {
 820                 ipnet->ipnet_dlstate = DL_UNBOUND;
 821                 ipnet->ipnet_family = AF_UNSPEC;
 822                 dlokack(q, mp, DL_UNBIND_REQ);
 823         }
 824 }
 825 
 826 static void
 827 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
 828 {
 829         ipnet_t         *ipnet = q->q_ptr;
 830         t_uscalar_t     level;
 831         int             err;
 832 
 833         if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
 834                 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
 835                 return;
 836         }
 837 
 838         if (ipnet->ipnet_flags & IPNET_LOMODE) {
 839                 dlokack(q, mp, DL_PROMISCON_REQ);
 840                 return;
 841         }
 842 
 843         level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
 844         if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
 845                 if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
 846                     ipnet->ipnet_ns->netstack_ipnet)) != 0) {
 847                         dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
 848                         return;
 849                 }
 850         }
 851 
 852         switch (level) {
 853         case DL_PROMISC_PHYS:
 854                 ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
 855                 break;
 856         case DL_PROMISC_SAP:
 857                 ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
 858                 break;
 859         case DL_PROMISC_MULTI:
 860                 ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
 861                 break;
 862         default:
 863                 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
 864                 return;
 865         }
 866 
 867         dlokack(q, mp, DL_PROMISCON_REQ);
 868 }
 869 
 870 static void
 871 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
 872 {
 873         ipnet_t         *ipnet = q->q_ptr;
 874         t_uscalar_t     level;
 875         uint16_t        orig_ipnet_flags = ipnet->ipnet_flags;
 876 
 877         if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
 878                 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
 879                 return;
 880         }
 881 
 882         if (ipnet->ipnet_flags & IPNET_LOMODE) {
 883                 dlokack(q, mp, DL_PROMISCOFF_REQ);
 884                 return;
 885         }
 886 
 887         level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
 888         switch (level) {
 889         case DL_PROMISC_PHYS:
 890                 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
 891                         ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
 892                 break;
 893         case DL_PROMISC_SAP:
 894                 if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
 895                         ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
 896                 break;
 897         case DL_PROMISC_MULTI:
 898                 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
 899                         ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
 900                 break;
 901         default:
 902                 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
 903                 return;
 904         }
 905 
 906         if (orig_ipnet_flags == ipnet->ipnet_flags) {
 907                 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
 908                 return;
 909         }
 910 
 911         if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
 912                 ipnet_leave_allmulti(ipnet->ipnet_if,
 913                     ipnet->ipnet_ns->netstack_ipnet);
 914         }
 915 
 916         dlokack(q, mp, DL_PROMISCOFF_REQ);
 917 }
 918 
 919 static int
 920 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
 921 {
 922         int             err = 0;
 923         ip_stack_t      *ipst = ips->ips_netstack->netstack_ip;
 924         uint64_t        index = ipnetif->if_index;
 925 
 926         mutex_enter(&ips->ips_event_lock);
 927         if (ipnetif->if_multicnt == 0) {
 928                 ASSERT((ipnetif->if_flags &
 929                     (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
 930                 if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
 931                         err = ip_join_allmulti(index, B_FALSE, ipst);
 932                         if (err != 0)
 933                                 goto done;
 934                         ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
 935                 }
 936                 if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
 937                         err = ip_join_allmulti(index, B_TRUE, ipst);
 938                         if (err != 0 &&
 939                             (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
 940                                 (void) ip_leave_allmulti(index, B_FALSE, ipst);
 941                                 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
 942                                 goto done;
 943                         }
 944                         ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
 945                 }
 946         }
 947         ipnetif->if_multicnt++;
 948 
 949 done:
 950         mutex_exit(&ips->ips_event_lock);
 951         return (err);
 952 }
 953 
 954 static void
 955 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
 956 {
 957         int             err;
 958         ip_stack_t      *ipst = ips->ips_netstack->netstack_ip;
 959         uint64_t        index = ipnetif->if_index;
 960 
 961         mutex_enter(&ips->ips_event_lock);
 962         ASSERT(ipnetif->if_multicnt != 0);
 963         if (--ipnetif->if_multicnt == 0) {
 964                 if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
 965                         err = ip_leave_allmulti(index, B_FALSE, ipst);
 966                         ASSERT(err == 0 || err == ENODEV);
 967                         ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
 968                 }
 969                 if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
 970                         err = ip_leave_allmulti(index, B_TRUE, ipst);
 971                         ASSERT(err == 0 || err == ENODEV);
 972                         ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
 973                 }
 974         }
 975         mutex_exit(&ips->ips_event_lock);
 976 }
 977 
 978 /*
 979  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
 980  * The structure it copies the header information from,
 981  * hook_pkt_observe_t, is constructed using network byte
 982  * order in ipobs_hook(), so there is no conversion here.
 983  */
 984 static mblk_t *
 985 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
 986 {
 987         mblk_t          *dlhdr;
 988         dl_ipnetinfo_t  *dl;
 989 
 990         if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
 991                 freemsg(mp);
 992                 return (NULL);
 993         }
 994         dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
 995         dl->dli_version = DL_IPNETINFO_VERSION;
 996         dl->dli_family = hdr->hpo_family;
 997         dl->dli_htype = hdr->hpo_htype;
 998         dl->dli_pktlen = hdr->hpo_pktlen;
 999         dl->dli_ifindex = hdr->hpo_ifindex;
1000         dl->dli_grifindex = hdr->hpo_grifindex;
1001         dl->dli_zsrc = hdr->hpo_zsrc;
1002         dl->dli_zdst = hdr->hpo_zdst;
1003         dlhdr->b_wptr += sizeof (*dl);
1004         dlhdr->b_cont = mp;
1005 
1006         return (dlhdr);
1007 }
1008 
1009 static ipnet_addrtype_t
1010 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1011 {
1012         list_t                  *list;
1013         ipnetif_t               *ipnetif = ipnet->ipnet_if;
1014         ipnetif_addr_t          *ifaddr;
1015         ipnet_addrtype_t        addrtype = IPNETADDR_UNKNOWN;
1016 
1017         /* First check if the address is multicast or limited broadcast. */
1018         switch (addr->iap_family) {
1019         case AF_INET:
1020                 if (CLASSD(*(addr->iap_addr4)) ||
1021                     *(addr->iap_addr4) == INADDR_BROADCAST)
1022                         return (IPNETADDR_MBCAST);
1023                 break;
1024         case AF_INET6:
1025                 if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1026                         return (IPNETADDR_MBCAST);
1027                 break;
1028         }
1029 
1030         /*
1031          * Walk the address list to see if the address belongs to our
1032          * interface or is one of our subnet broadcast addresses.
1033          */
1034         mutex_enter(&ipnetif->if_addr_lock);
1035         list = (addr->iap_family == AF_INET) ?
1036             &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1037         for (ifaddr = list_head(list);
1038             ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1039             ifaddr = list_next(list, ifaddr)) {
1040                 /*
1041                  * If we're not in the global zone, then only look at
1042                  * addresses in our zone.
1043                  */
1044                 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1045                     ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1046                         continue;
1047                 switch (addr->iap_family) {
1048                 case AF_INET:
1049                         if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1050                             *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1051                                 addrtype = IPNETADDR_MYADDR;
1052                         else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1053                             *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1054                                 addrtype = IPNETADDR_MBCAST;
1055                         break;
1056                 case AF_INET6:
1057                         if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1058                             &ifaddr->ifa_ip6addr))
1059                                 addrtype = IPNETADDR_MYADDR;
1060                         break;
1061                 }
1062         }
1063         mutex_exit(&ipnetif->if_addr_lock);
1064 
1065         return (addrtype);
1066 }
1067 
1068 /*
1069  * Verify if the packet contained in hdr should be passed up to the
1070  * ipnet client stream.
1071  */
1072 static boolean_t
1073 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1074     ipnet_addrp_t *dst)
1075 {
1076         boolean_t               obsif;
1077         uint64_t                ifindex = ipnet->ipnet_if->if_index;
1078         ipnet_addrtype_t        srctype;
1079         ipnet_addrtype_t        dsttype;
1080 
1081         srctype = ipnet_get_addrtype(ipnet, src);
1082         dsttype = ipnet_get_addrtype(ipnet, dst);
1083 
1084         /*
1085          * If the packet's ifindex matches ours, or the packet's group ifindex
1086          * matches ours, it's on the interface we're observing.  (Thus,
1087          * observing on the group ifindex matches all ifindexes in the group.)
1088          */
1089         obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1090             ntohl(hdr->hpo_grifindex) == ifindex);
1091 
1092         DTRACE_PROBE5(ipnet_accept__addr,
1093             ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1094             ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1095             boolean_t, obsif);
1096 
1097         /*
1098          * Do not allow an ipnet stream to see packets that are not from or to
1099          * its zone.  The exception is when zones are using the shared stack
1100          * model.  In this case, streams in the global zone have visibility
1101          * into other shared-stack zones, and broadcast and multicast traffic
1102          * is visible by all zones in the stack.
1103          */
1104         if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1105             dsttype != IPNETADDR_MBCAST) {
1106                 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1107                     ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1108                         return (B_FALSE);
1109         }
1110 
1111         /*
1112          * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1113          * packet's IP version.
1114          */
1115         if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1116             ipnet->ipnet_family != hdr->hpo_family)
1117                 return (B_FALSE);
1118 
1119         /* If the destination address is ours, then accept the packet. */
1120         if (dsttype == IPNETADDR_MYADDR)
1121                 return (B_TRUE);
1122 
1123         /*
1124          * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1125          * sent or received on the interface we're observing, or packets that
1126          * have our source address (this allows us to see packets we send).
1127          */
1128         if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1129                 if (srctype == IPNETADDR_MYADDR || obsif)
1130                         return (B_TRUE);
1131         }
1132 
1133         /*
1134          * We accept multicast and broadcast packets transmitted or received
1135          * on the interface we're observing.
1136          */
1137         if (dsttype == IPNETADDR_MBCAST && obsif)
1138                 return (B_TRUE);
1139 
1140         return (B_FALSE);
1141 }
1142 
1143 /*
1144  * Verify if the packet contained in hdr should be passed up to the ipnet
1145  * client stream that's in IPNET_LOMODE.
1146  */
1147 /* ARGSUSED */
1148 static boolean_t
1149 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1150     ipnet_addrp_t *dst)
1151 {
1152         if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1153                 /*
1154                  * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1155                  */
1156                 if (ipnet->ipnet_if == NULL)
1157                         return (B_FALSE);
1158         }
1159 
1160         /*
1161          * An ipnet stream must not see packets that are not from/to its zone.
1162          */
1163         if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1164                 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1165                     ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1166                         return (B_FALSE);
1167         }
1168 
1169         return (ipnet->ipnet_family == AF_UNSPEC ||
1170             ipnet->ipnet_family == hdr->hpo_family);
1171 }
1172 
1173 static void
1174 ipnet_dispatch(void *arg)
1175 {
1176         mblk_t                  *mp = arg;
1177         hook_pkt_observe_t      *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1178         ipnet_t                 *ipnet;
1179         mblk_t                  *netmp;
1180         list_t                  *list;
1181         ipnet_stack_t           *ips;
1182         ipnet_addrp_t           src;
1183         ipnet_addrp_t           dst;
1184 
1185         ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1186 
1187         netmp = hdr->hpo_pkt->b_cont;
1188         src.iap_family = hdr->hpo_family;
1189         dst.iap_family = hdr->hpo_family;
1190 
1191         if (hdr->hpo_family == AF_INET) {
1192                 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1193                 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1194         } else {
1195                 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1196                 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1197         }
1198 
1199         ipnet_walkers_inc(ips);
1200 
1201         list = &ips->ips_str_list;
1202         for (ipnet = list_head(list); ipnet != NULL;
1203             ipnet = list_next(list, ipnet)) {
1204                 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1205                         IPSK_BUMP(ips, ik_acceptFail);
1206                         continue;
1207                 }
1208                 IPSK_BUMP(ips, ik_acceptOk);
1209 
1210                 if (list_next(list, ipnet) == NULL) {
1211                         netmp = hdr->hpo_pkt->b_cont;
1212                         hdr->hpo_pkt->b_cont = NULL;
1213                 } else {
1214                         if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1215                             (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1216                                 IPSK_BUMP(ips, ik_duplicationFail);
1217                                 continue;
1218                         }
1219                 }
1220 
1221                 if (ipnet->ipnet_flags & IPNET_INFO) {
1222                         if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1223                                 IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1224                                 continue;
1225                         }
1226                 }
1227 
1228                 if (ipnet->ipnet_rq->q_first == NULL &&
1229                     canputnext(ipnet->ipnet_rq)) {
1230                         putnext(ipnet->ipnet_rq, netmp);
1231                         IPSK_BUMP(ips, ik_dispatchDeliver);
1232                 } else if (canput(ipnet->ipnet_rq)) {
1233                         (void) putq(ipnet->ipnet_rq, netmp);
1234                         IPSK_BUMP(ips, ik_dispatchDeliver);
1235                 } else {
1236                         freemsg(netmp);
1237                         IPSK_BUMP(ips, ik_dispatchPutDrop);
1238                 }
1239         }
1240 
1241         ipnet_walkers_dec(ips);
1242 
1243         freemsg(mp);
1244 }
1245 
1246 static void
1247 ipnet_input(mblk_t *mp)
1248 {
1249         hook_pkt_observe_t      *hdr = (hook_pkt_observe_t *)mp->b_rptr;
1250         ipnet_stack_t           *ips;
1251 
1252         ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1253 
1254         if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1255             DDI_SUCCESS) {
1256                 IPSK_BUMP(ips, ik_dispatchFail);
1257                 freemsg(mp);
1258         } else {
1259                 IPSK_BUMP(ips, ik_dispatchOk);
1260         }
1261 }
1262 
1263 static ipnetif_t *
1264 ipnet_alloc_if(ipnet_stack_t *ips)
1265 {
1266         ipnetif_t       *ipnetif;
1267 
1268         if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1269                 return (NULL);
1270 
1271         mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1272         list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1273             offsetof(ipnetif_addr_t, ifa_link));
1274         list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1275             offsetof(ipnetif_addr_t, ifa_link));
1276         mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1277 
1278         ipnetif->if_stackp = ips;
1279 
1280         return (ipnetif);
1281 }
1282 
1283 /*
1284  * Create a new ipnetif_t and new minor node for it.  If creation is
1285  * successful the new ipnetif_t is inserted into an avl_tree
1286  * containing ipnetif's for this stack instance.
1287  */
1288 static ipnetif_t *
1289 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1290     uint64_t ifflags)
1291 {
1292         ipnetif_t       *ipnetif;
1293         avl_index_t     where = 0;
1294         minor_t         ifminor;
1295 
1296         /*
1297          * Because ipnetif_create() can be called from a NIC event
1298          * callback, it should not block.
1299          */
1300         ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1301         if (ifminor == (minor_t)-1)
1302                 return (NULL);
1303         if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1304                 id_free(ipnet_minor_space, ifminor);
1305                 return (NULL);
1306         }
1307 
1308         (void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1309         ipnetif->if_index = (uint_t)index;
1310         ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1311         ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1312 
1313         ipnetif->if_refcnt = 1;
1314         if ((ifflags & IFF_LOOPBACK) != 0)
1315                 ipnetif->if_flags = IPNETIF_LOOPBACK;
1316 
1317         mutex_enter(&ips->ips_avl_lock);
1318         VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1319         avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1320         VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1321         avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1322         mutex_exit(&ips->ips_avl_lock);
1323 
1324         return (ipnetif);
1325 }
1326 
1327 static void
1328 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1329 {
1330         ipnet_t *ipnet;
1331 
1332         ipnet_walkers_inc(ips);
1333         /* Send a SIGHUP to all open streams associated with this ipnetif. */
1334         for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1335             ipnet = list_next(&ips->ips_str_list, ipnet)) {
1336                 if (ipnet->ipnet_if == ipnetif)
1337                         (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1338         }
1339         ipnet_walkers_dec(ips);
1340         mutex_enter(&ips->ips_avl_lock);
1341         avl_remove(&ips->ips_avl_by_index, ipnetif);
1342         avl_remove(&ips->ips_avl_by_name, ipnetif);
1343         mutex_exit(&ips->ips_avl_lock);
1344         /*
1345          * Release the reference we implicitly held in ipnetif_create().
1346          */
1347         ipnetif_refrele(ipnetif);
1348 }
1349 
1350 static void
1351 ipnet_purge_addrlist(list_t *addrlist)
1352 {
1353         ipnetif_addr_t  *ifa;
1354 
1355         while ((ifa = list_head(addrlist)) != NULL) {
1356                 list_remove(addrlist, ifa);
1357                 if (ifa->ifa_shared != NULL)
1358                         ipnetif_clone_release(ifa->ifa_shared);
1359                 kmem_free(ifa, sizeof (*ifa));
1360         }
1361 }
1362 
1363 static void
1364 ipnetif_free(ipnetif_t *ipnetif)
1365 {
1366         ASSERT(ipnetif->if_refcnt == 0);
1367         ASSERT(ipnetif->if_sharecnt == 0);
1368 
1369         /* Remove IPv4/v6 address lists from the ipnetif */
1370         ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1371         list_destroy(&ipnetif->if_ip4addr_list);
1372         ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1373         list_destroy(&ipnetif->if_ip6addr_list);
1374         mutex_destroy(&ipnetif->if_addr_lock);
1375         mutex_destroy(&ipnetif->if_reflock);
1376         if (ipnetif->if_dev != 0)
1377                 id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1378         kmem_free(ipnetif, sizeof (*ipnetif));
1379 }
1380 
1381 /*
1382  * Create an ipnetif_addr_t with the given logical interface id (lif)
1383  * and add it to the supplied ipnetif.  The lif is the netinfo
1384  * representation of logical interface id, and we use this id to match
1385  * incoming netinfo events against our lists of addresses.
1386  */
1387 static void
1388 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1389 {
1390         ipnetif_addr_t          *ifaddr;
1391         zoneid_t                zoneid;
1392         struct sockaddr_in      bcast;
1393         struct sockaddr_storage addr;
1394         net_ifaddr_t            type = NA_ADDRESS;
1395         uint64_t                phyif = ipnetif->if_index;
1396 
1397         if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1398             net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1399                 return;
1400 
1401         if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1402                 return;
1403         ifaddr->ifa_zone = zoneid;
1404         ifaddr->ifa_id = lif;
1405         ifaddr->ifa_shared = NULL;
1406 
1407         switch (addr.ss_family) {
1408         case AF_INET:
1409                 ifaddr->ifa_ip4addr =
1410                     ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1411                 /*
1412                  * Try and get the broadcast address.  Note that it's okay for
1413                  * an interface to not have a broadcast address, so we don't
1414                  * fail the entire operation if net_getlifaddr() fails here.
1415                  */
1416                 type = NA_BROADCAST;
1417                 if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1418                         ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1419                 break;
1420         case AF_INET6:
1421                 ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1422                 break;
1423         }
1424 
1425         /*
1426          * The zoneid stored in ipnetif_t needs to correspond to the actual
1427          * zone the address is being used in. This facilitates finding the
1428          * correct netstack_t pointer, amongst other things, later.
1429          */
1430         if (zoneid == ALL_ZONES)
1431                 zoneid = GLOBAL_ZONEID;
1432 
1433         mutex_enter(&ipnetif->if_addr_lock);
1434         if (zoneid != ipnetif->if_zoneid) {
1435                 ipnetif_t *ifp2;
1436 
1437                 ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1438                 ifaddr->ifa_shared = ifp2;
1439         }
1440         list_insert_tail(addr.ss_family == AF_INET ?
1441             &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1442         mutex_exit(&ipnetif->if_addr_lock);
1443 }
1444 
1445 static void
1446 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1447 {
1448         mutex_enter(&ipnetif->if_addr_lock);
1449         if (ifaddr->ifa_shared != NULL)
1450                 ipnetif_clone_release(ifaddr->ifa_shared);
1451 
1452         list_remove(isv6 ?
1453             &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1454         mutex_exit(&ipnetif->if_addr_lock);
1455         kmem_free(ifaddr, sizeof (*ifaddr));
1456 }
1457 
1458 static void
1459 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1460 {
1461         ipnetif_t       *ipnetif;
1462         boolean_t       refrele_needed = B_TRUE;
1463         uint64_t        ifflags;
1464         uint64_t        ifindex;
1465         char            *ifname;
1466 
1467         ifflags = 0;
1468         ifname = ipne->ipne_ifname;
1469         ifindex = ipne->ipne_ifindex;
1470 
1471         (void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1472 
1473         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1474                 ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1475                 refrele_needed = B_FALSE;
1476         }
1477         if (ipnetif != NULL) {
1478                 ipnetif->if_flags |=
1479                     isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1480         }
1481 
1482         if (ipnetif->if_multicnt != 0) {
1483                 if (ip_join_allmulti(ifindex, isv6,
1484                     ips->ips_netstack->netstack_ip) == 0) {
1485                         ipnetif->if_flags |=
1486                             isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1487                 }
1488         }
1489 
1490         if (refrele_needed)
1491                 ipnetif_refrele(ipnetif);
1492 }
1493 
1494 static void
1495 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1496 {
1497         ipnetif_t       *ipnetif;
1498 
1499         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1500                 return;
1501 
1502         mutex_enter(&ipnetif->if_addr_lock);
1503         ipnet_purge_addrlist(isv6 ?
1504             &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1505         mutex_exit(&ipnetif->if_addr_lock);
1506 
1507         /*
1508          * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1509          * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1510          * if both IPv4 and IPv6 interfaces have been unplumbed.
1511          */
1512         ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1513         if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1514                 ipnetif_remove(ipnetif, ips);
1515         ipnetif_refrele(ipnetif);
1516 }
1517 
1518 static void
1519 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1520     ipnet_stack_t *ips, boolean_t isv6)
1521 {
1522         ipnetif_t       *ipnetif;
1523         ipnetif_addr_t  *ifaddr;
1524 
1525         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1526                 return;
1527         if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1528                 /*
1529                  * We must have missed a NE_LIF_DOWN event.  Delete this
1530                  * ifaddr and re-create it.
1531                  */
1532                 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1533         }
1534 
1535         ipnet_add_ifaddr(lifindex, ipnetif, nd);
1536         ipnetif_refrele(ipnetif);
1537 }
1538 
1539 static void
1540 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1541     boolean_t isv6)
1542 {
1543         ipnetif_t       *ipnetif;
1544         ipnetif_addr_t  *ifaddr;
1545 
1546         if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1547                 return;
1548         if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1549                 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1550         ipnetif_refrele(ipnetif);
1551         /*
1552          * Make sure that open streams on this ipnetif are still allowed to
1553          * have it open.
1554          */
1555         ipnetif_zonecheck(ipnetif, ips);
1556 }
1557 
1558 /*
1559  * This callback from the NIC event framework dispatches a taskq as the event
1560  * handlers may block.
1561  */
1562 /* ARGSUSED */
1563 static int
1564 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1565 {
1566         ipnet_stack_t           *ips = arg;
1567         hook_nic_event_t        *hn = (hook_nic_event_t *)info;
1568         ipnet_nicevent_t        *ipne;
1569 
1570         if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1571                 return (0);
1572         ipne->ipne_event = hn->hne_event;
1573         ipne->ipne_protocol = hn->hne_protocol;
1574         ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1575         ipne->ipne_ifindex = hn->hne_nic;
1576         ipne->ipne_lifindex = hn->hne_lif;
1577         if (hn->hne_datalen != 0) {
1578                 (void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1579                     sizeof (ipne->ipne_ifname));
1580         }
1581         (void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1582             ipne, DDI_NOSLEEP);
1583         return (0);
1584 }
1585 
1586 static void
1587 ipnet_nicevent_task(void *arg)
1588 {
1589         ipnet_nicevent_t        *ipne = arg;
1590         netstack_t              *ns;
1591         ipnet_stack_t           *ips;
1592         boolean_t               isv6;
1593 
1594         if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1595                 goto done;
1596         ips = ns->netstack_ipnet;
1597         isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1598 
1599         mutex_enter(&ips->ips_event_lock);
1600         switch (ipne->ipne_event) {
1601         case NE_PLUMB:
1602                 ipnet_plumb_ev(ipne, ips, isv6);
1603                 break;
1604         case NE_UNPLUMB:
1605                 ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1606                 break;
1607         case NE_LIF_UP:
1608                 ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1609                     ipne->ipne_protocol, ips, isv6);
1610                 break;
1611         case NE_LIF_DOWN:
1612                 ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1613                     isv6);
1614                 break;
1615         default:
1616                 break;
1617         }
1618         mutex_exit(&ips->ips_event_lock);
1619 done:
1620         if (ns != NULL)
1621                 netstack_rele(ns);
1622         kmem_free(ipne, sizeof (ipnet_nicevent_t));
1623 }
1624 
1625 dev_t
1626 ipnet_if_getdev(char *name, zoneid_t zoneid)
1627 {
1628         netstack_t      *ns;
1629         ipnet_stack_t   *ips;
1630         ipnetif_t       *ipnetif;
1631         dev_t           dev = (dev_t)-1;
1632 
1633         if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1634                 return (dev);
1635         if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1636                 return (dev);
1637 
1638         ips = ns->netstack_ipnet;
1639         mutex_enter(&ips->ips_avl_lock);
1640         if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1641                 if (ipnetif_in_zone(ipnetif, zoneid, ips))
1642                         dev = ipnetif->if_dev;
1643         }
1644         mutex_exit(&ips->ips_avl_lock);
1645         netstack_rele(ns);
1646 
1647         return (dev);
1648 }
1649 
1650 static ipnetif_t *
1651 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1652 {
1653         ipnetif_t       *ipnetif;
1654 
1655         mutex_enter(&ips->ips_avl_lock);
1656         if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1657                 ipnetif_refhold(ipnetif);
1658         mutex_exit(&ips->ips_avl_lock);
1659         return (ipnetif);
1660 }
1661 
1662 static ipnetif_t *
1663 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1664 {
1665         ipnetif_t       *ipnetif;
1666         avl_tree_t      *tree;
1667 
1668         mutex_enter(&ips->ips_avl_lock);
1669         tree = &ips->ips_avl_by_index;
1670         for (ipnetif = avl_first(tree); ipnetif != NULL;
1671             ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1672                 if (ipnetif->if_dev == dev) {
1673                         ipnetif_refhold(ipnetif);
1674                         break;
1675                 }
1676         }
1677         mutex_exit(&ips->ips_avl_lock);
1678         return (ipnetif);
1679 }
1680 
1681 static ipnetif_addr_t *
1682 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1683 {
1684         ipnetif_addr_t  *ifaddr;
1685         list_t  *list;
1686 
1687         mutex_enter(&ipnetif->if_addr_lock);
1688         list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1689         for (ifaddr = list_head(list); ifaddr != NULL;
1690             ifaddr = list_next(list, ifaddr)) {
1691                 if (lid == ifaddr->ifa_id)
1692                         break;
1693         }
1694         mutex_exit(&ipnetif->if_addr_lock);
1695         return (ifaddr);
1696 }
1697 
1698 /* ARGSUSED */
1699 static void *
1700 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1701 {
1702         ipnet_stack_t   *ips;
1703 
1704         ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1705         ips->ips_netstack = ns;
1706         mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1707         avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1708             sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1709         avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1710             sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1711         avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1712             sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1713         mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1714         cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1715         list_create(&ips->ips_str_list, sizeof (ipnet_t),
1716             offsetof(ipnet_t, ipnet_next));
1717         ipnet_register_netihook(ips);
1718         return (ips);
1719 }
1720 
1721 /* ARGSUSED */
1722 static void
1723 ipnet_stack_fini(netstackid_t stackid, void *arg)
1724 {
1725         ipnet_stack_t   *ips = arg;
1726         ipnetif_t       *ipnetif, *nipnetif;
1727 
1728         if (ips->ips_kstatp != NULL) {
1729                 zoneid_t zoneid;
1730 
1731                 zoneid = netstackid_to_zoneid(stackid);
1732                 net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1733         }
1734         if (ips->ips_ndv4 != NULL) {
1735                 VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1736                     ips->ips_nicevents) == 0);
1737                 VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1738         }
1739         if (ips->ips_ndv6 != NULL) {
1740                 VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1741                     ips->ips_nicevents) == 0);
1742                 VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1743         }
1744         hook_free(ips->ips_nicevents);
1745 
1746         for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1747             ipnetif = nipnetif) {
1748                 nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1749                 ipnetif_remove(ipnetif, ips);
1750         }
1751         avl_destroy(&ips->ips_avl_by_shared);
1752         avl_destroy(&ips->ips_avl_by_index);
1753         avl_destroy(&ips->ips_avl_by_name);
1754         mutex_destroy(&ips->ips_avl_lock);
1755         mutex_destroy(&ips->ips_walkers_lock);
1756         cv_destroy(&ips->ips_walkers_cv);
1757         list_destroy(&ips->ips_str_list);
1758         kmem_free(ips, sizeof (*ips));
1759 }
1760 
1761 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1762 static boolean_t
1763 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1764 {
1765         ipnetif_addr_t  *ifa;
1766 
1767         for (ifa = list_head(addrlist); ifa != NULL;
1768             ifa = list_next(addrlist, ifa)) {
1769                 if (ifa->ifa_zone == zoneid)
1770                         return (B_TRUE);
1771         }
1772         return (B_FALSE);
1773 }
1774 
1775 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1776 static boolean_t
1777 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1778 {
1779         int     ret;
1780 
1781         /*
1782          * The global zone has visibility into all interfaces in the global
1783          * stack, and exclusive stack zones have visibility into all
1784          * interfaces in their stack.
1785          */
1786         if (zoneid == GLOBAL_ZONEID ||
1787             ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1788                 return (B_TRUE);
1789 
1790         /*
1791          * Shared-stack zones only have visibility for interfaces that have
1792          * addresses in their zone.
1793          */
1794         mutex_enter(&ipnetif->if_addr_lock);
1795         ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1796             ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1797         mutex_exit(&ipnetif->if_addr_lock);
1798         return (ret);
1799 }
1800 
1801 /*
1802  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1803  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1804  * to have an ipnetif open if there are no longer any addresses that belong to
1805  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1806  * case, send the ipnet_t an M_HANGUP.
1807  */
1808 static void
1809 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1810 {
1811         list_t  *strlist = &ips->ips_str_list;
1812         ipnet_t *ipnet;
1813 
1814         ipnet_walkers_inc(ips);
1815         for (ipnet = list_head(strlist); ipnet != NULL;
1816             ipnet = list_next(strlist, ipnet)) {
1817                 if (ipnet->ipnet_if != ipnetif)
1818                         continue;
1819                 if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1820                         (void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1821         }
1822         ipnet_walkers_dec(ips);
1823 }
1824 
1825 void
1826 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1827 {
1828         ipnetif_t               *ipnetif;
1829         list_t                  cbdata;
1830         ipnetif_cbdata_t        *cbnode;
1831         netstack_t              *ns;
1832         ipnet_stack_t           *ips;
1833 
1834         /*
1835          * On labeled systems, non-global zones shouldn't see anything
1836          * in /dev/ipnet.
1837          */
1838         if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1839                 return;
1840 
1841         if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1842                 return;
1843 
1844         ips = ns->netstack_ipnet;
1845         list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1846             offsetof(ipnetif_cbdata_t, ic_next));
1847 
1848         mutex_enter(&ips->ips_avl_lock);
1849         for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1850             ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1851                 if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1852                         continue;
1853                 cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1854                 (void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1855                 cbnode->ic_dev = ipnetif->if_dev;
1856                 list_insert_head(&cbdata, cbnode);
1857         }
1858         mutex_exit(&ips->ips_avl_lock);
1859 
1860         while ((cbnode = list_head(&cbdata)) != NULL) {
1861                 cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1862                 list_remove(&cbdata, cbnode);
1863                 kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1864         }
1865         list_destroy(&cbdata);
1866         netstack_rele(ns);
1867 }
1868 
1869 static int
1870 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1871 {
1872         int64_t index1 = *((int64_t *)index_ptr);
1873         int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1874 
1875         return (SIGNOF(index2 - index1));
1876 }
1877 
1878 static int
1879 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1880 {
1881         int     res;
1882 
1883         res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1884         return (SIGNOF(res));
1885 }
1886 
1887 static int
1888 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1889 {
1890         const uintptr_t *ptr = key_ptr;
1891         const ipnetif_t *ifp;
1892         int             res;
1893 
1894         ifp = ipnetifp;
1895         res = ifp->if_zoneid - ptr[0];
1896         if (res != 0)
1897                 return (SIGNOF(res));
1898         res = strcmp(ifp->if_name, (char *)ptr[1]);
1899         return (SIGNOF(res));
1900 }
1901 
1902 static void
1903 ipnetif_refhold(ipnetif_t *ipnetif)
1904 {
1905         mutex_enter(&ipnetif->if_reflock);
1906         ipnetif->if_refcnt++;
1907         mutex_exit(&ipnetif->if_reflock);
1908 }
1909 
1910 static void
1911 ipnetif_refrele(ipnetif_t *ipnetif)
1912 {
1913         mutex_enter(&ipnetif->if_reflock);
1914         ASSERT(ipnetif->if_refcnt > 0);
1915         if (--ipnetif->if_refcnt == 0)
1916                 ipnetif_free(ipnetif);
1917         else
1918                 mutex_exit(&ipnetif->if_reflock);
1919 }
1920 
1921 static void
1922 ipnet_walkers_inc(ipnet_stack_t *ips)
1923 {
1924         mutex_enter(&ips->ips_walkers_lock);
1925         ips->ips_walkers_cnt++;
1926         mutex_exit(&ips->ips_walkers_lock);
1927 }
1928 
1929 static void
1930 ipnet_walkers_dec(ipnet_stack_t *ips)
1931 {
1932         mutex_enter(&ips->ips_walkers_lock);
1933         ASSERT(ips->ips_walkers_cnt != 0);
1934         if (--ips->ips_walkers_cnt == 0)
1935                 cv_broadcast(&ips->ips_walkers_cv);
1936         mutex_exit(&ips->ips_walkers_lock);
1937 }
1938 
1939 /*ARGSUSED*/
1940 static int
1941 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1942 {
1943         hook_pkt_observe_t      *hdr;
1944         pfv_t                   func = (pfv_t)arg;
1945         mblk_t                  *mp;
1946 
1947         hdr = (hook_pkt_observe_t *)info;
1948         /*
1949          * Code in ip_input() expects that it is the only one accessing the
1950          * packet.
1951          */
1952         mp = copymsg(hdr->hpo_pkt);
1953         if (mp == NULL)  {
1954                 netstack_t *ns = hdr->hpo_ctx;
1955                 ipnet_stack_t *ips = ns->netstack_ipnet;
1956 
1957                 IPSK_BUMP(ips, ik_dispatchDupDrop);
1958                 return (0);
1959         }
1960 
1961         hdr = (hook_pkt_observe_t *)mp->b_rptr;
1962         hdr->hpo_pkt = mp;
1963 
1964         func(mp);
1965 
1966         return (0);
1967 }
1968 
1969 hook_t *
1970 ipobs_register_hook(netstack_t *ns, pfv_t func)
1971 {
1972         ip_stack_t      *ipst = ns->netstack_ip;
1973         char            name[32];
1974         hook_t          *hook;
1975 
1976         HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1977         VERIFY(hook != NULL);
1978 
1979         /*
1980          * To register multiple hooks with the same callback function,
1981          * a unique name is needed.
1982          */
1983         (void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1984         hook->h_name = strdup(name);
1985 
1986         (void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1987         (void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1988 
1989         return (hook);
1990 }
1991 
1992 void
1993 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1994 {
1995         ip_stack_t      *ipst = ns->netstack_ip;
1996 
1997         (void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1998 
1999         (void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
2000 
2001         strfree(hook->h_name);
2002 
2003         hook_free(hook);
2004 }
2005 
2006 /* ******************************************************************** */
2007 /* BPF Functions below                                                  */
2008 /* ******************************************************************** */
2009 
2010 /*
2011  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2012  */
2013 ipnet_stack_t *
2014 ipnet_find_by_zoneid(zoneid_t zoneid)
2015 {
2016         netstack_t      *ns;
2017 
2018         VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2019         return (ns->netstack_ipnet);
2020 }
2021 
2022 /*
2023  * Functions, such as the above ipnet_find_by_zoneid(), will return a
2024  * pointer to ipnet_stack_t by calling a netstack lookup function.
2025  * The netstack_find_*() functions return a pointer after doing a "hold"
2026  * on the data structure and thereby require a "release" when the caller
2027  * is finished with it. We need to mirror that API here and thus a caller
2028  * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2029  */
2030 void
2031 ipnet_rele(ipnet_stack_t *ips)
2032 {
2033         netstack_rele(ips->ips_netstack);
2034 }
2035 
2036 /*
2037  */
2038 void
2039 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2040 {
2041         ipnet_itap = tapfunc;
2042 }
2043 
2044 /*
2045  * The list of interfaces available via ipnet is private for each zone,
2046  * so the AVL tree of each zone must be searched for a given name, even
2047  * if all names are unique.
2048  */
2049 int
2050 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2051 {
2052         ipnet_stack_t   *ips;
2053         ipnetif_t       *ipnetif;
2054 
2055         ASSERT(ptr != NULL);
2056         VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2057 
2058         mutex_enter(&ips->ips_avl_lock);
2059 
2060         /*
2061          * Shared instance zone?
2062          */
2063         if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2064                 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2065 
2066                 ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2067         } else {
2068                 ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2069         }
2070         if (ipnetif != NULL)
2071                 ipnetif_refhold(ipnetif);
2072         mutex_exit(&ips->ips_avl_lock);
2073 
2074         *ptr = ipnetif;
2075         ipnet_rele(ips);
2076 
2077         if (ipnetif == NULL)
2078                 return (ESRCH);
2079         return (0);
2080 }
2081 
2082 void
2083 ipnet_close_byhandle(ipnetif_t *ifp)
2084 {
2085         ASSERT(ifp != NULL);
2086         ipnetif_refrele(ifp);
2087 }
2088 
2089 const char *
2090 ipnet_name(ipnetif_t *ifp)
2091 {
2092         ASSERT(ifp != NULL);
2093         return (ifp->if_name);
2094 }
2095 
2096 /*
2097  * To find the linkid for a given name, it is necessary to know which zone
2098  * the interface name belongs to and to search the avl tree for that zone
2099  * as there is no master list of all interfaces and which zone they belong
2100  * to. It is assumed that the caller of this function is somehow already
2101  * working with the ipnet interfaces and hence the ips_event_lock is held.
2102  * When BPF calls into this function, it is doing so because of an event
2103  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2104  * value returned has meaning without the need for grabbing a hold on the
2105  * owning structure.
2106  */
2107 int
2108 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2109 {
2110         ipnet_stack_t   *ips;
2111         ipnetif_t       *ifp;
2112 
2113         VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2114         ASSERT(mutex_owned(&ips->ips_event_lock));
2115 
2116         mutex_enter(&ips->ips_avl_lock);
2117         ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2118         if (ifp != NULL)
2119                 *idp = (uint_t)ifp->if_index;
2120 
2121         /*
2122          * Shared instance zone?
2123          */
2124         if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2125                 uintptr_t key[2] = { zoneid, (uintptr_t)name };
2126 
2127                 ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2128                 if (ifp != NULL)
2129                         *idp = (uint_t)ifp->if_index;
2130         }
2131 
2132         mutex_exit(&ips->ips_avl_lock);
2133         ipnet_rele(ips);
2134 
2135         if (ifp == NULL)
2136                 return (ESRCH);
2137         return (0);
2138 }
2139 
2140 /*
2141  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2142  * there is in mac. BPF only needs to have this because it is required as
2143  * part of interfacing correctly with mac. The reuse of the original
2144  * ipnetif_t as a client poses no danger, so long as it is done with its
2145  * own ref-count'd hold that is given up on close.
2146  */
2147 int
2148 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2149 {
2150         ASSERT(ptr != NULL);
2151         ASSERT(result != NULL);
2152         ipnetif_refhold(ptr);
2153         *result = ptr;
2154 
2155         return (0);
2156 }
2157 
2158 void
2159 ipnet_client_close(ipnetif_t *ptr)
2160 {
2161         ASSERT(ptr != NULL);
2162         ipnetif_refrele(ptr);
2163 }
2164 
2165 /*
2166  * This is called from BPF when it needs to start receiving packets
2167  * from ipnet.
2168  *
2169  * The use of the ipnet_t structure here is somewhat lightweight when
2170  * compared to how it is used elsewhere but it already has all of the
2171  * right fields in it, so reuse here doesn't seem out of order. Its
2172  * primary purpose here is to provide the means to store pointers for
2173  * use when ipnet_promisc_remove() needs to be called.
2174  *
2175  * This should never be called for the IPNET_MINOR_LO device as it is
2176  * never created via ipnetif_create.
2177  */
2178 /*ARGSUSED*/
2179 int
2180 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2181     int flags)
2182 {
2183         ip_stack_t      *ipst;
2184         netstack_t      *ns;
2185         ipnetif_t       *ifp;
2186         ipnet_t         *ipnet;
2187         char            name[32];
2188         int             error;
2189 
2190         ifp = (ipnetif_t *)handle;
2191 
2192         if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
2193                 return (EINVAL);
2194 
2195         ns = netstack_find_by_zoneid(ifp->if_zoneid);
2196 
2197         if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
2198                 netstack_rele(ns);
2199                 return (error);
2200         }
2201 
2202         ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2203         ipnet->ipnet_if = ifp;
2204         ipnet->ipnet_ns = ns;
2205         ipnet->ipnet_flags = flags;
2206 
2207         if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2208                 ipnet->ipnet_acceptfn = ipnet_loaccept;
2209         } else {
2210                 ipnet->ipnet_acceptfn = ipnet_accept;
2211         }
2212 
2213         /*
2214          * To register multiple hooks with the same callback function,
2215          * a unique name is needed.
2216          */
2217         HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2218         (void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2219             (void *)ipnet->ipnet_hook);
2220         ipnet->ipnet_hook->h_name = strdup(name);
2221         ipnet->ipnet_data = data;
2222         ipnet->ipnet_zoneid = ifp->if_zoneid;
2223 
2224         ipst = ns->netstack_ip;
2225 
2226         error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2227             ipnet->ipnet_hook);
2228         if (error != 0)
2229                 goto regfail;
2230 
2231         error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2232             ipnet->ipnet_hook);
2233         if (error != 0) {
2234                 (void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2235                     NH_OBSERVE, ipnet->ipnet_hook);
2236                 goto regfail;
2237         }
2238 
2239         *mhandle = (uintptr_t)ipnet;
2240         netstack_rele(ns);
2241 
2242         return (0);
2243 
2244 regfail:
2245         cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2246         strfree(ipnet->ipnet_hook->h_name);
2247         hook_free(ipnet->ipnet_hook);
2248         netstack_rele(ns);
2249         return (error);
2250 }
2251 
2252 void
2253 ipnet_promisc_remove(void *data)
2254 {
2255         ip_stack_t      *ipst;
2256         ipnet_t         *ipnet;
2257         hook_t          *hook;
2258 
2259         ipnet = data;
2260         ipst = ipnet->ipnet_ns->netstack_ip;
2261         hook = ipnet->ipnet_hook;
2262 
2263         VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2264             hook) == 0);
2265 
2266         VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2267             hook) == 0);
2268 
2269         strfree(hook->h_name);
2270 
2271         hook_free(hook);
2272 
2273         kmem_free(ipnet, sizeof (*ipnet));
2274 }
2275 
2276 /*
2277  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2278  * An important field from that structure is "ipnet_data" that
2279  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2280  * to be passed back to bpf when we call into ipnet_itap.
2281  *
2282  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2283  * from BPF.
2284  */
2285 /*ARGSUSED*/
2286 static int
2287 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2288 {
2289         hook_pkt_observe_t      *hdr;
2290         ipnet_addrp_t           src;
2291         ipnet_addrp_t           dst;
2292         ipnet_stack_t           *ips;
2293         ipnet_t                 *ipnet;
2294         mblk_t                  *netmp;
2295         mblk_t                  *mp;
2296 
2297         hdr = (hook_pkt_observe_t *)info;
2298         mp = hdr->hpo_pkt;
2299         ipnet = (ipnet_t *)arg;
2300         ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2301 
2302         netmp = hdr->hpo_pkt->b_cont;
2303         src.iap_family = hdr->hpo_family;
2304         dst.iap_family = hdr->hpo_family;
2305 
2306         if (hdr->hpo_family == AF_INET) {
2307                 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2308                 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2309         } else {
2310                 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2311                 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2312         }
2313 
2314         if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2315                 IPSK_BUMP(ips, ik_acceptFail);
2316                 return (0);
2317         }
2318         IPSK_BUMP(ips, ik_acceptOk);
2319 
2320         ipnet_itap(ipnet->ipnet_data, mp,
2321             hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2322             ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2323 
2324         return (0);
2325 }
2326 
2327 /*
2328  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2329  * to life and configures an IP address. The model that BPF uses is that
2330  * each interface must have a unique pointer and each interface must be
2331  * representative of what it can capture. They are limited to one DLT
2332  * per interface and one zone per interface. Thus every interface that
2333  * can be seen in a zone must be announced via an attach to bpf. For
2334  * shared instance zones, this means the ipnet driver needs to detect
2335  * when an address is added to an interface in a zone for the first
2336  * time (and also when the last address is removed.)
2337  */
2338 static ipnetif_t *
2339 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2340 {
2341         uintptr_t       key[2] = { zoneid, (uintptr_t)ifp->if_name };
2342         ipnet_stack_t   *ips = ifp->if_stackp;
2343         avl_index_t     where = 0;
2344         ipnetif_t       *newif;
2345 
2346         mutex_enter(&ips->ips_avl_lock);
2347         newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2348         if (newif != NULL) {
2349                 ipnetif_refhold(newif);
2350                 newif->if_sharecnt++;
2351                 mutex_exit(&ips->ips_avl_lock);
2352                 return (newif);
2353         }
2354 
2355         newif = ipnet_alloc_if(ips);
2356         if (newif == NULL) {
2357                 mutex_exit(&ips->ips_avl_lock);
2358                 return (NULL);
2359         }
2360 
2361         newif->if_refcnt = 1;
2362         newif->if_sharecnt = 1;
2363         newif->if_zoneid = zoneid;
2364         (void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2365         newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2366         newif->if_index = ifp->if_index;
2367 
2368         avl_insert(&ips->ips_avl_by_shared, newif, where);
2369         mutex_exit(&ips->ips_avl_lock);
2370 
2371         return (newif);
2372 }
2373 
2374 static void
2375 ipnetif_clone_release(ipnetif_t *ipnetif)
2376 {
2377         boolean_t       dofree = B_FALSE;
2378         boolean_t       doremove = B_FALSE;
2379         ipnet_stack_t   *ips = ipnetif->if_stackp;
2380 
2381         mutex_enter(&ipnetif->if_reflock);
2382         ASSERT(ipnetif->if_refcnt > 0);
2383         if (--ipnetif->if_refcnt == 0)
2384                 dofree = B_TRUE;
2385         ASSERT(ipnetif->if_sharecnt > 0);
2386         if (--ipnetif->if_sharecnt == 0)
2387                 doremove = B_TRUE;
2388         mutex_exit(&ipnetif->if_reflock);
2389         if (doremove) {
2390                 mutex_enter(&ips->ips_avl_lock);
2391                 avl_remove(&ips->ips_avl_by_shared, ipnetif);
2392                 mutex_exit(&ips->ips_avl_lock);
2393         }
2394         if (dofree) {
2395                 ASSERT(ipnetif->if_sharecnt == 0);
2396                 ipnetif_free(ipnetif);
2397         }
2398 }