1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2016 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Overlay Devices
  18  *
  19  * Overlay devices provide a means for creating overlay networks, a means of
  20  * multiplexing multiple logical, isolated, and discrete layer two and layer
  21  * three networks on top of one physical network.
  22  *
  23  * In general, these overlay devices encapsulate the logic to answer two
  24  * different questions:
  25  *
  26  *   1) How should I transform a packet to put it on the wire?
  27  *   2) Where should I send a transformed packet?
  28  *
  29  * Each overlay device is presented to the user as a GLDv3 device. While the
  30  * link itself cannot have an IP interface created on top of it, it allows for
  31  * additional GLDv3 devices, such as a VNIC, to be created on top of it which
  32  * can be plumbed up with IP interfaces.
  33  *
  34  *
  35  * --------------------
  36  * General Architecture
  37  * --------------------
  38  *
  39  * The logical overlay device that a user sees in dladm(1M) is a combination of
  40  * two different components that work together. The first component is this
  41  * kernel module, which is responsible for answering question one -- how should
  42  * I transform a packet to put it on the wire.
  43  *
  44  * The second component is what we call the virtual ARP daemon, or varpd. It is
  45  * a userland component that is responsible for answering the second question --
  46  * Where should I send a transformed packet. Instances of the kernel overlay
  47  * GLDv3 device ask varpd the question of where should a packet go.
  48  *
  49  * The split was done for a few reasons. Importantly, we wanted to keep the act
  50  * of generating encapsulated packets in the kernel so as to ensure that the
  51  * general data path was fast and also kept simple. On the flip side, while the
  52  * question of where should something go may be simple, it may often be
  53  * complicated and need to interface with several different external or
  54  * distributed systems. In those cases, it's simpler to allow for the full
  55  * flexibility of userland to be brought to bear to solve that problem and in
  56  * general, the path isn't very common.
  57  *
  58  * The following is what makes up the logical overlay device that a user would
  59  * create with dladm(1M).
  60  *
  61  *       Kernel                                     Userland
  62  *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
  63  *   . +--------+   +--------+  +--------+   .   .                       .
  64  *   . | VNIC 0 |   | VNIC 1 |  | VNIC 2 |   .   .                       .
  65  *   . +--------+   +--------+  +--------+   .   .                       .
  66  *   .     |            |           |        .   .                       .
  67  *   .     |            |           |        .   .                       .
  68  *   .     +------------+-----------+        .   .                       .
  69  *   .                  |              . . /dev/overlay                  .
  70  *   .           +--------------+      .     .   .       +------------+  .
  71  *   .           |              |      .     .   .       |            |  .
  72  *   .           |    Overlay   |======*=================|   Virtual  |  .
  73  *   .           | GLDv3 Device |========================| ARP Daemon |  .
  74  *   .           |              |            .   .       |            |  .
  75  *   .           +--------------+            .   .       +------------+  .
  76  *   .                  |                    .   .              |        .
  77  *   .                  |                    .   .              |        .
  78  *   .           +----------------+          .   .         +--------+    .
  79  *   .           |  Overlay       |          .   .         | varpd  |    .
  80  *   .           |  Encapsulation |          .   .         | Lookup |    .
  81  *   .           |  Plugin        |          .   .         | Plugin |    .
  82  *   .           +----------------+          .   .         +--------+    .
  83  *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
  84  *
  85  *
  86  * This image shows the two different components and where they live.
  87  * Importantly, it also shows that both the kernel overlay device and the
  88  * userland varpd both support plugins. The plugins actually implement the
  89  * things that users care about and the APIs have been designed to try to
  90  * minimize the amount of things that a module writer needs to worry about it.
  91  *
  92  * IDENTIFIERS
  93  *
  94  * Every overlay device is defined by a unique identifier which is the overlay
  95  * identifier. Its purpose is similar to that of a VLAN identifier, it's a
  96  * unique number that is used to differentiate between different entries on the
  97  * wire.
  98  *
  99  * ENCAPSULATION
 100  *
 101  * An overlay encapsulation plugin is a kernel miscellaneous module whose
 102  * purpose is to contain knowledge about how to transform packets to put them
 103  * onto the wire and to take them off. An example of an encapsulation plugin is
 104  * vxlan. It's also how support for things like nvgre or geneve would be brought
 105  * into the system.
 106  *
 107  * Each encapsulation plugins defines a series of operation vectors and
 108  * properties. For the full details on everything they should provide, please
 109  * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
 110  * for telling the system what information is required to send a packet. For
 111  * example, vxlan is defined to send everything over a UDP packet and therefore
 112  * requires a port and an IP address, while nvgre on the other hand is its own
 113  * IP type and therefore just requires an IP address. In addition, it also
 114  * provides information about the kind of socket that should be created. This is
 115  * used by the kernel multiplexor, more of that in the Kernel Components
 116  * section.
 117  *
 118  * LOOKUPS
 119  *
 120  * The kernel communicates requests for lookups over the character device
 121  * /dev/overlay. varpd is responsible for listening for requests on that device
 122  * and answering them. The character device is specific to the target path and
 123  * varpd.
 124  *
 125  * Much as the kernel overlay module handles the bulk of the scaffolding but
 126  * leaves the important work to the encapsulation plugin, varpd provides a
 127  * similar role and leaves the full brunt of lookups to a userland dynamic
 128  * shared object which implements the logic of lookups.
 129  *
 130  * Each lookup plugin defines a series of operation vectors and properties. For
 131  * the full details on everything that they should provide, please read
 132  * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
 133  * address and asked to give an address on the physical network that it should
 134  * be sent to. In addition, they handle questions related to how to handle
 135  * things like broadcast and multicast traffic, etc.
 136  *
 137  * ----------
 138  * Properties
 139  * ----------
 140  *
 141  * A device from a dladm perspective has a unique set of properties that are
 142  * combined from three different sources:
 143  *
 144  *   1) Generic properties that every overlay device has
 145  *   2) Properties that are specific to the encapsulation plugin
 146  *   3) Properties that are specific to the lookup plugin
 147  *
 148  * All of these are exposed in a single set of properties in dladm. Note that
 149  * these are not necessarily traditional link properties. However, if something
 150  * is both a traditional GLDv3 link property, say the MTU of a device, and a
 151  * specific property here, than the driver ensures that all existing GLDv3
 152  * specific means of manipulating it are used and wraps up its private property
 153  * interfaces to ensure that works.
 154  *
 155  * Properties in the second and third category are prefixed with the name of
 156  * their module. For example, the vxlan encapsulation module has a property
 157  * called the 'listen_ip'. This property would show up in dladm as
 158  * 'vxlan/listen_ip'. This allows different plugins to both use similar names
 159  * for similar properties and to also have independent name spaces so that
 160  * overlapping names do not conflict with anything else.
 161  *
 162  * While the kernel combines both sets one and two into a single coherent view,
 163  * it does not do anything with respect to the properties that are owned by the
 164  * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
 165  * charge of bridging these two worlds into one magical experience for the user.
 166  * It carries the burden of knowing about both overlay specific and varpd
 167  * specific properties. Importantly, we want to maintain this distinction. We
 168  * don't want to treat the kernel as an arbitrary key/value store for varpd and
 169  * we want the kernel to own its own data and not have to ask userland for
 170  * information that it owns.
 171  *
 172  * Every property in the system has the following attributes:
 173  *
 174  *   o A name
 175  *   o A type
 176  *   o A size
 177  *   o Permissions
 178  *   o Default value
 179  *   o Valid value ranges
 180  *   o A value
 181  *
 182  * Everything except for the value is obtained by callers through the propinfo
 183  * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
 184  * currently 256 bytes.
 185  *
 186  * The following are the supported types of properties:
 187  *
 188  *      OVERLAY_PROP_T_INT
 189  *
 190  *              A signed integer, its length is 8 bytes, corresponding to a
 191  *              int64_t.
 192  *
 193  *      OVERLAY_PROP_T_UINT
 194  *
 195  *              An unsigned integer, its length is 8 bytes, corresponding to a
 196  *              uint64_t.
 197  *
 198  *      OVERLAY_PROP_T_IP
 199  *
 200  *              A struct in6_addr, it has a fixed size.
 201  *
 202  *      OVERLAY_PROP_T_STRING
 203  *
 204  *              A null-terminated character string encoded in either ASCII or
 205  *              UTF-8. Note that the size of the string includes the null
 206  *              terminator.
 207  *
 208  *      OVERLAY_PROP_T_ETHER
 209  *
 210  *              An ether_addr_t, which has a fixed size.
 211  *
 212  * The next thing that we apply to a property is its permission. The permissions
 213  * are put together by the bitwise or of the following flags and values.
 214  *
 215  *      OVERLAY_PROP_PERM_REQ
 216  *
 217  *              This indicates a required property. A property that is required
 218  *              must be set by a consumer before the device can be created. If a
 219  *              required property has a default property, this constraint is
 220  *              loosened because the default property defines the value.
 221  *
 222  *      OVERLAY_PORP_PERM_READ
 223  *
 224  *              This indicates that a property can be read. All properties will
 225  *              have this value set.
 226  *
 227  *      OVERLAY_PROP_PERM_WRITE
 228  *
 229  *              This indicates that a property can be written to and thus
 230  *              updated by userland. Properties that are only intended to
 231  *              display information, will not have OVERLAY_PROP_PERM_WRITE set.
 232  *
 233  * In addition, a few additional values are defined as a convenience to
 234  * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
 235  * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
 236  * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
 237  * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
 238  * property should generally be a constant across its lifetime.
 239  *
 240  * A property may optionally have a default value. If it does have a default
 241  * value, and that property is not set to be a different value, then the default
 242  * value is inherited automatically. It also means that if the default value is
 243  * acceptable, there is no need to set the value for a required property. For
 244  * example, the vxlan module has the vxlan/listen_port property which is
 245  * required, but has a default value of 4789 (the IANA assigned port). Because
 246  * of that default value, there is no need for it to be set.
 247  *
 248  * Finally, a property may declare a list of valid values. These valid values
 249  * are used for display purposes, they are not enforced by the broader system,
 250  * but merely allow a means for the information to be communicated to the user
 251  * through dladm(1M). Like a default value, this is optional.
 252  *
 253  * The general scaffolding does not do very much with respect to the getting and
 254  * setting of properties. That is really owned by the individual plugins
 255  * themselves.
 256  *
 257  * -----------------------------
 258  * Destinations and Plugin Types
 259  * -----------------------------
 260  *
 261  * Both encapsulation and lookup plugins define the kinds of destinations that
 262  * they know how to support. There are three different pieces of information
 263  * that can be used to address to a destination currently, all of which is
 264  * summarized in the type overlay_point_t. Any combination of these is
 265  * supported.
 266  *
 267  *      OVERLAY_PLUGIN_D_ETHERNET
 268  *
 269  *              An Ethernet MAC address is required.
 270  *
 271  *      OVERLAY_PLUGIN_D_IP
 272  *
 273  *              An IP address is required. All IP addresses used by the overlay
 274  *              system are transmitted as IPv6 addresses. IPv4 addresses can be
 275  *              represented by using IPv4-mapped IPv6 addresses.
 276  *
 277  *      OVERLAY_PLUGIN_D_PORT
 278  *
 279  *              A TCP/UDP port is required.
 280  *
 281  * A kernel encapsulation plugin declares which of these that it requires, it's
 282  * a static set. On the other hand, a userland lookup plugin can be built to
 283  * support all of these or any combination thereof. It gets passed the required
 284  * destination type, based on the kernel encapsulation method, and then it makes
 285  * the determination as to whether or not it supports it. For example, the
 286  * direct plugin can support either an IP or both an IP and a port, it simply
 287  * doesn't display the direct/dest_port property in the cases where a port is
 288  * not required to support this.
 289  *
 290  * The user lookup plugins have two different modes of operation which
 291  * determines how they interact with the broader system and how look ups are
 292  * performed. These types are:
 293  *
 294  *      OVERLAY_TARGET_POINT
 295  *
 296  *              A point to point plugin has a single static definition for where
 297  *              to send all traffic. Every packet in the system always gets sent
 298  *              to the exact same destination which is programmed into the
 299  *              kernel when the general device is activated.
 300  *
 301  *      OVERLAY_TARGET_DYNAMIC
 302  *
 303  *              A dynamic plugin does not have a single static definition.
 304  *              Instead, for each destination, the kernel makes an asynchronous
 305  *              request to varpd to determine where the packet should be routed,
 306  *              and if a specific destination is found, then that destination is
 307  *              cached in the overlay device's target cache.
 308  *
 309  * This distinction, while important for the general overlay device's operation,
 310  * is not important to the encapsulation plugins. They don't need to know about
 311  * any of these pieces. It's just a concern for varpd, the userland plugin, and
 312  * the general overlay scaffolding.
 313  *
 314  * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
 315  * maintain a target cache, and instead just keeps track of the destination and
 316  * always sends encapsulated packets to that address. When the target type is of
 317  * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
 318  * destinations. These destinations are kept around in an instance of a
 319  * reference hash that is specific to the given overlay device. Entries in the
 320  * cache can be invalidated and replaced by varpd and its lookup plugins.
 321  *
 322  * ----------------------------------
 323  * Kernel Components and Architecture
 324  * ----------------------------------
 325  *
 326  * There are multiple pieces inside the kernel that work together, there is the
 327  * general overlay_dev_t structure, which is the logical GLDv3 device, but it
 328  * itself has references to things like an instance of an encapsulation plugin,
 329  * a pointer to a mux and a target cache. It can roughly be summarized in the
 330  * following image:
 331  *
 332  *     +------------------+
 333  *     | global           |
 334  *     | overlay list     |
 335  *     | overlay_dev_list |
 336  *     +------------------+
 337  *        |
 338  *        |  +-----------------------+            +---------------+
 339  *        +->| GLDv3 Device          |----------->| GLDv3 Device  | -> ...
 340  *           | overlay_dev_t         |            | overlay_dev_t |
 341  *           |                       |            +---------------+
 342  *           |                       |
 343  *           | mac_handle_t     -----+---> GLDv3 handle to MAC
 344  *           | datalink_id_t    -----+---> Datalink ID used by DLS
 345  *           | overlay_dev_flag_t ---+---> Device state
 346  *           | uint_t           -----+---> Curent device MTU
 347  *           | uint_t           -----+---> In-progress RX operations
 348  *           | uint_t           -----+---> In-progress TX operations
 349  *           | char[]           -----+---> FMA degraded message
 350  *           | void *           -----+---> plugin private data
 351  *           | overlay_target_t * ---+---------------------+
 352  *           | overlay_plugin_t * ---+---------+           |
 353  *           +-----------------------+         |           |
 354  *                           ^                 |           |
 355  *   +--------------------+  |                 |           |
 356  *   | Kernel Socket      |  |                 |           |
 357  *   | Multiplexor        |  |                 |           |
 358  *   | overlay_mux_t      |  |                 |           |
 359  *   |                    |  |                 |           |
 360  *   | avl_tree_t        -+--+                 |           |
 361  *   | uint_t            -+--> socket family   |           |
 362  *   | uint_t            -+--> socket type     |           |
 363  *   | uint_t            -+--> socket protocol |           |
 364  *   | ksocket_t         -+--> I/O socket      |           |
 365  *   | struct sockaddr * -+--> ksocket address |           |
 366  *   | overlay_plugin_t --+--------+           |           |
 367  *   +--------------------+        |           |           |
 368  *                                 |           |           |
 369  *   +-------------------------+   |           |           |
 370  *   | Encap Plugin            |<--+-----------+           |
 371  *   | overlay_plugin_t        |                           |
 372  *   |                         |                           |
 373  *   | char *               ---+--> plugin name            |
 374  *   | overlay_plugin_ops_t * -+--> plugin downcalls       |
 375  *   | char ** (props)      ---+--> property list          |
 376  *   | uint_t               ---+--> id length              |
 377  *   | overlay_plugin_flags_t -+--> plugin flags           |
 378  *   | overlay_plugin_dest_t --+--> destination type       v
 379  *   +-------------------------+                    +-------------------------+
 380  *                                                  |   Target Cache          |
 381  *                                                  |   overlay_target_t      |
 382  *                                                  |                         |
 383  *                                    cache mode <--+- overlay_target_mode_t  |
 384  *                                     dest type <--+- overlay_plugin_dest_t  |
 385  *                                   cache flags <--+- overlay_target_flag_t  |
 386  *                                     varpd id  <--+- uint64_t               |
 387  *                       outstanding varpd reqs. <--+- uint_t                 |
 388  *                   OVERLAY_TARGET_POINT state  <--+- overlay_target_point_t |
 389  *               OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t   |
 390  *                                              |   +-------------------------+
 391  *                      +-----------------------+
 392  *                      |
 393  *                      v
 394  *   +-------------------------------+   +------------------------+
 395  *   | Target Entry                  |-->| Target Entry           |--> ...
 396  *   | overlay_target_entry_t        |   | overlay_target_entry_t |
 397  *   |                               |   +------------------------+
 398  *   |                               |
 399  *   | overlay_target_entry_flags_t -+--> Entry flags
 400  *   | uint8_t[ETHERADDRL]        ---+--> Target MAC address
 401  *   | overlay_target_point_t     ---+--> Target underlay address
 402  *   | mblk_t *                   ---+--> outstanding mblk head
 403  *   | mblk_t *                   ---+--> outstanding mblk tail
 404  *   | size_t                     ---+--> outstanding mblk size
 405  *   +-------------------------------+
 406  *
 407  * The primary entries that we care about are the overlay_dev_t, which
 408  * correspond to each overlay device that is created with dladm(1M). Globally,
 409  * these devices are maintained in a simple list_t which is protected with a
 410  * lock.  Hence, these include important information such as the mac_handle_t
 411  * and a datalink_id_t which is used to interact with the broader MAC and DLS
 412  * ecosystem. We also maintain additional information such as the current state,
 413  * outstanding operations, the mtu, and importantly, the plugin's private data.
 414  * This is the instance of an encapsulation plugin that gets created as part of
 415  * creating an overlay device. Another aspect of this is that the overlay_dev_t
 416  * also includes information with respect to FMA. For more information, see the
 417  * FMA section.
 418  *
 419  * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
 420  * is the encapsulation plugin. This allows the device to make downcalls into it
 421  * based on doing things like getting and setting properties. Otherwise, the
 422  * plugin itself is a fairly straightforward entity. They are maintained in an
 423  * (not pictured above) list. The plugins themselves mostly maintain things like
 424  * the static list of properties, what kind of destination they require, and the
 425  * operations vector. A given module may contain more if necessary.
 426  *
 427  * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
 428  * maintains a ksocket and it is through the mux that we send and receive
 429  * message blocks. The mux represents a socket type and address, as well as a
 430  * plugin. Multiple overlay_dev_t devices may then share the same mux. For
 431  * example, consider the case where you have different instances of vxlan all on
 432  * the same underlay network. These would all logically share the same IP
 433  * address and port that packets are sent and received on; however, what differs
 434  * is the decapuslation ID.
 435  *
 436  * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
 437  * a socket, we enable a direct callback on the ksocket. This means that
 438  * whenever a message block chain is received, rather than sitting there and
 439  * getting a callback in a context and kicking that back out to a taskq. Instead
 440  * data comes into the callback function overlay_mux_recv().
 441  *
 442  * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
 443  * function) to transmit. It receives encapsulated packets, decapsulates them to
 444  * determine the overlay identifier, looks up the given device that matches that
 445  * identifier, and then causes the broader MAC world to receive the packet with
 446  * a call to mac_rx().
 447  *
 448  * Today, we don't do too much that's special with the ksocket; however, as
 449  * hardware is gaining understanding for these encapuslation protocols, we'll
 450  * probably want to think of better ways to get those capabilities passed down
 451  * and potentially better ways to program receive filters so they get directly
 452  * to us. Though, that's all fantasy future land.
 453  *
 454  * The next part of the puzzle is the target cache. The purpose of the target
 455  * cache is to cache where we should send a packet on the underlay network,
 456  * given its mac address. The target cache operates in two modes depending on
 457  * whether the lookup module was declared to OVERLAY_TARGET_POINT or
 458  * OVERLAY_TARGET_DYANMIC.
 459  *
 460  * In the case where the target cache has been programmed to be
 461  * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
 462  * which has the destination that we send everything, no matter the destination
 463  * mac address.
 464  *
 465  * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
 466  * are much more interesting and as a result, more complicated. We primarily
 467  * store lists of overlay_target_entry_t's which are stored in both an avl tree
 468  * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
 469  * is only used for a few of the target ioctls used to dump data such that we
 470  * can get a consistent iteration order for things like dladm show-overlay -t.
 471  * The key that we use for the reference hashtable is based on the mac address
 472  * in the cache and currently we just do a simple CRC32 to transform it into a
 473  * hash.
 474  *
 475  * Each entry maintains a set of flags to indicate the current status of the
 476  * request. The flags may indicate one of three states: that current cache entry
 477  * is valid, that the current cache entry has been directed to drop all output,
 478  * and that the current cache entry is invalid and may be being looked up. In
 479  * the case where it's valid, we just take the destination address and run with
 480  * it.
 481  *
 482  * If it's invalid and a lookup has not been made, then we start the process
 483  * that prepares a query that will make its way up to varpd. The cache entry
 484  * entry maintains a message block chain of outstanding message blocks and a
 485  * size. These lists are populated only when we don't know the answer as to
 486  * where should these be sent. The size entry is used to cap the amount of
 487  * outstanding data that we don't know the answer to. If we exceed a cap on the
 488  * amount of outstanding data (currently 1 Mb), then we'll drop any additional
 489  * packets. Once we get an answer indicating a valid destination, we transmit
 490  * any outstanding data to that place. For the full story on how we look that up
 491  * will be discussed in the section on the Target Cache Lifecycle.
 492  *
 493  * ------------------------
 494  * FMA and Degraded Devices
 495  * ------------------------
 496  *
 497  * Every kernel overlay device keeps track of its FMA state. Today in FMA we
 498  * cannot represent partitions between resources nor can we represent that a
 499  * given minor node of a psuedo device has failed -- if we degrade the overlay
 500  * device, then the entire dev_info_t is degraded. However, we still want to be
 501  * able to indicate to administrators that things may go wrong.
 502  *
 503  * To this end, we've added a notion of a degraded state to every overlay
 504  * device. This state is primarily dictated by userland and it can happen for
 505  * various reasons. Generally, because a userland lookup plugin has been
 506  * partitioned, or something has gone wrong such that there is no longer any
 507  * userland lookup module for a device, then we'll mark it degraded.
 508  *
 509  * As long as any of our minor instances is degraded, then we'll fire off the
 510  * FMA event to note that. Once the last degraded instance is no longer
 511  * degraded, then we'll end up telling FMA that we're all clean.
 512  *
 513  * To help administrators get a better sense of which of the various minor
 514  * devices is wrong, we store the odd_fmamsg[] character array. This character
 515  * array can be fetched with doing a dladm show-overlay -f.
 516  *
 517  * Note, that it's important that we do not update the link status of the
 518  * devices. We want to remain up as much as possible. By changing the link in a
 519  * degraded state, this may end up making things worse. We may still actually
 520  * have information in the target cache and if we mark the link down, that'll
 521  * result in not being able to use it. The reason being that this'll mark all
 522  * the downstream VNICs down which will go to IP and from there we end up
 523  * dealing with sadness.
 524  *
 525  * -----------------------
 526  * Target Cache Life Cycle
 527  * -----------------------
 528  *
 529  * This section only applies when we have a lookup plugin of
 530  * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
 531  * OVERLAY_TARGET_POINT.
 532  *
 533  * While we got into the target cache in the general architecture section, it's
 534  * worth going into more details as to how this actually works and showing some
 535  * examples and state machines. Recall that a target cache entry basically has
 536  * the following state transition diagram:
 537  *
 538  * Initial state
 539  *    . . .           . . . first access       . . . varpd lookup enqueued
 540  *        .           .                        .
 541  *        .           .                        .
 542  *     +-------+      .     +----------+       .
 543  *     |  No   |------*---->| Invalid  |-------*----+
 544  *     | Entry |            |  Entry   |            |
 545  *     +-------+            +----------+            |
 546  *                 varpd      ^      ^   varpd      |
 547  *                 invalidate |      |   drop       |
 548  *                      . . . *      * . .          v
 549  *          +-------+         |      |         +---------+
 550  *          | Entry |--->-----+      +----<----| Entry   |
 551  *          | Valid |<----------*---------<----| Pending |->-+     varpd
 552  *          +-------+           .              +---------+   * . . drop, but
 553  *                              . varpd                ^     |     other queued
 554  *                              . success              |     |     entries
 555  *                                                     +-----+
 556  *
 557  * When the table is first created, it is empty. As we attempt to lookup entries
 558  * and we find there is no entry at all, we'll create a new table entry for it.
 559  * At that point the entry is technically in an invalid state, that means that
 560  * we have no valid data from varpd. In that case, we'll go ahead and queue the
 561  * packet into the entry's pending chain, and queue a varpd lookup, setting the
 562  * OVERLAY_ENTRY_F_PENDING flag in the progress.
 563  *
 564  * If additional mblk_t's come in for this entry, we end up appending them to
 565  * the tail of the chain, if and only if, we don't exceed the threshold for the
 566  * amount of space they can take up. An entry remains pending until we get a
 567  * varpd reply. If varpd replies with a valid results, we move to the valid
 568  * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
 569  * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
 570  *
 571  * Once an entry is valid, it stays valid until user land tells us to invalidate
 572  * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
 573  * OVERLAY_TARG_CACHE_SET respectively.
 574  *
 575  * If the lookup fails with a call to drop the packet, then the next state is
 576  * determined by the state of the queue. If the set of outstanding entries is
 577  * empty, then we just transition back to the invalid state. If instead, the
 578  * set of outstanding entries is not empty, then we'll queue another entry and
 579  * stay in the same state, repeating this until the number of requests is
 580  * drained.
 581  *
 582  * The following images describes the flow of a given lookup and where the
 583  * overlay_target_entry_t is at any given time.
 584  *
 585  *     +-------------------+
 586  *     | Invalid Entry     |            An entry starts off as an invalid entry
 587  *     | de:ad:be:ef:00:00 |            and only exists in the target cache.
 588  *     +-------------------+
 589  *
 590  *      ~~~~
 591  *
 592  *     +---------------------+
 593  *     | Global list_t       |          A mblk_t comes in for an entry. We
 594  *     | overlay_target_list |          append it to the overlay_target_list.
 595  *     +---------------------+
 596  *                   |
 597  *                   v
 598  *             +-------------------+      +-------------------+
 599  *             | Pending Entry     |----->| Pending Entry     |--->...
 600  *             | 42:5e:1a:10:d6:2d |      | de:ad:be:ef:00:00 |
 601  *             +-------------------+      +-------------------+
 602  *
 603  *      ~~~~
 604  *
 605  *     +--------------------------+
 606  *     | /dev/overlay minor state |     User land said that it would look up an
 607  *     | overlay_target_hdl_t     |     entry for us. We remove it from the
 608  *     +--------------------------+     global list and add it to the handle's
 609  *                  |                   outstanding list.
 610  *                  |
 611  *                  v
 612  *            +-------------------+      +-------------------+
 613  *            | Pending Entry     |----->| Pending Entry     |
 614  *            | 90:b8:d0:79:02:dd |      | de:ad:be:ef:00:00 |
 615  *            +-------------------+      +-------------------+
 616  *
 617  *      ~~~~
 618  *
 619  *     +-------------------+
 620  *     | Valid Entry       |            varpd returned an answer with
 621  *     | de:ad:be:ef:00:00 |            OVERLAY_IOC_RESPOND and the target cache
 622  *     | 10.169.23.42:4789 |            entry is now populated with a
 623  *     +-------------------+            destination and marked as valid
 624  *
 625  *
 626  * The lookup mechanism is performed via a series of operations on the character
 627  * psuedo-device /dev/overlay. The only thing that uses this device is the
 628  * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
 629  * granting a new minor number which maintains its own state. We maintain this
 630  * state so that way if an outstanding lookup was queued to something that
 631  * crashed or closed its handle without responding, we can know about this and
 632  * thus handle it appropriately.
 633  *
 634  * When a lookup is first created it's added to our global list of outstanding
 635  * lookups. To service requests, userland is required to perform an ioctl to ask
 636  * for a request. We will block it in the kernel a set amount of time waiting
 637  * for a request. When we give a request to a given minor instance of the
 638  * device, we remove it from the global list and append the request to the
 639  * device's list of outstanding entries, for the reasons we discussed above.
 640  * When a lookup comes in, we give user land a smaller amount of information
 641  * specific to that packet, the overlay_targ_lookup_t. It includes a request id
 642  * to identify this, and then the overlay id, the varpd id, the header and
 643  * packet size, the source and destination mac address, the SAP, and any
 644  * potential VLAN header.
 645  *
 646  * At that point, it stays in that outstanding list until one of two ioctls are
 647  * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
 648  * userland may also perform other operations. For example, it may use
 649  * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
 650  * analysis of what to do beyond what we gave it initially. This is useful for
 651  * providing proxy arp and the like. Finally, there are two other ioctls that
 652  * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
 653  * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
 654  * causes us to encapsulate and send out the packet they've given us.
 655  *
 656  *
 657  * Finally, through the target cache, several ioctls are provided to allow for
 658  * interrogation and management of the cache. They allow for individual entries
 659  * to be retrieved, set, or have the entire table flushed. For the full set of
 660  * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
 661  *
 662  * ------------------
 663  * Sample Packet Flow
 664  * ------------------
 665  *
 666  * There's a lot of pieces here, hopefully an example of how this all fits
 667  * together will help clarify and elucidate what's going on. We're going to
 668  * first track an outgoing packet, eg. one that is sent from an IP interface on
 669  * a VNIC on top of an overlay device, and then we'll look at what it means to
 670  * respond to that.
 671  *
 672  *
 673  *    +----------------+        +--------------+            +------------------+
 674  *    | IP/DLS send    |------->| MAC sends it |----------->| mblk_t reaches   |
 675  *    | packet to MAC  |        | to the GLDv3 |            | overlay GLDv3 tx |
 676  *    +----------------+        | VNIC device  |            | overlay_m_tx()   |
 677  *                              +--------------+            +------------------+
 678  *                                                                   |
 679  *                             . lookup              . cache         |
 680  *                             . drop                . miss          v
 681  *            +---------+      .       +--------+    .      +------------------+
 682  *            | freemsg |<-----*-------| varpd  |<---*------| Lookup each mblk |
 683  *            | mblk_t  |              | lookup |           | in the target    |
 684  *            +---------+              | queued |           | cache            |
 685  *                ^                    +--------+           +------------------+
 686  *      on send   |                        |                         |     cache
 687  *      error . . *                        *. . lookup               * . . hit
 688  *                |                        |    success              v
 689  *                |                        |                +------------------+
 690  *    +-----------------+                  +--------------->| call plugin      |
 691  *    | Send out        |                                   | ovpo_encap() to  |
 692  *    | overlay_mux_t's |<----------------------------------| get encap mblk_t |
 693  *    | ksocket         |                                   +------------------+
 694  *    +-----------------+
 695  *
 696  * The receive end point looks a little different and looks more like:
 697  *
 698  *  +------------------+     +----------------+    +-----------+
 699  *  | mblk_t comes off |---->| enter netstack |--->| delivered |---+
 700  *  | the physical     |     | IP stack       |    |     to    |   * . . direct
 701  *  | device           |     +----------------+    |  ksocket  |   |   callback
 702  *  +------------------+                           +-----------+   |
 703  *                       . overlay id                              |
 704  *                       . not found                               v
 705  *       +-----------+   .      +-----------------+       +--------------------+
 706  *       | freemsg   |<--*------| call plugin     |<------| overlay_mux_recv() |
 707  *       | mblk_t    |          | ovpo_decap() to |       +--------------------+
 708  *       +-----------+          | decap mblk_t    |
 709  *                              +-----------------+
 710  *                                     |
 711  *                                     * . . overlay id
 712  *                                     v     found
 713  *                                 +--------+      +----------------+
 714  *                                 | adjust |----->| call mac_rx    |
 715  *                                 | mblk_t |      | on original    |
 716  *                                 +--------+      | decaped packet |
 717  *                                                 +----------------+
 718  *
 719  * ------------------
 720  * Netstack Awareness
 721  * ------------------
 722  *
 723  * In the above image we note that this enters a netstack. Today the only
 724  * netstack that can be is the global zone as the overlay driver itself is not
 725  * exactly netstack aware. What this really means is that varpd cannot run in a
 726  * non-global zone and an overlay device cannot belong to a non-global zone.
 727  * Non-global zones can still have a VNIC assigned to them that's been created
 728  * over the overlay device the same way they would if it had been created over
 729  * an etherstub or a physical device.
 730  *
 731  * The majority of the work to make it netstack aware is straightforward and the
 732  * biggest thing is to create a netstack module that allows us to hook into
 733  * netstack (and thus zone) creation and destruction.  From there, we need to
 734  * amend the target cache lookup routines that we discussed earlier to not have
 735  * a global outstanding list and a global list of handles, but rather, one per
 736  * netstack.
 737  *
 738  * For the mux, we'll need to open the ksocket in the context of the zone, we
 739  * can likely do this with a properly composed credential, but we'll need to do
 740  * some more work on that path. Finally, we'll want to make sure the dld ioctls
 741  * are aware of the zoneid of the caller and we use that appropriately and store
 742  * it in the overlay_dev_t.
 743  *
 744  * -----------
 745  * GLDv3 Notes
 746  * -----------
 747  *
 748  * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
 749  * relevant and other parts are much less relevant for us. For example, the
 750  * GLDv3 is used to toggle the device being put into and out of promiscuous
 751  * mode, to program MAC addresses for unicast and multicast hardware filters.
 752  * Today, an overlay device doesn't have a notion of promiscuous mode nor does
 753  * it have a notion of unicast and multicast addresses programmed into the
 754  * device. Instead, for the purposes of the hardware filter, we don't do
 755  * anything and just always accept new addresses being added and removed.
 756  *
 757  * If the GLDv3 start function has not been called, then we will not use this
 758  * device for I/O purposes. Any calls to transmit or receive should be dropped,
 759  * though the GLDv3 guarantees us that transmit will not be called without
 760  * calling start. Similarly, once stop is called, then no packets can be dealt
 761  * with.
 762  *
 763  * Today we don't support the stat interfaces, though there's no good reason
 764  * that we shouldn't assemble some of the stats based on what we have in the
 765  * future.
 766  *
 767  * When it comes to link properties, many of the traditional link properties do
 768  * not apply and many others MAC handles for us. For example, we don't need to
 769  * implement anything for overlay_m_getprop() to deal with returning the MTU, as
 770  * MAC never calls into us for that. As such, there isn't much of anything to
 771  * support in terms of properties.
 772  *
 773  * Today, we don't support any notion of hardware capabilities. However, if
 774  * future NIC hardware or other changes to the system cause it to make sense for
 775  * us to emulate logical groups, then we should do that. However, we still do
 776  * implement a capab function so that we can identify ourselves as an overlay
 777  * device to the broader MAC framework. This is done mostly so that a device
 778  * created on top of us can have fanout rings as we don't try to lie about a
 779  * speed for our device.
 780  *
 781  * The other question is what should be done for a device's MTU and margin. We
 782  * set our minimum supported MTU to be the minimum value that an IP network may
 783  * be set to 576 -- which mimics what an etherstub does. On the flip side, we
 784  * have our upper bound set to 8900. This value comes from the fact that a lot
 785  * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
 786  * bytes, which isn't exactly the most accurate number, but it'll be good enough
 787  * for now. Because of that, our default MTU off of these devices is 1400, as
 788  * the default MTU for everything is usually 1500 or whatever the underlying
 789  * device is at; however, this is a bit simpler than asking the netstack what
 790  * are all the IP interfaces at. It also calls into question how PMTU and PMTU
 791  * discovery should work here. The challenge, especially for
 792  * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
 793  * not clear that if you have a single bad entry that the overall MTU should be
 794  * lowered. Instead, we should figure out a better way of determining these
 795  * kinds of PMTU errors and appropriately alerting the administrator via FMA.
 796  *
 797  * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
 798  * or not the underlying encapsulation device supports VLAN tags. If it does,
 799  * then we'll set the margin to allow for it, otherwise, we will not.
 800  */
 801 
 802 #include <sys/conf.h>
 803 #include <sys/errno.h>
 804 #include <sys/stat.h>
 805 #include <sys/ddi.h>
 806 #include <sys/sunddi.h>
 807 #include <sys/modctl.h>
 808 #include <sys/policy.h>
 809 #include <sys/stream.h>
 810 #include <sys/strsubr.h>
 811 #include <sys/strsun.h>
 812 #include <sys/types.h>
 813 #include <sys/kmem.h>
 814 #include <sys/param.h>
 815 #include <sys/sysmacros.h>
 816 #include <sys/ddifm.h>
 817 
 818 #include <sys/dls.h>
 819 #include <sys/dld_ioc.h>
 820 #include <sys/mac_provider.h>
 821 #include <sys/mac_client_priv.h>
 822 #include <sys/mac_ether.h>
 823 #include <sys/vlan.h>
 824 
 825 #include <sys/overlay_impl.h>
 826 
 827 dev_info_t *overlay_dip;
 828 static kmutex_t overlay_dev_lock;
 829 static list_t overlay_dev_list;
 830 static uint8_t overlay_macaddr[ETHERADDRL] =
 831         { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
 832 
 833 typedef enum overlay_dev_prop {
 834         OVERLAY_DEV_P_MTU = 0,
 835         OVERLAY_DEV_P_VNETID,
 836         OVERLAY_DEV_P_ENCAP,
 837         OVERLAY_DEV_P_VARPDID,
 838         OVERLAY_DEV_P_DCID
 839 } overlay_dev_prop_t;
 840 
 841 #define OVERLAY_DEV_NPROPS      5
 842 static const char *overlay_dev_props[] = {
 843         "mtu",
 844         "vnetid",
 845         "encap",
 846         "varpd/id",
 847         "dcid"
 848 };
 849 
 850 #define OVERLAY_MTU_MIN 576
 851 #define OVERLAY_MTU_DEF 1400
 852 #define OVERLAY_MTU_MAX 8900
 853 
 854 overlay_dev_t *
 855 overlay_hold_by_dlid(datalink_id_t id)
 856 {
 857         overlay_dev_t *o;
 858 
 859         mutex_enter(&overlay_dev_lock);
 860         for (o = list_head(&overlay_dev_list); o != NULL;
 861             o = list_next(&overlay_dev_list, o)) {
 862                 if (id == o->odd_linkid) {
 863                         mutex_enter(&o->odd_lock);
 864                         o->odd_ref++;
 865                         mutex_exit(&o->odd_lock);
 866                         mutex_exit(&overlay_dev_lock);
 867                         return (o);
 868                 }
 869         }
 870 
 871         mutex_exit(&overlay_dev_lock);
 872         return (NULL);
 873 }
 874 
 875 void
 876 overlay_hold_rele(overlay_dev_t *odd)
 877 {
 878         mutex_enter(&odd->odd_lock);
 879         ASSERT(odd->odd_ref > 0);
 880         odd->odd_ref--;
 881         mutex_exit(&odd->odd_lock);
 882 }
 883 
 884 void
 885 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
 886 {
 887         ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
 888         ASSERT(MUTEX_HELD(&odd->odd_lock));
 889 
 890         if (flag & OVERLAY_F_IN_RX)
 891                 odd->odd_rxcount++;
 892         if (flag & OVERLAY_F_IN_TX)
 893                 odd->odd_txcount++;
 894         odd->odd_flags |= flag;
 895 }
 896 
 897 void
 898 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
 899 {
 900         boolean_t signal = B_FALSE;
 901 
 902         ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
 903         ASSERT(MUTEX_HELD(&odd->odd_lock));
 904 
 905         if (flag & OVERLAY_F_IN_RX) {
 906                 ASSERT(odd->odd_rxcount > 0);
 907                 odd->odd_rxcount--;
 908                 if (odd->odd_rxcount == 0) {
 909                         signal = B_TRUE;
 910                         odd->odd_flags &= ~OVERLAY_F_IN_RX;
 911                 }
 912         }
 913         if (flag & OVERLAY_F_IN_TX) {
 914                 ASSERT(odd->odd_txcount > 0);
 915                 odd->odd_txcount--;
 916                 if (odd->odd_txcount == 0) {
 917                         signal = B_TRUE;
 918                         odd->odd_flags &= ~OVERLAY_F_IN_TX;
 919                 }
 920         }
 921 
 922         if (signal == B_TRUE)
 923                 cv_broadcast(&odd->odd_iowait);
 924 }
 925 
 926 static void
 927 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
 928 {
 929         ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
 930         ASSERT(MUTEX_HELD(&odd->odd_lock));
 931 
 932         while (odd->odd_flags & flag) {
 933                 cv_wait(&odd->odd_iowait, &odd->odd_lock);
 934         }
 935 }
 936 
 937 void
 938 overlay_dev_iter(overlay_dev_iter_f func, void *arg)
 939 {
 940         overlay_dev_t *odd;
 941 
 942         mutex_enter(&overlay_dev_lock);
 943         for (odd = list_head(&overlay_dev_list); odd != NULL;
 944             odd = list_next(&overlay_dev_list, odd)) {
 945                 if (func(odd, arg) != 0) {
 946                         mutex_exit(&overlay_dev_lock);
 947                         return;
 948                 }
 949         }
 950         mutex_exit(&overlay_dev_lock);
 951 }
 952 
 953 /* ARGSUSED */
 954 static int
 955 overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
 956 {
 957         return (ENOTSUP);
 958 }
 959 
 960 static int
 961 overlay_m_start(void *arg)
 962 {
 963         overlay_dev_t *odd = arg;
 964         overlay_mux_t *mux;
 965         int ret, domain, family, prot;
 966         struct sockaddr_storage storage;
 967         socklen_t slen;
 968 
 969         mutex_enter(&odd->odd_lock);
 970         if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
 971                 mutex_exit(&odd->odd_lock);
 972                 return (EAGAIN);
 973         }
 974         mutex_exit(&odd->odd_lock);
 975 
 976         ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
 977             &family, &prot, (struct sockaddr *)&storage, &slen);
 978         if (ret != 0)
 979                 return (ret);
 980 
 981         mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
 982             (struct sockaddr *)&storage, slen, &ret);
 983         if (mux == NULL)
 984                 return (ret);
 985 
 986         overlay_mux_add_dev(mux, odd);
 987         odd->odd_mux = mux;
 988         mutex_enter(&odd->odd_lock);
 989         ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
 990         odd->odd_flags |= OVERLAY_F_IN_MUX;
 991         mutex_exit(&odd->odd_lock);
 992 
 993         return (0);
 994 }
 995 
 996 static void
 997 overlay_m_stop(void *arg)
 998 {
 999         overlay_dev_t *odd = arg;
1000 
1001         /*
1002          * The MAC Perimeter is held here, so we don't have to worry about
1003          * synchornizing this with respect to metadata operations.
1004          */
1005         mutex_enter(&odd->odd_lock);
1006         VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
1007         VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
1008         odd->odd_flags |= OVERLAY_F_MDDROP;
1009         overlay_io_wait(odd, OVERLAY_F_IOMASK);
1010         mutex_exit(&odd->odd_lock);
1011 
1012         overlay_mux_remove_dev(odd->odd_mux, odd);
1013         overlay_mux_close(odd->odd_mux);
1014         odd->odd_mux = NULL;
1015 
1016         mutex_enter(&odd->odd_lock);
1017         odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1018         odd->odd_flags &= ~OVERLAY_F_MDDROP;
1019         VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
1020         mutex_exit(&odd->odd_lock);
1021 }
1022 
1023 /*
1024  * For more info on this, see the big theory statement.
1025  */
1026 /* ARGSUSED */
1027 static int
1028 overlay_m_promisc(void *arg, boolean_t on)
1029 {
1030         return (0);
1031 }
1032 
1033 /*
1034  * For more info on this, see the big theory statement.
1035  */
1036 /* ARGSUSED */
1037 static int
1038 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
1039 {
1040         return (0);
1041 }
1042 
1043 /*
1044  * For more info on this, see the big theory statement.
1045  */
1046 /* ARGSUSED */
1047 static int
1048 overlay_m_unicast(void *arg, const uint8_t *macaddr)
1049 {
1050         return (0);
1051 }
1052 
1053 mblk_t *
1054 overlay_m_tx(void *arg, mblk_t *mp_chain)
1055 {
1056         overlay_dev_t *odd = arg;
1057         mblk_t *mp, *ep;
1058         int ret;
1059         ovep_encap_info_t einfo;
1060         struct msghdr hdr;
1061 
1062         mutex_enter(&odd->odd_lock);
1063         if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
1064             !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1065                 mutex_exit(&odd->odd_lock);
1066                 freemsgchain(mp_chain);
1067                 return (NULL);
1068         }
1069         overlay_io_start(odd, OVERLAY_F_IN_TX);
1070         mutex_exit(&odd->odd_lock);
1071 
1072         bzero(&hdr, sizeof (struct msghdr));
1073 
1074         bzero(&einfo, sizeof (ovep_encap_info_t));
1075 
1076         mp = mp_chain;
1077         while (mp != NULL) {
1078                 socklen_t slen;
1079                 struct sockaddr_storage storage;
1080 
1081                 mp_chain = mp->b_next;
1082                 mp->b_next = NULL;
1083                 ep = NULL;
1084 
1085                 ret = overlay_target_lookup(odd, mp,
1086                     (struct sockaddr *)&storage, &slen, &einfo.ovdi_id);
1087                 if (ret != OVERLAY_TARGET_OK) {
1088                         if (ret == OVERLAY_TARGET_DROP)
1089                                 freemsg(mp);
1090                         mp = mp_chain;
1091                         continue;
1092                 }
1093 
1094                 hdr.msg_name = &storage;
1095                 hdr.msg_namelen = slen;
1096 
1097                 ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
1098                     &einfo, &ep);
1099                 if (ret != 0 || ep == NULL) {
1100                         freemsg(mp);
1101                         goto out;
1102                 }
1103 
1104                 ep->b_cont = mp;
1105                 ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
1106                 if (ret != 0)
1107                         goto out;
1108 
1109                 mp = mp_chain;
1110         }
1111 
1112 out:
1113         mutex_enter(&odd->odd_lock);
1114         overlay_io_done(odd, OVERLAY_F_IN_TX);
1115         mutex_exit(&odd->odd_lock);
1116         return (mp_chain);
1117 }
1118 
1119 /* ARGSUSED */
1120 static void
1121 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1122 {
1123         miocnak(q, mp, 0, ENOTSUP);
1124 }
1125 
1126 /* ARGSUSED */
1127 static boolean_t
1128 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1129 {
1130         /*
1131          * Tell MAC we're an overlay.
1132          */
1133         if (cap == MAC_CAPAB_OVERLAY)
1134                 return (B_TRUE);
1135         return (B_FALSE);
1136 }
1137 
1138 /* ARGSUSED */
1139 static int
1140 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1141     uint_t pr_valsize, const void *pr_val)
1142 {
1143         uint32_t mtu, old;
1144         int err;
1145         overlay_dev_t *odd = arg;
1146 
1147         if (pr_num != MAC_PROP_MTU)
1148                 return (ENOTSUP);
1149 
1150         bcopy(pr_val, &mtu, sizeof (mtu));
1151         if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
1152                 return (EINVAL);
1153 
1154         mutex_enter(&odd->odd_lock);
1155         old = odd->odd_mtu;
1156         odd->odd_mtu = mtu;
1157         err = mac_maxsdu_update(odd->odd_mh, mtu);
1158         if (err != 0)
1159                 odd->odd_mtu = old;
1160         mutex_exit(&odd->odd_lock);
1161 
1162         return (err);
1163 }
1164 
1165 /* ARGSUSED */
1166 static int
1167 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1168     uint_t pr_valsize, void *pr_val)
1169 {
1170         return (ENOTSUP);
1171 }
1172 
1173 /* ARGSUSED */
1174 static void
1175 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1176     mac_prop_info_handle_t prh)
1177 {
1178         if (pr_num != MAC_PROP_MTU)
1179                 return;
1180 
1181         mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
1182         mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
1183 }
1184 
1185 static mac_callbacks_t overlay_m_callbacks = {
1186         .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
1187             MC_PROPINFO),
1188         .mc_getstat = overlay_m_stat,
1189         .mc_start = overlay_m_start,
1190         .mc_stop = overlay_m_stop,
1191         .mc_setpromisc = overlay_m_promisc,
1192         .mc_multicst = overlay_m_multicast,
1193         .mc_unicst = overlay_m_unicast,
1194         .mc_tx = overlay_m_tx,
1195         .mc_ioctl = overlay_m_ioctl,
1196         .mc_getcapab = overlay_m_getcapab,
1197         .mc_getprop = overlay_m_getprop,
1198         .mc_setprop = overlay_m_setprop,
1199         .mc_propinfo = overlay_m_propinfo
1200 };
1201 
1202 static boolean_t
1203 overlay_valid_name(const char *name, size_t buflen)
1204 {
1205         size_t actlen;
1206         int err, i;
1207 
1208         for (i = 0; i < buflen; i++) {
1209                 if (name[i] == '\0')
1210                         break;
1211         }
1212 
1213         if (i == 0 || i == buflen)
1214                 return (B_FALSE);
1215         actlen = i;
1216         if (strchr(name, '/') != NULL)
1217                 return (B_FALSE);
1218         if (u8_validate((char *)name, actlen, NULL,
1219             U8_VALIDATE_ENTIRE, &err) < 0)
1220                 return (B_FALSE);
1221         return (B_TRUE);
1222 }
1223 
1224 /* ARGSUSED */
1225 static int
1226 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1227 {
1228         int err;
1229         uint64_t maxid;
1230         overlay_dev_t *odd, *o;
1231         mac_register_t *mac;
1232         overlay_ioc_create_t *oicp = karg;
1233 
1234         if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
1235                 return (EINVAL);
1236 
1237         odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
1238         odd->odd_linkid = oicp->oic_linkid;
1239         odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
1240         if (odd->odd_plugin == NULL) {
1241                 kmem_free(odd, sizeof (overlay_dev_t));
1242                 return (ENOENT);
1243         }
1244         err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
1245             &odd->odd_pvoid);
1246         if (err != 0) {
1247                 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1248                 overlay_plugin_rele(odd->odd_plugin);
1249                 kmem_free(odd, sizeof (overlay_dev_t));
1250                 return (EINVAL);
1251         }
1252 
1253         /*
1254          * Make sure that our virtual network id is valid for the given plugin
1255          * that we're working with.
1256          */
1257         ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1258         maxid = UINT64_MAX;
1259         if (odd->odd_plugin->ovp_id_size != 8)
1260                 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
1261         if (oicp->oic_vnetid > maxid) {
1262                 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1263                 overlay_plugin_rele(odd->odd_plugin);
1264                 kmem_free(odd, sizeof (overlay_dev_t));
1265                 return (EINVAL);
1266         }
1267         odd->odd_vid = oicp->oic_vnetid;
1268 
1269         if (oicp->oic_dcid > UINT32_MAX) {
1270                 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1271                 overlay_plugin_rele(odd->odd_plugin);
1272                 kmem_free(odd, sizeof (overlay_dev_t));
1273                 return (EINVAL);
1274         }
1275         odd->odd_dcid = oicp->oic_dcid;
1276 
1277         mac = mac_alloc(MAC_VERSION);
1278         if (mac == NULL) {
1279                 mutex_exit(&overlay_dev_lock);
1280                 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1281                 overlay_plugin_rele(odd->odd_plugin);
1282                 kmem_free(odd, sizeof (overlay_dev_t));
1283                 return (EINVAL);
1284         }
1285 
1286         mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1287         mac->m_driver = odd;
1288         mac->m_dip = overlay_dip;
1289         mac->m_dst_addr = NULL;
1290         mac->m_callbacks = &overlay_m_callbacks;
1291         mac->m_pdata = NULL;
1292         mac->m_pdata_size = 0;
1293 
1294         mac->m_priv_props = NULL;
1295 
1296         /* Let mac handle this itself. */
1297         mac->m_instance = (uint_t)-1;
1298 
1299         /*
1300          * There is no real source address that should be used here, but saying
1301          * that we're not ethernet is going to cause its own problems. At the
1302          * end of the say, this is fine.
1303          */
1304         mac->m_src_addr = overlay_macaddr;
1305 
1306         /*
1307          * Start with the default MTU as the max SDU. If the MTU is changed, the
1308          * SDU will be changed to reflect that.
1309          */
1310         mac->m_min_sdu = 1;
1311         mac->m_max_sdu = OVERLAY_MTU_DEF;
1312         mac->m_multicast_sdu = 0;
1313 
1314         /*
1315          * The underlying device doesn't matter, instead this comes from the
1316          * encapsulation protocol and whether or not they allow VLAN tags.
1317          */
1318         if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
1319                 mac->m_margin = VLAN_TAGSZ;
1320         } else {
1321                 mac->m_margin = 0;
1322         }
1323 
1324         /*
1325          * Today, we have no MAC virtualization, it may make sense in the future
1326          * to go ahead and emulate some subset of this, but it doesn't today.
1327          */
1328         mac->m_v12n = MAC_VIRT_NONE;
1329 
1330         mutex_enter(&overlay_dev_lock);
1331         for (o = list_head(&overlay_dev_list); o != NULL;
1332             o = list_next(&overlay_dev_list, o)) {
1333                 if (o->odd_linkid == oicp->oic_linkid) {
1334                         mutex_exit(&overlay_dev_lock);
1335                         odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1336                         overlay_plugin_rele(odd->odd_plugin);
1337                         kmem_free(odd, sizeof (overlay_dev_t));
1338                         return (EEXIST);
1339                 }
1340 
1341                 if (o->odd_vid == oicp->oic_vnetid &&
1342                     o->odd_plugin == odd->odd_plugin) {
1343                         mutex_exit(&overlay_dev_lock);
1344                         odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1345                         overlay_plugin_rele(odd->odd_plugin);
1346                         kmem_free(odd, sizeof (overlay_dev_t));
1347                         return (EEXIST);
1348                 }
1349         }
1350 
1351         err = mac_register(mac, &odd->odd_mh);
1352         mac_free(mac);
1353         if (err != 0) {
1354                 mutex_exit(&overlay_dev_lock);
1355                 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1356                 overlay_plugin_rele(odd->odd_plugin);
1357                 kmem_free(odd, sizeof (overlay_dev_t));
1358                 return (err);
1359         }
1360 
1361         err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1362             crgetzoneid(cred));
1363         if (err != 0) {
1364                 mutex_exit(&overlay_dev_lock);
1365                 (void) mac_unregister(odd->odd_mh);
1366                 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1367                 overlay_plugin_rele(odd->odd_plugin);
1368                 kmem_free(odd, sizeof (overlay_dev_t));
1369                 return (err);
1370         }
1371 
1372         mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
1373         cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
1374         odd->odd_ref = 0;
1375         odd->odd_flags = 0;
1376         list_insert_tail(&overlay_dev_list, odd);
1377         mutex_exit(&overlay_dev_lock);
1378 
1379         return (0);
1380 }
1381 
1382 /* ARGSUSED */
1383 static int
1384 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1385 {
1386         int i, ret;
1387         overlay_dev_t *odd;
1388         mac_perim_handle_t mph;
1389         overlay_ioc_activate_t *oiap = karg;
1390         overlay_ioc_propinfo_t *infop;
1391         overlay_ioc_prop_t *oip;
1392         overlay_prop_handle_t phdl;
1393 
1394         odd = overlay_hold_by_dlid(oiap->oia_linkid);
1395         if (odd == NULL)
1396                 return (ENOENT);
1397 
1398         infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
1399         oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
1400         phdl = (overlay_prop_handle_t)infop;
1401 
1402         mac_perim_enter_by_mh(odd->odd_mh, &mph);
1403         mutex_enter(&odd->odd_lock);
1404         if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1405                 mutex_exit(&odd->odd_lock);
1406                 mac_perim_exit(mph);
1407                 overlay_hold_rele(odd);
1408                 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1409                 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1410                 return (EEXIST);
1411         }
1412         mutex_exit(&odd->odd_lock);
1413 
1414         for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1415                 const char *pname = odd->odd_plugin->ovp_props[i];
1416                 bzero(infop, sizeof (overlay_ioc_propinfo_t));
1417                 overlay_prop_init(phdl);
1418                 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
1419                 if (ret != 0) {
1420                         mac_perim_exit(mph);
1421                         overlay_hold_rele(odd);
1422                         kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1423                         kmem_free(oip, sizeof (overlay_ioc_prop_t));
1424                         return (ret);
1425                 }
1426 
1427                 if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
1428                         continue;
1429                 bzero(oip, sizeof (overlay_ioc_prop_t));
1430                 oip->oip_size = sizeof (oip->oip_value);
1431                 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1432                     pname, oip->oip_value, &oip->oip_size);
1433                 if (ret != 0) {
1434                         mac_perim_exit(mph);
1435                         overlay_hold_rele(odd);
1436                         kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1437                         kmem_free(oip, sizeof (overlay_ioc_prop_t));
1438                         return (ret);
1439                 }
1440                 if (oip->oip_size == 0) {
1441                         mac_perim_exit(mph);
1442                         overlay_hold_rele(odd);
1443                         kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1444                         kmem_free(oip, sizeof (overlay_ioc_prop_t));
1445                         return (EINVAL);
1446                 }
1447         }
1448 
1449         mutex_enter(&odd->odd_lock);
1450         if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
1451                 mutex_exit(&odd->odd_lock);
1452                 mac_perim_exit(mph);
1453                 overlay_hold_rele(odd);
1454                 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1455                 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1456                 return (ENXIO);
1457         }
1458 
1459         ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
1460         odd->odd_flags |= OVERLAY_F_ACTIVATED;
1461 
1462         /*
1463          * Now that we've activated ourselves, we should indicate to the world
1464          * that we're up. Note that we may not be able to perform lookups at
1465          * this time, but our notion of being 'up' isn't dependent on that
1466          * ability.
1467          */
1468         mac_link_update(odd->odd_mh, LINK_STATE_UP);
1469         mutex_exit(&odd->odd_lock);
1470 
1471         mac_perim_exit(mph);
1472         overlay_hold_rele(odd);
1473         kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1474         kmem_free(oip, sizeof (overlay_ioc_prop_t));
1475 
1476         return (0);
1477 }
1478 
1479 /* ARGSUSED */
1480 static int
1481 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1482 {
1483         overlay_ioc_delete_t *oidp = karg;
1484         overlay_dev_t *odd;
1485         datalink_id_t tid;
1486         int ret;
1487 
1488         odd = overlay_hold_by_dlid(oidp->oid_linkid);
1489         if (odd == NULL) {
1490                 return (ENOENT);
1491         }
1492 
1493         mutex_enter(&odd->odd_lock);
1494         /* If we're not the only hold, we're busy */
1495         if (odd->odd_ref != 1) {
1496                 mutex_exit(&odd->odd_lock);
1497                 overlay_hold_rele(odd);
1498                 return (EBUSY);
1499         }
1500 
1501         if (odd->odd_flags & OVERLAY_F_IN_MUX) {
1502                 mutex_exit(&odd->odd_lock);
1503                 overlay_hold_rele(odd);
1504                 return (EBUSY);
1505         }
1506 
1507         /*
1508          * To remove this, we need to first remove it from dls and then remove
1509          * it from mac. The act of removing it from mac will check if there are
1510          * devices on top of this, eg. vnics. If there are, then that will fail
1511          * and we'll have to go through and recreate the dls entry. Only after
1512          * mac_unregister has succeeded, then we'll go through and actually free
1513          * everything and drop the dev lock.
1514          */
1515         ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
1516         if (ret != 0) {
1517                 overlay_hold_rele(odd);
1518                 return (ret);
1519         }
1520 
1521         ASSERT(oidp->oid_linkid == tid);
1522         ret = mac_disable(odd->odd_mh);
1523         if (ret != 0) {
1524                 (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1525                     crgetzoneid(cred));
1526                 overlay_hold_rele(odd);
1527                 return (ret);
1528         }
1529 
1530         overlay_target_quiesce(odd->odd_target);
1531 
1532         mutex_enter(&overlay_dev_lock);
1533         list_remove(&overlay_dev_list, odd);
1534         mutex_exit(&overlay_dev_lock);
1535 
1536         cv_destroy(&odd->odd_iowait);
1537         mutex_destroy(&odd->odd_lock);
1538         overlay_target_free(odd);
1539         odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1540         overlay_plugin_rele(odd->odd_plugin);
1541         kmem_free(odd, sizeof (overlay_dev_t));
1542 
1543         return (0);
1544 }
1545 
1546 /* ARGSUSED */
1547 static int
1548 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
1549     int *rvalp)
1550 {
1551         overlay_dev_t *odd;
1552         overlay_ioc_nprops_t *on = karg;
1553 
1554         odd = overlay_hold_by_dlid(on->oipn_linkid);
1555         if (odd == NULL)
1556                 return (ENOENT);
1557         on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
1558         overlay_hold_rele(odd);
1559 
1560         return (0);
1561 }
1562 
1563 static int
1564 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
1565 {
1566         overlay_prop_handle_t phdl = arg;
1567         overlay_prop_set_range_str(phdl, opp->ovp_name);
1568         return (0);
1569 }
1570 
1571 static int
1572 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
1573 {
1574         int i;
1575 
1576         for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1577                 if (strcmp(overlay_dev_props[i], name) == 0) {
1578                         *id = i;
1579                         return (0);
1580                 }
1581         }
1582 
1583         for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1584                 if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
1585                         *id = i + OVERLAY_DEV_NPROPS;
1586                         return (0);
1587                 }
1588         }
1589 
1590         return (ENOENT);
1591 }
1592 
1593 static void
1594 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
1595 {
1596         uint32_t def;
1597         mac_propval_range_t range;
1598         uint_t perm;
1599 
1600         ASSERT(MAC_PERIM_HELD(odd->odd_mh));
1601 
1602         bzero(&range, sizeof (mac_propval_range_t));
1603         range.mpr_count = 1;
1604         if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
1605             sizeof (def), &range, &perm) != 0)
1606                 return;
1607 
1608         if (perm == MAC_PROP_PERM_READ)
1609                 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1610         else if (perm == MAC_PROP_PERM_WRITE)
1611                 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
1612         else if (perm == MAC_PROP_PERM_RW)
1613                 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1614 
1615         overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1616         overlay_prop_set_default(phdl, &def, sizeof (def));
1617         overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
1618             range.mpr_range_uint32[0].mpur_max);
1619 }
1620 
1621 /* ARGSUSED */
1622 static int
1623 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
1624     int *rvalp)
1625 {
1626         overlay_dev_t *odd;
1627         int ret;
1628         mac_perim_handle_t mph;
1629         uint_t propid = UINT_MAX;
1630         overlay_ioc_propinfo_t *oip = karg;
1631         overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
1632 
1633         odd = overlay_hold_by_dlid(oip->oipi_linkid);
1634         if (odd == NULL)
1635                 return (ENOENT);
1636 
1637         overlay_prop_init(phdl);
1638         mac_perim_enter_by_mh(odd->odd_mh, &mph);
1639 
1640         /*
1641          * If the id is -1, then the property that we're looking for is named in
1642          * oipi_name and we should fill in its id. Otherwise, we've been given
1643          * an id and we need to turn that into a name for our plugin's sake. The
1644          * id is our own fabrication for property discovery.
1645          */
1646         if (oip->oipi_id == -1) {
1647                 /*
1648                  * Determine if it's a known generic property or it belongs to a
1649                  * module by checking against the list of known names.
1650                  */
1651                 oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1652                 if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
1653                     &propid)) != 0) {
1654                         overlay_hold_rele(odd);
1655                         mac_perim_exit(mph);
1656                         return (ret);
1657                 }
1658                 oip->oipi_id = propid;
1659                 if (propid >= OVERLAY_DEV_NPROPS) {
1660                         ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1661                             oip->oipi_name, phdl);
1662                         overlay_hold_rele(odd);
1663                         mac_perim_exit(mph);
1664                         return (ret);
1665 
1666                 }
1667         } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
1668                 uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
1669 
1670                 if (id >= odd->odd_plugin->ovp_nprops) {
1671                         overlay_hold_rele(odd);
1672                         mac_perim_exit(mph);
1673                         return (EINVAL);
1674                 }
1675                 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1676                     odd->odd_plugin->ovp_props[id], phdl);
1677                 overlay_hold_rele(odd);
1678                 mac_perim_exit(mph);
1679                 return (ret);
1680         } else if (oip->oipi_id < -1) {
1681                 overlay_hold_rele(odd);
1682                 mac_perim_exit(mph);
1683                 return (EINVAL);
1684         } else {
1685                 ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
1686                 ASSERT(oip->oipi_id >= 0);
1687                 propid = oip->oipi_id;
1688                 (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
1689                     sizeof (oip->oipi_name));
1690         }
1691 
1692         switch (propid) {
1693         case OVERLAY_DEV_P_MTU:
1694                 overlay_i_propinfo_mtu(odd, phdl);
1695                 break;
1696         case OVERLAY_DEV_P_VNETID:
1697                 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1698                 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1699                 overlay_prop_set_nodefault(phdl);
1700                 break;
1701         case OVERLAY_DEV_P_ENCAP:
1702                 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1703                 overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
1704                 overlay_prop_set_nodefault(phdl);
1705                 overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
1706                 break;
1707         case OVERLAY_DEV_P_VARPDID:
1708                 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1709                 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1710                 overlay_prop_set_nodefault(phdl);
1711                 break;
1712         case OVERLAY_DEV_P_DCID:
1713                 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1714                 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1715                 overlay_prop_set_nodefault(phdl);
1716                 overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX);
1717                 break;
1718         default:
1719                 overlay_hold_rele(odd);
1720                 mac_perim_exit(mph);
1721                 return (ENOENT);
1722         }
1723 
1724         overlay_hold_rele(odd);
1725         mac_perim_exit(mph);
1726         return (0);
1727 }
1728 
1729 /* ARGSUSED */
1730 static int
1731 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1732     int *rvalp)
1733 {
1734         int ret;
1735         overlay_dev_t *odd;
1736         mac_perim_handle_t mph;
1737         overlay_ioc_prop_t *oip = karg;
1738         uint_t propid, mtu;
1739 
1740         odd = overlay_hold_by_dlid(oip->oip_linkid);
1741         if (odd == NULL)
1742                 return (ENOENT);
1743 
1744         mac_perim_enter_by_mh(odd->odd_mh, &mph);
1745         oip->oip_size = OVERLAY_PROP_SIZEMAX;
1746         oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1747         if (oip->oip_id == -1) {
1748                 int i;
1749 
1750                 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1751                         if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1752                                 break;
1753                         if (i == OVERLAY_DEV_NPROPS) {
1754                                 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
1755                                     odd->odd_pvoid, oip->oip_name,
1756                                     oip->oip_value, &oip->oip_size);
1757                                 overlay_hold_rele(odd);
1758                                 mac_perim_exit(mph);
1759                                 return (ret);
1760                         }
1761                 }
1762 
1763                 propid = i;
1764         } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1765                 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1766 
1767                 if (id > odd->odd_plugin->ovp_nprops) {
1768                         overlay_hold_rele(odd);
1769                         mac_perim_exit(mph);
1770                         return (EINVAL);
1771                 }
1772                 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1773                     odd->odd_plugin->ovp_props[id], oip->oip_value,
1774                     &oip->oip_size);
1775                 overlay_hold_rele(odd);
1776                 mac_perim_exit(mph);
1777                 return (ret);
1778         } else if (oip->oip_id < -1) {
1779                 overlay_hold_rele(odd);
1780                 mac_perim_exit(mph);
1781                 return (EINVAL);
1782         } else {
1783                 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1784                 ASSERT(oip->oip_id >= 0);
1785                 propid = oip->oip_id;
1786         }
1787 
1788         ret = 0;
1789         switch (propid) {
1790         case OVERLAY_DEV_P_MTU:
1791                 /*
1792                  * The MTU is always set and retrieved through MAC, to allow for
1793                  * MAC to do whatever it wants, as really that property belongs
1794                  * to MAC. This is important for things where vnics have hold on
1795                  * the MTU.
1796                  */
1797                 mac_sdu_get(odd->odd_mh, NULL, &mtu);
1798                 bcopy(&mtu, oip->oip_value, sizeof (uint_t));
1799                 oip->oip_size = sizeof (uint_t);
1800                 break;
1801         case OVERLAY_DEV_P_VNETID:
1802                 /*
1803                  * While it's read-only while inside of a mux, we're not in a
1804                  * context that can guarantee that. Therefore we always grab the
1805                  * overlay_dev_t's odd_lock.
1806                  */
1807                 mutex_enter(&odd->odd_lock);
1808                 bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
1809                 mutex_exit(&odd->odd_lock);
1810                 oip->oip_size = sizeof (uint64_t);
1811                 break;
1812         case OVERLAY_DEV_P_ENCAP:
1813                 oip->oip_size = strlcpy((char *)oip->oip_value,
1814                     odd->odd_plugin->ovp_name, oip->oip_size);
1815                 break;
1816         case OVERLAY_DEV_P_VARPDID:
1817                 mutex_enter(&odd->odd_lock);
1818                 if (odd->odd_flags & OVERLAY_F_VARPD) {
1819                         const uint64_t val = odd->odd_target->ott_id;
1820                         bcopy(&val, oip->oip_value, sizeof (uint64_t));
1821                         oip->oip_size = sizeof (uint64_t);
1822                 } else {
1823                         oip->oip_size = 0;
1824                 }
1825                 mutex_exit(&odd->odd_lock);
1826                 break;
1827         case OVERLAY_DEV_P_DCID:
1828                 /*
1829                  * While it's read-only while inside of a mux, we're not in a
1830                  * context that can guarantee that. Therefore we always grab the
1831                  * overlay_dev_t's odd_lock.
1832                  */
1833                 mutex_enter(&odd->odd_lock);
1834                 bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t));
1835                 mutex_exit(&odd->odd_lock);
1836                 oip->oip_size = sizeof (uint32_t);
1837                 break;
1838 
1839         default:
1840                 ret = ENOENT;
1841         }
1842 
1843         overlay_hold_rele(odd);
1844         mac_perim_exit(mph);
1845         return (ret);
1846 }
1847 
1848 static void
1849 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
1850 {
1851         mutex_enter(&odd->odd_lock);
1852 
1853         /* Simple case, not active */
1854         if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1855                 odd->odd_vid = vnetid;
1856                 mutex_exit(&odd->odd_lock);
1857                 return;
1858         }
1859 
1860         /*
1861          * In the hard case, we need to set the drop flag, quiesce I/O and then
1862          * we can go ahead and do everything.
1863          */
1864         odd->odd_flags |= OVERLAY_F_MDDROP;
1865         overlay_io_wait(odd, OVERLAY_F_IOMASK);
1866         mutex_exit(&odd->odd_lock);
1867 
1868         overlay_mux_remove_dev(odd->odd_mux, odd);
1869         mutex_enter(&odd->odd_lock);
1870         odd->odd_vid = vnetid;
1871         mutex_exit(&odd->odd_lock);
1872         overlay_mux_add_dev(odd->odd_mux, odd);
1873 
1874         mutex_enter(&odd->odd_lock);
1875         ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1876         odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1877         mutex_exit(&odd->odd_lock);
1878 }
1879 
1880 static void
1881 overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid)
1882 {
1883         mutex_enter(&odd->odd_lock);
1884 
1885         /* Simple case, not active */
1886         if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1887                 odd->odd_dcid = dcid;
1888                 mutex_exit(&odd->odd_lock);
1889                 return;
1890         }
1891 
1892         /*
1893          * In the hard case, we need to set the drop flag, quiesce I/O and then
1894          * we can go ahead and do everything.
1895          */
1896         odd->odd_flags |= OVERLAY_F_MDDROP;
1897         overlay_io_wait(odd, OVERLAY_F_IOMASK);
1898         mutex_exit(&odd->odd_lock);
1899 
1900         overlay_mux_remove_dev(odd->odd_mux, odd);
1901         mutex_enter(&odd->odd_lock);
1902         odd->odd_dcid = dcid;
1903         mutex_exit(&odd->odd_lock);
1904         overlay_mux_add_dev(odd->odd_mux, odd);
1905 
1906         mutex_enter(&odd->odd_lock);
1907         ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1908         odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1909         mutex_exit(&odd->odd_lock);
1910 }
1911 
1912 /* ARGSUSED */
1913 static int
1914 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1915     int *rvalp)
1916 {
1917         int ret;
1918         overlay_dev_t *odd;
1919         overlay_ioc_prop_t *oip = karg;
1920         uint_t propid = UINT_MAX;
1921         mac_perim_handle_t mph;
1922         uint64_t maxid, *vidp, *dcidp;
1923 
1924         if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
1925                 return (EINVAL);
1926 
1927         odd = overlay_hold_by_dlid(oip->oip_linkid);
1928         if (odd == NULL)
1929                 return (ENOENT);
1930 
1931         oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1932         mac_perim_enter_by_mh(odd->odd_mh, &mph);
1933         mutex_enter(&odd->odd_lock);
1934         if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1935                 mac_perim_exit(mph);
1936                 mutex_exit(&odd->odd_lock);
1937                 return (ENOTSUP);
1938         }
1939         mutex_exit(&odd->odd_lock);
1940         if (oip->oip_id == -1) {
1941                 int i;
1942 
1943                 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1944                         if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1945                                 break;
1946                         if (i == OVERLAY_DEV_NPROPS) {
1947                                 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
1948                                     odd->odd_pvoid, oip->oip_name,
1949                                     oip->oip_value, oip->oip_size);
1950                                 overlay_hold_rele(odd);
1951                                 mac_perim_exit(mph);
1952                                 return (ret);
1953                         }
1954                 }
1955 
1956                 propid = i;
1957         } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1958                 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1959 
1960                 if (id > odd->odd_plugin->ovp_nprops) {
1961                         mac_perim_exit(mph);
1962                         overlay_hold_rele(odd);
1963                         return (EINVAL);
1964                 }
1965                 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
1966                     odd->odd_plugin->ovp_props[id], oip->oip_value,
1967                     oip->oip_size);
1968                 mac_perim_exit(mph);
1969                 overlay_hold_rele(odd);
1970                 return (ret);
1971         } else if (oip->oip_id < -1) {
1972                 mac_perim_exit(mph);
1973                 overlay_hold_rele(odd);
1974                 return (EINVAL);
1975         } else {
1976                 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1977                 ASSERT(oip->oip_id >= 0);
1978                 propid = oip->oip_id;
1979         }
1980 
1981         ret = 0;
1982         switch (propid) {
1983         case OVERLAY_DEV_P_MTU:
1984                 ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
1985                     oip->oip_value, oip->oip_size);
1986                 break;
1987         case OVERLAY_DEV_P_VNETID:
1988                 if (oip->oip_size != sizeof (uint64_t)) {
1989                         ret = EINVAL;
1990                         break;
1991                 }
1992                 vidp = (uint64_t *)oip->oip_value;
1993                 ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1994                 maxid = UINT64_MAX;
1995                 if (odd->odd_plugin->ovp_id_size != 8)
1996                         maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
1997                             1ULL;
1998                 if (*vidp >= maxid) {
1999                         ret = EINVAL;
2000                         break;
2001                 }
2002                 overlay_setprop_vnetid(odd, *vidp);
2003                 break;
2004         case OVERLAY_DEV_P_ENCAP:
2005         case OVERLAY_DEV_P_VARPDID:
2006                 ret = EPERM;
2007                 break;
2008         case OVERLAY_DEV_P_DCID:
2009                 if (oip->oip_size != sizeof (uint64_t)) {
2010                         ret = EINVAL;
2011                         break;
2012                 }
2013                 dcidp = (uint64_t *)oip->oip_value;
2014                 if (*dcidp > UINT32_MAX) {
2015                         ret = EINVAL;
2016                         break;
2017                 }
2018                 overlay_setprop_dcid(odd, *dcidp);
2019                 break;
2020 
2021         default:
2022                 ret = ENOENT;
2023         }
2024 
2025         mac_perim_exit(mph);
2026         overlay_hold_rele(odd);
2027         return (ret);
2028 }
2029 
2030 /* ARGSUSED */
2031 static int
2032 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
2033     int *rvalp)
2034 {
2035         overlay_dev_t *odd;
2036         overlay_ioc_status_t *os = karg;
2037 
2038         odd = overlay_hold_by_dlid(os->ois_linkid);
2039         if (odd == NULL)
2040                 return (ENOENT);
2041 
2042         mutex_enter(&odd->odd_lock);
2043         if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
2044                 os->ois_status = OVERLAY_I_DEGRADED;
2045                 if (odd->odd_fmamsg != NULL) {
2046                         (void) strlcpy(os->ois_message, odd->odd_fmamsg,
2047                             OVERLAY_STATUS_BUFLEN);
2048                 } else {
2049                         os->ois_message[0] = '\0';
2050                 }
2051 
2052         } else {
2053                 os->ois_status = OVERLAY_I_OK;
2054                 os->ois_message[0] = '\0';
2055         }
2056         mutex_exit(&odd->odd_lock);
2057         overlay_hold_rele(odd);
2058 
2059         return (0);
2060 }
2061 
2062 static dld_ioc_info_t overlay_ioc_list[] = {
2063         { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
2064                 overlay_i_create, secpolicy_dl_config },
2065         { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
2066                 overlay_i_activate, secpolicy_dl_config },
2067         { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
2068                 overlay_i_delete, secpolicy_dl_config },
2069         { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
2070                 sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
2071                 secpolicy_dl_config },
2072         { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
2073                 sizeof (overlay_ioc_prop_t), overlay_i_getprop,
2074                 secpolicy_dl_config },
2075         { OVERLAY_IOC_SETPROP, DLDCOPYIN,
2076                 sizeof (overlay_ioc_prop_t), overlay_i_setprop,
2077                 secpolicy_dl_config },
2078         { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
2079                 sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
2080                 secpolicy_dl_config },
2081         { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
2082                 sizeof (overlay_ioc_status_t), overlay_i_status,
2083                 NULL }
2084 };
2085 
2086 static int
2087 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2088 {
2089         int fmcap = DDI_FM_EREPORT_CAPABLE;
2090         if (cmd != DDI_ATTACH)
2091                 return (DDI_FAILURE);
2092 
2093         if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
2094                 return (DDI_FAILURE);
2095 
2096         ddi_fm_init(dip, &fmcap, NULL);
2097 
2098         if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
2099             ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
2100                 return (DDI_FAILURE);
2101 
2102         if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
2103             DLDIOCCNT(overlay_ioc_list)) != 0) {
2104                 ddi_remove_minor_node(dip, OVERLAY_CTL);
2105                 return (DDI_FAILURE);
2106         }
2107 
2108         overlay_dip = dip;
2109         return (DDI_SUCCESS);
2110 }
2111 
2112 /* ARGSUSED */
2113 static int
2114 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
2115 {
2116         int error;
2117 
2118         switch (cmd) {
2119         case DDI_INFO_DEVT2DEVINFO:
2120                 *resp = (void *)overlay_dip;
2121                 error = DDI_SUCCESS;
2122                 break;
2123         case DDI_INFO_DEVT2INSTANCE:
2124                 *resp = (void *)0;
2125                 error = DDI_SUCCESS;
2126                 break;
2127         default:
2128                 error = DDI_FAILURE;
2129                 break;
2130         }
2131 
2132         return (error);
2133 }
2134 
2135 static int
2136 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2137 {
2138         if (cmd != DDI_DETACH)
2139                 return (DDI_FAILURE);
2140 
2141         mutex_enter(&overlay_dev_lock);
2142         if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
2143                 mutex_exit(&overlay_dev_lock);
2144                 return (EBUSY);
2145         }
2146         mutex_exit(&overlay_dev_lock);
2147 
2148 
2149         dld_ioc_unregister(OVERLAY_IOC);
2150         ddi_remove_minor_node(dip, OVERLAY_CTL);
2151         ddi_fm_fini(dip);
2152         overlay_dip = NULL;
2153         return (DDI_SUCCESS);
2154 }
2155 
2156 #define OVERLAY_IOCTL_MASK      0xffffff00
2157 /* ARGSUSED */
2158 static int
2159 overlay_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2160     int *rvalp)
2161 {
2162         switch (cmd & OVERLAY_IOCTL_MASK) {
2163         case OVERLAY_TARG_IOCTL:
2164                 return (overlay_target_ioctl(dev, cmd, arg, mode, credp,
2165                     rvalp));
2166         case OVERLAY_ROUTER_IOCTL:
2167                 return (overlay_router_ioctl(dev, cmd, arg, mode, credp,
2168                     rvalp));
2169         default:
2170                 return (ENOTTY);
2171         }
2172 }
2173 
2174 static struct cb_ops overlay_cbops = {
2175         overlay_target_open,    /* cb_open */
2176         overlay_target_close,   /* cb_close */
2177         nodev,                  /* cb_strategy */
2178         nodev,                  /* cb_print */
2179         nodev,                  /* cb_dump */
2180         nodev,                  /* cb_read */
2181         nodev,                  /* cb_write */
2182         overlay_ioctl,          /* cb_ioctl */
2183         nodev,                  /* cb_devmap */
2184         nodev,                  /* cb_mmap */
2185         nodev,                  /* cb_segmap */
2186         nochpoll,               /* cb_chpoll */
2187         ddi_prop_op,            /* cb_prop_op */
2188         NULL,                   /* cb_stream */
2189         D_MP,                   /* cb_flag */
2190         CB_REV,                 /* cb_rev */
2191         nodev,                  /* cb_aread */
2192         nodev,                  /* cb_awrite */
2193 };
2194 
2195 static struct dev_ops overlay_dev_ops = {
2196         DEVO_REV,               /* devo_rev */
2197         0,                      /* devo_refcnt */
2198         overlay_getinfo,        /* devo_getinfo */
2199         nulldev,                /* devo_identify */
2200         nulldev,                /* devo_probe */
2201         overlay_attach,         /* devo_attach */
2202         overlay_detach,         /* devo_detach */
2203         nulldev,                /* devo_reset */
2204         &overlay_cbops,             /* devo_cb_ops */
2205         NULL,                   /* devo_bus_ops */
2206         NULL,                   /* devo_power */
2207         ddi_quiesce_not_supported       /* devo_quiesce */
2208 };
2209 
2210 static struct modldrv overlay_modldrv = {
2211         &mod_driverops,
2212         "Overlay Network Driver",
2213         &overlay_dev_ops
2214 };
2215 
2216 static struct modlinkage overlay_linkage = {
2217         MODREV_1,
2218         &overlay_modldrv
2219 };
2220 
2221 static int
2222 overlay_init(void)
2223 {
2224         mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
2225         list_create(&overlay_dev_list, sizeof (overlay_dev_t),
2226             offsetof(overlay_dev_t, odd_link));
2227         overlay_mux_init();
2228         overlay_plugin_init();
2229         overlay_target_init();
2230 
2231         return (DDI_SUCCESS);
2232 }
2233 
2234 static void
2235 overlay_fini(void)
2236 {
2237         overlay_target_fini();
2238         overlay_plugin_fini();
2239         overlay_mux_fini();
2240         mutex_destroy(&overlay_dev_lock);
2241         list_destroy(&overlay_dev_list);
2242 }
2243 
2244 int
2245 _init(void)
2246 {
2247         int err;
2248 
2249         if ((err = overlay_init()) != DDI_SUCCESS)
2250                 return (err);
2251 
2252         mac_init_ops(NULL, "overlay");
2253         err = mod_install(&overlay_linkage);
2254         if (err != DDI_SUCCESS) {
2255                 overlay_fini();
2256                 return (err);
2257         }
2258 
2259         return (0);
2260 }
2261 
2262 int
2263 _info(struct modinfo *modinfop)
2264 {
2265         return (mod_info(&overlay_linkage, modinfop));
2266 }
2267 
2268 int
2269 _fini(void)
2270 {
2271         int err;
2272 
2273         err = mod_remove(&overlay_linkage);
2274         if (err != 0)
2275                 return (err);
2276 
2277         overlay_fini();
2278         return (0);
2279 }