Print this page
    
Overlay fabric router
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/io/overlay/overlay.c
          +++ new/usr/src/uts/common/io/overlay/overlay.c
   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13   13   * Copyright 2016 Joyent, Inc.
  14   14   */
  15   15  
  16   16  /*
  17   17   * Overlay Devices
  18   18   *
  19   19   * Overlay devices provide a means for creating overlay networks, a means of
  20   20   * multiplexing multiple logical, isolated, and discrete layer two and layer
  21   21   * three networks on top of one physical network.
  22   22   *
  23   23   * In general, these overlay devices encapsulate the logic to answer two
  24   24   * different questions:
  25   25   *
  26   26   *   1) How should I transform a packet to put it on the wire?
  27   27   *   2) Where should I send a transformed packet?
  28   28   *
  29   29   * Each overlay device is presented to the user as a GLDv3 device. While the
  30   30   * link itself cannot have an IP interface created on top of it, it allows for
  31   31   * additional GLDv3 devices, such as a VNIC, to be created on top of it which
  32   32   * can be plumbed up with IP interfaces.
  33   33   *
  34   34   *
  35   35   * --------------------
  36   36   * General Architecture
  37   37   * --------------------
  38   38   *
  39   39   * The logical overlay device that a user sees in dladm(1M) is a combination of
  40   40   * two different components that work together. The first component is this
  41   41   * kernel module, which is responsible for answering question one -- how should
  42   42   * I transform a packet to put it on the wire.
  43   43   *
  44   44   * The second component is what we call the virtual ARP daemon, or varpd. It is
  45   45   * a userland component that is responsible for answering the second question --
  46   46   * Where should I send a transformed packet. Instances of the kernel overlay
  47   47   * GLDv3 device ask varpd the question of where should a packet go.
  48   48   *
  49   49   * The split was done for a few reasons. Importantly, we wanted to keep the act
  50   50   * of generating encapsulated packets in the kernel so as to ensure that the
  51   51   * general data path was fast and also kept simple. On the flip side, while the
  52   52   * question of where should something go may be simple, it may often be
  53   53   * complicated and need to interface with several different external or
  54   54   * distributed systems. In those cases, it's simpler to allow for the full
  55   55   * flexibility of userland to be brought to bear to solve that problem and in
  56   56   * general, the path isn't very common.
  57   57   *
  58   58   * The following is what makes up the logical overlay device that a user would
  59   59   * create with dladm(1M).
  60   60   *
  61   61   *       Kernel                                     Userland
  62   62   *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
  63   63   *   . +--------+   +--------+  +--------+   .   .                       .
  64   64   *   . | VNIC 0 |   | VNIC 1 |  | VNIC 2 |   .   .                       .
  65   65   *   . +--------+   +--------+  +--------+   .   .                       .
  66   66   *   .     |            |           |        .   .                       .
  67   67   *   .     |            |           |        .   .                       .
  68   68   *   .     +------------+-----------+        .   .                       .
  69   69   *   .                  |              . . /dev/overlay                  .
  70   70   *   .           +--------------+      .     .   .       +------------+  .
  71   71   *   .           |              |      .     .   .       |            |  .
  72   72   *   .           |    Overlay   |======*=================|   Virtual  |  .
  73   73   *   .           | GLDv3 Device |========================| ARP Daemon |  .
  74   74   *   .           |              |            .   .       |            |  .
  75   75   *   .           +--------------+            .   .       +------------+  .
  76   76   *   .                  |                    .   .              |        .
  77   77   *   .                  |                    .   .              |        .
  78   78   *   .           +----------------+          .   .         +--------+    .
  79   79   *   .           |  Overlay       |          .   .         | varpd  |    .
  80   80   *   .           |  Encapsulation |          .   .         | Lookup |    .
  81   81   *   .           |  Plugin        |          .   .         | Plugin |    .
  82   82   *   .           +----------------+          .   .         +--------+    .
  83   83   *   . . . . . . . . . . . . . . . . . . . . .   . . . . . . . . . . . . .
  84   84   *
  85   85   *
  86   86   * This image shows the two different components and where they live.
  87   87   * Importantly, it also shows that both the kernel overlay device and the
  88   88   * userland varpd both support plugins. The plugins actually implement the
  89   89   * things that users care about and the APIs have been designed to try to
  90   90   * minimize the amount of things that a module writer needs to worry about it.
  91   91   *
  92   92   * IDENTIFIERS
  93   93   *
  94   94   * Every overlay device is defined by a unique identifier which is the overlay
  95   95   * identifier. Its purpose is similar to that of a VLAN identifier, it's a
  96   96   * unique number that is used to differentiate between different entries on the
  97   97   * wire.
  98   98   *
  99   99   * ENCAPSULATION
 100  100   *
 101  101   * An overlay encapsulation plugin is a kernel miscellaneous module whose
 102  102   * purpose is to contain knowledge about how to transform packets to put them
 103  103   * onto the wire and to take them off. An example of an encapsulation plugin is
 104  104   * vxlan. It's also how support for things like nvgre or geneve would be brought
 105  105   * into the system.
 106  106   *
 107  107   * Each encapsulation plugins defines a series of operation vectors and
 108  108   * properties. For the full details on everything they should provide, please
 109  109   * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
 110  110   * for telling the system what information is required to send a packet. For
 111  111   * example, vxlan is defined to send everything over a UDP packet and therefore
 112  112   * requires a port and an IP address, while nvgre on the other hand is its own
 113  113   * IP type and therefore just requires an IP address. In addition, it also
 114  114   * provides information about the kind of socket that should be created. This is
 115  115   * used by the kernel multiplexor, more of that in the Kernel Components
 116  116   * section.
 117  117   *
 118  118   * LOOKUPS
 119  119   *
 120  120   * The kernel communicates requests for lookups over the character device
 121  121   * /dev/overlay. varpd is responsible for listening for requests on that device
 122  122   * and answering them. The character device is specific to the target path and
 123  123   * varpd.
 124  124   *
 125  125   * Much as the kernel overlay module handles the bulk of the scaffolding but
 126  126   * leaves the important work to the encapsulation plugin, varpd provides a
 127  127   * similar role and leaves the full brunt of lookups to a userland dynamic
 128  128   * shared object which implements the logic of lookups.
 129  129   *
 130  130   * Each lookup plugin defines a series of operation vectors and properties. For
 131  131   * the full details on everything that they should provide, please read
 132  132   * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
 133  133   * address and asked to give an address on the physical network that it should
 134  134   * be sent to. In addition, they handle questions related to how to handle
 135  135   * things like broadcast and multicast traffic, etc.
 136  136   *
 137  137   * ----------
 138  138   * Properties
 139  139   * ----------
 140  140   *
 141  141   * A device from a dladm perspective has a unique set of properties that are
 142  142   * combined from three different sources:
 143  143   *
 144  144   *   1) Generic properties that every overlay device has
 145  145   *   2) Properties that are specific to the encapsulation plugin
 146  146   *   3) Properties that are specific to the lookup plugin
 147  147   *
 148  148   * All of these are exposed in a single set of properties in dladm. Note that
 149  149   * these are not necessarily traditional link properties. However, if something
 150  150   * is both a traditional GLDv3 link property, say the MTU of a device, and a
 151  151   * specific property here, than the driver ensures that all existing GLDv3
 152  152   * specific means of manipulating it are used and wraps up its private property
 153  153   * interfaces to ensure that works.
 154  154   *
 155  155   * Properties in the second and third category are prefixed with the name of
 156  156   * their module. For example, the vxlan encapsulation module has a property
 157  157   * called the 'listen_ip'. This property would show up in dladm as
 158  158   * 'vxlan/listen_ip'. This allows different plugins to both use similar names
 159  159   * for similar properties and to also have independent name spaces so that
 160  160   * overlapping names do not conflict with anything else.
 161  161   *
 162  162   * While the kernel combines both sets one and two into a single coherent view,
 163  163   * it does not do anything with respect to the properties that are owned by the
 164  164   * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
 165  165   * charge of bridging these two worlds into one magical experience for the user.
 166  166   * It carries the burden of knowing about both overlay specific and varpd
 167  167   * specific properties. Importantly, we want to maintain this distinction. We
 168  168   * don't want to treat the kernel as an arbitrary key/value store for varpd and
 169  169   * we want the kernel to own its own data and not have to ask userland for
 170  170   * information that it owns.
 171  171   *
 172  172   * Every property in the system has the following attributes:
 173  173   *
 174  174   *   o A name
 175  175   *   o A type
 176  176   *   o A size
 177  177   *   o Permissions
 178  178   *   o Default value
 179  179   *   o Valid value ranges
 180  180   *   o A value
 181  181   *
 182  182   * Everything except for the value is obtained by callers through the propinfo
 183  183   * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
 184  184   * currently 256 bytes.
 185  185   *
 186  186   * The following are the supported types of properties:
 187  187   *
 188  188   *      OVERLAY_PROP_T_INT
 189  189   *
 190  190   *              A signed integer, its length is 8 bytes, corresponding to a
 191  191   *              int64_t.
 192  192   *
 193  193   *      OVERLAY_PROP_T_UINT
 194  194   *
 195  195   *              An unsigned integer, its length is 8 bytes, corresponding to a
 196  196   *              uint64_t.
 197  197   *
 198  198   *      OVERLAY_PROP_T_IP
 199  199   *
 200  200   *              A struct in6_addr, it has a fixed size.
 201  201   *
 202  202   *      OVERLAY_PROP_T_STRING
 203  203   *
 204  204   *              A null-terminated character string encoded in either ASCII or
 205  205   *              UTF-8. Note that the size of the string includes the null
 206  206   *              terminator.
 207  207   *
 208  208   *      OVERLAY_PROP_T_ETHER
 209  209   *
 210  210   *              An ether_addr_t, which has a fixed size.
 211  211   *
 212  212   * The next thing that we apply to a property is its permission. The permissions
 213  213   * are put together by the bitwise or of the following flags and values.
 214  214   *
 215  215   *      OVERLAY_PROP_PERM_REQ
 216  216   *
 217  217   *              This indicates a required property. A property that is required
 218  218   *              must be set by a consumer before the device can be created. If a
 219  219   *              required property has a default property, this constraint is
 220  220   *              loosened because the default property defines the value.
 221  221   *
 222  222   *      OVERLAY_PORP_PERM_READ
 223  223   *
 224  224   *              This indicates that a property can be read. All properties will
 225  225   *              have this value set.
 226  226   *
 227  227   *      OVERLAY_PROP_PERM_WRITE
 228  228   *
 229  229   *              This indicates that a property can be written to and thus
 230  230   *              updated by userland. Properties that are only intended to
 231  231   *              display information, will not have OVERLAY_PROP_PERM_WRITE set.
 232  232   *
 233  233   * In addition, a few additional values are defined as a convenience to
 234  234   * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
 235  235   * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
 236  236   * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
 237  237   * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
 238  238   * property should generally be a constant across its lifetime.
 239  239   *
 240  240   * A property may optionally have a default value. If it does have a default
 241  241   * value, and that property is not set to be a different value, then the default
 242  242   * value is inherited automatically. It also means that if the default value is
 243  243   * acceptable, there is no need to set the value for a required property. For
 244  244   * example, the vxlan module has the vxlan/listen_port property which is
 245  245   * required, but has a default value of 4789 (the IANA assigned port). Because
 246  246   * of that default value, there is no need for it to be set.
 247  247   *
 248  248   * Finally, a property may declare a list of valid values. These valid values
 249  249   * are used for display purposes, they are not enforced by the broader system,
 250  250   * but merely allow a means for the information to be communicated to the user
 251  251   * through dladm(1M). Like a default value, this is optional.
 252  252   *
 253  253   * The general scaffolding does not do very much with respect to the getting and
 254  254   * setting of properties. That is really owned by the individual plugins
 255  255   * themselves.
 256  256   *
 257  257   * -----------------------------
 258  258   * Destinations and Plugin Types
 259  259   * -----------------------------
 260  260   *
 261  261   * Both encapsulation and lookup plugins define the kinds of destinations that
 262  262   * they know how to support. There are three different pieces of information
 263  263   * that can be used to address to a destination currently, all of which is
 264  264   * summarized in the type overlay_point_t. Any combination of these is
 265  265   * supported.
 266  266   *
 267  267   *      OVERLAY_PLUGIN_D_ETHERNET
 268  268   *
 269  269   *              An Ethernet MAC address is required.
 270  270   *
 271  271   *      OVERLAY_PLUGIN_D_IP
 272  272   *
 273  273   *              An IP address is required. All IP addresses used by the overlay
 274  274   *              system are transmitted as IPv6 addresses. IPv4 addresses can be
 275  275   *              represented by using IPv4-mapped IPv6 addresses.
 276  276   *
 277  277   *      OVERLAY_PLUGIN_D_PORT
 278  278   *
 279  279   *              A TCP/UDP port is required.
 280  280   *
 281  281   * A kernel encapsulation plugin declares which of these that it requires, it's
 282  282   * a static set. On the other hand, a userland lookup plugin can be built to
 283  283   * support all of these or any combination thereof. It gets passed the required
 284  284   * destination type, based on the kernel encapsulation method, and then it makes
 285  285   * the determination as to whether or not it supports it. For example, the
 286  286   * direct plugin can support either an IP or both an IP and a port, it simply
 287  287   * doesn't display the direct/dest_port property in the cases where a port is
 288  288   * not required to support this.
 289  289   *
 290  290   * The user lookup plugins have two different modes of operation which
 291  291   * determines how they interact with the broader system and how look ups are
 292  292   * performed. These types are:
 293  293   *
 294  294   *      OVERLAY_TARGET_POINT
 295  295   *
 296  296   *              A point to point plugin has a single static definition for where
 297  297   *              to send all traffic. Every packet in the system always gets sent
 298  298   *              to the exact same destination which is programmed into the
 299  299   *              kernel when the general device is activated.
 300  300   *
 301  301   *      OVERLAY_TARGET_DYNAMIC
 302  302   *
 303  303   *              A dynamic plugin does not have a single static definition.
 304  304   *              Instead, for each destination, the kernel makes an asynchronous
 305  305   *              request to varpd to determine where the packet should be routed,
 306  306   *              and if a specific destination is found, then that destination is
 307  307   *              cached in the overlay device's target cache.
 308  308   *
 309  309   * This distinction, while important for the general overlay device's operation,
 310  310   * is not important to the encapsulation plugins. They don't need to know about
 311  311   * any of these pieces. It's just a concern for varpd, the userland plugin, and
 312  312   * the general overlay scaffolding.
 313  313   *
 314  314   * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
 315  315   * maintain a target cache, and instead just keeps track of the destination and
 316  316   * always sends encapsulated packets to that address. When the target type is of
 317  317   * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
 318  318   * destinations. These destinations are kept around in an instance of a
 319  319   * reference hash that is specific to the given overlay device. Entries in the
 320  320   * cache can be invalidated and replaced by varpd and its lookup plugins.
 321  321   *
 322  322   * ----------------------------------
 323  323   * Kernel Components and Architecture
 324  324   * ----------------------------------
 325  325   *
 326  326   * There are multiple pieces inside the kernel that work together, there is the
 327  327   * general overlay_dev_t structure, which is the logical GLDv3 device, but it
 328  328   * itself has references to things like an instance of an encapsulation plugin,
 329  329   * a pointer to a mux and a target cache. It can roughly be summarized in the
 330  330   * following image:
 331  331   *
 332  332   *     +------------------+
 333  333   *     | global           |
 334  334   *     | overlay list     |
 335  335   *     | overlay_dev_list |
 336  336   *     +------------------+
 337  337   *        |
 338  338   *        |  +-----------------------+            +---------------+
 339  339   *        +->| GLDv3 Device          |----------->| GLDv3 Device  | -> ...
 340  340   *           | overlay_dev_t         |            | overlay_dev_t |
 341  341   *           |                       |            +---------------+
 342  342   *           |                       |
 343  343   *           | mac_handle_t     -----+---> GLDv3 handle to MAC
 344  344   *           | datalink_id_t    -----+---> Datalink ID used by DLS
 345  345   *           | overlay_dev_flag_t ---+---> Device state
 346  346   *           | uint_t           -----+---> Curent device MTU
 347  347   *           | uint_t           -----+---> In-progress RX operations
 348  348   *           | uint_t           -----+---> In-progress TX operations
 349  349   *           | char[]           -----+---> FMA degraded message
 350  350   *           | void *           -----+---> plugin private data
 351  351   *           | overlay_target_t * ---+---------------------+
 352  352   *           | overlay_plugin_t * ---+---------+           |
 353  353   *           +-----------------------+         |           |
 354  354   *                           ^                 |           |
 355  355   *   +--------------------+  |                 |           |
 356  356   *   | Kernel Socket      |  |                 |           |
 357  357   *   | Multiplexor        |  |                 |           |
 358  358   *   | overlay_mux_t      |  |                 |           |
 359  359   *   |                    |  |                 |           |
 360  360   *   | avl_tree_t        -+--+                 |           |
 361  361   *   | uint_t            -+--> socket family   |           |
 362  362   *   | uint_t            -+--> socket type     |           |
 363  363   *   | uint_t            -+--> socket protocol |           |
 364  364   *   | ksocket_t         -+--> I/O socket      |           |
 365  365   *   | struct sockaddr * -+--> ksocket address |           |
 366  366   *   | overlay_plugin_t --+--------+           |           |
 367  367   *   +--------------------+        |           |           |
 368  368   *                                 |           |           |
 369  369   *   +-------------------------+   |           |           |
 370  370   *   | Encap Plugin            |<--+-----------+           |
 371  371   *   | overlay_plugin_t        |                           |
 372  372   *   |                         |                           |
 373  373   *   | char *               ---+--> plugin name            |
 374  374   *   | overlay_plugin_ops_t * -+--> plugin downcalls       |
 375  375   *   | char ** (props)      ---+--> property list          |
 376  376   *   | uint_t               ---+--> id length              |
 377  377   *   | overlay_plugin_flags_t -+--> plugin flags           |
 378  378   *   | overlay_plugin_dest_t --+--> destination type       v
 379  379   *   +-------------------------+                    +-------------------------+
 380  380   *                                                  |   Target Cache          |
 381  381   *                                                  |   overlay_target_t      |
 382  382   *                                                  |                         |
 383  383   *                                    cache mode <--+- overlay_target_mode_t  |
 384  384   *                                     dest type <--+- overlay_plugin_dest_t  |
 385  385   *                                   cache flags <--+- overlay_target_flag_t  |
 386  386   *                                     varpd id  <--+- uint64_t               |
 387  387   *                       outstanding varpd reqs. <--+- uint_t                 |
 388  388   *                   OVERLAY_TARGET_POINT state  <--+- overlay_target_point_t |
 389  389   *               OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t   |
 390  390   *                                              |   +-------------------------+
 391  391   *                      +-----------------------+
 392  392   *                      |
 393  393   *                      v
 394  394   *   +-------------------------------+   +------------------------+
 395  395   *   | Target Entry                  |-->| Target Entry           |--> ...
 396  396   *   | overlay_target_entry_t        |   | overlay_target_entry_t |
 397  397   *   |                               |   +------------------------+
 398  398   *   |                               |
 399  399   *   | overlay_target_entry_flags_t -+--> Entry flags
 400  400   *   | uint8_t[ETHERADDRL]        ---+--> Target MAC address
 401  401   *   | overlay_target_point_t     ---+--> Target underlay address
 402  402   *   | mblk_t *                   ---+--> outstanding mblk head
 403  403   *   | mblk_t *                   ---+--> outstanding mblk tail
 404  404   *   | size_t                     ---+--> outstanding mblk size
 405  405   *   +-------------------------------+
 406  406   *
 407  407   * The primary entries that we care about are the overlay_dev_t, which
 408  408   * correspond to each overlay device that is created with dladm(1M). Globally,
 409  409   * these devices are maintained in a simple list_t which is protected with a
 410  410   * lock.  Hence, these include important information such as the mac_handle_t
 411  411   * and a datalink_id_t which is used to interact with the broader MAC and DLS
 412  412   * ecosystem. We also maintain additional information such as the current state,
 413  413   * outstanding operations, the mtu, and importantly, the plugin's private data.
 414  414   * This is the instance of an encapsulation plugin that gets created as part of
 415  415   * creating an overlay device. Another aspect of this is that the overlay_dev_t
 416  416   * also includes information with respect to FMA. For more information, see the
 417  417   * FMA section.
 418  418   *
 419  419   * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
 420  420   * is the encapsulation plugin. This allows the device to make downcalls into it
 421  421   * based on doing things like getting and setting properties. Otherwise, the
 422  422   * plugin itself is a fairly straightforward entity. They are maintained in an
 423  423   * (not pictured above) list. The plugins themselves mostly maintain things like
 424  424   * the static list of properties, what kind of destination they require, and the
 425  425   * operations vector. A given module may contain more if necessary.
 426  426   *
 427  427   * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
 428  428   * maintains a ksocket and it is through the mux that we send and receive
 429  429   * message blocks. The mux represents a socket type and address, as well as a
 430  430   * plugin. Multiple overlay_dev_t devices may then share the same mux. For
 431  431   * example, consider the case where you have different instances of vxlan all on
 432  432   * the same underlay network. These would all logically share the same IP
 433  433   * address and port that packets are sent and received on; however, what differs
 434  434   * is the decapuslation ID.
 435  435   *
 436  436   * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
 437  437   * a socket, we enable a direct callback on the ksocket. This means that
 438  438   * whenever a message block chain is received, rather than sitting there and
 439  439   * getting a callback in a context and kicking that back out to a taskq. Instead
 440  440   * data comes into the callback function overlay_mux_recv().
 441  441   *
 442  442   * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
 443  443   * function) to transmit. It receives encapsulated packets, decapsulates them to
 444  444   * determine the overlay identifier, looks up the given device that matches that
 445  445   * identifier, and then causes the broader MAC world to receive the packet with
 446  446   * a call to mac_rx().
 447  447   *
 448  448   * Today, we don't do too much that's special with the ksocket; however, as
 449  449   * hardware is gaining understanding for these encapuslation protocols, we'll
 450  450   * probably want to think of better ways to get those capabilities passed down
 451  451   * and potentially better ways to program receive filters so they get directly
 452  452   * to us. Though, that's all fantasy future land.
 453  453   *
 454  454   * The next part of the puzzle is the target cache. The purpose of the target
 455  455   * cache is to cache where we should send a packet on the underlay network,
 456  456   * given its mac address. The target cache operates in two modes depending on
 457  457   * whether the lookup module was declared to OVERLAY_TARGET_POINT or
 458  458   * OVERLAY_TARGET_DYANMIC.
 459  459   *
 460  460   * In the case where the target cache has been programmed to be
 461  461   * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
 462  462   * which has the destination that we send everything, no matter the destination
 463  463   * mac address.
 464  464   *
 465  465   * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
 466  466   * are much more interesting and as a result, more complicated. We primarily
 467  467   * store lists of overlay_target_entry_t's which are stored in both an avl tree
 468  468   * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
 469  469   * is only used for a few of the target ioctls used to dump data such that we
 470  470   * can get a consistent iteration order for things like dladm show-overlay -t.
 471  471   * The key that we use for the reference hashtable is based on the mac address
 472  472   * in the cache and currently we just do a simple CRC32 to transform it into a
 473  473   * hash.
 474  474   *
 475  475   * Each entry maintains a set of flags to indicate the current status of the
 476  476   * request. The flags may indicate one of three states: that current cache entry
 477  477   * is valid, that the current cache entry has been directed to drop all output,
 478  478   * and that the current cache entry is invalid and may be being looked up. In
 479  479   * the case where it's valid, we just take the destination address and run with
 480  480   * it.
 481  481   *
 482  482   * If it's invalid and a lookup has not been made, then we start the process
 483  483   * that prepares a query that will make its way up to varpd. The cache entry
 484  484   * entry maintains a message block chain of outstanding message blocks and a
 485  485   * size. These lists are populated only when we don't know the answer as to
 486  486   * where should these be sent. The size entry is used to cap the amount of
 487  487   * outstanding data that we don't know the answer to. If we exceed a cap on the
 488  488   * amount of outstanding data (currently 1 Mb), then we'll drop any additional
 489  489   * packets. Once we get an answer indicating a valid destination, we transmit
 490  490   * any outstanding data to that place. For the full story on how we look that up
 491  491   * will be discussed in the section on the Target Cache Lifecycle.
 492  492   *
 493  493   * ------------------------
 494  494   * FMA and Degraded Devices
 495  495   * ------------------------
 496  496   *
 497  497   * Every kernel overlay device keeps track of its FMA state. Today in FMA we
 498  498   * cannot represent partitions between resources nor can we represent that a
 499  499   * given minor node of a psuedo device has failed -- if we degrade the overlay
 500  500   * device, then the entire dev_info_t is degraded. However, we still want to be
 501  501   * able to indicate to administrators that things may go wrong.
 502  502   *
 503  503   * To this end, we've added a notion of a degraded state to every overlay
 504  504   * device. This state is primarily dictated by userland and it can happen for
 505  505   * various reasons. Generally, because a userland lookup plugin has been
 506  506   * partitioned, or something has gone wrong such that there is no longer any
 507  507   * userland lookup module for a device, then we'll mark it degraded.
 508  508   *
 509  509   * As long as any of our minor instances is degraded, then we'll fire off the
 510  510   * FMA event to note that. Once the last degraded instance is no longer
 511  511   * degraded, then we'll end up telling FMA that we're all clean.
 512  512   *
 513  513   * To help administrators get a better sense of which of the various minor
 514  514   * devices is wrong, we store the odd_fmamsg[] character array. This character
 515  515   * array can be fetched with doing a dladm show-overlay -f.
 516  516   *
 517  517   * Note, that it's important that we do not update the link status of the
 518  518   * devices. We want to remain up as much as possible. By changing the link in a
 519  519   * degraded state, this may end up making things worse. We may still actually
 520  520   * have information in the target cache and if we mark the link down, that'll
 521  521   * result in not being able to use it. The reason being that this'll mark all
 522  522   * the downstream VNICs down which will go to IP and from there we end up
 523  523   * dealing with sadness.
 524  524   *
 525  525   * -----------------------
 526  526   * Target Cache Life Cycle
 527  527   * -----------------------
 528  528   *
 529  529   * This section only applies when we have a lookup plugin of
 530  530   * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
 531  531   * OVERLAY_TARGET_POINT.
 532  532   *
 533  533   * While we got into the target cache in the general architecture section, it's
 534  534   * worth going into more details as to how this actually works and showing some
 535  535   * examples and state machines. Recall that a target cache entry basically has
 536  536   * the following state transition diagram:
 537  537   *
 538  538   * Initial state
 539  539   *    . . .           . . . first access       . . . varpd lookup enqueued
 540  540   *        .           .                        .
 541  541   *        .           .                        .
 542  542   *     +-------+      .     +----------+       .
 543  543   *     |  No   |------*---->| Invalid  |-------*----+
 544  544   *     | Entry |            |  Entry   |            |
 545  545   *     +-------+            +----------+            |
 546  546   *                 varpd      ^      ^   varpd      |
 547  547   *                 invalidate |      |   drop       |
 548  548   *                      . . . *      * . .          v
 549  549   *          +-------+         |      |         +---------+
 550  550   *          | Entry |--->-----+      +----<----| Entry   |
 551  551   *          | Valid |<----------*---------<----| Pending |->-+     varpd
 552  552   *          +-------+           .              +---------+   * . . drop, but
 553  553   *                              . varpd                ^     |     other queued
 554  554   *                              . success              |     |     entries
 555  555   *                                                     +-----+
 556  556   *
 557  557   * When the table is first created, it is empty. As we attempt to lookup entries
 558  558   * and we find there is no entry at all, we'll create a new table entry for it.
 559  559   * At that point the entry is technically in an invalid state, that means that
 560  560   * we have no valid data from varpd. In that case, we'll go ahead and queue the
 561  561   * packet into the entry's pending chain, and queue a varpd lookup, setting the
 562  562   * OVERLAY_ENTRY_F_PENDING flag in the progress.
 563  563   *
 564  564   * If additional mblk_t's come in for this entry, we end up appending them to
 565  565   * the tail of the chain, if and only if, we don't exceed the threshold for the
 566  566   * amount of space they can take up. An entry remains pending until we get a
 567  567   * varpd reply. If varpd replies with a valid results, we move to the valid
 568  568   * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
 569  569   * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
 570  570   *
 571  571   * Once an entry is valid, it stays valid until user land tells us to invalidate
 572  572   * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
 573  573   * OVERLAY_TARG_CACHE_SET respectively.
 574  574   *
 575  575   * If the lookup fails with a call to drop the packet, then the next state is
 576  576   * determined by the state of the queue. If the set of outstanding entries is
 577  577   * empty, then we just transition back to the invalid state. If instead, the
 578  578   * set of outstanding entries is not empty, then we'll queue another entry and
 579  579   * stay in the same state, repeating this until the number of requests is
 580  580   * drained.
 581  581   *
 582  582   * The following images describes the flow of a given lookup and where the
 583  583   * overlay_target_entry_t is at any given time.
 584  584   *
 585  585   *     +-------------------+
 586  586   *     | Invalid Entry     |            An entry starts off as an invalid entry
 587  587   *     | de:ad:be:ef:00:00 |            and only exists in the target cache.
 588  588   *     +-------------------+
 589  589   *
 590  590   *      ~~~~
 591  591   *
 592  592   *     +---------------------+
 593  593   *     | Global list_t       |          A mblk_t comes in for an entry. We
 594  594   *     | overlay_target_list |          append it to the overlay_target_list.
 595  595   *     +---------------------+
 596  596   *                   |
 597  597   *                   v
 598  598   *             +-------------------+      +-------------------+
 599  599   *             | Pending Entry     |----->| Pending Entry     |--->...
 600  600   *             | 42:5e:1a:10:d6:2d |      | de:ad:be:ef:00:00 |
 601  601   *             +-------------------+      +-------------------+
 602  602   *
 603  603   *      ~~~~
 604  604   *
 605  605   *     +--------------------------+
 606  606   *     | /dev/overlay minor state |     User land said that it would look up an
 607  607   *     | overlay_target_hdl_t     |     entry for us. We remove it from the
 608  608   *     +--------------------------+     global list and add it to the handle's
 609  609   *                  |                   outstanding list.
 610  610   *                  |
 611  611   *                  v
 612  612   *            +-------------------+      +-------------------+
 613  613   *            | Pending Entry     |----->| Pending Entry     |
 614  614   *            | 90:b8:d0:79:02:dd |      | de:ad:be:ef:00:00 |
 615  615   *            +-------------------+      +-------------------+
 616  616   *
 617  617   *      ~~~~
 618  618   *
 619  619   *     +-------------------+
 620  620   *     | Valid Entry       |            varpd returned an answer with
 621  621   *     | de:ad:be:ef:00:00 |            OVERLAY_IOC_RESPOND and the target cache
 622  622   *     | 10.169.23.42:4789 |            entry is now populated with a
 623  623   *     +-------------------+            destination and marked as valid
 624  624   *
 625  625   *
 626  626   * The lookup mechanism is performed via a series of operations on the character
 627  627   * psuedo-device /dev/overlay. The only thing that uses this device is the
 628  628   * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
 629  629   * granting a new minor number which maintains its own state. We maintain this
 630  630   * state so that way if an outstanding lookup was queued to something that
 631  631   * crashed or closed its handle without responding, we can know about this and
 632  632   * thus handle it appropriately.
 633  633   *
 634  634   * When a lookup is first created it's added to our global list of outstanding
 635  635   * lookups. To service requests, userland is required to perform an ioctl to ask
 636  636   * for a request. We will block it in the kernel a set amount of time waiting
 637  637   * for a request. When we give a request to a given minor instance of the
 638  638   * device, we remove it from the global list and append the request to the
 639  639   * device's list of outstanding entries, for the reasons we discussed above.
 640  640   * When a lookup comes in, we give user land a smaller amount of information
 641  641   * specific to that packet, the overlay_targ_lookup_t. It includes a request id
 642  642   * to identify this, and then the overlay id, the varpd id, the header and
 643  643   * packet size, the source and destination mac address, the SAP, and any
 644  644   * potential VLAN header.
 645  645   *
 646  646   * At that point, it stays in that outstanding list until one of two ioctls are
 647  647   * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
 648  648   * userland may also perform other operations. For example, it may use
 649  649   * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
 650  650   * analysis of what to do beyond what we gave it initially. This is useful for
 651  651   * providing proxy arp and the like. Finally, there are two other ioctls that
 652  652   * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
 653  653   * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
 654  654   * causes us to encapsulate and send out the packet they've given us.
 655  655   *
 656  656   *
 657  657   * Finally, through the target cache, several ioctls are provided to allow for
 658  658   * interrogation and management of the cache. They allow for individual entries
 659  659   * to be retrieved, set, or have the entire table flushed. For the full set of
 660  660   * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
 661  661   *
 662  662   * ------------------
 663  663   * Sample Packet Flow
 664  664   * ------------------
 665  665   *
 666  666   * There's a lot of pieces here, hopefully an example of how this all fits
 667  667   * together will help clarify and elucidate what's going on. We're going to
 668  668   * first track an outgoing packet, eg. one that is sent from an IP interface on
 669  669   * a VNIC on top of an overlay device, and then we'll look at what it means to
 670  670   * respond to that.
 671  671   *
 672  672   *
 673  673   *    +----------------+        +--------------+            +------------------+
 674  674   *    | IP/DLS send    |------->| MAC sends it |----------->| mblk_t reaches   |
 675  675   *    | packet to MAC  |        | to the GLDv3 |            | overlay GLDv3 tx |
 676  676   *    +----------------+        | VNIC device  |            | overlay_m_tx()   |
 677  677   *                              +--------------+            +------------------+
 678  678   *                                                                   |
 679  679   *                             . lookup              . cache         |
 680  680   *                             . drop                . miss          v
 681  681   *            +---------+      .       +--------+    .      +------------------+
 682  682   *            | freemsg |<-----*-------| varpd  |<---*------| Lookup each mblk |
 683  683   *            | mblk_t  |              | lookup |           | in the target    |
 684  684   *            +---------+              | queued |           | cache            |
 685  685   *                ^                    +--------+           +------------------+
 686  686   *      on send   |                        |                         |     cache
 687  687   *      error . . *                        *. . lookup               * . . hit
 688  688   *                |                        |    success              v
 689  689   *                |                        |                +------------------+
 690  690   *    +-----------------+                  +--------------->| call plugin      |
 691  691   *    | Send out        |                                   | ovpo_encap() to  |
 692  692   *    | overlay_mux_t's |<----------------------------------| get encap mblk_t |
 693  693   *    | ksocket         |                                   +------------------+
 694  694   *    +-----------------+
 695  695   *
 696  696   * The receive end point looks a little different and looks more like:
 697  697   *
 698  698   *  +------------------+     +----------------+    +-----------+
 699  699   *  | mblk_t comes off |---->| enter netstack |--->| delivered |---+
 700  700   *  | the physical     |     | IP stack       |    |     to    |   * . . direct
 701  701   *  | device           |     +----------------+    |  ksocket  |   |   callback
 702  702   *  +------------------+                           +-----------+   |
 703  703   *                       . overlay id                              |
 704  704   *                       . not found                               v
 705  705   *       +-----------+   .      +-----------------+       +--------------------+
 706  706   *       | freemsg   |<--*------| call plugin     |<------| overlay_mux_recv() |
 707  707   *       | mblk_t    |          | ovpo_decap() to |       +--------------------+
 708  708   *       +-----------+          | decap mblk_t    |
 709  709   *                              +-----------------+
 710  710   *                                     |
 711  711   *                                     * . . overlay id
 712  712   *                                     v     found
 713  713   *                                 +--------+      +----------------+
 714  714   *                                 | adjust |----->| call mac_rx    |
 715  715   *                                 | mblk_t |      | on original    |
 716  716   *                                 +--------+      | decaped packet |
 717  717   *                                                 +----------------+
 718  718   *
 719  719   * ------------------
 720  720   * Netstack Awareness
 721  721   * ------------------
 722  722   *
 723  723   * In the above image we note that this enters a netstack. Today the only
 724  724   * netstack that can be is the global zone as the overlay driver itself is not
 725  725   * exactly netstack aware. What this really means is that varpd cannot run in a
 726  726   * non-global zone and an overlay device cannot belong to a non-global zone.
 727  727   * Non-global zones can still have a VNIC assigned to them that's been created
 728  728   * over the overlay device the same way they would if it had been created over
 729  729   * an etherstub or a physical device.
 730  730   *
 731  731   * The majority of the work to make it netstack aware is straightforward and the
 732  732   * biggest thing is to create a netstack module that allows us to hook into
 733  733   * netstack (and thus zone) creation and destruction.  From there, we need to
 734  734   * amend the target cache lookup routines that we discussed earlier to not have
 735  735   * a global outstanding list and a global list of handles, but rather, one per
 736  736   * netstack.
 737  737   *
 738  738   * For the mux, we'll need to open the ksocket in the context of the zone, we
 739  739   * can likely do this with a properly composed credential, but we'll need to do
 740  740   * some more work on that path. Finally, we'll want to make sure the dld ioctls
 741  741   * are aware of the zoneid of the caller and we use that appropriately and store
 742  742   * it in the overlay_dev_t.
 743  743   *
 744  744   * -----------
 745  745   * GLDv3 Notes
 746  746   * -----------
 747  747   *
 748  748   * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
 749  749   * relevant and other parts are much less relevant for us. For example, the
 750  750   * GLDv3 is used to toggle the device being put into and out of promiscuous
 751  751   * mode, to program MAC addresses for unicast and multicast hardware filters.
 752  752   * Today, an overlay device doesn't have a notion of promiscuous mode nor does
 753  753   * it have a notion of unicast and multicast addresses programmed into the
 754  754   * device. Instead, for the purposes of the hardware filter, we don't do
 755  755   * anything and just always accept new addresses being added and removed.
 756  756   *
 757  757   * If the GLDv3 start function has not been called, then we will not use this
 758  758   * device for I/O purposes. Any calls to transmit or receive should be dropped,
 759  759   * though the GLDv3 guarantees us that transmit will not be called without
 760  760   * calling start. Similarly, once stop is called, then no packets can be dealt
 761  761   * with.
 762  762   *
 763  763   * Today we don't support the stat interfaces, though there's no good reason
 764  764   * that we shouldn't assemble some of the stats based on what we have in the
 765  765   * future.
 766  766   *
 767  767   * When it comes to link properties, many of the traditional link properties do
 768  768   * not apply and many others MAC handles for us. For example, we don't need to
 769  769   * implement anything for overlay_m_getprop() to deal with returning the MTU, as
 770  770   * MAC never calls into us for that. As such, there isn't much of anything to
 771  771   * support in terms of properties.
 772  772   *
 773  773   * Today, we don't support any notion of hardware capabilities. However, if
 774  774   * future NIC hardware or other changes to the system cause it to make sense for
 775  775   * us to emulate logical groups, then we should do that. However, we still do
 776  776   * implement a capab function so that we can identify ourselves as an overlay
 777  777   * device to the broader MAC framework. This is done mostly so that a device
 778  778   * created on top of us can have fanout rings as we don't try to lie about a
 779  779   * speed for our device.
 780  780   *
 781  781   * The other question is what should be done for a device's MTU and margin. We
 782  782   * set our minimum supported MTU to be the minimum value that an IP network may
 783  783   * be set to 576 -- which mimics what an etherstub does. On the flip side, we
 784  784   * have our upper bound set to 8900. This value comes from the fact that a lot
 785  785   * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
 786  786   * bytes, which isn't exactly the most accurate number, but it'll be good enough
 787  787   * for now. Because of that, our default MTU off of these devices is 1400, as
 788  788   * the default MTU for everything is usually 1500 or whatever the underlying
 789  789   * device is at; however, this is a bit simpler than asking the netstack what
 790  790   * are all the IP interfaces at. It also calls into question how PMTU and PMTU
 791  791   * discovery should work here. The challenge, especially for
 792  792   * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
 793  793   * not clear that if you have a single bad entry that the overall MTU should be
 794  794   * lowered. Instead, we should figure out a better way of determining these
 795  795   * kinds of PMTU errors and appropriately alerting the administrator via FMA.
 796  796   *
 797  797   * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
 798  798   * or not the underlying encapsulation device supports VLAN tags. If it does,
 799  799   * then we'll set the margin to allow for it, otherwise, we will not.
 800  800   */
 801  801  
 802  802  #include <sys/conf.h>
 803  803  #include <sys/errno.h>
 804  804  #include <sys/stat.h>
 805  805  #include <sys/ddi.h>
 806  806  #include <sys/sunddi.h>
 807  807  #include <sys/modctl.h>
 808  808  #include <sys/policy.h>
 809  809  #include <sys/stream.h>
 810  810  #include <sys/strsubr.h>
 811  811  #include <sys/strsun.h>
 812  812  #include <sys/types.h>
 813  813  #include <sys/kmem.h>
 814  814  #include <sys/param.h>
 815  815  #include <sys/sysmacros.h>
 816  816  #include <sys/ddifm.h>
 817  817  
 818  818  #include <sys/dls.h>
 819  819  #include <sys/dld_ioc.h>
 820  820  #include <sys/mac_provider.h>
 821  821  #include <sys/mac_client_priv.h>
 822  822  #include <sys/mac_ether.h>
 823  823  #include <sys/vlan.h>
 824  824  
 825  825  #include <sys/overlay_impl.h>
 826  826  
 827  827  dev_info_t *overlay_dip;
 828  828  static kmutex_t overlay_dev_lock;
 829  829  static list_t overlay_dev_list;
 830  830  static uint8_t overlay_macaddr[ETHERADDRL] =
 831  831          { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
 832  832  
 833  833  typedef enum overlay_dev_prop {
 834  834          OVERLAY_DEV_P_MTU = 0,
 835  835          OVERLAY_DEV_P_VNETID,
 836  836          OVERLAY_DEV_P_ENCAP,
 837  837          OVERLAY_DEV_P_VARPDID,
 838  838          OVERLAY_DEV_P_DCID
 839  839  } overlay_dev_prop_t;
 840  840  
 841  841  #define OVERLAY_DEV_NPROPS      5
 842  842  static const char *overlay_dev_props[] = {
 843  843          "mtu",
 844  844          "vnetid",
 845  845          "encap",
 846  846          "varpd/id",
 847  847          "dcid"
 848  848  };
 849  849  
 850  850  #define OVERLAY_MTU_MIN 576
 851  851  #define OVERLAY_MTU_DEF 1400
 852  852  #define OVERLAY_MTU_MAX 8900
 853  853  
 854  854  overlay_dev_t *
 855  855  overlay_hold_by_dlid(datalink_id_t id)
 856  856  {
 857  857          overlay_dev_t *o;
 858  858  
 859  859          mutex_enter(&overlay_dev_lock);
 860  860          for (o = list_head(&overlay_dev_list); o != NULL;
 861  861              o = list_next(&overlay_dev_list, o)) {
 862  862                  if (id == o->odd_linkid) {
 863  863                          mutex_enter(&o->odd_lock);
 864  864                          o->odd_ref++;
 865  865                          mutex_exit(&o->odd_lock);
 866  866                          mutex_exit(&overlay_dev_lock);
 867  867                          return (o);
 868  868                  }
 869  869          }
 870  870  
 871  871          mutex_exit(&overlay_dev_lock);
 872  872          return (NULL);
 873  873  }
 874  874  
 875  875  void
 876  876  overlay_hold_rele(overlay_dev_t *odd)
 877  877  {
 878  878          mutex_enter(&odd->odd_lock);
 879  879          ASSERT(odd->odd_ref > 0);
 880  880          odd->odd_ref--;
 881  881          mutex_exit(&odd->odd_lock);
 882  882  }
 883  883  
 884  884  void
 885  885  overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
 886  886  {
 887  887          ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
 888  888          ASSERT(MUTEX_HELD(&odd->odd_lock));
 889  889  
 890  890          if (flag & OVERLAY_F_IN_RX)
 891  891                  odd->odd_rxcount++;
 892  892          if (flag & OVERLAY_F_IN_TX)
 893  893                  odd->odd_txcount++;
 894  894          odd->odd_flags |= flag;
 895  895  }
 896  896  
 897  897  void
 898  898  overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
 899  899  {
 900  900          boolean_t signal = B_FALSE;
 901  901  
 902  902          ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
 903  903          ASSERT(MUTEX_HELD(&odd->odd_lock));
 904  904  
 905  905          if (flag & OVERLAY_F_IN_RX) {
 906  906                  ASSERT(odd->odd_rxcount > 0);
 907  907                  odd->odd_rxcount--;
 908  908                  if (odd->odd_rxcount == 0) {
 909  909                          signal = B_TRUE;
 910  910                          odd->odd_flags &= ~OVERLAY_F_IN_RX;
 911  911                  }
 912  912          }
 913  913          if (flag & OVERLAY_F_IN_TX) {
 914  914                  ASSERT(odd->odd_txcount > 0);
 915  915                  odd->odd_txcount--;
 916  916                  if (odd->odd_txcount == 0) {
 917  917                          signal = B_TRUE;
 918  918                          odd->odd_flags &= ~OVERLAY_F_IN_TX;
 919  919                  }
 920  920          }
 921  921  
 922  922          if (signal == B_TRUE)
 923  923                  cv_broadcast(&odd->odd_iowait);
 924  924  }
 925  925  
 926  926  static void
 927  927  overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
 928  928  {
 929  929          ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
 930  930          ASSERT(MUTEX_HELD(&odd->odd_lock));
 931  931  
 932  932          while (odd->odd_flags & flag) {
 933  933                  cv_wait(&odd->odd_iowait, &odd->odd_lock);
 934  934          }
 935  935  }
 936  936  
 937  937  void
 938  938  overlay_dev_iter(overlay_dev_iter_f func, void *arg)
 939  939  {
 940  940          overlay_dev_t *odd;
 941  941  
 942  942          mutex_enter(&overlay_dev_lock);
 943  943          for (odd = list_head(&overlay_dev_list); odd != NULL;
 944  944              odd = list_next(&overlay_dev_list, odd)) {
 945  945                  if (func(odd, arg) != 0) {
 946  946                          mutex_exit(&overlay_dev_lock);
 947  947                          return;
 948  948                  }
 949  949          }
 950  950          mutex_exit(&overlay_dev_lock);
 951  951  }
 952  952  
 953  953  /* ARGSUSED */
 954  954  static int
 955  955  overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
 956  956  {
 957  957          return (ENOTSUP);
 958  958  }
 959  959  
 960  960  static int
 961  961  overlay_m_start(void *arg)
 962  962  {
 963  963          overlay_dev_t *odd = arg;
 964  964          overlay_mux_t *mux;
 965  965          int ret, domain, family, prot;
 966  966          struct sockaddr_storage storage;
 967  967          socklen_t slen;
 968  968  
 969  969          mutex_enter(&odd->odd_lock);
 970  970          if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
 971  971                  mutex_exit(&odd->odd_lock);
 972  972                  return (EAGAIN);
 973  973          }
 974  974          mutex_exit(&odd->odd_lock);
 975  975  
 976  976          ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
 977  977              &family, &prot, (struct sockaddr *)&storage, &slen);
 978  978          if (ret != 0)
 979  979                  return (ret);
 980  980  
 981  981          mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
 982  982              (struct sockaddr *)&storage, slen, &ret);
 983  983          if (mux == NULL)
 984  984                  return (ret);
 985  985  
 986  986          overlay_mux_add_dev(mux, odd);
 987  987          odd->odd_mux = mux;
 988  988          mutex_enter(&odd->odd_lock);
 989  989          ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
 990  990          odd->odd_flags |= OVERLAY_F_IN_MUX;
 991  991          mutex_exit(&odd->odd_lock);
 992  992  
 993  993          return (0);
 994  994  }
 995  995  
 996  996  static void
 997  997  overlay_m_stop(void *arg)
 998  998  {
 999  999          overlay_dev_t *odd = arg;
1000 1000  
1001 1001          /*
1002 1002           * The MAC Perimeter is held here, so we don't have to worry about
1003 1003           * synchornizing this with respect to metadata operations.
1004 1004           */
1005 1005          mutex_enter(&odd->odd_lock);
1006 1006          VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
1007 1007          VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
1008 1008          odd->odd_flags |= OVERLAY_F_MDDROP;
1009 1009          overlay_io_wait(odd, OVERLAY_F_IOMASK);
1010 1010          mutex_exit(&odd->odd_lock);
1011 1011  
1012 1012          overlay_mux_remove_dev(odd->odd_mux, odd);
1013 1013          overlay_mux_close(odd->odd_mux);
1014 1014          odd->odd_mux = NULL;
1015 1015  
1016 1016          mutex_enter(&odd->odd_lock);
1017 1017          odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1018 1018          odd->odd_flags &= ~OVERLAY_F_MDDROP;
1019 1019          VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
1020 1020          mutex_exit(&odd->odd_lock);
1021 1021  }
1022 1022  
1023 1023  /*
1024 1024   * For more info on this, see the big theory statement.
1025 1025   */
1026 1026  /* ARGSUSED */
1027 1027  static int
1028 1028  overlay_m_promisc(void *arg, boolean_t on)
1029 1029  {
1030 1030          return (0);
1031 1031  }
1032 1032  
1033 1033  /*
1034 1034   * For more info on this, see the big theory statement.
1035 1035   */
1036 1036  /* ARGSUSED */
1037 1037  static int
1038 1038  overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
1039 1039  {
1040 1040          return (0);
1041 1041  }
1042 1042  
1043 1043  /*
1044 1044   * For more info on this, see the big theory statement.
1045 1045   */
1046 1046  /* ARGSUSED */
1047 1047  static int
1048 1048  overlay_m_unicast(void *arg, const uint8_t *macaddr)
1049 1049  {
1050 1050          return (0);
1051 1051  }
1052 1052  
1053 1053  mblk_t *
1054 1054  overlay_m_tx(void *arg, mblk_t *mp_chain)
1055 1055  {
1056 1056          overlay_dev_t *odd = arg;
1057 1057          mblk_t *mp, *ep;
1058 1058          int ret;
1059 1059          ovep_encap_info_t einfo;
1060 1060          struct msghdr hdr;
1061 1061  
1062 1062          mutex_enter(&odd->odd_lock);
1063 1063          if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
1064 1064              !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
  
    | 
      ↓ open down ↓ | 
    1064 lines elided | 
    
      ↑ open up ↑ | 
  
1065 1065                  mutex_exit(&odd->odd_lock);
1066 1066                  freemsgchain(mp_chain);
1067 1067                  return (NULL);
1068 1068          }
1069 1069          overlay_io_start(odd, OVERLAY_F_IN_TX);
1070 1070          mutex_exit(&odd->odd_lock);
1071 1071  
1072 1072          bzero(&hdr, sizeof (struct msghdr));
1073 1073  
1074 1074          bzero(&einfo, sizeof (ovep_encap_info_t));
1075      -        einfo.ovdi_id = odd->odd_vid;
     1075 +
1076 1076          mp = mp_chain;
1077 1077          while (mp != NULL) {
1078 1078                  socklen_t slen;
1079 1079                  struct sockaddr_storage storage;
1080 1080  
1081 1081                  mp_chain = mp->b_next;
1082 1082                  mp->b_next = NULL;
1083 1083                  ep = NULL;
1084 1084  
1085      -                /*
1086      -                 * TODO: we probably need to change 'storage' to a
1087      -                 * refheld overlay_target_entry_t and also maybe set
1088      -                 * local vlan from packet header for check below
1089      -                 */
1090 1085                  ret = overlay_target_lookup(odd, mp,
1091      -                    (struct sockaddr *)&storage, &slen);
     1086 +                    (struct sockaddr *)&storage, &slen, &einfo.ovdi_id);
1092 1087                  if (ret != OVERLAY_TARGET_OK) {
1093 1088                          if (ret == OVERLAY_TARGET_DROP)
1094 1089                                  freemsg(mp);
1095 1090                          mp = mp_chain;
1096 1091                          continue;
1097 1092                  }
1098 1093  
1099      -                /*
1100      -                 * TODO:
1101      -                 *      set hdr.msg_name from target_entry
1102      -                 *
1103      -                 *      if !local:
1104      -                 *              check fabric attachment
1105      -                 *              modify vlan tag, VL2 mac addresses
1106      -                 *
1107      -                 *      set einfo.ovdi_id to vnet id (move into loop since
1108      -                 *      things cannot assume to all have same vnet id anymore)
1109      -                 */
1110 1094                  hdr.msg_name = &storage;
1111 1095                  hdr.msg_namelen = slen;
1112 1096  
1113 1097                  ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
1114 1098                      &einfo, &ep);
1115 1099                  if (ret != 0 || ep == NULL) {
1116 1100                          freemsg(mp);
1117 1101                          goto out;
1118 1102                  }
1119 1103  
1120 1104                  ep->b_cont = mp;
1121 1105                  ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
1122 1106                  if (ret != 0)
1123 1107                          goto out;
1124 1108  
1125 1109                  mp = mp_chain;
1126 1110          }
1127 1111  
1128 1112  out:
1129 1113          mutex_enter(&odd->odd_lock);
1130 1114          overlay_io_done(odd, OVERLAY_F_IN_TX);
1131 1115          mutex_exit(&odd->odd_lock);
1132 1116          return (mp_chain);
1133 1117  }
1134 1118  
1135 1119  /* ARGSUSED */
1136 1120  static void
1137 1121  overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1138 1122  {
1139 1123          miocnak(q, mp, 0, ENOTSUP);
1140 1124  }
1141 1125  
1142 1126  /* ARGSUSED */
1143 1127  static boolean_t
1144 1128  overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1145 1129  {
1146 1130          /*
1147 1131           * Tell MAC we're an overlay.
1148 1132           */
1149 1133          if (cap == MAC_CAPAB_OVERLAY)
1150 1134                  return (B_TRUE);
1151 1135          return (B_FALSE);
1152 1136  }
1153 1137  
1154 1138  /* ARGSUSED */
1155 1139  static int
1156 1140  overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1157 1141      uint_t pr_valsize, const void *pr_val)
1158 1142  {
1159 1143          uint32_t mtu, old;
1160 1144          int err;
1161 1145          overlay_dev_t *odd = arg;
1162 1146  
1163 1147          if (pr_num != MAC_PROP_MTU)
1164 1148                  return (ENOTSUP);
1165 1149  
1166 1150          bcopy(pr_val, &mtu, sizeof (mtu));
1167 1151          if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
1168 1152                  return (EINVAL);
1169 1153  
1170 1154          mutex_enter(&odd->odd_lock);
1171 1155          old = odd->odd_mtu;
1172 1156          odd->odd_mtu = mtu;
1173 1157          err = mac_maxsdu_update(odd->odd_mh, mtu);
1174 1158          if (err != 0)
1175 1159                  odd->odd_mtu = old;
1176 1160          mutex_exit(&odd->odd_lock);
1177 1161  
1178 1162          return (err);
1179 1163  }
1180 1164  
1181 1165  /* ARGSUSED */
1182 1166  static int
1183 1167  overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1184 1168      uint_t pr_valsize, void *pr_val)
1185 1169  {
1186 1170          return (ENOTSUP);
1187 1171  }
1188 1172  
1189 1173  /* ARGSUSED */
1190 1174  static void
1191 1175  overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1192 1176      mac_prop_info_handle_t prh)
1193 1177  {
1194 1178          if (pr_num != MAC_PROP_MTU)
1195 1179                  return;
1196 1180  
1197 1181          mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
1198 1182          mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
1199 1183  }
1200 1184  
1201 1185  static mac_callbacks_t overlay_m_callbacks = {
1202 1186          .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
1203 1187              MC_PROPINFO),
1204 1188          .mc_getstat = overlay_m_stat,
1205 1189          .mc_start = overlay_m_start,
1206 1190          .mc_stop = overlay_m_stop,
1207 1191          .mc_setpromisc = overlay_m_promisc,
1208 1192          .mc_multicst = overlay_m_multicast,
1209 1193          .mc_unicst = overlay_m_unicast,
1210 1194          .mc_tx = overlay_m_tx,
1211 1195          .mc_ioctl = overlay_m_ioctl,
1212 1196          .mc_getcapab = overlay_m_getcapab,
1213 1197          .mc_getprop = overlay_m_getprop,
1214 1198          .mc_setprop = overlay_m_setprop,
1215 1199          .mc_propinfo = overlay_m_propinfo
1216 1200  };
1217 1201  
1218 1202  static boolean_t
1219 1203  overlay_valid_name(const char *name, size_t buflen)
1220 1204  {
1221 1205          size_t actlen;
1222 1206          int err, i;
1223 1207  
1224 1208          for (i = 0; i < buflen; i++) {
1225 1209                  if (name[i] == '\0')
1226 1210                          break;
1227 1211          }
1228 1212  
1229 1213          if (i == 0 || i == buflen)
1230 1214                  return (B_FALSE);
1231 1215          actlen = i;
1232 1216          if (strchr(name, '/') != NULL)
1233 1217                  return (B_FALSE);
1234 1218          if (u8_validate((char *)name, actlen, NULL,
1235 1219              U8_VALIDATE_ENTIRE, &err) < 0)
1236 1220                  return (B_FALSE);
1237 1221          return (B_TRUE);
1238 1222  }
1239 1223  
1240 1224  /* ARGSUSED */
1241 1225  static int
1242 1226  overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1243 1227  {
1244 1228          int err;
1245 1229          uint64_t maxid;
1246 1230          overlay_dev_t *odd, *o;
1247 1231          mac_register_t *mac;
1248 1232          overlay_ioc_create_t *oicp = karg;
1249 1233  
1250 1234          if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
1251 1235                  return (EINVAL);
1252 1236  
1253 1237          odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
1254 1238          odd->odd_linkid = oicp->oic_linkid;
1255 1239          odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
1256 1240          if (odd->odd_plugin == NULL) {
1257 1241                  kmem_free(odd, sizeof (overlay_dev_t));
1258 1242                  return (ENOENT);
1259 1243          }
1260 1244          err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
1261 1245              &odd->odd_pvoid);
1262 1246          if (err != 0) {
1263 1247                  odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1264 1248                  overlay_plugin_rele(odd->odd_plugin);
1265 1249                  kmem_free(odd, sizeof (overlay_dev_t));
1266 1250                  return (EINVAL);
1267 1251          }
1268 1252  
1269 1253          /*
1270 1254           * Make sure that our virtual network id is valid for the given plugin
1271 1255           * that we're working with.
1272 1256           */
1273 1257          ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1274 1258          maxid = UINT64_MAX;
1275 1259          if (odd->odd_plugin->ovp_id_size != 8)
1276 1260                  maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
1277 1261          if (oicp->oic_vnetid > maxid) {
1278 1262                  odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1279 1263                  overlay_plugin_rele(odd->odd_plugin);
1280 1264                  kmem_free(odd, sizeof (overlay_dev_t));
1281 1265                  return (EINVAL);
1282 1266          }
1283 1267          odd->odd_vid = oicp->oic_vnetid;
1284 1268  
1285 1269          if (oicp->oic_dcid > UINT32_MAX) {
1286 1270                  odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1287 1271                  overlay_plugin_rele(odd->odd_plugin);
1288 1272                  kmem_free(odd, sizeof (overlay_dev_t));
1289 1273                  return (EINVAL);
1290 1274          }
1291 1275          odd->odd_dcid = oicp->oic_dcid;
1292 1276  
1293 1277          mac = mac_alloc(MAC_VERSION);
1294 1278          if (mac == NULL) {
1295 1279                  mutex_exit(&overlay_dev_lock);
1296 1280                  odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1297 1281                  overlay_plugin_rele(odd->odd_plugin);
1298 1282                  kmem_free(odd, sizeof (overlay_dev_t));
1299 1283                  return (EINVAL);
1300 1284          }
1301 1285  
1302 1286          mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1303 1287          mac->m_driver = odd;
1304 1288          mac->m_dip = overlay_dip;
1305 1289          mac->m_dst_addr = NULL;
1306 1290          mac->m_callbacks = &overlay_m_callbacks;
1307 1291          mac->m_pdata = NULL;
1308 1292          mac->m_pdata_size = 0;
1309 1293  
1310 1294          mac->m_priv_props = NULL;
1311 1295  
1312 1296          /* Let mac handle this itself. */
1313 1297          mac->m_instance = (uint_t)-1;
1314 1298  
1315 1299          /*
1316 1300           * There is no real source address that should be used here, but saying
1317 1301           * that we're not ethernet is going to cause its own problems. At the
1318 1302           * end of the say, this is fine.
1319 1303           */
1320 1304          mac->m_src_addr = overlay_macaddr;
1321 1305  
1322 1306          /*
1323 1307           * Start with the default MTU as the max SDU. If the MTU is changed, the
1324 1308           * SDU will be changed to reflect that.
1325 1309           */
1326 1310          mac->m_min_sdu = 1;
1327 1311          mac->m_max_sdu = OVERLAY_MTU_DEF;
1328 1312          mac->m_multicast_sdu = 0;
1329 1313  
1330 1314          /*
1331 1315           * The underlying device doesn't matter, instead this comes from the
1332 1316           * encapsulation protocol and whether or not they allow VLAN tags.
1333 1317           */
1334 1318          if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
1335 1319                  mac->m_margin = VLAN_TAGSZ;
1336 1320          } else {
1337 1321                  mac->m_margin = 0;
1338 1322          }
1339 1323  
1340 1324          /*
1341 1325           * Today, we have no MAC virtualization, it may make sense in the future
1342 1326           * to go ahead and emulate some subset of this, but it doesn't today.
1343 1327           */
1344 1328          mac->m_v12n = MAC_VIRT_NONE;
1345 1329  
1346 1330          mutex_enter(&overlay_dev_lock);
1347 1331          for (o = list_head(&overlay_dev_list); o != NULL;
1348 1332              o = list_next(&overlay_dev_list, o)) {
1349 1333                  if (o->odd_linkid == oicp->oic_linkid) {
1350 1334                          mutex_exit(&overlay_dev_lock);
1351 1335                          odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1352 1336                          overlay_plugin_rele(odd->odd_plugin);
1353 1337                          kmem_free(odd, sizeof (overlay_dev_t));
1354 1338                          return (EEXIST);
1355 1339                  }
1356 1340  
1357 1341                  if (o->odd_vid == oicp->oic_vnetid &&
1358 1342                      o->odd_plugin == odd->odd_plugin) {
1359 1343                          mutex_exit(&overlay_dev_lock);
1360 1344                          odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1361 1345                          overlay_plugin_rele(odd->odd_plugin);
1362 1346                          kmem_free(odd, sizeof (overlay_dev_t));
1363 1347                          return (EEXIST);
1364 1348                  }
1365 1349          }
1366 1350  
1367 1351          err = mac_register(mac, &odd->odd_mh);
1368 1352          mac_free(mac);
1369 1353          if (err != 0) {
1370 1354                  mutex_exit(&overlay_dev_lock);
1371 1355                  odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1372 1356                  overlay_plugin_rele(odd->odd_plugin);
1373 1357                  kmem_free(odd, sizeof (overlay_dev_t));
1374 1358                  return (err);
1375 1359          }
1376 1360  
1377 1361          err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1378 1362              crgetzoneid(cred));
1379 1363          if (err != 0) {
1380 1364                  mutex_exit(&overlay_dev_lock);
1381 1365                  (void) mac_unregister(odd->odd_mh);
1382 1366                  odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1383 1367                  overlay_plugin_rele(odd->odd_plugin);
1384 1368                  kmem_free(odd, sizeof (overlay_dev_t));
1385 1369                  return (err);
1386 1370          }
1387 1371  
1388 1372          mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
1389 1373          cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
1390 1374          odd->odd_ref = 0;
1391 1375          odd->odd_flags = 0;
1392 1376          list_insert_tail(&overlay_dev_list, odd);
1393 1377          mutex_exit(&overlay_dev_lock);
1394 1378  
1395 1379          return (0);
1396 1380  }
1397 1381  
1398 1382  /* ARGSUSED */
1399 1383  static int
1400 1384  overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1401 1385  {
1402 1386          int i, ret;
1403 1387          overlay_dev_t *odd;
1404 1388          mac_perim_handle_t mph;
1405 1389          overlay_ioc_activate_t *oiap = karg;
1406 1390          overlay_ioc_propinfo_t *infop;
1407 1391          overlay_ioc_prop_t *oip;
1408 1392          overlay_prop_handle_t phdl;
1409 1393  
1410 1394          odd = overlay_hold_by_dlid(oiap->oia_linkid);
1411 1395          if (odd == NULL)
1412 1396                  return (ENOENT);
1413 1397  
1414 1398          infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
1415 1399          oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
1416 1400          phdl = (overlay_prop_handle_t)infop;
1417 1401  
1418 1402          mac_perim_enter_by_mh(odd->odd_mh, &mph);
1419 1403          mutex_enter(&odd->odd_lock);
1420 1404          if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1421 1405                  mutex_exit(&odd->odd_lock);
1422 1406                  mac_perim_exit(mph);
1423 1407                  overlay_hold_rele(odd);
1424 1408                  kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1425 1409                  kmem_free(oip, sizeof (overlay_ioc_prop_t));
1426 1410                  return (EEXIST);
1427 1411          }
1428 1412          mutex_exit(&odd->odd_lock);
1429 1413  
1430 1414          for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1431 1415                  const char *pname = odd->odd_plugin->ovp_props[i];
1432 1416                  bzero(infop, sizeof (overlay_ioc_propinfo_t));
1433 1417                  overlay_prop_init(phdl);
1434 1418                  ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
1435 1419                  if (ret != 0) {
1436 1420                          mac_perim_exit(mph);
1437 1421                          overlay_hold_rele(odd);
1438 1422                          kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1439 1423                          kmem_free(oip, sizeof (overlay_ioc_prop_t));
1440 1424                          return (ret);
1441 1425                  }
1442 1426  
1443 1427                  if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
1444 1428                          continue;
1445 1429                  bzero(oip, sizeof (overlay_ioc_prop_t));
1446 1430                  oip->oip_size = sizeof (oip->oip_value);
1447 1431                  ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1448 1432                      pname, oip->oip_value, &oip->oip_size);
1449 1433                  if (ret != 0) {
1450 1434                          mac_perim_exit(mph);
1451 1435                          overlay_hold_rele(odd);
1452 1436                          kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1453 1437                          kmem_free(oip, sizeof (overlay_ioc_prop_t));
1454 1438                          return (ret);
1455 1439                  }
1456 1440                  if (oip->oip_size == 0) {
1457 1441                          mac_perim_exit(mph);
1458 1442                          overlay_hold_rele(odd);
1459 1443                          kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1460 1444                          kmem_free(oip, sizeof (overlay_ioc_prop_t));
1461 1445                          return (EINVAL);
1462 1446                  }
1463 1447          }
1464 1448  
1465 1449          mutex_enter(&odd->odd_lock);
1466 1450          if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
1467 1451                  mutex_exit(&odd->odd_lock);
1468 1452                  mac_perim_exit(mph);
1469 1453                  overlay_hold_rele(odd);
1470 1454                  kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1471 1455                  kmem_free(oip, sizeof (overlay_ioc_prop_t));
1472 1456                  return (ENXIO);
1473 1457          }
1474 1458  
1475 1459          ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
1476 1460          odd->odd_flags |= OVERLAY_F_ACTIVATED;
1477 1461  
1478 1462          /*
1479 1463           * Now that we've activated ourselves, we should indicate to the world
1480 1464           * that we're up. Note that we may not be able to perform lookups at
1481 1465           * this time, but our notion of being 'up' isn't dependent on that
1482 1466           * ability.
1483 1467           */
1484 1468          mac_link_update(odd->odd_mh, LINK_STATE_UP);
1485 1469          mutex_exit(&odd->odd_lock);
1486 1470  
1487 1471          mac_perim_exit(mph);
1488 1472          overlay_hold_rele(odd);
1489 1473          kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1490 1474          kmem_free(oip, sizeof (overlay_ioc_prop_t));
1491 1475  
1492 1476          return (0);
1493 1477  }
1494 1478  
1495 1479  /* ARGSUSED */
1496 1480  static int
1497 1481  overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1498 1482  {
1499 1483          overlay_ioc_delete_t *oidp = karg;
1500 1484          overlay_dev_t *odd;
1501 1485          datalink_id_t tid;
1502 1486          int ret;
1503 1487  
1504 1488          odd = overlay_hold_by_dlid(oidp->oid_linkid);
1505 1489          if (odd == NULL) {
1506 1490                  return (ENOENT);
1507 1491          }
1508 1492  
1509 1493          mutex_enter(&odd->odd_lock);
1510 1494          /* If we're not the only hold, we're busy */
1511 1495          if (odd->odd_ref != 1) {
1512 1496                  mutex_exit(&odd->odd_lock);
1513 1497                  overlay_hold_rele(odd);
1514 1498                  return (EBUSY);
1515 1499          }
1516 1500  
1517 1501          if (odd->odd_flags & OVERLAY_F_IN_MUX) {
1518 1502                  mutex_exit(&odd->odd_lock);
1519 1503                  overlay_hold_rele(odd);
1520 1504                  return (EBUSY);
1521 1505          }
1522 1506  
1523 1507          /*
1524 1508           * To remove this, we need to first remove it from dls and then remove
1525 1509           * it from mac. The act of removing it from mac will check if there are
1526 1510           * devices on top of this, eg. vnics. If there are, then that will fail
1527 1511           * and we'll have to go through and recreate the dls entry. Only after
1528 1512           * mac_unregister has succeeded, then we'll go through and actually free
1529 1513           * everything and drop the dev lock.
1530 1514           */
1531 1515          ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
1532 1516          if (ret != 0) {
1533 1517                  overlay_hold_rele(odd);
1534 1518                  return (ret);
1535 1519          }
1536 1520  
1537 1521          ASSERT(oidp->oid_linkid == tid);
1538 1522          ret = mac_disable(odd->odd_mh);
1539 1523          if (ret != 0) {
1540 1524                  (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1541 1525                      crgetzoneid(cred));
1542 1526                  overlay_hold_rele(odd);
1543 1527                  return (ret);
1544 1528          }
1545 1529  
1546 1530          overlay_target_quiesce(odd->odd_target);
1547 1531  
1548 1532          mutex_enter(&overlay_dev_lock);
1549 1533          list_remove(&overlay_dev_list, odd);
1550 1534          mutex_exit(&overlay_dev_lock);
1551 1535  
1552 1536          cv_destroy(&odd->odd_iowait);
1553 1537          mutex_destroy(&odd->odd_lock);
1554 1538          overlay_target_free(odd);
1555 1539          odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1556 1540          overlay_plugin_rele(odd->odd_plugin);
1557 1541          kmem_free(odd, sizeof (overlay_dev_t));
1558 1542  
1559 1543          return (0);
1560 1544  }
1561 1545  
1562 1546  /* ARGSUSED */
1563 1547  static int
1564 1548  overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
1565 1549      int *rvalp)
1566 1550  {
1567 1551          overlay_dev_t *odd;
1568 1552          overlay_ioc_nprops_t *on = karg;
1569 1553  
1570 1554          odd = overlay_hold_by_dlid(on->oipn_linkid);
1571 1555          if (odd == NULL)
1572 1556                  return (ENOENT);
1573 1557          on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
1574 1558          overlay_hold_rele(odd);
1575 1559  
1576 1560          return (0);
1577 1561  }
1578 1562  
1579 1563  static int
1580 1564  overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
1581 1565  {
1582 1566          overlay_prop_handle_t phdl = arg;
1583 1567          overlay_prop_set_range_str(phdl, opp->ovp_name);
1584 1568          return (0);
1585 1569  }
1586 1570  
1587 1571  static int
1588 1572  overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
1589 1573  {
1590 1574          int i;
1591 1575  
1592 1576          for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1593 1577                  if (strcmp(overlay_dev_props[i], name) == 0) {
1594 1578                          *id = i;
1595 1579                          return (0);
1596 1580                  }
1597 1581          }
1598 1582  
1599 1583          for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1600 1584                  if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
1601 1585                          *id = i + OVERLAY_DEV_NPROPS;
1602 1586                          return (0);
1603 1587                  }
1604 1588          }
1605 1589  
1606 1590          return (ENOENT);
1607 1591  }
1608 1592  
1609 1593  static void
1610 1594  overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
1611 1595  {
1612 1596          uint32_t def;
1613 1597          mac_propval_range_t range;
1614 1598          uint_t perm;
1615 1599  
1616 1600          ASSERT(MAC_PERIM_HELD(odd->odd_mh));
1617 1601  
1618 1602          bzero(&range, sizeof (mac_propval_range_t));
1619 1603          range.mpr_count = 1;
1620 1604          if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
1621 1605              sizeof (def), &range, &perm) != 0)
1622 1606                  return;
1623 1607  
1624 1608          if (perm == MAC_PROP_PERM_READ)
1625 1609                  overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1626 1610          else if (perm == MAC_PROP_PERM_WRITE)
1627 1611                  overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
1628 1612          else if (perm == MAC_PROP_PERM_RW)
1629 1613                  overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1630 1614  
1631 1615          overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1632 1616          overlay_prop_set_default(phdl, &def, sizeof (def));
1633 1617          overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
1634 1618              range.mpr_range_uint32[0].mpur_max);
1635 1619  }
1636 1620  
1637 1621  /* ARGSUSED */
1638 1622  static int
1639 1623  overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
1640 1624      int *rvalp)
1641 1625  {
1642 1626          overlay_dev_t *odd;
1643 1627          int ret;
1644 1628          mac_perim_handle_t mph;
1645 1629          uint_t propid = UINT_MAX;
1646 1630          overlay_ioc_propinfo_t *oip = karg;
1647 1631          overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
1648 1632  
1649 1633          odd = overlay_hold_by_dlid(oip->oipi_linkid);
1650 1634          if (odd == NULL)
1651 1635                  return (ENOENT);
1652 1636  
1653 1637          overlay_prop_init(phdl);
1654 1638          mac_perim_enter_by_mh(odd->odd_mh, &mph);
1655 1639  
1656 1640          /*
1657 1641           * If the id is -1, then the property that we're looking for is named in
1658 1642           * oipi_name and we should fill in its id. Otherwise, we've been given
1659 1643           * an id and we need to turn that into a name for our plugin's sake. The
1660 1644           * id is our own fabrication for property discovery.
1661 1645           */
1662 1646          if (oip->oipi_id == -1) {
1663 1647                  /*
1664 1648                   * Determine if it's a known generic property or it belongs to a
1665 1649                   * module by checking against the list of known names.
1666 1650                   */
1667 1651                  oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1668 1652                  if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
1669 1653                      &propid)) != 0) {
1670 1654                          overlay_hold_rele(odd);
1671 1655                          mac_perim_exit(mph);
1672 1656                          return (ret);
1673 1657                  }
1674 1658                  oip->oipi_id = propid;
1675 1659                  if (propid >= OVERLAY_DEV_NPROPS) {
1676 1660                          ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1677 1661                              oip->oipi_name, phdl);
1678 1662                          overlay_hold_rele(odd);
1679 1663                          mac_perim_exit(mph);
1680 1664                          return (ret);
1681 1665  
1682 1666                  }
1683 1667          } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
1684 1668                  uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
1685 1669  
1686 1670                  if (id >= odd->odd_plugin->ovp_nprops) {
1687 1671                          overlay_hold_rele(odd);
1688 1672                          mac_perim_exit(mph);
1689 1673                          return (EINVAL);
1690 1674                  }
1691 1675                  ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1692 1676                      odd->odd_plugin->ovp_props[id], phdl);
1693 1677                  overlay_hold_rele(odd);
1694 1678                  mac_perim_exit(mph);
1695 1679                  return (ret);
1696 1680          } else if (oip->oipi_id < -1) {
1697 1681                  overlay_hold_rele(odd);
1698 1682                  mac_perim_exit(mph);
1699 1683                  return (EINVAL);
1700 1684          } else {
1701 1685                  ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
1702 1686                  ASSERT(oip->oipi_id >= 0);
1703 1687                  propid = oip->oipi_id;
1704 1688                  (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
1705 1689                      sizeof (oip->oipi_name));
1706 1690          }
1707 1691  
1708 1692          switch (propid) {
1709 1693          case OVERLAY_DEV_P_MTU:
1710 1694                  overlay_i_propinfo_mtu(odd, phdl);
1711 1695                  break;
1712 1696          case OVERLAY_DEV_P_VNETID:
1713 1697                  overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1714 1698                  overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1715 1699                  overlay_prop_set_nodefault(phdl);
1716 1700                  break;
1717 1701          case OVERLAY_DEV_P_ENCAP:
1718 1702                  overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1719 1703                  overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
1720 1704                  overlay_prop_set_nodefault(phdl);
1721 1705                  overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
1722 1706                  break;
1723 1707          case OVERLAY_DEV_P_VARPDID:
1724 1708                  overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1725 1709                  overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1726 1710                  overlay_prop_set_nodefault(phdl);
1727 1711                  break;
1728 1712          case OVERLAY_DEV_P_DCID:
1729 1713                  overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1730 1714                  overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1731 1715                  overlay_prop_set_nodefault(phdl);
1732 1716                  overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX);
1733 1717                  break;
1734 1718          default:
1735 1719                  overlay_hold_rele(odd);
1736 1720                  mac_perim_exit(mph);
1737 1721                  return (ENOENT);
1738 1722          }
1739 1723  
1740 1724          overlay_hold_rele(odd);
1741 1725          mac_perim_exit(mph);
1742 1726          return (0);
1743 1727  }
1744 1728  
1745 1729  /* ARGSUSED */
1746 1730  static int
1747 1731  overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1748 1732      int *rvalp)
1749 1733  {
1750 1734          int ret;
1751 1735          overlay_dev_t *odd;
1752 1736          mac_perim_handle_t mph;
1753 1737          overlay_ioc_prop_t *oip = karg;
1754 1738          uint_t propid, mtu;
1755 1739  
1756 1740          odd = overlay_hold_by_dlid(oip->oip_linkid);
1757 1741          if (odd == NULL)
1758 1742                  return (ENOENT);
1759 1743  
1760 1744          mac_perim_enter_by_mh(odd->odd_mh, &mph);
1761 1745          oip->oip_size = OVERLAY_PROP_SIZEMAX;
1762 1746          oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1763 1747          if (oip->oip_id == -1) {
1764 1748                  int i;
1765 1749  
1766 1750                  for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1767 1751                          if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1768 1752                                  break;
1769 1753                          if (i == OVERLAY_DEV_NPROPS) {
1770 1754                                  ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
1771 1755                                      odd->odd_pvoid, oip->oip_name,
1772 1756                                      oip->oip_value, &oip->oip_size);
1773 1757                                  overlay_hold_rele(odd);
1774 1758                                  mac_perim_exit(mph);
1775 1759                                  return (ret);
1776 1760                          }
1777 1761                  }
1778 1762  
1779 1763                  propid = i;
1780 1764          } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1781 1765                  uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1782 1766  
1783 1767                  if (id > odd->odd_plugin->ovp_nprops) {
1784 1768                          overlay_hold_rele(odd);
1785 1769                          mac_perim_exit(mph);
1786 1770                          return (EINVAL);
1787 1771                  }
1788 1772                  ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1789 1773                      odd->odd_plugin->ovp_props[id], oip->oip_value,
1790 1774                      &oip->oip_size);
1791 1775                  overlay_hold_rele(odd);
1792 1776                  mac_perim_exit(mph);
1793 1777                  return (ret);
1794 1778          } else if (oip->oip_id < -1) {
1795 1779                  overlay_hold_rele(odd);
1796 1780                  mac_perim_exit(mph);
1797 1781                  return (EINVAL);
1798 1782          } else {
1799 1783                  ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1800 1784                  ASSERT(oip->oip_id >= 0);
1801 1785                  propid = oip->oip_id;
1802 1786          }
1803 1787  
1804 1788          ret = 0;
1805 1789          switch (propid) {
1806 1790          case OVERLAY_DEV_P_MTU:
1807 1791                  /*
1808 1792                   * The MTU is always set and retrieved through MAC, to allow for
1809 1793                   * MAC to do whatever it wants, as really that property belongs
1810 1794                   * to MAC. This is important for things where vnics have hold on
1811 1795                   * the MTU.
1812 1796                   */
1813 1797                  mac_sdu_get(odd->odd_mh, NULL, &mtu);
1814 1798                  bcopy(&mtu, oip->oip_value, sizeof (uint_t));
1815 1799                  oip->oip_size = sizeof (uint_t);
1816 1800                  break;
1817 1801          case OVERLAY_DEV_P_VNETID:
1818 1802                  /*
1819 1803                   * While it's read-only while inside of a mux, we're not in a
1820 1804                   * context that can guarantee that. Therefore we always grab the
1821 1805                   * overlay_dev_t's odd_lock.
1822 1806                   */
1823 1807                  mutex_enter(&odd->odd_lock);
1824 1808                  bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
1825 1809                  mutex_exit(&odd->odd_lock);
1826 1810                  oip->oip_size = sizeof (uint64_t);
1827 1811                  break;
1828 1812          case OVERLAY_DEV_P_ENCAP:
1829 1813                  oip->oip_size = strlcpy((char *)oip->oip_value,
1830 1814                      odd->odd_plugin->ovp_name, oip->oip_size);
1831 1815                  break;
1832 1816          case OVERLAY_DEV_P_VARPDID:
1833 1817                  mutex_enter(&odd->odd_lock);
1834 1818                  if (odd->odd_flags & OVERLAY_F_VARPD) {
1835 1819                          const uint64_t val = odd->odd_target->ott_id;
1836 1820                          bcopy(&val, oip->oip_value, sizeof (uint64_t));
1837 1821                          oip->oip_size = sizeof (uint64_t);
1838 1822                  } else {
1839 1823                          oip->oip_size = 0;
1840 1824                  }
1841 1825                  mutex_exit(&odd->odd_lock);
1842 1826                  break;
1843 1827          case OVERLAY_DEV_P_DCID:
1844 1828                  /*
1845 1829                   * While it's read-only while inside of a mux, we're not in a
1846 1830                   * context that can guarantee that. Therefore we always grab the
1847 1831                   * overlay_dev_t's odd_lock.
1848 1832                   */
1849 1833                  mutex_enter(&odd->odd_lock);
1850 1834                  bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t));
1851 1835                  mutex_exit(&odd->odd_lock);
1852 1836                  oip->oip_size = sizeof (uint32_t);
1853 1837                  break;
1854 1838  
1855 1839          default:
1856 1840                  ret = ENOENT;
1857 1841          }
1858 1842  
1859 1843          overlay_hold_rele(odd);
1860 1844          mac_perim_exit(mph);
1861 1845          return (ret);
1862 1846  }
1863 1847  
1864 1848  static void
1865 1849  overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
1866 1850  {
1867 1851          mutex_enter(&odd->odd_lock);
1868 1852  
1869 1853          /* Simple case, not active */
1870 1854          if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1871 1855                  odd->odd_vid = vnetid;
1872 1856                  mutex_exit(&odd->odd_lock);
1873 1857                  return;
1874 1858          }
1875 1859  
1876 1860          /*
1877 1861           * In the hard case, we need to set the drop flag, quiesce I/O and then
1878 1862           * we can go ahead and do everything.
1879 1863           */
1880 1864          odd->odd_flags |= OVERLAY_F_MDDROP;
1881 1865          overlay_io_wait(odd, OVERLAY_F_IOMASK);
1882 1866          mutex_exit(&odd->odd_lock);
1883 1867  
1884 1868          overlay_mux_remove_dev(odd->odd_mux, odd);
1885 1869          mutex_enter(&odd->odd_lock);
1886 1870          odd->odd_vid = vnetid;
1887 1871          mutex_exit(&odd->odd_lock);
1888 1872          overlay_mux_add_dev(odd->odd_mux, odd);
1889 1873  
1890 1874          mutex_enter(&odd->odd_lock);
1891 1875          ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1892 1876          odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1893 1877          mutex_exit(&odd->odd_lock);
1894 1878  }
1895 1879  
1896 1880  static void
1897 1881  overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid)
1898 1882  {
1899 1883          mutex_enter(&odd->odd_lock);
1900 1884  
1901 1885          /* Simple case, not active */
1902 1886          if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1903 1887                  odd->odd_dcid = dcid;
1904 1888                  mutex_exit(&odd->odd_lock);
1905 1889                  return;
1906 1890          }
1907 1891  
1908 1892          /*
1909 1893           * In the hard case, we need to set the drop flag, quiesce I/O and then
1910 1894           * we can go ahead and do everything.
1911 1895           */
1912 1896          odd->odd_flags |= OVERLAY_F_MDDROP;
1913 1897          overlay_io_wait(odd, OVERLAY_F_IOMASK);
1914 1898          mutex_exit(&odd->odd_lock);
1915 1899  
1916 1900          overlay_mux_remove_dev(odd->odd_mux, odd);
1917 1901          mutex_enter(&odd->odd_lock);
1918 1902          odd->odd_dcid = dcid;
1919 1903          mutex_exit(&odd->odd_lock);
1920 1904          overlay_mux_add_dev(odd->odd_mux, odd);
1921 1905  
1922 1906          mutex_enter(&odd->odd_lock);
1923 1907          ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1924 1908          odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1925 1909          mutex_exit(&odd->odd_lock);
1926 1910  }
1927 1911  
1928 1912  /* ARGSUSED */
1929 1913  static int
1930 1914  overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1931 1915      int *rvalp)
1932 1916  {
1933 1917          int ret;
1934 1918          overlay_dev_t *odd;
1935 1919          overlay_ioc_prop_t *oip = karg;
1936 1920          uint_t propid = UINT_MAX;
1937 1921          mac_perim_handle_t mph;
1938 1922          uint64_t maxid, *vidp, *dcidp;
1939 1923  
1940 1924          if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
1941 1925                  return (EINVAL);
1942 1926  
1943 1927          odd = overlay_hold_by_dlid(oip->oip_linkid);
1944 1928          if (odd == NULL)
1945 1929                  return (ENOENT);
1946 1930  
1947 1931          oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1948 1932          mac_perim_enter_by_mh(odd->odd_mh, &mph);
1949 1933          mutex_enter(&odd->odd_lock);
1950 1934          if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1951 1935                  mac_perim_exit(mph);
1952 1936                  mutex_exit(&odd->odd_lock);
1953 1937                  return (ENOTSUP);
1954 1938          }
1955 1939          mutex_exit(&odd->odd_lock);
1956 1940          if (oip->oip_id == -1) {
1957 1941                  int i;
1958 1942  
1959 1943                  for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1960 1944                          if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1961 1945                                  break;
1962 1946                          if (i == OVERLAY_DEV_NPROPS) {
1963 1947                                  ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
1964 1948                                      odd->odd_pvoid, oip->oip_name,
1965 1949                                      oip->oip_value, oip->oip_size);
1966 1950                                  overlay_hold_rele(odd);
1967 1951                                  mac_perim_exit(mph);
1968 1952                                  return (ret);
1969 1953                          }
1970 1954                  }
1971 1955  
1972 1956                  propid = i;
1973 1957          } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1974 1958                  uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1975 1959  
1976 1960                  if (id > odd->odd_plugin->ovp_nprops) {
1977 1961                          mac_perim_exit(mph);
1978 1962                          overlay_hold_rele(odd);
1979 1963                          return (EINVAL);
1980 1964                  }
1981 1965                  ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
1982 1966                      odd->odd_plugin->ovp_props[id], oip->oip_value,
1983 1967                      oip->oip_size);
1984 1968                  mac_perim_exit(mph);
1985 1969                  overlay_hold_rele(odd);
1986 1970                  return (ret);
1987 1971          } else if (oip->oip_id < -1) {
1988 1972                  mac_perim_exit(mph);
1989 1973                  overlay_hold_rele(odd);
1990 1974                  return (EINVAL);
1991 1975          } else {
1992 1976                  ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1993 1977                  ASSERT(oip->oip_id >= 0);
1994 1978                  propid = oip->oip_id;
1995 1979          }
1996 1980  
1997 1981          ret = 0;
1998 1982          switch (propid) {
1999 1983          case OVERLAY_DEV_P_MTU:
2000 1984                  ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
2001 1985                      oip->oip_value, oip->oip_size);
2002 1986                  break;
2003 1987          case OVERLAY_DEV_P_VNETID:
2004 1988                  if (oip->oip_size != sizeof (uint64_t)) {
2005 1989                          ret = EINVAL;
2006 1990                          break;
2007 1991                  }
2008 1992                  vidp = (uint64_t *)oip->oip_value;
2009 1993                  ASSERT(odd->odd_plugin->ovp_id_size <= 8);
2010 1994                  maxid = UINT64_MAX;
2011 1995                  if (odd->odd_plugin->ovp_id_size != 8)
2012 1996                          maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
2013 1997                              1ULL;
2014 1998                  if (*vidp >= maxid) {
2015 1999                          ret = EINVAL;
2016 2000                          break;
2017 2001                  }
2018 2002                  overlay_setprop_vnetid(odd, *vidp);
2019 2003                  break;
2020 2004          case OVERLAY_DEV_P_ENCAP:
2021 2005          case OVERLAY_DEV_P_VARPDID:
2022 2006                  ret = EPERM;
2023 2007                  break;
2024 2008          case OVERLAY_DEV_P_DCID:
2025 2009                  if (oip->oip_size != sizeof (uint64_t)) {
2026 2010                          ret = EINVAL;
2027 2011                          break;
2028 2012                  }
2029 2013                  dcidp = (uint64_t *)oip->oip_value;
2030 2014                  if (*dcidp > UINT32_MAX) {
2031 2015                          ret = EINVAL;
2032 2016                          break;
2033 2017                  }
2034 2018                  overlay_setprop_dcid(odd, *dcidp);
2035 2019                  break;
2036 2020  
2037 2021          default:
2038 2022                  ret = ENOENT;
2039 2023          }
2040 2024  
2041 2025          mac_perim_exit(mph);
2042 2026          overlay_hold_rele(odd);
2043 2027          return (ret);
2044 2028  }
2045 2029  
2046 2030  /* ARGSUSED */
2047 2031  static int
2048 2032  overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
2049 2033      int *rvalp)
2050 2034  {
2051 2035          overlay_dev_t *odd;
2052 2036          overlay_ioc_status_t *os = karg;
2053 2037  
2054 2038          odd = overlay_hold_by_dlid(os->ois_linkid);
2055 2039          if (odd == NULL)
2056 2040                  return (ENOENT);
2057 2041  
2058 2042          mutex_enter(&odd->odd_lock);
2059 2043          if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
2060 2044                  os->ois_status = OVERLAY_I_DEGRADED;
2061 2045                  if (odd->odd_fmamsg != NULL) {
2062 2046                          (void) strlcpy(os->ois_message, odd->odd_fmamsg,
2063 2047                              OVERLAY_STATUS_BUFLEN);
2064 2048                  } else {
2065 2049                          os->ois_message[0] = '\0';
2066 2050                  }
2067 2051  
2068 2052          } else {
2069 2053                  os->ois_status = OVERLAY_I_OK;
2070 2054                  os->ois_message[0] = '\0';
2071 2055          }
2072 2056          mutex_exit(&odd->odd_lock);
2073 2057          overlay_hold_rele(odd);
2074 2058  
2075 2059          return (0);
2076 2060  }
2077 2061  
2078 2062  static dld_ioc_info_t overlay_ioc_list[] = {
2079 2063          { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
2080 2064                  overlay_i_create, secpolicy_dl_config },
2081 2065          { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
2082 2066                  overlay_i_activate, secpolicy_dl_config },
2083 2067          { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
2084 2068                  overlay_i_delete, secpolicy_dl_config },
2085 2069          { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
2086 2070                  sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
2087 2071                  secpolicy_dl_config },
2088 2072          { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
2089 2073                  sizeof (overlay_ioc_prop_t), overlay_i_getprop,
2090 2074                  secpolicy_dl_config },
2091 2075          { OVERLAY_IOC_SETPROP, DLDCOPYIN,
2092 2076                  sizeof (overlay_ioc_prop_t), overlay_i_setprop,
2093 2077                  secpolicy_dl_config },
2094 2078          { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
2095 2079                  sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
2096 2080                  secpolicy_dl_config },
2097 2081          { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
2098 2082                  sizeof (overlay_ioc_status_t), overlay_i_status,
2099 2083                  NULL }
2100 2084  };
2101 2085  
2102 2086  static int
2103 2087  overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2104 2088  {
2105 2089          int fmcap = DDI_FM_EREPORT_CAPABLE;
2106 2090          if (cmd != DDI_ATTACH)
2107 2091                  return (DDI_FAILURE);
2108 2092  
2109 2093          if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
2110 2094                  return (DDI_FAILURE);
2111 2095  
2112 2096          ddi_fm_init(dip, &fmcap, NULL);
2113 2097  
2114 2098          if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
2115 2099              ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
2116 2100                  return (DDI_FAILURE);
2117 2101  
2118 2102          if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
2119 2103              DLDIOCCNT(overlay_ioc_list)) != 0) {
2120 2104                  ddi_remove_minor_node(dip, OVERLAY_CTL);
2121 2105                  return (DDI_FAILURE);
2122 2106          }
2123 2107  
2124 2108          overlay_dip = dip;
2125 2109          return (DDI_SUCCESS);
2126 2110  }
2127 2111  
2128 2112  /* ARGSUSED */
2129 2113  static int
2130 2114  overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
2131 2115  {
2132 2116          int error;
2133 2117  
2134 2118          switch (cmd) {
2135 2119          case DDI_INFO_DEVT2DEVINFO:
2136 2120                  *resp = (void *)overlay_dip;
2137 2121                  error = DDI_SUCCESS;
2138 2122                  break;
2139 2123          case DDI_INFO_DEVT2INSTANCE:
2140 2124                  *resp = (void *)0;
2141 2125                  error = DDI_SUCCESS;
2142 2126                  break;
2143 2127          default:
2144 2128                  error = DDI_FAILURE;
2145 2129                  break;
2146 2130          }
2147 2131  
2148 2132          return (error);
2149 2133  }
2150 2134  
2151 2135  static int
2152 2136  overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2153 2137  {
2154 2138          if (cmd != DDI_DETACH)
2155 2139                  return (DDI_FAILURE);
2156 2140  
2157 2141          mutex_enter(&overlay_dev_lock);
2158 2142          if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
2159 2143                  mutex_exit(&overlay_dev_lock);
2160 2144                  return (EBUSY);
2161 2145          }
  
    | 
      ↓ open down ↓ | 
    1042 lines elided | 
    
      ↑ open up ↑ | 
  
2162 2146          mutex_exit(&overlay_dev_lock);
2163 2147  
2164 2148  
2165 2149          dld_ioc_unregister(OVERLAY_IOC);
2166 2150          ddi_remove_minor_node(dip, OVERLAY_CTL);
2167 2151          ddi_fm_fini(dip);
2168 2152          overlay_dip = NULL;
2169 2153          return (DDI_SUCCESS);
2170 2154  }
2171 2155  
     2156 +#define OVERLAY_IOCTL_MASK      0xffffff00
     2157 +/* ARGSUSED */
     2158 +static int
     2159 +overlay_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
     2160 +    int *rvalp)
     2161 +{
     2162 +        switch (cmd & OVERLAY_IOCTL_MASK) {
     2163 +        case OVERLAY_TARG_IOCTL:
     2164 +                return (overlay_target_ioctl(dev, cmd, arg, mode, credp,
     2165 +                    rvalp));
     2166 +        case OVERLAY_ROUTER_IOCTL:
     2167 +                return (overlay_router_ioctl(dev, cmd, arg, mode, credp,
     2168 +                    rvalp));
     2169 +        default:
     2170 +                return (ENOTTY);
     2171 +        }
     2172 +}
     2173 +
2172 2174  static struct cb_ops overlay_cbops = {
2173 2175          overlay_target_open,    /* cb_open */
2174 2176          overlay_target_close,   /* cb_close */
2175 2177          nodev,                  /* cb_strategy */
2176 2178          nodev,                  /* cb_print */
2177 2179          nodev,                  /* cb_dump */
2178 2180          nodev,                  /* cb_read */
2179 2181          nodev,                  /* cb_write */
2180      -        overlay_target_ioctl,   /* cb_ioctl */
     2182 +        overlay_ioctl,          /* cb_ioctl */
2181 2183          nodev,                  /* cb_devmap */
2182 2184          nodev,                  /* cb_mmap */
2183 2185          nodev,                  /* cb_segmap */
2184 2186          nochpoll,               /* cb_chpoll */
2185 2187          ddi_prop_op,            /* cb_prop_op */
2186 2188          NULL,                   /* cb_stream */
2187 2189          D_MP,                   /* cb_flag */
2188 2190          CB_REV,                 /* cb_rev */
2189 2191          nodev,                  /* cb_aread */
2190 2192          nodev,                  /* cb_awrite */
2191 2193  };
2192 2194  
2193 2195  static struct dev_ops overlay_dev_ops = {
2194 2196          DEVO_REV,               /* devo_rev */
2195 2197          0,                      /* devo_refcnt */
2196 2198          overlay_getinfo,        /* devo_getinfo */
2197 2199          nulldev,                /* devo_identify */
2198 2200          nulldev,                /* devo_probe */
2199 2201          overlay_attach,         /* devo_attach */
2200 2202          overlay_detach,         /* devo_detach */
2201 2203          nulldev,                /* devo_reset */
2202 2204          &overlay_cbops,         /* devo_cb_ops */
2203 2205          NULL,                   /* devo_bus_ops */
2204 2206          NULL,                   /* devo_power */
2205 2207          ddi_quiesce_not_supported       /* devo_quiesce */
2206 2208  };
2207 2209  
2208 2210  static struct modldrv overlay_modldrv = {
2209 2211          &mod_driverops,
2210 2212          "Overlay Network Driver",
2211 2213          &overlay_dev_ops
2212 2214  };
2213 2215  
2214 2216  static struct modlinkage overlay_linkage = {
2215 2217          MODREV_1,
2216 2218          &overlay_modldrv
2217 2219  };
2218 2220  
2219 2221  static int
2220 2222  overlay_init(void)
2221 2223  {
2222 2224          mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
2223 2225          list_create(&overlay_dev_list, sizeof (overlay_dev_t),
2224 2226              offsetof(overlay_dev_t, odd_link));
2225 2227          overlay_mux_init();
2226 2228          overlay_plugin_init();
2227 2229          overlay_target_init();
2228 2230  
2229 2231          return (DDI_SUCCESS);
2230 2232  }
2231 2233  
2232 2234  static void
2233 2235  overlay_fini(void)
2234 2236  {
2235 2237          overlay_target_fini();
2236 2238          overlay_plugin_fini();
2237 2239          overlay_mux_fini();
2238 2240          mutex_destroy(&overlay_dev_lock);
2239 2241          list_destroy(&overlay_dev_list);
2240 2242  }
2241 2243  
2242 2244  int
2243 2245  _init(void)
2244 2246  {
2245 2247          int err;
2246 2248  
2247 2249          if ((err = overlay_init()) != DDI_SUCCESS)
2248 2250                  return (err);
2249 2251  
2250 2252          mac_init_ops(NULL, "overlay");
2251 2253          err = mod_install(&overlay_linkage);
2252 2254          if (err != DDI_SUCCESS) {
2253 2255                  overlay_fini();
2254 2256                  return (err);
2255 2257          }
2256 2258  
2257 2259          return (0);
2258 2260  }
2259 2261  
2260 2262  int
2261 2263  _info(struct modinfo *modinfop)
2262 2264  {
2263 2265          return (mod_info(&overlay_linkage, modinfop));
2264 2266  }
2265 2267  
2266 2268  int
2267 2269  _fini(void)
2268 2270  {
2269 2271          int err;
2270 2272  
2271 2273          err = mod_remove(&overlay_linkage);
2272 2274          if (err != 0)
2273 2275                  return (err);
2274 2276  
2275 2277          overlay_fini();
2276 2278          return (0);
2277 2279  }
  
    | 
      ↓ open down ↓ | 
    87 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX