Print this page
Overlay fabric router
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/overlay/overlay.c
+++ new/usr/src/uts/common/io/overlay/overlay.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 13 * Copyright 2016 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * Overlay Devices
18 18 *
19 19 * Overlay devices provide a means for creating overlay networks, a means of
20 20 * multiplexing multiple logical, isolated, and discrete layer two and layer
21 21 * three networks on top of one physical network.
22 22 *
23 23 * In general, these overlay devices encapsulate the logic to answer two
24 24 * different questions:
25 25 *
26 26 * 1) How should I transform a packet to put it on the wire?
27 27 * 2) Where should I send a transformed packet?
28 28 *
29 29 * Each overlay device is presented to the user as a GLDv3 device. While the
30 30 * link itself cannot have an IP interface created on top of it, it allows for
31 31 * additional GLDv3 devices, such as a VNIC, to be created on top of it which
32 32 * can be plumbed up with IP interfaces.
33 33 *
34 34 *
35 35 * --------------------
36 36 * General Architecture
37 37 * --------------------
38 38 *
39 39 * The logical overlay device that a user sees in dladm(1M) is a combination of
40 40 * two different components that work together. The first component is this
41 41 * kernel module, which is responsible for answering question one -- how should
42 42 * I transform a packet to put it on the wire.
43 43 *
44 44 * The second component is what we call the virtual ARP daemon, or varpd. It is
45 45 * a userland component that is responsible for answering the second question --
46 46 * Where should I send a transformed packet. Instances of the kernel overlay
47 47 * GLDv3 device ask varpd the question of where should a packet go.
48 48 *
49 49 * The split was done for a few reasons. Importantly, we wanted to keep the act
50 50 * of generating encapsulated packets in the kernel so as to ensure that the
51 51 * general data path was fast and also kept simple. On the flip side, while the
52 52 * question of where should something go may be simple, it may often be
53 53 * complicated and need to interface with several different external or
54 54 * distributed systems. In those cases, it's simpler to allow for the full
55 55 * flexibility of userland to be brought to bear to solve that problem and in
56 56 * general, the path isn't very common.
57 57 *
58 58 * The following is what makes up the logical overlay device that a user would
59 59 * create with dladm(1M).
60 60 *
61 61 * Kernel Userland
62 62 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
63 63 * . +--------+ +--------+ +--------+ . . .
64 64 * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . .
65 65 * . +--------+ +--------+ +--------+ . . .
66 66 * . | | | . . .
67 67 * . | | | . . .
68 68 * . +------------+-----------+ . . .
69 69 * . | . . /dev/overlay .
70 70 * . +--------------+ . . . +------------+ .
71 71 * . | | . . . | | .
72 72 * . | Overlay |======*=================| Virtual | .
73 73 * . | GLDv3 Device |========================| ARP Daemon | .
74 74 * . | | . . | | .
75 75 * . +--------------+ . . +------------+ .
76 76 * . | . . | .
77 77 * . | . . | .
78 78 * . +----------------+ . . +--------+ .
79 79 * . | Overlay | . . | varpd | .
80 80 * . | Encapsulation | . . | Lookup | .
81 81 * . | Plugin | . . | Plugin | .
82 82 * . +----------------+ . . +--------+ .
83 83 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
84 84 *
85 85 *
86 86 * This image shows the two different components and where they live.
87 87 * Importantly, it also shows that both the kernel overlay device and the
88 88 * userland varpd both support plugins. The plugins actually implement the
89 89 * things that users care about and the APIs have been designed to try to
90 90 * minimize the amount of things that a module writer needs to worry about it.
91 91 *
92 92 * IDENTIFIERS
93 93 *
94 94 * Every overlay device is defined by a unique identifier which is the overlay
95 95 * identifier. Its purpose is similar to that of a VLAN identifier, it's a
96 96 * unique number that is used to differentiate between different entries on the
97 97 * wire.
98 98 *
99 99 * ENCAPSULATION
100 100 *
101 101 * An overlay encapsulation plugin is a kernel miscellaneous module whose
102 102 * purpose is to contain knowledge about how to transform packets to put them
103 103 * onto the wire and to take them off. An example of an encapsulation plugin is
104 104 * vxlan. It's also how support for things like nvgre or geneve would be brought
105 105 * into the system.
106 106 *
107 107 * Each encapsulation plugins defines a series of operation vectors and
108 108 * properties. For the full details on everything they should provide, please
109 109 * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
110 110 * for telling the system what information is required to send a packet. For
111 111 * example, vxlan is defined to send everything over a UDP packet and therefore
112 112 * requires a port and an IP address, while nvgre on the other hand is its own
113 113 * IP type and therefore just requires an IP address. In addition, it also
114 114 * provides information about the kind of socket that should be created. This is
115 115 * used by the kernel multiplexor, more of that in the Kernel Components
116 116 * section.
117 117 *
118 118 * LOOKUPS
119 119 *
120 120 * The kernel communicates requests for lookups over the character device
121 121 * /dev/overlay. varpd is responsible for listening for requests on that device
122 122 * and answering them. The character device is specific to the target path and
123 123 * varpd.
124 124 *
125 125 * Much as the kernel overlay module handles the bulk of the scaffolding but
126 126 * leaves the important work to the encapsulation plugin, varpd provides a
127 127 * similar role and leaves the full brunt of lookups to a userland dynamic
128 128 * shared object which implements the logic of lookups.
129 129 *
130 130 * Each lookup plugin defines a series of operation vectors and properties. For
131 131 * the full details on everything that they should provide, please read
132 132 * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
133 133 * address and asked to give an address on the physical network that it should
134 134 * be sent to. In addition, they handle questions related to how to handle
135 135 * things like broadcast and multicast traffic, etc.
136 136 *
137 137 * ----------
138 138 * Properties
139 139 * ----------
140 140 *
141 141 * A device from a dladm perspective has a unique set of properties that are
142 142 * combined from three different sources:
143 143 *
144 144 * 1) Generic properties that every overlay device has
145 145 * 2) Properties that are specific to the encapsulation plugin
146 146 * 3) Properties that are specific to the lookup plugin
147 147 *
148 148 * All of these are exposed in a single set of properties in dladm. Note that
149 149 * these are not necessarily traditional link properties. However, if something
150 150 * is both a traditional GLDv3 link property, say the MTU of a device, and a
151 151 * specific property here, than the driver ensures that all existing GLDv3
152 152 * specific means of manipulating it are used and wraps up its private property
153 153 * interfaces to ensure that works.
154 154 *
155 155 * Properties in the second and third category are prefixed with the name of
156 156 * their module. For example, the vxlan encapsulation module has a property
157 157 * called the 'listen_ip'. This property would show up in dladm as
158 158 * 'vxlan/listen_ip'. This allows different plugins to both use similar names
159 159 * for similar properties and to also have independent name spaces so that
160 160 * overlapping names do not conflict with anything else.
161 161 *
162 162 * While the kernel combines both sets one and two into a single coherent view,
163 163 * it does not do anything with respect to the properties that are owned by the
164 164 * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
165 165 * charge of bridging these two worlds into one magical experience for the user.
166 166 * It carries the burden of knowing about both overlay specific and varpd
167 167 * specific properties. Importantly, we want to maintain this distinction. We
168 168 * don't want to treat the kernel as an arbitrary key/value store for varpd and
169 169 * we want the kernel to own its own data and not have to ask userland for
170 170 * information that it owns.
171 171 *
172 172 * Every property in the system has the following attributes:
173 173 *
174 174 * o A name
175 175 * o A type
176 176 * o A size
177 177 * o Permissions
178 178 * o Default value
179 179 * o Valid value ranges
180 180 * o A value
181 181 *
182 182 * Everything except for the value is obtained by callers through the propinfo
183 183 * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
184 184 * currently 256 bytes.
185 185 *
186 186 * The following are the supported types of properties:
187 187 *
188 188 * OVERLAY_PROP_T_INT
189 189 *
190 190 * A signed integer, its length is 8 bytes, corresponding to a
191 191 * int64_t.
192 192 *
193 193 * OVERLAY_PROP_T_UINT
194 194 *
195 195 * An unsigned integer, its length is 8 bytes, corresponding to a
196 196 * uint64_t.
197 197 *
198 198 * OVERLAY_PROP_T_IP
199 199 *
200 200 * A struct in6_addr, it has a fixed size.
201 201 *
202 202 * OVERLAY_PROP_T_STRING
203 203 *
204 204 * A null-terminated character string encoded in either ASCII or
205 205 * UTF-8. Note that the size of the string includes the null
206 206 * terminator.
207 207 *
208 208 * OVERLAY_PROP_T_ETHER
209 209 *
210 210 * An ether_addr_t, which has a fixed size.
211 211 *
212 212 * The next thing that we apply to a property is its permission. The permissions
213 213 * are put together by the bitwise or of the following flags and values.
214 214 *
215 215 * OVERLAY_PROP_PERM_REQ
216 216 *
217 217 * This indicates a required property. A property that is required
218 218 * must be set by a consumer before the device can be created. If a
219 219 * required property has a default property, this constraint is
220 220 * loosened because the default property defines the value.
221 221 *
222 222 * OVERLAY_PORP_PERM_READ
223 223 *
224 224 * This indicates that a property can be read. All properties will
225 225 * have this value set.
226 226 *
227 227 * OVERLAY_PROP_PERM_WRITE
228 228 *
229 229 * This indicates that a property can be written to and thus
230 230 * updated by userland. Properties that are only intended to
231 231 * display information, will not have OVERLAY_PROP_PERM_WRITE set.
232 232 *
233 233 * In addition, a few additional values are defined as a convenience to
234 234 * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
235 235 * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
236 236 * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
237 237 * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
238 238 * property should generally be a constant across its lifetime.
239 239 *
240 240 * A property may optionally have a default value. If it does have a default
241 241 * value, and that property is not set to be a different value, then the default
242 242 * value is inherited automatically. It also means that if the default value is
243 243 * acceptable, there is no need to set the value for a required property. For
244 244 * example, the vxlan module has the vxlan/listen_port property which is
245 245 * required, but has a default value of 4789 (the IANA assigned port). Because
246 246 * of that default value, there is no need for it to be set.
247 247 *
248 248 * Finally, a property may declare a list of valid values. These valid values
249 249 * are used for display purposes, they are not enforced by the broader system,
250 250 * but merely allow a means for the information to be communicated to the user
251 251 * through dladm(1M). Like a default value, this is optional.
252 252 *
253 253 * The general scaffolding does not do very much with respect to the getting and
254 254 * setting of properties. That is really owned by the individual plugins
255 255 * themselves.
256 256 *
257 257 * -----------------------------
258 258 * Destinations and Plugin Types
259 259 * -----------------------------
260 260 *
261 261 * Both encapsulation and lookup plugins define the kinds of destinations that
262 262 * they know how to support. There are three different pieces of information
263 263 * that can be used to address to a destination currently, all of which is
264 264 * summarized in the type overlay_point_t. Any combination of these is
265 265 * supported.
266 266 *
267 267 * OVERLAY_PLUGIN_D_ETHERNET
268 268 *
269 269 * An Ethernet MAC address is required.
270 270 *
271 271 * OVERLAY_PLUGIN_D_IP
272 272 *
273 273 * An IP address is required. All IP addresses used by the overlay
274 274 * system are transmitted as IPv6 addresses. IPv4 addresses can be
275 275 * represented by using IPv4-mapped IPv6 addresses.
276 276 *
277 277 * OVERLAY_PLUGIN_D_PORT
278 278 *
279 279 * A TCP/UDP port is required.
280 280 *
281 281 * A kernel encapsulation plugin declares which of these that it requires, it's
282 282 * a static set. On the other hand, a userland lookup plugin can be built to
283 283 * support all of these or any combination thereof. It gets passed the required
284 284 * destination type, based on the kernel encapsulation method, and then it makes
285 285 * the determination as to whether or not it supports it. For example, the
286 286 * direct plugin can support either an IP or both an IP and a port, it simply
287 287 * doesn't display the direct/dest_port property in the cases where a port is
288 288 * not required to support this.
289 289 *
290 290 * The user lookup plugins have two different modes of operation which
291 291 * determines how they interact with the broader system and how look ups are
292 292 * performed. These types are:
293 293 *
294 294 * OVERLAY_TARGET_POINT
295 295 *
296 296 * A point to point plugin has a single static definition for where
297 297 * to send all traffic. Every packet in the system always gets sent
298 298 * to the exact same destination which is programmed into the
299 299 * kernel when the general device is activated.
300 300 *
301 301 * OVERLAY_TARGET_DYNAMIC
302 302 *
303 303 * A dynamic plugin does not have a single static definition.
304 304 * Instead, for each destination, the kernel makes an asynchronous
305 305 * request to varpd to determine where the packet should be routed,
306 306 * and if a specific destination is found, then that destination is
307 307 * cached in the overlay device's target cache.
308 308 *
309 309 * This distinction, while important for the general overlay device's operation,
310 310 * is not important to the encapsulation plugins. They don't need to know about
311 311 * any of these pieces. It's just a concern for varpd, the userland plugin, and
312 312 * the general overlay scaffolding.
313 313 *
314 314 * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
315 315 * maintain a target cache, and instead just keeps track of the destination and
316 316 * always sends encapsulated packets to that address. When the target type is of
317 317 * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
318 318 * destinations. These destinations are kept around in an instance of a
319 319 * reference hash that is specific to the given overlay device. Entries in the
320 320 * cache can be invalidated and replaced by varpd and its lookup plugins.
321 321 *
322 322 * ----------------------------------
323 323 * Kernel Components and Architecture
324 324 * ----------------------------------
325 325 *
326 326 * There are multiple pieces inside the kernel that work together, there is the
327 327 * general overlay_dev_t structure, which is the logical GLDv3 device, but it
328 328 * itself has references to things like an instance of an encapsulation plugin,
329 329 * a pointer to a mux and a target cache. It can roughly be summarized in the
330 330 * following image:
331 331 *
332 332 * +------------------+
333 333 * | global |
334 334 * | overlay list |
335 335 * | overlay_dev_list |
336 336 * +------------------+
337 337 * |
338 338 * | +-----------------------+ +---------------+
339 339 * +->| GLDv3 Device |----------->| GLDv3 Device | -> ...
340 340 * | overlay_dev_t | | overlay_dev_t |
341 341 * | | +---------------+
342 342 * | |
343 343 * | mac_handle_t -----+---> GLDv3 handle to MAC
344 344 * | datalink_id_t -----+---> Datalink ID used by DLS
345 345 * | overlay_dev_flag_t ---+---> Device state
346 346 * | uint_t -----+---> Curent device MTU
347 347 * | uint_t -----+---> In-progress RX operations
348 348 * | uint_t -----+---> In-progress TX operations
349 349 * | char[] -----+---> FMA degraded message
350 350 * | void * -----+---> plugin private data
351 351 * | overlay_target_t * ---+---------------------+
352 352 * | overlay_plugin_t * ---+---------+ |
353 353 * +-----------------------+ | |
354 354 * ^ | |
355 355 * +--------------------+ | | |
356 356 * | Kernel Socket | | | |
357 357 * | Multiplexor | | | |
358 358 * | overlay_mux_t | | | |
359 359 * | | | | |
360 360 * | avl_tree_t -+--+ | |
361 361 * | uint_t -+--> socket family | |
362 362 * | uint_t -+--> socket type | |
363 363 * | uint_t -+--> socket protocol | |
364 364 * | ksocket_t -+--> I/O socket | |
365 365 * | struct sockaddr * -+--> ksocket address | |
366 366 * | overlay_plugin_t --+--------+ | |
367 367 * +--------------------+ | | |
368 368 * | | |
369 369 * +-------------------------+ | | |
370 370 * | Encap Plugin |<--+-----------+ |
371 371 * | overlay_plugin_t | |
372 372 * | | |
373 373 * | char * ---+--> plugin name |
374 374 * | overlay_plugin_ops_t * -+--> plugin downcalls |
375 375 * | char ** (props) ---+--> property list |
376 376 * | uint_t ---+--> id length |
377 377 * | overlay_plugin_flags_t -+--> plugin flags |
378 378 * | overlay_plugin_dest_t --+--> destination type v
379 379 * +-------------------------+ +-------------------------+
380 380 * | Target Cache |
381 381 * | overlay_target_t |
382 382 * | |
383 383 * cache mode <--+- overlay_target_mode_t |
384 384 * dest type <--+- overlay_plugin_dest_t |
385 385 * cache flags <--+- overlay_target_flag_t |
386 386 * varpd id <--+- uint64_t |
387 387 * outstanding varpd reqs. <--+- uint_t |
388 388 * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t |
389 389 * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t |
390 390 * | +-------------------------+
391 391 * +-----------------------+
392 392 * |
393 393 * v
394 394 * +-------------------------------+ +------------------------+
395 395 * | Target Entry |-->| Target Entry |--> ...
396 396 * | overlay_target_entry_t | | overlay_target_entry_t |
397 397 * | | +------------------------+
398 398 * | |
399 399 * | overlay_target_entry_flags_t -+--> Entry flags
400 400 * | uint8_t[ETHERADDRL] ---+--> Target MAC address
401 401 * | overlay_target_point_t ---+--> Target underlay address
402 402 * | mblk_t * ---+--> outstanding mblk head
403 403 * | mblk_t * ---+--> outstanding mblk tail
404 404 * | size_t ---+--> outstanding mblk size
405 405 * +-------------------------------+
406 406 *
407 407 * The primary entries that we care about are the overlay_dev_t, which
408 408 * correspond to each overlay device that is created with dladm(1M). Globally,
409 409 * these devices are maintained in a simple list_t which is protected with a
410 410 * lock. Hence, these include important information such as the mac_handle_t
411 411 * and a datalink_id_t which is used to interact with the broader MAC and DLS
412 412 * ecosystem. We also maintain additional information such as the current state,
413 413 * outstanding operations, the mtu, and importantly, the plugin's private data.
414 414 * This is the instance of an encapsulation plugin that gets created as part of
415 415 * creating an overlay device. Another aspect of this is that the overlay_dev_t
416 416 * also includes information with respect to FMA. For more information, see the
417 417 * FMA section.
418 418 *
419 419 * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
420 420 * is the encapsulation plugin. This allows the device to make downcalls into it
421 421 * based on doing things like getting and setting properties. Otherwise, the
422 422 * plugin itself is a fairly straightforward entity. They are maintained in an
423 423 * (not pictured above) list. The plugins themselves mostly maintain things like
424 424 * the static list of properties, what kind of destination they require, and the
425 425 * operations vector. A given module may contain more if necessary.
426 426 *
427 427 * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
428 428 * maintains a ksocket and it is through the mux that we send and receive
429 429 * message blocks. The mux represents a socket type and address, as well as a
430 430 * plugin. Multiple overlay_dev_t devices may then share the same mux. For
431 431 * example, consider the case where you have different instances of vxlan all on
432 432 * the same underlay network. These would all logically share the same IP
433 433 * address and port that packets are sent and received on; however, what differs
434 434 * is the decapuslation ID.
435 435 *
436 436 * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
437 437 * a socket, we enable a direct callback on the ksocket. This means that
438 438 * whenever a message block chain is received, rather than sitting there and
439 439 * getting a callback in a context and kicking that back out to a taskq. Instead
440 440 * data comes into the callback function overlay_mux_recv().
441 441 *
442 442 * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
443 443 * function) to transmit. It receives encapsulated packets, decapsulates them to
444 444 * determine the overlay identifier, looks up the given device that matches that
445 445 * identifier, and then causes the broader MAC world to receive the packet with
446 446 * a call to mac_rx().
447 447 *
448 448 * Today, we don't do too much that's special with the ksocket; however, as
449 449 * hardware is gaining understanding for these encapuslation protocols, we'll
450 450 * probably want to think of better ways to get those capabilities passed down
451 451 * and potentially better ways to program receive filters so they get directly
452 452 * to us. Though, that's all fantasy future land.
453 453 *
454 454 * The next part of the puzzle is the target cache. The purpose of the target
455 455 * cache is to cache where we should send a packet on the underlay network,
456 456 * given its mac address. The target cache operates in two modes depending on
457 457 * whether the lookup module was declared to OVERLAY_TARGET_POINT or
458 458 * OVERLAY_TARGET_DYANMIC.
459 459 *
460 460 * In the case where the target cache has been programmed to be
461 461 * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
462 462 * which has the destination that we send everything, no matter the destination
463 463 * mac address.
464 464 *
465 465 * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
466 466 * are much more interesting and as a result, more complicated. We primarily
467 467 * store lists of overlay_target_entry_t's which are stored in both an avl tree
468 468 * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
469 469 * is only used for a few of the target ioctls used to dump data such that we
470 470 * can get a consistent iteration order for things like dladm show-overlay -t.
471 471 * The key that we use for the reference hashtable is based on the mac address
472 472 * in the cache and currently we just do a simple CRC32 to transform it into a
473 473 * hash.
474 474 *
475 475 * Each entry maintains a set of flags to indicate the current status of the
476 476 * request. The flags may indicate one of three states: that current cache entry
477 477 * is valid, that the current cache entry has been directed to drop all output,
478 478 * and that the current cache entry is invalid and may be being looked up. In
479 479 * the case where it's valid, we just take the destination address and run with
480 480 * it.
481 481 *
482 482 * If it's invalid and a lookup has not been made, then we start the process
483 483 * that prepares a query that will make its way up to varpd. The cache entry
484 484 * entry maintains a message block chain of outstanding message blocks and a
485 485 * size. These lists are populated only when we don't know the answer as to
486 486 * where should these be sent. The size entry is used to cap the amount of
487 487 * outstanding data that we don't know the answer to. If we exceed a cap on the
488 488 * amount of outstanding data (currently 1 Mb), then we'll drop any additional
489 489 * packets. Once we get an answer indicating a valid destination, we transmit
490 490 * any outstanding data to that place. For the full story on how we look that up
491 491 * will be discussed in the section on the Target Cache Lifecycle.
492 492 *
493 493 * ------------------------
494 494 * FMA and Degraded Devices
495 495 * ------------------------
496 496 *
497 497 * Every kernel overlay device keeps track of its FMA state. Today in FMA we
498 498 * cannot represent partitions between resources nor can we represent that a
499 499 * given minor node of a psuedo device has failed -- if we degrade the overlay
500 500 * device, then the entire dev_info_t is degraded. However, we still want to be
501 501 * able to indicate to administrators that things may go wrong.
502 502 *
503 503 * To this end, we've added a notion of a degraded state to every overlay
504 504 * device. This state is primarily dictated by userland and it can happen for
505 505 * various reasons. Generally, because a userland lookup plugin has been
506 506 * partitioned, or something has gone wrong such that there is no longer any
507 507 * userland lookup module for a device, then we'll mark it degraded.
508 508 *
509 509 * As long as any of our minor instances is degraded, then we'll fire off the
510 510 * FMA event to note that. Once the last degraded instance is no longer
511 511 * degraded, then we'll end up telling FMA that we're all clean.
512 512 *
513 513 * To help administrators get a better sense of which of the various minor
514 514 * devices is wrong, we store the odd_fmamsg[] character array. This character
515 515 * array can be fetched with doing a dladm show-overlay -f.
516 516 *
517 517 * Note, that it's important that we do not update the link status of the
518 518 * devices. We want to remain up as much as possible. By changing the link in a
519 519 * degraded state, this may end up making things worse. We may still actually
520 520 * have information in the target cache and if we mark the link down, that'll
521 521 * result in not being able to use it. The reason being that this'll mark all
522 522 * the downstream VNICs down which will go to IP and from there we end up
523 523 * dealing with sadness.
524 524 *
525 525 * -----------------------
526 526 * Target Cache Life Cycle
527 527 * -----------------------
528 528 *
529 529 * This section only applies when we have a lookup plugin of
530 530 * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
531 531 * OVERLAY_TARGET_POINT.
532 532 *
533 533 * While we got into the target cache in the general architecture section, it's
534 534 * worth going into more details as to how this actually works and showing some
535 535 * examples and state machines. Recall that a target cache entry basically has
536 536 * the following state transition diagram:
537 537 *
538 538 * Initial state
539 539 * . . . . . . first access . . . varpd lookup enqueued
540 540 * . . .
541 541 * . . .
542 542 * +-------+ . +----------+ .
543 543 * | No |------*---->| Invalid |-------*----+
544 544 * | Entry | | Entry | |
545 545 * +-------+ +----------+ |
546 546 * varpd ^ ^ varpd |
547 547 * invalidate | | drop |
548 548 * . . . * * . . v
549 549 * +-------+ | | +---------+
550 550 * | Entry |--->-----+ +----<----| Entry |
551 551 * | Valid |<----------*---------<----| Pending |->-+ varpd
552 552 * +-------+ . +---------+ * . . drop, but
553 553 * . varpd ^ | other queued
554 554 * . success | | entries
555 555 * +-----+
556 556 *
557 557 * When the table is first created, it is empty. As we attempt to lookup entries
558 558 * and we find there is no entry at all, we'll create a new table entry for it.
559 559 * At that point the entry is technically in an invalid state, that means that
560 560 * we have no valid data from varpd. In that case, we'll go ahead and queue the
561 561 * packet into the entry's pending chain, and queue a varpd lookup, setting the
562 562 * OVERLAY_ENTRY_F_PENDING flag in the progress.
563 563 *
564 564 * If additional mblk_t's come in for this entry, we end up appending them to
565 565 * the tail of the chain, if and only if, we don't exceed the threshold for the
566 566 * amount of space they can take up. An entry remains pending until we get a
567 567 * varpd reply. If varpd replies with a valid results, we move to the valid
568 568 * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
569 569 * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
570 570 *
571 571 * Once an entry is valid, it stays valid until user land tells us to invalidate
572 572 * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
573 573 * OVERLAY_TARG_CACHE_SET respectively.
574 574 *
575 575 * If the lookup fails with a call to drop the packet, then the next state is
576 576 * determined by the state of the queue. If the set of outstanding entries is
577 577 * empty, then we just transition back to the invalid state. If instead, the
578 578 * set of outstanding entries is not empty, then we'll queue another entry and
579 579 * stay in the same state, repeating this until the number of requests is
580 580 * drained.
581 581 *
582 582 * The following images describes the flow of a given lookup and where the
583 583 * overlay_target_entry_t is at any given time.
584 584 *
585 585 * +-------------------+
586 586 * | Invalid Entry | An entry starts off as an invalid entry
587 587 * | de:ad:be:ef:00:00 | and only exists in the target cache.
588 588 * +-------------------+
589 589 *
590 590 * ~~~~
591 591 *
592 592 * +---------------------+
593 593 * | Global list_t | A mblk_t comes in for an entry. We
594 594 * | overlay_target_list | append it to the overlay_target_list.
595 595 * +---------------------+
596 596 * |
597 597 * v
598 598 * +-------------------+ +-------------------+
599 599 * | Pending Entry |----->| Pending Entry |--->...
600 600 * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 |
601 601 * +-------------------+ +-------------------+
602 602 *
603 603 * ~~~~
604 604 *
605 605 * +--------------------------+
606 606 * | /dev/overlay minor state | User land said that it would look up an
607 607 * | overlay_target_hdl_t | entry for us. We remove it from the
608 608 * +--------------------------+ global list and add it to the handle's
609 609 * | outstanding list.
610 610 * |
611 611 * v
612 612 * +-------------------+ +-------------------+
613 613 * | Pending Entry |----->| Pending Entry |
614 614 * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 |
615 615 * +-------------------+ +-------------------+
616 616 *
617 617 * ~~~~
618 618 *
619 619 * +-------------------+
620 620 * | Valid Entry | varpd returned an answer with
621 621 * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache
622 622 * | 10.169.23.42:4789 | entry is now populated with a
623 623 * +-------------------+ destination and marked as valid
624 624 *
625 625 *
626 626 * The lookup mechanism is performed via a series of operations on the character
627 627 * psuedo-device /dev/overlay. The only thing that uses this device is the
628 628 * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
629 629 * granting a new minor number which maintains its own state. We maintain this
630 630 * state so that way if an outstanding lookup was queued to something that
631 631 * crashed or closed its handle without responding, we can know about this and
632 632 * thus handle it appropriately.
633 633 *
634 634 * When a lookup is first created it's added to our global list of outstanding
635 635 * lookups. To service requests, userland is required to perform an ioctl to ask
636 636 * for a request. We will block it in the kernel a set amount of time waiting
637 637 * for a request. When we give a request to a given minor instance of the
638 638 * device, we remove it from the global list and append the request to the
639 639 * device's list of outstanding entries, for the reasons we discussed above.
640 640 * When a lookup comes in, we give user land a smaller amount of information
641 641 * specific to that packet, the overlay_targ_lookup_t. It includes a request id
642 642 * to identify this, and then the overlay id, the varpd id, the header and
643 643 * packet size, the source and destination mac address, the SAP, and any
644 644 * potential VLAN header.
645 645 *
646 646 * At that point, it stays in that outstanding list until one of two ioctls are
647 647 * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
648 648 * userland may also perform other operations. For example, it may use
649 649 * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
650 650 * analysis of what to do beyond what we gave it initially. This is useful for
651 651 * providing proxy arp and the like. Finally, there are two other ioctls that
652 652 * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
653 653 * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
654 654 * causes us to encapsulate and send out the packet they've given us.
655 655 *
656 656 *
657 657 * Finally, through the target cache, several ioctls are provided to allow for
658 658 * interrogation and management of the cache. They allow for individual entries
659 659 * to be retrieved, set, or have the entire table flushed. For the full set of
660 660 * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
661 661 *
662 662 * ------------------
663 663 * Sample Packet Flow
664 664 * ------------------
665 665 *
666 666 * There's a lot of pieces here, hopefully an example of how this all fits
667 667 * together will help clarify and elucidate what's going on. We're going to
668 668 * first track an outgoing packet, eg. one that is sent from an IP interface on
669 669 * a VNIC on top of an overlay device, and then we'll look at what it means to
670 670 * respond to that.
671 671 *
672 672 *
673 673 * +----------------+ +--------------+ +------------------+
674 674 * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches |
675 675 * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx |
676 676 * +----------------+ | VNIC device | | overlay_m_tx() |
677 677 * +--------------+ +------------------+
678 678 * |
679 679 * . lookup . cache |
680 680 * . drop . miss v
681 681 * +---------+ . +--------+ . +------------------+
682 682 * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk |
683 683 * | mblk_t | | lookup | | in the target |
684 684 * +---------+ | queued | | cache |
685 685 * ^ +--------+ +------------------+
686 686 * on send | | | cache
687 687 * error . . * *. . lookup * . . hit
688 688 * | | success v
689 689 * | | +------------------+
690 690 * +-----------------+ +--------------->| call plugin |
691 691 * | Send out | | ovpo_encap() to |
692 692 * | overlay_mux_t's |<----------------------------------| get encap mblk_t |
693 693 * | ksocket | +------------------+
694 694 * +-----------------+
695 695 *
696 696 * The receive end point looks a little different and looks more like:
697 697 *
698 698 * +------------------+ +----------------+ +-----------+
699 699 * | mblk_t comes off |---->| enter netstack |--->| delivered |---+
700 700 * | the physical | | IP stack | | to | * . . direct
701 701 * | device | +----------------+ | ksocket | | callback
702 702 * +------------------+ +-----------+ |
703 703 * . overlay id |
704 704 * . not found v
705 705 * +-----------+ . +-----------------+ +--------------------+
706 706 * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() |
707 707 * | mblk_t | | ovpo_decap() to | +--------------------+
708 708 * +-----------+ | decap mblk_t |
709 709 * +-----------------+
710 710 * |
711 711 * * . . overlay id
712 712 * v found
713 713 * +--------+ +----------------+
714 714 * | adjust |----->| call mac_rx |
715 715 * | mblk_t | | on original |
716 716 * +--------+ | decaped packet |
717 717 * +----------------+
718 718 *
719 719 * ------------------
720 720 * Netstack Awareness
721 721 * ------------------
722 722 *
723 723 * In the above image we note that this enters a netstack. Today the only
724 724 * netstack that can be is the global zone as the overlay driver itself is not
725 725 * exactly netstack aware. What this really means is that varpd cannot run in a
726 726 * non-global zone and an overlay device cannot belong to a non-global zone.
727 727 * Non-global zones can still have a VNIC assigned to them that's been created
728 728 * over the overlay device the same way they would if it had been created over
729 729 * an etherstub or a physical device.
730 730 *
731 731 * The majority of the work to make it netstack aware is straightforward and the
732 732 * biggest thing is to create a netstack module that allows us to hook into
733 733 * netstack (and thus zone) creation and destruction. From there, we need to
734 734 * amend the target cache lookup routines that we discussed earlier to not have
735 735 * a global outstanding list and a global list of handles, but rather, one per
736 736 * netstack.
737 737 *
738 738 * For the mux, we'll need to open the ksocket in the context of the zone, we
739 739 * can likely do this with a properly composed credential, but we'll need to do
740 740 * some more work on that path. Finally, we'll want to make sure the dld ioctls
741 741 * are aware of the zoneid of the caller and we use that appropriately and store
742 742 * it in the overlay_dev_t.
743 743 *
744 744 * -----------
745 745 * GLDv3 Notes
746 746 * -----------
747 747 *
748 748 * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
749 749 * relevant and other parts are much less relevant for us. For example, the
750 750 * GLDv3 is used to toggle the device being put into and out of promiscuous
751 751 * mode, to program MAC addresses for unicast and multicast hardware filters.
752 752 * Today, an overlay device doesn't have a notion of promiscuous mode nor does
753 753 * it have a notion of unicast and multicast addresses programmed into the
754 754 * device. Instead, for the purposes of the hardware filter, we don't do
755 755 * anything and just always accept new addresses being added and removed.
756 756 *
757 757 * If the GLDv3 start function has not been called, then we will not use this
758 758 * device for I/O purposes. Any calls to transmit or receive should be dropped,
759 759 * though the GLDv3 guarantees us that transmit will not be called without
760 760 * calling start. Similarly, once stop is called, then no packets can be dealt
761 761 * with.
762 762 *
763 763 * Today we don't support the stat interfaces, though there's no good reason
764 764 * that we shouldn't assemble some of the stats based on what we have in the
765 765 * future.
766 766 *
767 767 * When it comes to link properties, many of the traditional link properties do
768 768 * not apply and many others MAC handles for us. For example, we don't need to
769 769 * implement anything for overlay_m_getprop() to deal with returning the MTU, as
770 770 * MAC never calls into us for that. As such, there isn't much of anything to
771 771 * support in terms of properties.
772 772 *
773 773 * Today, we don't support any notion of hardware capabilities. However, if
774 774 * future NIC hardware or other changes to the system cause it to make sense for
775 775 * us to emulate logical groups, then we should do that. However, we still do
776 776 * implement a capab function so that we can identify ourselves as an overlay
777 777 * device to the broader MAC framework. This is done mostly so that a device
778 778 * created on top of us can have fanout rings as we don't try to lie about a
779 779 * speed for our device.
780 780 *
781 781 * The other question is what should be done for a device's MTU and margin. We
782 782 * set our minimum supported MTU to be the minimum value that an IP network may
783 783 * be set to 576 -- which mimics what an etherstub does. On the flip side, we
784 784 * have our upper bound set to 8900. This value comes from the fact that a lot
785 785 * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
786 786 * bytes, which isn't exactly the most accurate number, but it'll be good enough
787 787 * for now. Because of that, our default MTU off of these devices is 1400, as
788 788 * the default MTU for everything is usually 1500 or whatever the underlying
789 789 * device is at; however, this is a bit simpler than asking the netstack what
790 790 * are all the IP interfaces at. It also calls into question how PMTU and PMTU
791 791 * discovery should work here. The challenge, especially for
792 792 * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
793 793 * not clear that if you have a single bad entry that the overall MTU should be
794 794 * lowered. Instead, we should figure out a better way of determining these
795 795 * kinds of PMTU errors and appropriately alerting the administrator via FMA.
796 796 *
797 797 * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
798 798 * or not the underlying encapsulation device supports VLAN tags. If it does,
799 799 * then we'll set the margin to allow for it, otherwise, we will not.
800 800 */
801 801
802 802 #include <sys/conf.h>
803 803 #include <sys/errno.h>
804 804 #include <sys/stat.h>
805 805 #include <sys/ddi.h>
806 806 #include <sys/sunddi.h>
807 807 #include <sys/modctl.h>
808 808 #include <sys/policy.h>
809 809 #include <sys/stream.h>
810 810 #include <sys/strsubr.h>
811 811 #include <sys/strsun.h>
812 812 #include <sys/types.h>
813 813 #include <sys/kmem.h>
814 814 #include <sys/param.h>
815 815 #include <sys/sysmacros.h>
816 816 #include <sys/ddifm.h>
817 817
818 818 #include <sys/dls.h>
819 819 #include <sys/dld_ioc.h>
820 820 #include <sys/mac_provider.h>
821 821 #include <sys/mac_client_priv.h>
822 822 #include <sys/mac_ether.h>
823 823 #include <sys/vlan.h>
824 824
825 825 #include <sys/overlay_impl.h>
826 826
827 827 dev_info_t *overlay_dip;
828 828 static kmutex_t overlay_dev_lock;
829 829 static list_t overlay_dev_list;
830 830 static uint8_t overlay_macaddr[ETHERADDRL] =
831 831 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
832 832
833 833 typedef enum overlay_dev_prop {
834 834 OVERLAY_DEV_P_MTU = 0,
835 835 OVERLAY_DEV_P_VNETID,
836 836 OVERLAY_DEV_P_ENCAP,
837 837 OVERLAY_DEV_P_VARPDID,
838 838 OVERLAY_DEV_P_DCID
839 839 } overlay_dev_prop_t;
840 840
841 841 #define OVERLAY_DEV_NPROPS 5
842 842 static const char *overlay_dev_props[] = {
843 843 "mtu",
844 844 "vnetid",
845 845 "encap",
846 846 "varpd/id",
847 847 "dcid"
848 848 };
849 849
850 850 #define OVERLAY_MTU_MIN 576
851 851 #define OVERLAY_MTU_DEF 1400
852 852 #define OVERLAY_MTU_MAX 8900
853 853
854 854 overlay_dev_t *
855 855 overlay_hold_by_dlid(datalink_id_t id)
856 856 {
857 857 overlay_dev_t *o;
858 858
859 859 mutex_enter(&overlay_dev_lock);
860 860 for (o = list_head(&overlay_dev_list); o != NULL;
861 861 o = list_next(&overlay_dev_list, o)) {
862 862 if (id == o->odd_linkid) {
863 863 mutex_enter(&o->odd_lock);
864 864 o->odd_ref++;
865 865 mutex_exit(&o->odd_lock);
866 866 mutex_exit(&overlay_dev_lock);
867 867 return (o);
868 868 }
869 869 }
870 870
871 871 mutex_exit(&overlay_dev_lock);
872 872 return (NULL);
873 873 }
874 874
875 875 void
876 876 overlay_hold_rele(overlay_dev_t *odd)
877 877 {
878 878 mutex_enter(&odd->odd_lock);
879 879 ASSERT(odd->odd_ref > 0);
880 880 odd->odd_ref--;
881 881 mutex_exit(&odd->odd_lock);
882 882 }
883 883
884 884 void
885 885 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
886 886 {
887 887 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
888 888 ASSERT(MUTEX_HELD(&odd->odd_lock));
889 889
890 890 if (flag & OVERLAY_F_IN_RX)
891 891 odd->odd_rxcount++;
892 892 if (flag & OVERLAY_F_IN_TX)
893 893 odd->odd_txcount++;
894 894 odd->odd_flags |= flag;
895 895 }
896 896
897 897 void
898 898 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
899 899 {
900 900 boolean_t signal = B_FALSE;
901 901
902 902 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
903 903 ASSERT(MUTEX_HELD(&odd->odd_lock));
904 904
905 905 if (flag & OVERLAY_F_IN_RX) {
906 906 ASSERT(odd->odd_rxcount > 0);
907 907 odd->odd_rxcount--;
908 908 if (odd->odd_rxcount == 0) {
909 909 signal = B_TRUE;
910 910 odd->odd_flags &= ~OVERLAY_F_IN_RX;
911 911 }
912 912 }
913 913 if (flag & OVERLAY_F_IN_TX) {
914 914 ASSERT(odd->odd_txcount > 0);
915 915 odd->odd_txcount--;
916 916 if (odd->odd_txcount == 0) {
917 917 signal = B_TRUE;
918 918 odd->odd_flags &= ~OVERLAY_F_IN_TX;
919 919 }
920 920 }
921 921
922 922 if (signal == B_TRUE)
923 923 cv_broadcast(&odd->odd_iowait);
924 924 }
925 925
926 926 static void
927 927 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
928 928 {
929 929 ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
930 930 ASSERT(MUTEX_HELD(&odd->odd_lock));
931 931
932 932 while (odd->odd_flags & flag) {
933 933 cv_wait(&odd->odd_iowait, &odd->odd_lock);
934 934 }
935 935 }
936 936
937 937 void
938 938 overlay_dev_iter(overlay_dev_iter_f func, void *arg)
939 939 {
940 940 overlay_dev_t *odd;
941 941
942 942 mutex_enter(&overlay_dev_lock);
943 943 for (odd = list_head(&overlay_dev_list); odd != NULL;
944 944 odd = list_next(&overlay_dev_list, odd)) {
945 945 if (func(odd, arg) != 0) {
946 946 mutex_exit(&overlay_dev_lock);
947 947 return;
948 948 }
949 949 }
950 950 mutex_exit(&overlay_dev_lock);
951 951 }
952 952
953 953 /* ARGSUSED */
954 954 static int
955 955 overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
956 956 {
957 957 return (ENOTSUP);
958 958 }
959 959
960 960 static int
961 961 overlay_m_start(void *arg)
962 962 {
963 963 overlay_dev_t *odd = arg;
964 964 overlay_mux_t *mux;
965 965 int ret, domain, family, prot;
966 966 struct sockaddr_storage storage;
967 967 socklen_t slen;
968 968
969 969 mutex_enter(&odd->odd_lock);
970 970 if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
971 971 mutex_exit(&odd->odd_lock);
972 972 return (EAGAIN);
973 973 }
974 974 mutex_exit(&odd->odd_lock);
975 975
976 976 ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
977 977 &family, &prot, (struct sockaddr *)&storage, &slen);
978 978 if (ret != 0)
979 979 return (ret);
980 980
981 981 mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
982 982 (struct sockaddr *)&storage, slen, &ret);
983 983 if (mux == NULL)
984 984 return (ret);
985 985
986 986 overlay_mux_add_dev(mux, odd);
987 987 odd->odd_mux = mux;
988 988 mutex_enter(&odd->odd_lock);
989 989 ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
990 990 odd->odd_flags |= OVERLAY_F_IN_MUX;
991 991 mutex_exit(&odd->odd_lock);
992 992
993 993 return (0);
994 994 }
995 995
996 996 static void
997 997 overlay_m_stop(void *arg)
998 998 {
999 999 overlay_dev_t *odd = arg;
1000 1000
1001 1001 /*
1002 1002 * The MAC Perimeter is held here, so we don't have to worry about
1003 1003 * synchornizing this with respect to metadata operations.
1004 1004 */
1005 1005 mutex_enter(&odd->odd_lock);
1006 1006 VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
1007 1007 VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
1008 1008 odd->odd_flags |= OVERLAY_F_MDDROP;
1009 1009 overlay_io_wait(odd, OVERLAY_F_IOMASK);
1010 1010 mutex_exit(&odd->odd_lock);
1011 1011
1012 1012 overlay_mux_remove_dev(odd->odd_mux, odd);
1013 1013 overlay_mux_close(odd->odd_mux);
1014 1014 odd->odd_mux = NULL;
1015 1015
1016 1016 mutex_enter(&odd->odd_lock);
1017 1017 odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1018 1018 odd->odd_flags &= ~OVERLAY_F_MDDROP;
1019 1019 VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
1020 1020 mutex_exit(&odd->odd_lock);
1021 1021 }
1022 1022
1023 1023 /*
1024 1024 * For more info on this, see the big theory statement.
1025 1025 */
1026 1026 /* ARGSUSED */
1027 1027 static int
1028 1028 overlay_m_promisc(void *arg, boolean_t on)
1029 1029 {
1030 1030 return (0);
1031 1031 }
1032 1032
1033 1033 /*
1034 1034 * For more info on this, see the big theory statement.
1035 1035 */
1036 1036 /* ARGSUSED */
1037 1037 static int
1038 1038 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
1039 1039 {
1040 1040 return (0);
1041 1041 }
1042 1042
1043 1043 /*
1044 1044 * For more info on this, see the big theory statement.
1045 1045 */
1046 1046 /* ARGSUSED */
1047 1047 static int
1048 1048 overlay_m_unicast(void *arg, const uint8_t *macaddr)
1049 1049 {
1050 1050 return (0);
1051 1051 }
1052 1052
1053 1053 mblk_t *
1054 1054 overlay_m_tx(void *arg, mblk_t *mp_chain)
1055 1055 {
1056 1056 overlay_dev_t *odd = arg;
1057 1057 mblk_t *mp, *ep;
1058 1058 int ret;
1059 1059 ovep_encap_info_t einfo;
1060 1060 struct msghdr hdr;
1061 1061
1062 1062 mutex_enter(&odd->odd_lock);
1063 1063 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
1064 1064 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
|
↓ open down ↓ |
1064 lines elided |
↑ open up ↑ |
1065 1065 mutex_exit(&odd->odd_lock);
1066 1066 freemsgchain(mp_chain);
1067 1067 return (NULL);
1068 1068 }
1069 1069 overlay_io_start(odd, OVERLAY_F_IN_TX);
1070 1070 mutex_exit(&odd->odd_lock);
1071 1071
1072 1072 bzero(&hdr, sizeof (struct msghdr));
1073 1073
1074 1074 bzero(&einfo, sizeof (ovep_encap_info_t));
1075 - einfo.ovdi_id = odd->odd_vid;
1075 +
1076 1076 mp = mp_chain;
1077 1077 while (mp != NULL) {
1078 1078 socklen_t slen;
1079 1079 struct sockaddr_storage storage;
1080 1080
1081 1081 mp_chain = mp->b_next;
1082 1082 mp->b_next = NULL;
1083 1083 ep = NULL;
1084 1084
1085 - /*
1086 - * TODO: we probably need to change 'storage' to a
1087 - * refheld overlay_target_entry_t and also maybe set
1088 - * local vlan from packet header for check below
1089 - */
1090 1085 ret = overlay_target_lookup(odd, mp,
1091 - (struct sockaddr *)&storage, &slen);
1086 + (struct sockaddr *)&storage, &slen, &einfo.ovdi_id);
1092 1087 if (ret != OVERLAY_TARGET_OK) {
1093 1088 if (ret == OVERLAY_TARGET_DROP)
1094 1089 freemsg(mp);
1095 1090 mp = mp_chain;
1096 1091 continue;
1097 1092 }
1098 1093
1099 - /*
1100 - * TODO:
1101 - * set hdr.msg_name from target_entry
1102 - *
1103 - * if !local:
1104 - * check fabric attachment
1105 - * modify vlan tag, VL2 mac addresses
1106 - *
1107 - * set einfo.ovdi_id to vnet id (move into loop since
1108 - * things cannot assume to all have same vnet id anymore)
1109 - */
1110 1094 hdr.msg_name = &storage;
1111 1095 hdr.msg_namelen = slen;
1112 1096
1113 1097 ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
1114 1098 &einfo, &ep);
1115 1099 if (ret != 0 || ep == NULL) {
1116 1100 freemsg(mp);
1117 1101 goto out;
1118 1102 }
1119 1103
1120 1104 ep->b_cont = mp;
1121 1105 ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
1122 1106 if (ret != 0)
1123 1107 goto out;
1124 1108
1125 1109 mp = mp_chain;
1126 1110 }
1127 1111
1128 1112 out:
1129 1113 mutex_enter(&odd->odd_lock);
1130 1114 overlay_io_done(odd, OVERLAY_F_IN_TX);
1131 1115 mutex_exit(&odd->odd_lock);
1132 1116 return (mp_chain);
1133 1117 }
1134 1118
1135 1119 /* ARGSUSED */
1136 1120 static void
1137 1121 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1138 1122 {
1139 1123 miocnak(q, mp, 0, ENOTSUP);
1140 1124 }
1141 1125
1142 1126 /* ARGSUSED */
1143 1127 static boolean_t
1144 1128 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1145 1129 {
1146 1130 /*
1147 1131 * Tell MAC we're an overlay.
1148 1132 */
1149 1133 if (cap == MAC_CAPAB_OVERLAY)
1150 1134 return (B_TRUE);
1151 1135 return (B_FALSE);
1152 1136 }
1153 1137
1154 1138 /* ARGSUSED */
1155 1139 static int
1156 1140 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1157 1141 uint_t pr_valsize, const void *pr_val)
1158 1142 {
1159 1143 uint32_t mtu, old;
1160 1144 int err;
1161 1145 overlay_dev_t *odd = arg;
1162 1146
1163 1147 if (pr_num != MAC_PROP_MTU)
1164 1148 return (ENOTSUP);
1165 1149
1166 1150 bcopy(pr_val, &mtu, sizeof (mtu));
1167 1151 if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
1168 1152 return (EINVAL);
1169 1153
1170 1154 mutex_enter(&odd->odd_lock);
1171 1155 old = odd->odd_mtu;
1172 1156 odd->odd_mtu = mtu;
1173 1157 err = mac_maxsdu_update(odd->odd_mh, mtu);
1174 1158 if (err != 0)
1175 1159 odd->odd_mtu = old;
1176 1160 mutex_exit(&odd->odd_lock);
1177 1161
1178 1162 return (err);
1179 1163 }
1180 1164
1181 1165 /* ARGSUSED */
1182 1166 static int
1183 1167 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1184 1168 uint_t pr_valsize, void *pr_val)
1185 1169 {
1186 1170 return (ENOTSUP);
1187 1171 }
1188 1172
1189 1173 /* ARGSUSED */
1190 1174 static void
1191 1175 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1192 1176 mac_prop_info_handle_t prh)
1193 1177 {
1194 1178 if (pr_num != MAC_PROP_MTU)
1195 1179 return;
1196 1180
1197 1181 mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
1198 1182 mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
1199 1183 }
1200 1184
1201 1185 static mac_callbacks_t overlay_m_callbacks = {
1202 1186 .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
1203 1187 MC_PROPINFO),
1204 1188 .mc_getstat = overlay_m_stat,
1205 1189 .mc_start = overlay_m_start,
1206 1190 .mc_stop = overlay_m_stop,
1207 1191 .mc_setpromisc = overlay_m_promisc,
1208 1192 .mc_multicst = overlay_m_multicast,
1209 1193 .mc_unicst = overlay_m_unicast,
1210 1194 .mc_tx = overlay_m_tx,
1211 1195 .mc_ioctl = overlay_m_ioctl,
1212 1196 .mc_getcapab = overlay_m_getcapab,
1213 1197 .mc_getprop = overlay_m_getprop,
1214 1198 .mc_setprop = overlay_m_setprop,
1215 1199 .mc_propinfo = overlay_m_propinfo
1216 1200 };
1217 1201
1218 1202 static boolean_t
1219 1203 overlay_valid_name(const char *name, size_t buflen)
1220 1204 {
1221 1205 size_t actlen;
1222 1206 int err, i;
1223 1207
1224 1208 for (i = 0; i < buflen; i++) {
1225 1209 if (name[i] == '\0')
1226 1210 break;
1227 1211 }
1228 1212
1229 1213 if (i == 0 || i == buflen)
1230 1214 return (B_FALSE);
1231 1215 actlen = i;
1232 1216 if (strchr(name, '/') != NULL)
1233 1217 return (B_FALSE);
1234 1218 if (u8_validate((char *)name, actlen, NULL,
1235 1219 U8_VALIDATE_ENTIRE, &err) < 0)
1236 1220 return (B_FALSE);
1237 1221 return (B_TRUE);
1238 1222 }
1239 1223
1240 1224 /* ARGSUSED */
1241 1225 static int
1242 1226 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1243 1227 {
1244 1228 int err;
1245 1229 uint64_t maxid;
1246 1230 overlay_dev_t *odd, *o;
1247 1231 mac_register_t *mac;
1248 1232 overlay_ioc_create_t *oicp = karg;
1249 1233
1250 1234 if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
1251 1235 return (EINVAL);
1252 1236
1253 1237 odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
1254 1238 odd->odd_linkid = oicp->oic_linkid;
1255 1239 odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
1256 1240 if (odd->odd_plugin == NULL) {
1257 1241 kmem_free(odd, sizeof (overlay_dev_t));
1258 1242 return (ENOENT);
1259 1243 }
1260 1244 err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
1261 1245 &odd->odd_pvoid);
1262 1246 if (err != 0) {
1263 1247 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1264 1248 overlay_plugin_rele(odd->odd_plugin);
1265 1249 kmem_free(odd, sizeof (overlay_dev_t));
1266 1250 return (EINVAL);
1267 1251 }
1268 1252
1269 1253 /*
1270 1254 * Make sure that our virtual network id is valid for the given plugin
1271 1255 * that we're working with.
1272 1256 */
1273 1257 ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1274 1258 maxid = UINT64_MAX;
1275 1259 if (odd->odd_plugin->ovp_id_size != 8)
1276 1260 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
1277 1261 if (oicp->oic_vnetid > maxid) {
1278 1262 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1279 1263 overlay_plugin_rele(odd->odd_plugin);
1280 1264 kmem_free(odd, sizeof (overlay_dev_t));
1281 1265 return (EINVAL);
1282 1266 }
1283 1267 odd->odd_vid = oicp->oic_vnetid;
1284 1268
1285 1269 if (oicp->oic_dcid > UINT32_MAX) {
1286 1270 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1287 1271 overlay_plugin_rele(odd->odd_plugin);
1288 1272 kmem_free(odd, sizeof (overlay_dev_t));
1289 1273 return (EINVAL);
1290 1274 }
1291 1275 odd->odd_dcid = oicp->oic_dcid;
1292 1276
1293 1277 mac = mac_alloc(MAC_VERSION);
1294 1278 if (mac == NULL) {
1295 1279 mutex_exit(&overlay_dev_lock);
1296 1280 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1297 1281 overlay_plugin_rele(odd->odd_plugin);
1298 1282 kmem_free(odd, sizeof (overlay_dev_t));
1299 1283 return (EINVAL);
1300 1284 }
1301 1285
1302 1286 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1303 1287 mac->m_driver = odd;
1304 1288 mac->m_dip = overlay_dip;
1305 1289 mac->m_dst_addr = NULL;
1306 1290 mac->m_callbacks = &overlay_m_callbacks;
1307 1291 mac->m_pdata = NULL;
1308 1292 mac->m_pdata_size = 0;
1309 1293
1310 1294 mac->m_priv_props = NULL;
1311 1295
1312 1296 /* Let mac handle this itself. */
1313 1297 mac->m_instance = (uint_t)-1;
1314 1298
1315 1299 /*
1316 1300 * There is no real source address that should be used here, but saying
1317 1301 * that we're not ethernet is going to cause its own problems. At the
1318 1302 * end of the say, this is fine.
1319 1303 */
1320 1304 mac->m_src_addr = overlay_macaddr;
1321 1305
1322 1306 /*
1323 1307 * Start with the default MTU as the max SDU. If the MTU is changed, the
1324 1308 * SDU will be changed to reflect that.
1325 1309 */
1326 1310 mac->m_min_sdu = 1;
1327 1311 mac->m_max_sdu = OVERLAY_MTU_DEF;
1328 1312 mac->m_multicast_sdu = 0;
1329 1313
1330 1314 /*
1331 1315 * The underlying device doesn't matter, instead this comes from the
1332 1316 * encapsulation protocol and whether or not they allow VLAN tags.
1333 1317 */
1334 1318 if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
1335 1319 mac->m_margin = VLAN_TAGSZ;
1336 1320 } else {
1337 1321 mac->m_margin = 0;
1338 1322 }
1339 1323
1340 1324 /*
1341 1325 * Today, we have no MAC virtualization, it may make sense in the future
1342 1326 * to go ahead and emulate some subset of this, but it doesn't today.
1343 1327 */
1344 1328 mac->m_v12n = MAC_VIRT_NONE;
1345 1329
1346 1330 mutex_enter(&overlay_dev_lock);
1347 1331 for (o = list_head(&overlay_dev_list); o != NULL;
1348 1332 o = list_next(&overlay_dev_list, o)) {
1349 1333 if (o->odd_linkid == oicp->oic_linkid) {
1350 1334 mutex_exit(&overlay_dev_lock);
1351 1335 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1352 1336 overlay_plugin_rele(odd->odd_plugin);
1353 1337 kmem_free(odd, sizeof (overlay_dev_t));
1354 1338 return (EEXIST);
1355 1339 }
1356 1340
1357 1341 if (o->odd_vid == oicp->oic_vnetid &&
1358 1342 o->odd_plugin == odd->odd_plugin) {
1359 1343 mutex_exit(&overlay_dev_lock);
1360 1344 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1361 1345 overlay_plugin_rele(odd->odd_plugin);
1362 1346 kmem_free(odd, sizeof (overlay_dev_t));
1363 1347 return (EEXIST);
1364 1348 }
1365 1349 }
1366 1350
1367 1351 err = mac_register(mac, &odd->odd_mh);
1368 1352 mac_free(mac);
1369 1353 if (err != 0) {
1370 1354 mutex_exit(&overlay_dev_lock);
1371 1355 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1372 1356 overlay_plugin_rele(odd->odd_plugin);
1373 1357 kmem_free(odd, sizeof (overlay_dev_t));
1374 1358 return (err);
1375 1359 }
1376 1360
1377 1361 err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1378 1362 crgetzoneid(cred));
1379 1363 if (err != 0) {
1380 1364 mutex_exit(&overlay_dev_lock);
1381 1365 (void) mac_unregister(odd->odd_mh);
1382 1366 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1383 1367 overlay_plugin_rele(odd->odd_plugin);
1384 1368 kmem_free(odd, sizeof (overlay_dev_t));
1385 1369 return (err);
1386 1370 }
1387 1371
1388 1372 mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
1389 1373 cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
1390 1374 odd->odd_ref = 0;
1391 1375 odd->odd_flags = 0;
1392 1376 list_insert_tail(&overlay_dev_list, odd);
1393 1377 mutex_exit(&overlay_dev_lock);
1394 1378
1395 1379 return (0);
1396 1380 }
1397 1381
1398 1382 /* ARGSUSED */
1399 1383 static int
1400 1384 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1401 1385 {
1402 1386 int i, ret;
1403 1387 overlay_dev_t *odd;
1404 1388 mac_perim_handle_t mph;
1405 1389 overlay_ioc_activate_t *oiap = karg;
1406 1390 overlay_ioc_propinfo_t *infop;
1407 1391 overlay_ioc_prop_t *oip;
1408 1392 overlay_prop_handle_t phdl;
1409 1393
1410 1394 odd = overlay_hold_by_dlid(oiap->oia_linkid);
1411 1395 if (odd == NULL)
1412 1396 return (ENOENT);
1413 1397
1414 1398 infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
1415 1399 oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
1416 1400 phdl = (overlay_prop_handle_t)infop;
1417 1401
1418 1402 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1419 1403 mutex_enter(&odd->odd_lock);
1420 1404 if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1421 1405 mutex_exit(&odd->odd_lock);
1422 1406 mac_perim_exit(mph);
1423 1407 overlay_hold_rele(odd);
1424 1408 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1425 1409 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1426 1410 return (EEXIST);
1427 1411 }
1428 1412 mutex_exit(&odd->odd_lock);
1429 1413
1430 1414 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1431 1415 const char *pname = odd->odd_plugin->ovp_props[i];
1432 1416 bzero(infop, sizeof (overlay_ioc_propinfo_t));
1433 1417 overlay_prop_init(phdl);
1434 1418 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
1435 1419 if (ret != 0) {
1436 1420 mac_perim_exit(mph);
1437 1421 overlay_hold_rele(odd);
1438 1422 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1439 1423 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1440 1424 return (ret);
1441 1425 }
1442 1426
1443 1427 if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
1444 1428 continue;
1445 1429 bzero(oip, sizeof (overlay_ioc_prop_t));
1446 1430 oip->oip_size = sizeof (oip->oip_value);
1447 1431 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1448 1432 pname, oip->oip_value, &oip->oip_size);
1449 1433 if (ret != 0) {
1450 1434 mac_perim_exit(mph);
1451 1435 overlay_hold_rele(odd);
1452 1436 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1453 1437 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1454 1438 return (ret);
1455 1439 }
1456 1440 if (oip->oip_size == 0) {
1457 1441 mac_perim_exit(mph);
1458 1442 overlay_hold_rele(odd);
1459 1443 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1460 1444 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1461 1445 return (EINVAL);
1462 1446 }
1463 1447 }
1464 1448
1465 1449 mutex_enter(&odd->odd_lock);
1466 1450 if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
1467 1451 mutex_exit(&odd->odd_lock);
1468 1452 mac_perim_exit(mph);
1469 1453 overlay_hold_rele(odd);
1470 1454 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1471 1455 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1472 1456 return (ENXIO);
1473 1457 }
1474 1458
1475 1459 ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
1476 1460 odd->odd_flags |= OVERLAY_F_ACTIVATED;
1477 1461
1478 1462 /*
1479 1463 * Now that we've activated ourselves, we should indicate to the world
1480 1464 * that we're up. Note that we may not be able to perform lookups at
1481 1465 * this time, but our notion of being 'up' isn't dependent on that
1482 1466 * ability.
1483 1467 */
1484 1468 mac_link_update(odd->odd_mh, LINK_STATE_UP);
1485 1469 mutex_exit(&odd->odd_lock);
1486 1470
1487 1471 mac_perim_exit(mph);
1488 1472 overlay_hold_rele(odd);
1489 1473 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1490 1474 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1491 1475
1492 1476 return (0);
1493 1477 }
1494 1478
1495 1479 /* ARGSUSED */
1496 1480 static int
1497 1481 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1498 1482 {
1499 1483 overlay_ioc_delete_t *oidp = karg;
1500 1484 overlay_dev_t *odd;
1501 1485 datalink_id_t tid;
1502 1486 int ret;
1503 1487
1504 1488 odd = overlay_hold_by_dlid(oidp->oid_linkid);
1505 1489 if (odd == NULL) {
1506 1490 return (ENOENT);
1507 1491 }
1508 1492
1509 1493 mutex_enter(&odd->odd_lock);
1510 1494 /* If we're not the only hold, we're busy */
1511 1495 if (odd->odd_ref != 1) {
1512 1496 mutex_exit(&odd->odd_lock);
1513 1497 overlay_hold_rele(odd);
1514 1498 return (EBUSY);
1515 1499 }
1516 1500
1517 1501 if (odd->odd_flags & OVERLAY_F_IN_MUX) {
1518 1502 mutex_exit(&odd->odd_lock);
1519 1503 overlay_hold_rele(odd);
1520 1504 return (EBUSY);
1521 1505 }
1522 1506
1523 1507 /*
1524 1508 * To remove this, we need to first remove it from dls and then remove
1525 1509 * it from mac. The act of removing it from mac will check if there are
1526 1510 * devices on top of this, eg. vnics. If there are, then that will fail
1527 1511 * and we'll have to go through and recreate the dls entry. Only after
1528 1512 * mac_unregister has succeeded, then we'll go through and actually free
1529 1513 * everything and drop the dev lock.
1530 1514 */
1531 1515 ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
1532 1516 if (ret != 0) {
1533 1517 overlay_hold_rele(odd);
1534 1518 return (ret);
1535 1519 }
1536 1520
1537 1521 ASSERT(oidp->oid_linkid == tid);
1538 1522 ret = mac_disable(odd->odd_mh);
1539 1523 if (ret != 0) {
1540 1524 (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1541 1525 crgetzoneid(cred));
1542 1526 overlay_hold_rele(odd);
1543 1527 return (ret);
1544 1528 }
1545 1529
1546 1530 overlay_target_quiesce(odd->odd_target);
1547 1531
1548 1532 mutex_enter(&overlay_dev_lock);
1549 1533 list_remove(&overlay_dev_list, odd);
1550 1534 mutex_exit(&overlay_dev_lock);
1551 1535
1552 1536 cv_destroy(&odd->odd_iowait);
1553 1537 mutex_destroy(&odd->odd_lock);
1554 1538 overlay_target_free(odd);
1555 1539 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1556 1540 overlay_plugin_rele(odd->odd_plugin);
1557 1541 kmem_free(odd, sizeof (overlay_dev_t));
1558 1542
1559 1543 return (0);
1560 1544 }
1561 1545
1562 1546 /* ARGSUSED */
1563 1547 static int
1564 1548 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
1565 1549 int *rvalp)
1566 1550 {
1567 1551 overlay_dev_t *odd;
1568 1552 overlay_ioc_nprops_t *on = karg;
1569 1553
1570 1554 odd = overlay_hold_by_dlid(on->oipn_linkid);
1571 1555 if (odd == NULL)
1572 1556 return (ENOENT);
1573 1557 on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
1574 1558 overlay_hold_rele(odd);
1575 1559
1576 1560 return (0);
1577 1561 }
1578 1562
1579 1563 static int
1580 1564 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
1581 1565 {
1582 1566 overlay_prop_handle_t phdl = arg;
1583 1567 overlay_prop_set_range_str(phdl, opp->ovp_name);
1584 1568 return (0);
1585 1569 }
1586 1570
1587 1571 static int
1588 1572 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
1589 1573 {
1590 1574 int i;
1591 1575
1592 1576 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1593 1577 if (strcmp(overlay_dev_props[i], name) == 0) {
1594 1578 *id = i;
1595 1579 return (0);
1596 1580 }
1597 1581 }
1598 1582
1599 1583 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1600 1584 if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
1601 1585 *id = i + OVERLAY_DEV_NPROPS;
1602 1586 return (0);
1603 1587 }
1604 1588 }
1605 1589
1606 1590 return (ENOENT);
1607 1591 }
1608 1592
1609 1593 static void
1610 1594 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
1611 1595 {
1612 1596 uint32_t def;
1613 1597 mac_propval_range_t range;
1614 1598 uint_t perm;
1615 1599
1616 1600 ASSERT(MAC_PERIM_HELD(odd->odd_mh));
1617 1601
1618 1602 bzero(&range, sizeof (mac_propval_range_t));
1619 1603 range.mpr_count = 1;
1620 1604 if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
1621 1605 sizeof (def), &range, &perm) != 0)
1622 1606 return;
1623 1607
1624 1608 if (perm == MAC_PROP_PERM_READ)
1625 1609 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1626 1610 else if (perm == MAC_PROP_PERM_WRITE)
1627 1611 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
1628 1612 else if (perm == MAC_PROP_PERM_RW)
1629 1613 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1630 1614
1631 1615 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1632 1616 overlay_prop_set_default(phdl, &def, sizeof (def));
1633 1617 overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
1634 1618 range.mpr_range_uint32[0].mpur_max);
1635 1619 }
1636 1620
1637 1621 /* ARGSUSED */
1638 1622 static int
1639 1623 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
1640 1624 int *rvalp)
1641 1625 {
1642 1626 overlay_dev_t *odd;
1643 1627 int ret;
1644 1628 mac_perim_handle_t mph;
1645 1629 uint_t propid = UINT_MAX;
1646 1630 overlay_ioc_propinfo_t *oip = karg;
1647 1631 overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
1648 1632
1649 1633 odd = overlay_hold_by_dlid(oip->oipi_linkid);
1650 1634 if (odd == NULL)
1651 1635 return (ENOENT);
1652 1636
1653 1637 overlay_prop_init(phdl);
1654 1638 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1655 1639
1656 1640 /*
1657 1641 * If the id is -1, then the property that we're looking for is named in
1658 1642 * oipi_name and we should fill in its id. Otherwise, we've been given
1659 1643 * an id and we need to turn that into a name for our plugin's sake. The
1660 1644 * id is our own fabrication for property discovery.
1661 1645 */
1662 1646 if (oip->oipi_id == -1) {
1663 1647 /*
1664 1648 * Determine if it's a known generic property or it belongs to a
1665 1649 * module by checking against the list of known names.
1666 1650 */
1667 1651 oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1668 1652 if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
1669 1653 &propid)) != 0) {
1670 1654 overlay_hold_rele(odd);
1671 1655 mac_perim_exit(mph);
1672 1656 return (ret);
1673 1657 }
1674 1658 oip->oipi_id = propid;
1675 1659 if (propid >= OVERLAY_DEV_NPROPS) {
1676 1660 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1677 1661 oip->oipi_name, phdl);
1678 1662 overlay_hold_rele(odd);
1679 1663 mac_perim_exit(mph);
1680 1664 return (ret);
1681 1665
1682 1666 }
1683 1667 } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
1684 1668 uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
1685 1669
1686 1670 if (id >= odd->odd_plugin->ovp_nprops) {
1687 1671 overlay_hold_rele(odd);
1688 1672 mac_perim_exit(mph);
1689 1673 return (EINVAL);
1690 1674 }
1691 1675 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1692 1676 odd->odd_plugin->ovp_props[id], phdl);
1693 1677 overlay_hold_rele(odd);
1694 1678 mac_perim_exit(mph);
1695 1679 return (ret);
1696 1680 } else if (oip->oipi_id < -1) {
1697 1681 overlay_hold_rele(odd);
1698 1682 mac_perim_exit(mph);
1699 1683 return (EINVAL);
1700 1684 } else {
1701 1685 ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
1702 1686 ASSERT(oip->oipi_id >= 0);
1703 1687 propid = oip->oipi_id;
1704 1688 (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
1705 1689 sizeof (oip->oipi_name));
1706 1690 }
1707 1691
1708 1692 switch (propid) {
1709 1693 case OVERLAY_DEV_P_MTU:
1710 1694 overlay_i_propinfo_mtu(odd, phdl);
1711 1695 break;
1712 1696 case OVERLAY_DEV_P_VNETID:
1713 1697 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1714 1698 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1715 1699 overlay_prop_set_nodefault(phdl);
1716 1700 break;
1717 1701 case OVERLAY_DEV_P_ENCAP:
1718 1702 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1719 1703 overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
1720 1704 overlay_prop_set_nodefault(phdl);
1721 1705 overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
1722 1706 break;
1723 1707 case OVERLAY_DEV_P_VARPDID:
1724 1708 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1725 1709 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1726 1710 overlay_prop_set_nodefault(phdl);
1727 1711 break;
1728 1712 case OVERLAY_DEV_P_DCID:
1729 1713 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1730 1714 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1731 1715 overlay_prop_set_nodefault(phdl);
1732 1716 overlay_prop_set_range_uint32(phdl, 0, UINT32_MAX);
1733 1717 break;
1734 1718 default:
1735 1719 overlay_hold_rele(odd);
1736 1720 mac_perim_exit(mph);
1737 1721 return (ENOENT);
1738 1722 }
1739 1723
1740 1724 overlay_hold_rele(odd);
1741 1725 mac_perim_exit(mph);
1742 1726 return (0);
1743 1727 }
1744 1728
1745 1729 /* ARGSUSED */
1746 1730 static int
1747 1731 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1748 1732 int *rvalp)
1749 1733 {
1750 1734 int ret;
1751 1735 overlay_dev_t *odd;
1752 1736 mac_perim_handle_t mph;
1753 1737 overlay_ioc_prop_t *oip = karg;
1754 1738 uint_t propid, mtu;
1755 1739
1756 1740 odd = overlay_hold_by_dlid(oip->oip_linkid);
1757 1741 if (odd == NULL)
1758 1742 return (ENOENT);
1759 1743
1760 1744 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1761 1745 oip->oip_size = OVERLAY_PROP_SIZEMAX;
1762 1746 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1763 1747 if (oip->oip_id == -1) {
1764 1748 int i;
1765 1749
1766 1750 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1767 1751 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1768 1752 break;
1769 1753 if (i == OVERLAY_DEV_NPROPS) {
1770 1754 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
1771 1755 odd->odd_pvoid, oip->oip_name,
1772 1756 oip->oip_value, &oip->oip_size);
1773 1757 overlay_hold_rele(odd);
1774 1758 mac_perim_exit(mph);
1775 1759 return (ret);
1776 1760 }
1777 1761 }
1778 1762
1779 1763 propid = i;
1780 1764 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1781 1765 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1782 1766
1783 1767 if (id > odd->odd_plugin->ovp_nprops) {
1784 1768 overlay_hold_rele(odd);
1785 1769 mac_perim_exit(mph);
1786 1770 return (EINVAL);
1787 1771 }
1788 1772 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1789 1773 odd->odd_plugin->ovp_props[id], oip->oip_value,
1790 1774 &oip->oip_size);
1791 1775 overlay_hold_rele(odd);
1792 1776 mac_perim_exit(mph);
1793 1777 return (ret);
1794 1778 } else if (oip->oip_id < -1) {
1795 1779 overlay_hold_rele(odd);
1796 1780 mac_perim_exit(mph);
1797 1781 return (EINVAL);
1798 1782 } else {
1799 1783 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1800 1784 ASSERT(oip->oip_id >= 0);
1801 1785 propid = oip->oip_id;
1802 1786 }
1803 1787
1804 1788 ret = 0;
1805 1789 switch (propid) {
1806 1790 case OVERLAY_DEV_P_MTU:
1807 1791 /*
1808 1792 * The MTU is always set and retrieved through MAC, to allow for
1809 1793 * MAC to do whatever it wants, as really that property belongs
1810 1794 * to MAC. This is important for things where vnics have hold on
1811 1795 * the MTU.
1812 1796 */
1813 1797 mac_sdu_get(odd->odd_mh, NULL, &mtu);
1814 1798 bcopy(&mtu, oip->oip_value, sizeof (uint_t));
1815 1799 oip->oip_size = sizeof (uint_t);
1816 1800 break;
1817 1801 case OVERLAY_DEV_P_VNETID:
1818 1802 /*
1819 1803 * While it's read-only while inside of a mux, we're not in a
1820 1804 * context that can guarantee that. Therefore we always grab the
1821 1805 * overlay_dev_t's odd_lock.
1822 1806 */
1823 1807 mutex_enter(&odd->odd_lock);
1824 1808 bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
1825 1809 mutex_exit(&odd->odd_lock);
1826 1810 oip->oip_size = sizeof (uint64_t);
1827 1811 break;
1828 1812 case OVERLAY_DEV_P_ENCAP:
1829 1813 oip->oip_size = strlcpy((char *)oip->oip_value,
1830 1814 odd->odd_plugin->ovp_name, oip->oip_size);
1831 1815 break;
1832 1816 case OVERLAY_DEV_P_VARPDID:
1833 1817 mutex_enter(&odd->odd_lock);
1834 1818 if (odd->odd_flags & OVERLAY_F_VARPD) {
1835 1819 const uint64_t val = odd->odd_target->ott_id;
1836 1820 bcopy(&val, oip->oip_value, sizeof (uint64_t));
1837 1821 oip->oip_size = sizeof (uint64_t);
1838 1822 } else {
1839 1823 oip->oip_size = 0;
1840 1824 }
1841 1825 mutex_exit(&odd->odd_lock);
1842 1826 break;
1843 1827 case OVERLAY_DEV_P_DCID:
1844 1828 /*
1845 1829 * While it's read-only while inside of a mux, we're not in a
1846 1830 * context that can guarantee that. Therefore we always grab the
1847 1831 * overlay_dev_t's odd_lock.
1848 1832 */
1849 1833 mutex_enter(&odd->odd_lock);
1850 1834 bcopy(&odd->odd_dcid, oip->oip_value, sizeof (uint32_t));
1851 1835 mutex_exit(&odd->odd_lock);
1852 1836 oip->oip_size = sizeof (uint32_t);
1853 1837 break;
1854 1838
1855 1839 default:
1856 1840 ret = ENOENT;
1857 1841 }
1858 1842
1859 1843 overlay_hold_rele(odd);
1860 1844 mac_perim_exit(mph);
1861 1845 return (ret);
1862 1846 }
1863 1847
1864 1848 static void
1865 1849 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
1866 1850 {
1867 1851 mutex_enter(&odd->odd_lock);
1868 1852
1869 1853 /* Simple case, not active */
1870 1854 if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1871 1855 odd->odd_vid = vnetid;
1872 1856 mutex_exit(&odd->odd_lock);
1873 1857 return;
1874 1858 }
1875 1859
1876 1860 /*
1877 1861 * In the hard case, we need to set the drop flag, quiesce I/O and then
1878 1862 * we can go ahead and do everything.
1879 1863 */
1880 1864 odd->odd_flags |= OVERLAY_F_MDDROP;
1881 1865 overlay_io_wait(odd, OVERLAY_F_IOMASK);
1882 1866 mutex_exit(&odd->odd_lock);
1883 1867
1884 1868 overlay_mux_remove_dev(odd->odd_mux, odd);
1885 1869 mutex_enter(&odd->odd_lock);
1886 1870 odd->odd_vid = vnetid;
1887 1871 mutex_exit(&odd->odd_lock);
1888 1872 overlay_mux_add_dev(odd->odd_mux, odd);
1889 1873
1890 1874 mutex_enter(&odd->odd_lock);
1891 1875 ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1892 1876 odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1893 1877 mutex_exit(&odd->odd_lock);
1894 1878 }
1895 1879
1896 1880 static void
1897 1881 overlay_setprop_dcid(overlay_dev_t *odd, uint32_t dcid)
1898 1882 {
1899 1883 mutex_enter(&odd->odd_lock);
1900 1884
1901 1885 /* Simple case, not active */
1902 1886 if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1903 1887 odd->odd_dcid = dcid;
1904 1888 mutex_exit(&odd->odd_lock);
1905 1889 return;
1906 1890 }
1907 1891
1908 1892 /*
1909 1893 * In the hard case, we need to set the drop flag, quiesce I/O and then
1910 1894 * we can go ahead and do everything.
1911 1895 */
1912 1896 odd->odd_flags |= OVERLAY_F_MDDROP;
1913 1897 overlay_io_wait(odd, OVERLAY_F_IOMASK);
1914 1898 mutex_exit(&odd->odd_lock);
1915 1899
1916 1900 overlay_mux_remove_dev(odd->odd_mux, odd);
1917 1901 mutex_enter(&odd->odd_lock);
1918 1902 odd->odd_dcid = dcid;
1919 1903 mutex_exit(&odd->odd_lock);
1920 1904 overlay_mux_add_dev(odd->odd_mux, odd);
1921 1905
1922 1906 mutex_enter(&odd->odd_lock);
1923 1907 ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1924 1908 odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1925 1909 mutex_exit(&odd->odd_lock);
1926 1910 }
1927 1911
1928 1912 /* ARGSUSED */
1929 1913 static int
1930 1914 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1931 1915 int *rvalp)
1932 1916 {
1933 1917 int ret;
1934 1918 overlay_dev_t *odd;
1935 1919 overlay_ioc_prop_t *oip = karg;
1936 1920 uint_t propid = UINT_MAX;
1937 1921 mac_perim_handle_t mph;
1938 1922 uint64_t maxid, *vidp, *dcidp;
1939 1923
1940 1924 if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
1941 1925 return (EINVAL);
1942 1926
1943 1927 odd = overlay_hold_by_dlid(oip->oip_linkid);
1944 1928 if (odd == NULL)
1945 1929 return (ENOENT);
1946 1930
1947 1931 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1948 1932 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1949 1933 mutex_enter(&odd->odd_lock);
1950 1934 if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1951 1935 mac_perim_exit(mph);
1952 1936 mutex_exit(&odd->odd_lock);
1953 1937 return (ENOTSUP);
1954 1938 }
1955 1939 mutex_exit(&odd->odd_lock);
1956 1940 if (oip->oip_id == -1) {
1957 1941 int i;
1958 1942
1959 1943 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1960 1944 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1961 1945 break;
1962 1946 if (i == OVERLAY_DEV_NPROPS) {
1963 1947 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
1964 1948 odd->odd_pvoid, oip->oip_name,
1965 1949 oip->oip_value, oip->oip_size);
1966 1950 overlay_hold_rele(odd);
1967 1951 mac_perim_exit(mph);
1968 1952 return (ret);
1969 1953 }
1970 1954 }
1971 1955
1972 1956 propid = i;
1973 1957 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1974 1958 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1975 1959
1976 1960 if (id > odd->odd_plugin->ovp_nprops) {
1977 1961 mac_perim_exit(mph);
1978 1962 overlay_hold_rele(odd);
1979 1963 return (EINVAL);
1980 1964 }
1981 1965 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
1982 1966 odd->odd_plugin->ovp_props[id], oip->oip_value,
1983 1967 oip->oip_size);
1984 1968 mac_perim_exit(mph);
1985 1969 overlay_hold_rele(odd);
1986 1970 return (ret);
1987 1971 } else if (oip->oip_id < -1) {
1988 1972 mac_perim_exit(mph);
1989 1973 overlay_hold_rele(odd);
1990 1974 return (EINVAL);
1991 1975 } else {
1992 1976 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1993 1977 ASSERT(oip->oip_id >= 0);
1994 1978 propid = oip->oip_id;
1995 1979 }
1996 1980
1997 1981 ret = 0;
1998 1982 switch (propid) {
1999 1983 case OVERLAY_DEV_P_MTU:
2000 1984 ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
2001 1985 oip->oip_value, oip->oip_size);
2002 1986 break;
2003 1987 case OVERLAY_DEV_P_VNETID:
2004 1988 if (oip->oip_size != sizeof (uint64_t)) {
2005 1989 ret = EINVAL;
2006 1990 break;
2007 1991 }
2008 1992 vidp = (uint64_t *)oip->oip_value;
2009 1993 ASSERT(odd->odd_plugin->ovp_id_size <= 8);
2010 1994 maxid = UINT64_MAX;
2011 1995 if (odd->odd_plugin->ovp_id_size != 8)
2012 1996 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
2013 1997 1ULL;
2014 1998 if (*vidp >= maxid) {
2015 1999 ret = EINVAL;
2016 2000 break;
2017 2001 }
2018 2002 overlay_setprop_vnetid(odd, *vidp);
2019 2003 break;
2020 2004 case OVERLAY_DEV_P_ENCAP:
2021 2005 case OVERLAY_DEV_P_VARPDID:
2022 2006 ret = EPERM;
2023 2007 break;
2024 2008 case OVERLAY_DEV_P_DCID:
2025 2009 if (oip->oip_size != sizeof (uint64_t)) {
2026 2010 ret = EINVAL;
2027 2011 break;
2028 2012 }
2029 2013 dcidp = (uint64_t *)oip->oip_value;
2030 2014 if (*dcidp > UINT32_MAX) {
2031 2015 ret = EINVAL;
2032 2016 break;
2033 2017 }
2034 2018 overlay_setprop_dcid(odd, *dcidp);
2035 2019 break;
2036 2020
2037 2021 default:
2038 2022 ret = ENOENT;
2039 2023 }
2040 2024
2041 2025 mac_perim_exit(mph);
2042 2026 overlay_hold_rele(odd);
2043 2027 return (ret);
2044 2028 }
2045 2029
2046 2030 /* ARGSUSED */
2047 2031 static int
2048 2032 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
2049 2033 int *rvalp)
2050 2034 {
2051 2035 overlay_dev_t *odd;
2052 2036 overlay_ioc_status_t *os = karg;
2053 2037
2054 2038 odd = overlay_hold_by_dlid(os->ois_linkid);
2055 2039 if (odd == NULL)
2056 2040 return (ENOENT);
2057 2041
2058 2042 mutex_enter(&odd->odd_lock);
2059 2043 if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
2060 2044 os->ois_status = OVERLAY_I_DEGRADED;
2061 2045 if (odd->odd_fmamsg != NULL) {
2062 2046 (void) strlcpy(os->ois_message, odd->odd_fmamsg,
2063 2047 OVERLAY_STATUS_BUFLEN);
2064 2048 } else {
2065 2049 os->ois_message[0] = '\0';
2066 2050 }
2067 2051
2068 2052 } else {
2069 2053 os->ois_status = OVERLAY_I_OK;
2070 2054 os->ois_message[0] = '\0';
2071 2055 }
2072 2056 mutex_exit(&odd->odd_lock);
2073 2057 overlay_hold_rele(odd);
2074 2058
2075 2059 return (0);
2076 2060 }
2077 2061
2078 2062 static dld_ioc_info_t overlay_ioc_list[] = {
2079 2063 { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
2080 2064 overlay_i_create, secpolicy_dl_config },
2081 2065 { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
2082 2066 overlay_i_activate, secpolicy_dl_config },
2083 2067 { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
2084 2068 overlay_i_delete, secpolicy_dl_config },
2085 2069 { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
2086 2070 sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
2087 2071 secpolicy_dl_config },
2088 2072 { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
2089 2073 sizeof (overlay_ioc_prop_t), overlay_i_getprop,
2090 2074 secpolicy_dl_config },
2091 2075 { OVERLAY_IOC_SETPROP, DLDCOPYIN,
2092 2076 sizeof (overlay_ioc_prop_t), overlay_i_setprop,
2093 2077 secpolicy_dl_config },
2094 2078 { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
2095 2079 sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
2096 2080 secpolicy_dl_config },
2097 2081 { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
2098 2082 sizeof (overlay_ioc_status_t), overlay_i_status,
2099 2083 NULL }
2100 2084 };
2101 2085
2102 2086 static int
2103 2087 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2104 2088 {
2105 2089 int fmcap = DDI_FM_EREPORT_CAPABLE;
2106 2090 if (cmd != DDI_ATTACH)
2107 2091 return (DDI_FAILURE);
2108 2092
2109 2093 if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
2110 2094 return (DDI_FAILURE);
2111 2095
2112 2096 ddi_fm_init(dip, &fmcap, NULL);
2113 2097
2114 2098 if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
2115 2099 ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
2116 2100 return (DDI_FAILURE);
2117 2101
2118 2102 if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
2119 2103 DLDIOCCNT(overlay_ioc_list)) != 0) {
2120 2104 ddi_remove_minor_node(dip, OVERLAY_CTL);
2121 2105 return (DDI_FAILURE);
2122 2106 }
2123 2107
2124 2108 overlay_dip = dip;
2125 2109 return (DDI_SUCCESS);
2126 2110 }
2127 2111
2128 2112 /* ARGSUSED */
2129 2113 static int
2130 2114 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
2131 2115 {
2132 2116 int error;
2133 2117
2134 2118 switch (cmd) {
2135 2119 case DDI_INFO_DEVT2DEVINFO:
2136 2120 *resp = (void *)overlay_dip;
2137 2121 error = DDI_SUCCESS;
2138 2122 break;
2139 2123 case DDI_INFO_DEVT2INSTANCE:
2140 2124 *resp = (void *)0;
2141 2125 error = DDI_SUCCESS;
2142 2126 break;
2143 2127 default:
2144 2128 error = DDI_FAILURE;
2145 2129 break;
2146 2130 }
2147 2131
2148 2132 return (error);
2149 2133 }
2150 2134
2151 2135 static int
2152 2136 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2153 2137 {
2154 2138 if (cmd != DDI_DETACH)
2155 2139 return (DDI_FAILURE);
2156 2140
2157 2141 mutex_enter(&overlay_dev_lock);
2158 2142 if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
2159 2143 mutex_exit(&overlay_dev_lock);
2160 2144 return (EBUSY);
2161 2145 }
|
↓ open down ↓ |
1042 lines elided |
↑ open up ↑ |
2162 2146 mutex_exit(&overlay_dev_lock);
2163 2147
2164 2148
2165 2149 dld_ioc_unregister(OVERLAY_IOC);
2166 2150 ddi_remove_minor_node(dip, OVERLAY_CTL);
2167 2151 ddi_fm_fini(dip);
2168 2152 overlay_dip = NULL;
2169 2153 return (DDI_SUCCESS);
2170 2154 }
2171 2155
2156 +#define OVERLAY_IOCTL_MASK 0xffffff00
2157 +/* ARGSUSED */
2158 +static int
2159 +overlay_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
2160 + int *rvalp)
2161 +{
2162 + switch (cmd & OVERLAY_IOCTL_MASK) {
2163 + case OVERLAY_TARG_IOCTL:
2164 + return (overlay_target_ioctl(dev, cmd, arg, mode, credp,
2165 + rvalp));
2166 + case OVERLAY_ROUTER_IOCTL:
2167 + return (overlay_router_ioctl(dev, cmd, arg, mode, credp,
2168 + rvalp));
2169 + default:
2170 + return (ENOTTY);
2171 + }
2172 +}
2173 +
2172 2174 static struct cb_ops overlay_cbops = {
2173 2175 overlay_target_open, /* cb_open */
2174 2176 overlay_target_close, /* cb_close */
2175 2177 nodev, /* cb_strategy */
2176 2178 nodev, /* cb_print */
2177 2179 nodev, /* cb_dump */
2178 2180 nodev, /* cb_read */
2179 2181 nodev, /* cb_write */
2180 - overlay_target_ioctl, /* cb_ioctl */
2182 + overlay_ioctl, /* cb_ioctl */
2181 2183 nodev, /* cb_devmap */
2182 2184 nodev, /* cb_mmap */
2183 2185 nodev, /* cb_segmap */
2184 2186 nochpoll, /* cb_chpoll */
2185 2187 ddi_prop_op, /* cb_prop_op */
2186 2188 NULL, /* cb_stream */
2187 2189 D_MP, /* cb_flag */
2188 2190 CB_REV, /* cb_rev */
2189 2191 nodev, /* cb_aread */
2190 2192 nodev, /* cb_awrite */
2191 2193 };
2192 2194
2193 2195 static struct dev_ops overlay_dev_ops = {
2194 2196 DEVO_REV, /* devo_rev */
2195 2197 0, /* devo_refcnt */
2196 2198 overlay_getinfo, /* devo_getinfo */
2197 2199 nulldev, /* devo_identify */
2198 2200 nulldev, /* devo_probe */
2199 2201 overlay_attach, /* devo_attach */
2200 2202 overlay_detach, /* devo_detach */
2201 2203 nulldev, /* devo_reset */
2202 2204 &overlay_cbops, /* devo_cb_ops */
2203 2205 NULL, /* devo_bus_ops */
2204 2206 NULL, /* devo_power */
2205 2207 ddi_quiesce_not_supported /* devo_quiesce */
2206 2208 };
2207 2209
2208 2210 static struct modldrv overlay_modldrv = {
2209 2211 &mod_driverops,
2210 2212 "Overlay Network Driver",
2211 2213 &overlay_dev_ops
2212 2214 };
2213 2215
2214 2216 static struct modlinkage overlay_linkage = {
2215 2217 MODREV_1,
2216 2218 &overlay_modldrv
2217 2219 };
2218 2220
2219 2221 static int
2220 2222 overlay_init(void)
2221 2223 {
2222 2224 mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
2223 2225 list_create(&overlay_dev_list, sizeof (overlay_dev_t),
2224 2226 offsetof(overlay_dev_t, odd_link));
2225 2227 overlay_mux_init();
2226 2228 overlay_plugin_init();
2227 2229 overlay_target_init();
2228 2230
2229 2231 return (DDI_SUCCESS);
2230 2232 }
2231 2233
2232 2234 static void
2233 2235 overlay_fini(void)
2234 2236 {
2235 2237 overlay_target_fini();
2236 2238 overlay_plugin_fini();
2237 2239 overlay_mux_fini();
2238 2240 mutex_destroy(&overlay_dev_lock);
2239 2241 list_destroy(&overlay_dev_list);
2240 2242 }
2241 2243
2242 2244 int
2243 2245 _init(void)
2244 2246 {
2245 2247 int err;
2246 2248
2247 2249 if ((err = overlay_init()) != DDI_SUCCESS)
2248 2250 return (err);
2249 2251
2250 2252 mac_init_ops(NULL, "overlay");
2251 2253 err = mod_install(&overlay_linkage);
2252 2254 if (err != DDI_SUCCESS) {
2253 2255 overlay_fini();
2254 2256 return (err);
2255 2257 }
2256 2258
2257 2259 return (0);
2258 2260 }
2259 2261
2260 2262 int
2261 2263 _info(struct modinfo *modinfop)
2262 2264 {
2263 2265 return (mod_info(&overlay_linkage, modinfop));
2264 2266 }
2265 2267
2266 2268 int
2267 2269 _fini(void)
2268 2270 {
2269 2271 int err;
2270 2272
2271 2273 err = mod_remove(&overlay_linkage);
2272 2274 if (err != 0)
2273 2275 return (err);
2274 2276
2275 2277 overlay_fini();
2276 2278 return (0);
2277 2279 }
|
↓ open down ↓ |
87 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX