Print this page
OS-7088 cyclics corked on overlay socket with full queue
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/io/overlay/overlay.c
+++ new/usr/src/uts/common/io/overlay/overlay.c
1 1 /*
2 2 * This file and its contents are supplied under the terms of the
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
3 3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 4 * You may only use this file in accordance with the terms of version
5 5 * 1.0 of the CDDL.
6 6 *
7 7 * A full copy of the text of the CDDL should have accompanied this
8 8 * source. A copy of the CDDL is also available via the Internet at
9 9 * http://www.illumos.org/license/CDDL.
10 10 */
11 11
12 12 /*
13 - * Copyright 2016 Joyent, Inc.
13 + * Copyright 2018 Joyent, Inc.
14 14 */
15 15
16 16 /*
17 17 * Overlay Devices
18 18 *
19 19 * Overlay devices provide a means for creating overlay networks, a means of
20 20 * multiplexing multiple logical, isolated, and discrete layer two and layer
21 21 * three networks on top of one physical network.
22 22 *
23 23 * In general, these overlay devices encapsulate the logic to answer two
24 24 * different questions:
25 25 *
26 26 * 1) How should I transform a packet to put it on the wire?
27 27 * 2) Where should I send a transformed packet?
28 28 *
29 29 * Each overlay device is presented to the user as a GLDv3 device. While the
30 30 * link itself cannot have an IP interface created on top of it, it allows for
31 31 * additional GLDv3 devices, such as a VNIC, to be created on top of it which
32 32 * can be plumbed up with IP interfaces.
33 33 *
34 34 *
35 35 * --------------------
36 36 * General Architecture
37 37 * --------------------
38 38 *
39 39 * The logical overlay device that a user sees in dladm(1M) is a combination of
40 40 * two different components that work together. The first component is this
41 41 * kernel module, which is responsible for answering question one -- how should
42 42 * I transform a packet to put it on the wire.
43 43 *
44 44 * The second component is what we call the virtual ARP daemon, or varpd. It is
45 45 * a userland component that is responsible for answering the second question --
46 46 * Where should I send a transformed packet. Instances of the kernel overlay
47 47 * GLDv3 device ask varpd the question of where should a packet go.
48 48 *
49 49 * The split was done for a few reasons. Importantly, we wanted to keep the act
50 50 * of generating encapsulated packets in the kernel so as to ensure that the
51 51 * general data path was fast and also kept simple. On the flip side, while the
52 52 * question of where should something go may be simple, it may often be
53 53 * complicated and need to interface with several different external or
54 54 * distributed systems. In those cases, it's simpler to allow for the full
55 55 * flexibility of userland to be brought to bear to solve that problem and in
56 56 * general, the path isn't very common.
57 57 *
58 58 * The following is what makes up the logical overlay device that a user would
59 59 * create with dladm(1M).
60 60 *
61 61 * Kernel Userland
62 62 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
63 63 * . +--------+ +--------+ +--------+ . . .
64 64 * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . .
65 65 * . +--------+ +--------+ +--------+ . . .
66 66 * . | | | . . .
67 67 * . | | | . . .
68 68 * . +------------+-----------+ . . .
69 69 * . | . . /dev/overlay .
70 70 * . +--------------+ . . . +------------+ .
71 71 * . | | . . . | | .
72 72 * . | Overlay |======*=================| Virtual | .
73 73 * . | GLDv3 Device |========================| ARP Daemon | .
74 74 * . | | . . | | .
75 75 * . +--------------+ . . +------------+ .
76 76 * . | . . | .
77 77 * . | . . | .
78 78 * . +----------------+ . . +--------+ .
79 79 * . | Overlay | . . | varpd | .
80 80 * . | Encapsulation | . . | Lookup | .
81 81 * . | Plugin | . . | Plugin | .
82 82 * . +----------------+ . . +--------+ .
83 83 * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
84 84 *
85 85 *
86 86 * This image shows the two different components and where they live.
87 87 * Importantly, it also shows that both the kernel overlay device and the
88 88 * userland varpd both support plugins. The plugins actually implement the
89 89 * things that users care about and the APIs have been designed to try to
90 90 * minimize the amount of things that a module writer needs to worry about it.
91 91 *
92 92 * IDENTIFIERS
93 93 *
94 94 * Every overlay device is defined by a unique identifier which is the overlay
95 95 * identifier. Its purpose is similar to that of a VLAN identifier, it's a
96 96 * unique number that is used to differentiate between different entries on the
97 97 * wire.
98 98 *
99 99 * ENCAPSULATION
100 100 *
101 101 * An overlay encapsulation plugin is a kernel miscellaneous module whose
102 102 * purpose is to contain knowledge about how to transform packets to put them
103 103 * onto the wire and to take them off. An example of an encapsulation plugin is
104 104 * vxlan. It's also how support for things like nvgre or geneve would be brought
105 105 * into the system.
106 106 *
107 107 * Each encapsulation plugins defines a series of operation vectors and
108 108 * properties. For the full details on everything they should provide, please
109 109 * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible
110 110 * for telling the system what information is required to send a packet. For
111 111 * example, vxlan is defined to send everything over a UDP packet and therefore
112 112 * requires a port and an IP address, while nvgre on the other hand is its own
113 113 * IP type and therefore just requires an IP address. In addition, it also
114 114 * provides information about the kind of socket that should be created. This is
115 115 * used by the kernel multiplexor, more of that in the Kernel Components
116 116 * section.
117 117 *
118 118 * LOOKUPS
119 119 *
120 120 * The kernel communicates requests for lookups over the character device
121 121 * /dev/overlay. varpd is responsible for listening for requests on that device
122 122 * and answering them. The character device is specific to the target path and
123 123 * varpd.
124 124 *
125 125 * Much as the kernel overlay module handles the bulk of the scaffolding but
126 126 * leaves the important work to the encapsulation plugin, varpd provides a
127 127 * similar role and leaves the full brunt of lookups to a userland dynamic
128 128 * shared object which implements the logic of lookups.
129 129 *
130 130 * Each lookup plugin defines a series of operation vectors and properties. For
131 131 * the full details on everything that they should provide, please read
132 132 * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC
133 133 * address and asked to give an address on the physical network that it should
134 134 * be sent to. In addition, they handle questions related to how to handle
135 135 * things like broadcast and multicast traffic, etc.
136 136 *
137 137 * ----------
138 138 * Properties
139 139 * ----------
140 140 *
141 141 * A device from a dladm perspective has a unique set of properties that are
142 142 * combined from three different sources:
143 143 *
144 144 * 1) Generic properties that every overlay device has
145 145 * 2) Properties that are specific to the encapsulation plugin
146 146 * 3) Properties that are specific to the lookup plugin
147 147 *
148 148 * All of these are exposed in a single set of properties in dladm. Note that
149 149 * these are not necessarily traditional link properties. However, if something
150 150 * is both a traditional GLDv3 link property, say the MTU of a device, and a
151 151 * specific property here, than the driver ensures that all existing GLDv3
152 152 * specific means of manipulating it are used and wraps up its private property
153 153 * interfaces to ensure that works.
154 154 *
155 155 * Properties in the second and third category are prefixed with the name of
156 156 * their module. For example, the vxlan encapsulation module has a property
157 157 * called the 'listen_ip'. This property would show up in dladm as
158 158 * 'vxlan/listen_ip'. This allows different plugins to both use similar names
159 159 * for similar properties and to also have independent name spaces so that
160 160 * overlapping names do not conflict with anything else.
161 161 *
162 162 * While the kernel combines both sets one and two into a single coherent view,
163 163 * it does not do anything with respect to the properties that are owned by the
164 164 * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in
165 165 * charge of bridging these two worlds into one magical experience for the user.
166 166 * It carries the burden of knowing about both overlay specific and varpd
167 167 * specific properties. Importantly, we want to maintain this distinction. We
168 168 * don't want to treat the kernel as an arbitrary key/value store for varpd and
169 169 * we want the kernel to own its own data and not have to ask userland for
170 170 * information that it owns.
171 171 *
172 172 * Every property in the system has the following attributes:
173 173 *
174 174 * o A name
175 175 * o A type
176 176 * o A size
177 177 * o Permissions
178 178 * o Default value
179 179 * o Valid value ranges
180 180 * o A value
181 181 *
182 182 * Everything except for the value is obtained by callers through the propinfo
183 183 * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX,
184 184 * currently 256 bytes.
185 185 *
186 186 * The following are the supported types of properties:
187 187 *
188 188 * OVERLAY_PROP_T_INT
189 189 *
190 190 * A signed integer, its length is 8 bytes, corresponding to a
191 191 * int64_t.
192 192 *
193 193 * OVERLAY_PROP_T_UINT
194 194 *
195 195 * An unsigned integer, its length is 8 bytes, corresponding to a
196 196 * uint64_t.
197 197 *
198 198 * OVERLAY_PROP_T_IP
199 199 *
200 200 * A struct in6_addr, it has a fixed size.
201 201 *
202 202 * OVERLAY_PROP_T_STRING
203 203 *
204 204 * A null-terminated character string encoded in either ASCII or
205 205 * UTF-8. Note that the size of the string includes the null
206 206 * terminator.
207 207 *
208 208 * The next thing that we apply to a property is its permission. The permissions
209 209 * are put together by the bitwise or of the following flags and values.
210 210 *
211 211 * OVERLAY_PROP_PERM_REQ
212 212 *
213 213 * This indicates a required property. A property that is required
214 214 * must be set by a consumer before the device can be created. If a
215 215 * required property has a default property, this constraint is
216 216 * loosened because the default property defines the value.
217 217 *
218 218 * OVERLAY_PORP_PERM_READ
219 219 *
220 220 * This indicates that a property can be read. All properties will
221 221 * have this value set.
222 222 *
223 223 * OVERLAY_PROP_PERM_WRITE
224 224 *
225 225 * This indicates that a property can be written to and thus
226 226 * updated by userland. Properties that are only intended to
227 227 * display information, will not have OVERLAY_PROP_PERM_WRITE set.
228 228 *
229 229 * In addition, a few additional values are defined as a convenience to
230 230 * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of
231 231 * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second,
232 232 * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ,
233 233 * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a
234 234 * property should generally be a constant across its lifetime.
235 235 *
236 236 * A property may optionally have a default value. If it does have a default
237 237 * value, and that property is not set to be a different value, then the default
238 238 * value is inherited automatically. It also means that if the default value is
239 239 * acceptable, there is no need to set the value for a required property. For
240 240 * example, the vxlan module has the vxlan/listen_port property which is
241 241 * required, but has a default value of 4789 (the IANA assigned port). Because
242 242 * of that default value, there is no need for it to be set.
243 243 *
244 244 * Finally, a property may declare a list of valid values. These valid values
245 245 * are used for display purposes, they are not enforced by the broader system,
246 246 * but merely allow a means for the information to be communicated to the user
247 247 * through dladm(1M). Like a default value, this is optional.
248 248 *
249 249 * The general scaffolding does not do very much with respect to the getting and
250 250 * setting of properties. That is really owned by the individual plugins
251 251 * themselves.
252 252 *
253 253 * -----------------------------
254 254 * Destinations and Plugin Types
255 255 * -----------------------------
256 256 *
257 257 * Both encapsulation and lookup plugins define the kinds of destinations that
258 258 * they know how to support. There are three different pieces of information
259 259 * that can be used to address to a destination currently, all of which is
260 260 * summarized in the type overlay_point_t. Any combination of these is
261 261 * supported.
262 262 *
263 263 * OVERLAY_PLUGIN_D_ETHERNET
264 264 *
265 265 * An Ethernet MAC address is required.
266 266 *
267 267 * OVERLAY_PLUGIN_D_IP
268 268 *
269 269 * An IP address is required. All IP addresses used by the overlay
270 270 * system are transmitted as IPv6 addresses. IPv4 addresses can be
271 271 * represented by using IPv4-mapped IPv6 addresses.
272 272 *
273 273 * OVERLAY_PLUGIN_D_PORT
274 274 *
275 275 * A TCP/UDP port is required.
276 276 *
277 277 * A kernel encapsulation plugin declares which of these that it requires, it's
278 278 * a static set. On the other hand, a userland lookup plugin can be built to
279 279 * support all of these or any combination thereof. It gets passed the required
280 280 * destination type, based on the kernel encapsulation method, and then it makes
281 281 * the determination as to whether or not it supports it. For example, the
282 282 * direct plugin can support either an IP or both an IP and a port, it simply
283 283 * doesn't display the direct/dest_port property in the cases where a port is
284 284 * not required to support this.
285 285 *
286 286 * The user lookup plugins have two different modes of operation which
287 287 * determines how they interact with the broader system and how look ups are
288 288 * performed. These types are:
289 289 *
290 290 * OVERLAY_TARGET_POINT
291 291 *
292 292 * A point to point plugin has a single static definition for where
293 293 * to send all traffic. Every packet in the system always gets sent
294 294 * to the exact same destination which is programmed into the
295 295 * kernel when the general device is activated.
296 296 *
297 297 * OVERLAY_TARGET_DYNAMIC
298 298 *
299 299 * A dynamic plugin does not have a single static definition.
300 300 * Instead, for each destination, the kernel makes an asynchronous
301 301 * request to varpd to determine where the packet should be routed,
302 302 * and if a specific destination is found, then that destination is
303 303 * cached in the overlay device's target cache.
304 304 *
305 305 * This distinction, while important for the general overlay device's operation,
306 306 * is not important to the encapsulation plugins. They don't need to know about
307 307 * any of these pieces. It's just a concern for varpd, the userland plugin, and
308 308 * the general overlay scaffolding.
309 309 *
310 310 * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not
311 311 * maintain a target cache, and instead just keeps track of the destination and
312 312 * always sends encapsulated packets to that address. When the target type is of
313 313 * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such
314 314 * destinations. These destinations are kept around in an instance of a
315 315 * reference hash that is specific to the given overlay device. Entries in the
316 316 * cache can be invalidated and replaced by varpd and its lookup plugins.
317 317 *
318 318 * ----------------------------------
319 319 * Kernel Components and Architecture
320 320 * ----------------------------------
321 321 *
322 322 * There are multiple pieces inside the kernel that work together, there is the
323 323 * general overlay_dev_t structure, which is the logical GLDv3 device, but it
324 324 * itself has references to things like an instance of an encapsulation plugin,
325 325 * a pointer to a mux and a target cache. It can roughly be summarized in the
326 326 * following image:
327 327 *
328 328 * +------------------+
329 329 * | global |
330 330 * | overlay list |
331 331 * | overlay_dev_list |
332 332 * +------------------+
333 333 * |
334 334 * | +-----------------------+ +---------------+
335 335 * +->| GLDv3 Device |----------->| GLDv3 Device | -> ...
336 336 * | overlay_dev_t | | overlay_dev_t |
337 337 * | | +---------------+
338 338 * | |
339 339 * | mac_handle_t -----+---> GLDv3 handle to MAC
340 340 * | datalink_id_t -----+---> Datalink ID used by DLS
341 341 * | overlay_dev_flag_t ---+---> Device state
342 342 * | uint_t -----+---> Curent device MTU
343 343 * | uint_t -----+---> In-progress RX operations
344 344 * | uint_t -----+---> In-progress TX operations
345 345 * | char[] -----+---> FMA degraded message
346 346 * | void * -----+---> plugin private data
347 347 * | overlay_target_t * ---+---------------------+
348 348 * | overlay_plugin_t * ---+---------+ |
349 349 * +-----------------------+ | |
350 350 * ^ | |
351 351 * +--------------------+ | | |
352 352 * | Kernel Socket | | | |
353 353 * | Multiplexor | | | |
354 354 * | overlay_mux_t | | | |
355 355 * | | | | |
356 356 * | avl_tree_t -+--+ | |
357 357 * | uint_t -+--> socket family | |
358 358 * | uint_t -+--> socket type | |
359 359 * | uint_t -+--> socket protocol | |
360 360 * | ksocket_t -+--> I/O socket | |
361 361 * | struct sockaddr * -+--> ksocket address | |
362 362 * | overlay_plugin_t --+--------+ | |
363 363 * +--------------------+ | | |
364 364 * | | |
365 365 * +-------------------------+ | | |
366 366 * | Encap Plugin |<--+-----------+ |
367 367 * | overlay_plugin_t | |
368 368 * | | |
369 369 * | char * ---+--> plugin name |
370 370 * | overlay_plugin_ops_t * -+--> plugin downcalls |
371 371 * | char ** (props) ---+--> property list |
372 372 * | uint_t ---+--> id length |
373 373 * | overlay_plugin_flags_t -+--> plugin flags |
374 374 * | overlay_plugin_dest_t --+--> destination type v
375 375 * +-------------------------+ +-------------------------+
376 376 * | Target Cache |
377 377 * | overlay_target_t |
378 378 * | |
379 379 * cache mode <--+- overlay_target_mode_t |
380 380 * dest type <--+- overlay_plugin_dest_t |
381 381 * cache flags <--+- overlay_target_flag_t |
382 382 * varpd id <--+- uint64_t |
383 383 * outstanding varpd reqs. <--+- uint_t |
384 384 * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t |
385 385 * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t |
386 386 * | +-------------------------+
387 387 * +-----------------------+
388 388 * |
389 389 * v
390 390 * +-------------------------------+ +------------------------+
391 391 * | Target Entry |-->| Target Entry |--> ...
392 392 * | overlay_target_entry_t | | overlay_target_entry_t |
393 393 * | | +------------------------+
394 394 * | |
395 395 * | overlay_target_entry_flags_t -+--> Entry flags
396 396 * | uint8_t[ETHERADDRL] ---+--> Target MAC address
397 397 * | overlay_target_point_t ---+--> Target underlay address
398 398 * | mblk_t * ---+--> outstanding mblk head
399 399 * | mblk_t * ---+--> outstanding mblk tail
400 400 * | size_t ---+--> outstanding mblk size
401 401 * +-------------------------------+
402 402 *
403 403 * The primary entries that we care about are the overlay_dev_t, which
404 404 * correspond to each overlay device that is created with dladm(1M). Globally,
405 405 * these devices are maintained in a simple list_t which is protected with a
406 406 * lock. Hence, these include important information such as the mac_handle_t
407 407 * and a datalink_id_t which is used to interact with the broader MAC and DLS
408 408 * ecosystem. We also maintain additional information such as the current state,
409 409 * outstanding operations, the mtu, and importantly, the plugin's private data.
410 410 * This is the instance of an encapsulation plugin that gets created as part of
411 411 * creating an overlay device. Another aspect of this is that the overlay_dev_t
412 412 * also includes information with respect to FMA. For more information, see the
413 413 * FMA section.
414 414 *
415 415 * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin
416 416 * is the encapsulation plugin. This allows the device to make downcalls into it
417 417 * based on doing things like getting and setting properties. Otherwise, the
418 418 * plugin itself is a fairly straightforward entity. They are maintained in an
419 419 * (not pictured above) list. The plugins themselves mostly maintain things like
420 420 * the static list of properties, what kind of destination they require, and the
421 421 * operations vector. A given module may contain more if necessary.
422 422 *
423 423 * The next piece of the puzzle is the mux, or a multiplexor. The mux itself
424 424 * maintains a ksocket and it is through the mux that we send and receive
425 425 * message blocks. The mux represents a socket type and address, as well as a
426 426 * plugin. Multiple overlay_dev_t devices may then share the same mux. For
427 427 * example, consider the case where you have different instances of vxlan all on
428 428 * the same underlay network. These would all logically share the same IP
429 429 * address and port that packets are sent and received on; however, what differs
430 430 * is the decapuslation ID.
431 431 *
432 432 * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike
433 433 * a socket, we enable a direct callback on the ksocket. This means that
434 434 * whenever a message block chain is received, rather than sitting there and
435 435 * getting a callback in a context and kicking that back out to a taskq. Instead
436 436 * data comes into the callback function overlay_mux_recv().
437 437 *
438 438 * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx
439 439 * function) to transmit. It receives encapsulated packets, decapsulates them to
440 440 * determine the overlay identifier, looks up the given device that matches that
441 441 * identifier, and then causes the broader MAC world to receive the packet with
442 442 * a call to mac_rx().
443 443 *
444 444 * Today, we don't do too much that's special with the ksocket; however, as
445 445 * hardware is gaining understanding for these encapuslation protocols, we'll
446 446 * probably want to think of better ways to get those capabilities passed down
447 447 * and potentially better ways to program receive filters so they get directly
448 448 * to us. Though, that's all fantasy future land.
449 449 *
450 450 * The next part of the puzzle is the target cache. The purpose of the target
451 451 * cache is to cache where we should send a packet on the underlay network,
452 452 * given its mac address. The target cache operates in two modes depending on
453 453 * whether the lookup module was declared to OVERLAY_TARGET_POINT or
454 454 * OVERLAY_TARGET_DYANMIC.
455 455 *
456 456 * In the case where the target cache has been programmed to be
457 457 * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t
458 458 * which has the destination that we send everything, no matter the destination
459 459 * mac address.
460 460 *
461 461 * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things
462 462 * are much more interesting and as a result, more complicated. We primarily
463 463 * store lists of overlay_target_entry_t's which are stored in both an avl tree
464 464 * and a refhash_t. The primary look up path uses the refhash_t and the avl tree
465 465 * is only used for a few of the target ioctls used to dump data such that we
466 466 * can get a consistent iteration order for things like dladm show-overlay -t.
467 467 * The key that we use for the reference hashtable is based on the mac address
468 468 * in the cache and currently we just do a simple CRC32 to transform it into a
469 469 * hash.
470 470 *
471 471 * Each entry maintains a set of flags to indicate the current status of the
472 472 * request. The flags may indicate one of three states: that current cache entry
473 473 * is valid, that the current cache entry has been directed to drop all output,
474 474 * and that the current cache entry is invalid and may be being looked up. In
475 475 * the case where it's valid, we just take the destination address and run with
476 476 * it.
477 477 *
478 478 * If it's invalid and a lookup has not been made, then we start the process
479 479 * that prepares a query that will make its way up to varpd. The cache entry
480 480 * entry maintains a message block chain of outstanding message blocks and a
481 481 * size. These lists are populated only when we don't know the answer as to
482 482 * where should these be sent. The size entry is used to cap the amount of
483 483 * outstanding data that we don't know the answer to. If we exceed a cap on the
484 484 * amount of outstanding data (currently 1 Mb), then we'll drop any additional
485 485 * packets. Once we get an answer indicating a valid destination, we transmit
486 486 * any outstanding data to that place. For the full story on how we look that up
487 487 * will be discussed in the section on the Target Cache Lifecycle.
488 488 *
489 489 * ------------------------
490 490 * FMA and Degraded Devices
491 491 * ------------------------
492 492 *
493 493 * Every kernel overlay device keeps track of its FMA state. Today in FMA we
494 494 * cannot represent partitions between resources nor can we represent that a
495 495 * given minor node of a psuedo device has failed -- if we degrade the overlay
496 496 * device, then the entire dev_info_t is degraded. However, we still want to be
497 497 * able to indicate to administrators that things may go wrong.
498 498 *
499 499 * To this end, we've added a notion of a degraded state to every overlay
500 500 * device. This state is primarily dictated by userland and it can happen for
501 501 * various reasons. Generally, because a userland lookup plugin has been
502 502 * partitioned, or something has gone wrong such that there is no longer any
503 503 * userland lookup module for a device, then we'll mark it degraded.
504 504 *
505 505 * As long as any of our minor instances is degraded, then we'll fire off the
506 506 * FMA event to note that. Once the last degraded instance is no longer
507 507 * degraded, then we'll end up telling FMA that we're all clean.
508 508 *
509 509 * To help administrators get a better sense of which of the various minor
510 510 * devices is wrong, we store the odd_fmamsg[] character array. This character
511 511 * array can be fetched with doing a dladm show-overlay -f.
512 512 *
513 513 * Note, that it's important that we do not update the link status of the
514 514 * devices. We want to remain up as much as possible. By changing the link in a
515 515 * degraded state, this may end up making things worse. We may still actually
516 516 * have information in the target cache and if we mark the link down, that'll
517 517 * result in not being able to use it. The reason being that this'll mark all
518 518 * the downstream VNICs down which will go to IP and from there we end up
519 519 * dealing with sadness.
520 520 *
521 521 * -----------------------
522 522 * Target Cache Life Cycle
523 523 * -----------------------
524 524 *
525 525 * This section only applies when we have a lookup plugin of
526 526 * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type
527 527 * OVERLAY_TARGET_POINT.
528 528 *
529 529 * While we got into the target cache in the general architecture section, it's
530 530 * worth going into more details as to how this actually works and showing some
531 531 * examples and state machines. Recall that a target cache entry basically has
532 532 * the following state transition diagram:
533 533 *
534 534 * Initial state
535 535 * . . . . . . first access . . . varpd lookup enqueued
536 536 * . . .
537 537 * . . .
538 538 * +-------+ . +----------+ .
539 539 * | No |------*---->| Invalid |-------*----+
540 540 * | Entry | | Entry | |
541 541 * +-------+ +----------+ |
542 542 * varpd ^ ^ varpd |
543 543 * invalidate | | drop |
544 544 * . . . * * . . v
545 545 * +-------+ | | +---------+
546 546 * | Entry |--->-----+ +----<----| Entry |
547 547 * | Valid |<----------*---------<----| Pending |->-+ varpd
548 548 * +-------+ . +---------+ * . . drop, but
549 549 * . varpd ^ | other queued
550 550 * . success | | entries
551 551 * +-----+
552 552 *
553 553 * When the table is first created, it is empty. As we attempt to lookup entries
554 554 * and we find there is no entry at all, we'll create a new table entry for it.
555 555 * At that point the entry is technically in an invalid state, that means that
556 556 * we have no valid data from varpd. In that case, we'll go ahead and queue the
557 557 * packet into the entry's pending chain, and queue a varpd lookup, setting the
558 558 * OVERLAY_ENTRY_F_PENDING flag in the progress.
559 559 *
560 560 * If additional mblk_t's come in for this entry, we end up appending them to
561 561 * the tail of the chain, if and only if, we don't exceed the threshold for the
562 562 * amount of space they can take up. An entry remains pending until we get a
563 563 * varpd reply. If varpd replies with a valid results, we move to the valid
564 564 * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one
565 565 * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate.
566 566 *
567 567 * Once an entry is valid, it stays valid until user land tells us to invalidate
568 568 * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and
569 569 * OVERLAY_TARG_CACHE_SET respectively.
570 570 *
571 571 * If the lookup fails with a call to drop the packet, then the next state is
572 572 * determined by the state of the queue. If the set of outstanding entries is
573 573 * empty, then we just transition back to the invalid state. If instead, the
574 574 * set of outstanding entries is not empty, then we'll queue another entry and
575 575 * stay in the same state, repeating this until the number of requests is
576 576 * drained.
577 577 *
578 578 * The following images describes the flow of a given lookup and where the
579 579 * overlay_target_entry_t is at any given time.
580 580 *
581 581 * +-------------------+
582 582 * | Invalid Entry | An entry starts off as an invalid entry
583 583 * | de:ad:be:ef:00:00 | and only exists in the target cache.
584 584 * +-------------------+
585 585 *
586 586 * ~~~~
587 587 *
588 588 * +---------------------+
589 589 * | Global list_t | A mblk_t comes in for an entry. We
590 590 * | overlay_target_list | append it to the overlay_target_list.
591 591 * +---------------------+
592 592 * |
593 593 * v
594 594 * +-------------------+ +-------------------+
595 595 * | Pending Entry |----->| Pending Entry |--->...
596 596 * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 |
597 597 * +-------------------+ +-------------------+
598 598 *
599 599 * ~~~~
600 600 *
601 601 * +--------------------------+
602 602 * | /dev/overlay minor state | User land said that it would look up an
603 603 * | overlay_target_hdl_t | entry for us. We remove it from the
604 604 * +--------------------------+ global list and add it to the handle's
605 605 * | outstanding list.
606 606 * |
607 607 * v
608 608 * +-------------------+ +-------------------+
609 609 * | Pending Entry |----->| Pending Entry |
610 610 * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 |
611 611 * +-------------------+ +-------------------+
612 612 *
613 613 * ~~~~
614 614 *
615 615 * +-------------------+
616 616 * | Valid Entry | varpd returned an answer with
617 617 * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache
618 618 * | 10.169.23.42:4789 | entry is now populated with a
619 619 * +-------------------+ destination and marked as valid
620 620 *
621 621 *
622 622 * The lookup mechanism is performed via a series of operations on the character
623 623 * psuedo-device /dev/overlay. The only thing that uses this device is the
624 624 * userland daemon varpd. /dev/overlay is a cloneable device, each open of it
625 625 * granting a new minor number which maintains its own state. We maintain this
626 626 * state so that way if an outstanding lookup was queued to something that
627 627 * crashed or closed its handle without responding, we can know about this and
628 628 * thus handle it appropriately.
629 629 *
630 630 * When a lookup is first created it's added to our global list of outstanding
631 631 * lookups. To service requests, userland is required to perform an ioctl to ask
632 632 * for a request. We will block it in the kernel a set amount of time waiting
633 633 * for a request. When we give a request to a given minor instance of the
634 634 * device, we remove it from the global list and append the request to the
635 635 * device's list of outstanding entries, for the reasons we discussed above.
636 636 * When a lookup comes in, we give user land a smaller amount of information
637 637 * specific to that packet, the overlay_targ_lookup_t. It includes a request id
638 638 * to identify this, and then the overlay id, the varpd id, the header and
639 639 * packet size, the source and destination mac address, the SAP, and any
640 640 * potential VLAN header.
641 641 *
642 642 * At that point, it stays in that outstanding list until one of two ioctls are
643 643 * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time,
644 644 * userland may also perform other operations. For example, it may use
645 645 * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth
646 646 * analysis of what to do beyond what we gave it initially. This is useful for
647 647 * providing proxy arp and the like. Finally, there are two other ioctls that
648 648 * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the
649 649 * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which
650 650 * causes us to encapsulate and send out the packet they've given us.
651 651 *
652 652 *
653 653 * Finally, through the target cache, several ioctls are provided to allow for
654 654 * interrogation and management of the cache. They allow for individual entries
655 655 * to be retrieved, set, or have the entire table flushed. For the full set of
656 656 * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h.
657 657 *
658 658 * ------------------
659 659 * Sample Packet Flow
660 660 * ------------------
661 661 *
662 662 * There's a lot of pieces here, hopefully an example of how this all fits
663 663 * together will help clarify and elucidate what's going on. We're going to
664 664 * first track an outgoing packet, eg. one that is sent from an IP interface on
665 665 * a VNIC on top of an overlay device, and then we'll look at what it means to
666 666 * respond to that.
667 667 *
668 668 *
669 669 * +----------------+ +--------------+ +------------------+
670 670 * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches |
671 671 * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx |
672 672 * +----------------+ | VNIC device | | overlay_m_tx() |
673 673 * +--------------+ +------------------+
674 674 * |
675 675 * . lookup . cache |
676 676 * . drop . miss v
677 677 * +---------+ . +--------+ . +------------------+
678 678 * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk |
679 679 * | mblk_t | | lookup | | in the target |
680 680 * +---------+ | queued | | cache |
681 681 * ^ +--------+ +------------------+
682 682 * on send | | | cache
683 683 * error . . * *. . lookup * . . hit
684 684 * | | success v
685 685 * | | +------------------+
686 686 * +-----------------+ +--------------->| call plugin |
687 687 * | Send out | | ovpo_encap() to |
688 688 * | overlay_mux_t's |<----------------------------------| get encap mblk_t |
689 689 * | ksocket | +------------------+
690 690 * +-----------------+
691 691 *
692 692 * The receive end point looks a little different and looks more like:
693 693 *
694 694 * +------------------+ +----------------+ +-----------+
695 695 * | mblk_t comes off |---->| enter netstack |--->| delivered |---+
696 696 * | the physical | | IP stack | | to | * . . direct
697 697 * | device | +----------------+ | ksocket | | callback
698 698 * +------------------+ +-----------+ |
699 699 * . overlay id |
700 700 * . not found v
701 701 * +-----------+ . +-----------------+ +--------------------+
702 702 * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() |
703 703 * | mblk_t | | ovpo_decap() to | +--------------------+
704 704 * +-----------+ | decap mblk_t |
705 705 * +-----------------+
706 706 * |
707 707 * * . . overlay id
708 708 * v found
709 709 * +--------+ +----------------+
710 710 * | adjust |----->| call mac_rx |
711 711 * | mblk_t | | on original |
712 712 * +--------+ | decaped packet |
713 713 * +----------------+
714 714 *
715 715 * ------------------
716 716 * Netstack Awareness
717 717 * ------------------
718 718 *
719 719 * In the above image we note that this enters a netstack. Today the only
720 720 * netstack that can be is the global zone as the overlay driver itself is not
721 721 * exactly netstack aware. What this really means is that varpd cannot run in a
722 722 * non-global zone and an overlay device cannot belong to a non-global zone.
723 723 * Non-global zones can still have a VNIC assigned to them that's been created
724 724 * over the overlay device the same way they would if it had been created over
725 725 * an etherstub or a physical device.
726 726 *
727 727 * The majority of the work to make it netstack aware is straightforward and the
728 728 * biggest thing is to create a netstack module that allows us to hook into
729 729 * netstack (and thus zone) creation and destruction. From there, we need to
730 730 * amend the target cache lookup routines that we discussed earlier to not have
731 731 * a global outstanding list and a global list of handles, but rather, one per
732 732 * netstack.
733 733 *
734 734 * For the mux, we'll need to open the ksocket in the context of the zone, we
735 735 * can likely do this with a properly composed credential, but we'll need to do
736 736 * some more work on that path. Finally, we'll want to make sure the dld ioctls
737 737 * are aware of the zoneid of the caller and we use that appropriately and store
738 738 * it in the overlay_dev_t.
739 739 *
740 740 * -----------
741 741 * GLDv3 Notes
742 742 * -----------
743 743 *
744 744 * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more
745 745 * relevant and other parts are much less relevant for us. For example, the
746 746 * GLDv3 is used to toggle the device being put into and out of promiscuous
747 747 * mode, to program MAC addresses for unicast and multicast hardware filters.
748 748 * Today, an overlay device doesn't have a notion of promiscuous mode nor does
749 749 * it have a notion of unicast and multicast addresses programmed into the
750 750 * device. Instead, for the purposes of the hardware filter, we don't do
751 751 * anything and just always accept new addresses being added and removed.
752 752 *
753 753 * If the GLDv3 start function has not been called, then we will not use this
754 754 * device for I/O purposes. Any calls to transmit or receive should be dropped,
755 755 * though the GLDv3 guarantees us that transmit will not be called without
756 756 * calling start. Similarly, once stop is called, then no packets can be dealt
757 757 * with.
758 758 *
759 759 * Today we don't support the stat interfaces, though there's no good reason
760 760 * that we shouldn't assemble some of the stats based on what we have in the
761 761 * future.
762 762 *
763 763 * When it comes to link properties, many of the traditional link properties do
764 764 * not apply and many others MAC handles for us. For example, we don't need to
765 765 * implement anything for overlay_m_getprop() to deal with returning the MTU, as
766 766 * MAC never calls into us for that. As such, there isn't much of anything to
767 767 * support in terms of properties.
768 768 *
769 769 * Today, we don't support any notion of hardware capabilities. However, if
770 770 * future NIC hardware or other changes to the system cause it to make sense for
771 771 * us to emulate logical groups, then we should do that. However, we still do
772 772 * implement a capab function so that we can identify ourselves as an overlay
773 773 * device to the broader MAC framework. This is done mostly so that a device
774 774 * created on top of us can have fanout rings as we don't try to lie about a
775 775 * speed for our device.
776 776 *
777 777 * The other question is what should be done for a device's MTU and margin. We
778 778 * set our minimum supported MTU to be the minimum value that an IP network may
779 779 * be set to 576 -- which mimics what an etherstub does. On the flip side, we
780 780 * have our upper bound set to 8900. This value comes from the fact that a lot
781 781 * of jumbo networks use their maximum as 9000. As such, we want to reserve 100
782 782 * bytes, which isn't exactly the most accurate number, but it'll be good enough
783 783 * for now. Because of that, our default MTU off of these devices is 1400, as
784 784 * the default MTU for everything is usually 1500 or whatever the underlying
785 785 * device is at; however, this is a bit simpler than asking the netstack what
786 786 * are all the IP interfaces at. It also calls into question how PMTU and PMTU
787 787 * discovery should work here. The challenge, especially for
788 788 * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's
789 789 * not clear that if you have a single bad entry that the overall MTU should be
790 790 * lowered. Instead, we should figure out a better way of determining these
791 791 * kinds of PMTU errors and appropriately alerting the administrator via FMA.
792 792 *
793 793 * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether
794 794 * or not the underlying encapsulation device supports VLAN tags. If it does,
795 795 * then we'll set the margin to allow for it, otherwise, we will not.
796 796 */
797 797
798 798 #include <sys/conf.h>
799 799 #include <sys/errno.h>
800 800 #include <sys/stat.h>
801 801 #include <sys/ddi.h>
802 802 #include <sys/sunddi.h>
803 803 #include <sys/modctl.h>
804 804 #include <sys/policy.h>
805 805 #include <sys/stream.h>
806 806 #include <sys/strsubr.h>
807 807 #include <sys/strsun.h>
808 808 #include <sys/types.h>
809 809 #include <sys/kmem.h>
810 810 #include <sys/param.h>
811 811 #include <sys/sysmacros.h>
812 812 #include <sys/ddifm.h>
813 813
814 814 #include <sys/dls.h>
815 815 #include <sys/dld_ioc.h>
816 816 #include <sys/mac_provider.h>
817 817 #include <sys/mac_client_priv.h>
818 818 #include <sys/mac_ether.h>
819 819 #include <sys/vlan.h>
820 820
821 821 #include <sys/overlay_impl.h>
822 822
823 823 dev_info_t *overlay_dip;
824 824 static kmutex_t overlay_dev_lock;
825 825 static list_t overlay_dev_list;
826 826 static uint8_t overlay_macaddr[ETHERADDRL] =
827 827 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
828 828
829 829 typedef enum overlay_dev_prop {
830 830 OVERLAY_DEV_P_MTU = 0,
831 831 OVERLAY_DEV_P_VNETID,
832 832 OVERLAY_DEV_P_ENCAP,
833 833 OVERLAY_DEV_P_VARPDID
834 834 } overlay_dev_prop_t;
835 835
836 836 #define OVERLAY_DEV_NPROPS 4
837 837 static const char *overlay_dev_props[] = {
838 838 "mtu",
839 839 "vnetid",
840 840 "encap",
841 841 "varpd/id"
842 842 };
843 843
844 844 #define OVERLAY_MTU_MIN 576
845 845 #define OVERLAY_MTU_DEF 1400
846 846 #define OVERLAY_MTU_MAX 8900
847 847
848 848 overlay_dev_t *
849 849 overlay_hold_by_dlid(datalink_id_t id)
850 850 {
851 851 overlay_dev_t *o;
852 852
853 853 mutex_enter(&overlay_dev_lock);
854 854 for (o = list_head(&overlay_dev_list); o != NULL;
855 855 o = list_next(&overlay_dev_list, o)) {
856 856 if (id == o->odd_linkid) {
857 857 mutex_enter(&o->odd_lock);
858 858 o->odd_ref++;
859 859 mutex_exit(&o->odd_lock);
860 860 mutex_exit(&overlay_dev_lock);
861 861 return (o);
862 862 }
863 863 }
864 864
865 865 mutex_exit(&overlay_dev_lock);
866 866 return (NULL);
867 867 }
868 868
869 869 void
870 870 overlay_hold_rele(overlay_dev_t *odd)
871 871 {
872 872 mutex_enter(&odd->odd_lock);
873 873 ASSERT(odd->odd_ref > 0);
874 874 odd->odd_ref--;
875 875 mutex_exit(&odd->odd_lock);
876 876 }
877 877
878 878 void
879 879 overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag)
880 880 {
881 881 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
882 882 ASSERT(MUTEX_HELD(&odd->odd_lock));
883 883
884 884 if (flag & OVERLAY_F_IN_RX)
885 885 odd->odd_rxcount++;
886 886 if (flag & OVERLAY_F_IN_TX)
887 887 odd->odd_txcount++;
888 888 odd->odd_flags |= flag;
889 889 }
890 890
891 891 void
892 892 overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag)
893 893 {
894 894 boolean_t signal = B_FALSE;
895 895
896 896 ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX);
897 897 ASSERT(MUTEX_HELD(&odd->odd_lock));
898 898
899 899 if (flag & OVERLAY_F_IN_RX) {
900 900 ASSERT(odd->odd_rxcount > 0);
901 901 odd->odd_rxcount--;
902 902 if (odd->odd_rxcount == 0) {
903 903 signal = B_TRUE;
904 904 odd->odd_flags &= ~OVERLAY_F_IN_RX;
905 905 }
906 906 }
907 907 if (flag & OVERLAY_F_IN_TX) {
908 908 ASSERT(odd->odd_txcount > 0);
909 909 odd->odd_txcount--;
910 910 if (odd->odd_txcount == 0) {
911 911 signal = B_TRUE;
912 912 odd->odd_flags &= ~OVERLAY_F_IN_TX;
913 913 }
914 914 }
915 915
916 916 if (signal == B_TRUE)
917 917 cv_broadcast(&odd->odd_iowait);
918 918 }
919 919
920 920 static void
921 921 overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag)
922 922 {
923 923 ASSERT((flag & ~OVERLAY_F_IOMASK) == 0);
924 924 ASSERT(MUTEX_HELD(&odd->odd_lock));
925 925
926 926 while (odd->odd_flags & flag) {
927 927 cv_wait(&odd->odd_iowait, &odd->odd_lock);
928 928 }
929 929 }
930 930
931 931 void
932 932 overlay_dev_iter(overlay_dev_iter_f func, void *arg)
933 933 {
934 934 overlay_dev_t *odd;
935 935
936 936 mutex_enter(&overlay_dev_lock);
937 937 for (odd = list_head(&overlay_dev_list); odd != NULL;
938 938 odd = list_next(&overlay_dev_list, odd)) {
939 939 if (func(odd, arg) != 0) {
940 940 mutex_exit(&overlay_dev_lock);
941 941 return;
942 942 }
943 943 }
944 944 mutex_exit(&overlay_dev_lock);
945 945 }
946 946
947 947 /* ARGSUSED */
948 948 static int
949 949 overlay_m_stat(void *arg, uint_t stat, uint64_t *val)
950 950 {
951 951 return (ENOTSUP);
952 952 }
953 953
954 954 static int
955 955 overlay_m_start(void *arg)
956 956 {
957 957 overlay_dev_t *odd = arg;
958 958 overlay_mux_t *mux;
959 959 int ret, domain, family, prot;
960 960 struct sockaddr_storage storage;
961 961 socklen_t slen;
962 962
963 963 mutex_enter(&odd->odd_lock);
964 964 if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) {
965 965 mutex_exit(&odd->odd_lock);
966 966 return (EAGAIN);
967 967 }
968 968 mutex_exit(&odd->odd_lock);
969 969
970 970 ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain,
971 971 &family, &prot, (struct sockaddr *)&storage, &slen);
972 972 if (ret != 0)
973 973 return (ret);
974 974
975 975 mux = overlay_mux_open(odd->odd_plugin, domain, family, prot,
976 976 (struct sockaddr *)&storage, slen, &ret);
977 977 if (mux == NULL)
978 978 return (ret);
979 979
980 980 overlay_mux_add_dev(mux, odd);
981 981 odd->odd_mux = mux;
982 982 mutex_enter(&odd->odd_lock);
983 983 ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX));
984 984 odd->odd_flags |= OVERLAY_F_IN_MUX;
985 985 mutex_exit(&odd->odd_lock);
986 986
987 987 return (0);
988 988 }
989 989
990 990 static void
991 991 overlay_m_stop(void *arg)
992 992 {
993 993 overlay_dev_t *odd = arg;
994 994
995 995 /*
996 996 * The MAC Perimeter is held here, so we don't have to worry about
997 997 * synchornizing this with respect to metadata operations.
998 998 */
999 999 mutex_enter(&odd->odd_lock);
1000 1000 VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX);
1001 1001 VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP));
1002 1002 odd->odd_flags |= OVERLAY_F_MDDROP;
1003 1003 overlay_io_wait(odd, OVERLAY_F_IOMASK);
1004 1004 mutex_exit(&odd->odd_lock);
1005 1005
1006 1006 overlay_mux_remove_dev(odd->odd_mux, odd);
1007 1007 overlay_mux_close(odd->odd_mux);
1008 1008 odd->odd_mux = NULL;
1009 1009
1010 1010 mutex_enter(&odd->odd_lock);
1011 1011 odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1012 1012 odd->odd_flags &= ~OVERLAY_F_MDDROP;
1013 1013 VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0);
1014 1014 mutex_exit(&odd->odd_lock);
1015 1015 }
1016 1016
1017 1017 /*
1018 1018 * For more info on this, see the big theory statement.
1019 1019 */
1020 1020 /* ARGSUSED */
1021 1021 static int
1022 1022 overlay_m_promisc(void *arg, boolean_t on)
1023 1023 {
1024 1024 return (0);
1025 1025 }
1026 1026
1027 1027 /*
1028 1028 * For more info on this, see the big theory statement.
1029 1029 */
1030 1030 /* ARGSUSED */
1031 1031 static int
1032 1032 overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp)
1033 1033 {
1034 1034 return (0);
1035 1035 }
1036 1036
1037 1037 /*
1038 1038 * For more info on this, see the big theory statement.
1039 1039 */
1040 1040 /* ARGSUSED */
1041 1041 static int
1042 1042 overlay_m_unicast(void *arg, const uint8_t *macaddr)
1043 1043 {
1044 1044 return (0);
1045 1045 }
1046 1046
|
↓ open down ↓ |
1023 lines elided |
↑ open up ↑ |
1047 1047 mblk_t *
1048 1048 overlay_m_tx(void *arg, mblk_t *mp_chain)
1049 1049 {
1050 1050 overlay_dev_t *odd = arg;
1051 1051 mblk_t *mp, *ep;
1052 1052 int ret;
1053 1053 ovep_encap_info_t einfo;
1054 1054 struct msghdr hdr;
1055 1055
1056 1056 mutex_enter(&odd->odd_lock);
1057 + ASSERT0(odd->odd_flags & OVERLAY_F_TXSTOPPED);
1057 1058 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
1058 1059 !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1059 1060 mutex_exit(&odd->odd_lock);
1060 1061 freemsgchain(mp_chain);
1061 1062 return (NULL);
1062 1063 }
1063 1064 overlay_io_start(odd, OVERLAY_F_IN_TX);
1064 1065 mutex_exit(&odd->odd_lock);
1065 1066
1066 1067 bzero(&hdr, sizeof (struct msghdr));
1067 1068
1068 1069 bzero(&einfo, sizeof (ovep_encap_info_t));
1069 1070 einfo.ovdi_id = odd->odd_vid;
1070 1071 mp = mp_chain;
1071 1072 while (mp != NULL) {
1072 1073 socklen_t slen;
1073 1074 struct sockaddr_storage storage;
1075 +#ifdef OVERLAY_FC_TEST
1076 + /* Can deal with it being NULL later... */
1077 + mblk_t *save_mp = msgpullup(mp, -1);
1078 +#endif
1074 1079
1075 1080 mp_chain = mp->b_next;
1076 1081 mp->b_next = NULL;
1077 1082 ep = NULL;
1078 1083
1079 1084 ret = overlay_target_lookup(odd, mp,
1080 1085 (struct sockaddr *)&storage, &slen);
1081 1086 if (ret != OVERLAY_TARGET_OK) {
1082 1087 if (ret == OVERLAY_TARGET_DROP)
1083 1088 freemsg(mp);
1084 1089 mp = mp_chain;
1090 +#ifdef OVERLAY_FC_TEST
1091 + freemsg(save_mp); /* Handles NULL and non-NULL */
1092 +#endif
1085 1093 continue;
1086 1094 }
1087 1095
1088 1096 hdr.msg_name = &storage;
1089 1097 hdr.msg_namelen = slen;
1090 1098
1091 1099 ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp,
1092 1100 &einfo, &ep);
1093 1101 if (ret != 0 || ep == NULL) {
1102 +#ifdef OVERLAY_FC_TEST
1103 + freemsg(save_mp); /* Handles NULL and non-NULL */
1104 +#endif
1094 1105 freemsg(mp);
1095 1106 goto out;
1096 1107 }
1097 1108
1098 1109 ASSERT(ep->b_cont == mp || ep == mp);
1099 1110 ret = overlay_mux_tx(odd->odd_mux, &hdr, ep);
1100 - if (ret != 0)
1111 + if (ret != 0) {
1112 + if (ret != EWOULDBLOCK) {
1113 + /*
1114 + * Get rid of the packets, something ELSE is
1115 + * wrong with the socket, and we really should
1116 + * just drop the packets for now.
1117 + */
1118 +#ifdef OVERLAY_FC_TEST
1119 + freemsg(save_mp);
1120 + save_mp = NULL;
1121 +#endif
1122 + freemsgchain(mp_chain);
1123 + mp_chain = NULL;
1124 + }
1125 +#ifdef OVERLAY_FC_TEST
1126 + if (save_mp != NULL) {
1127 + /*
1128 + * Return the dropped mp here to see how
1129 + * upper-layer MAC reacts to it.
1130 + */
1131 + save_mp->b_next = mp_chain;
1132 + mp_chain = save_mp;
1133 + }
1134 +#endif
1135 + /*
1136 + * EWOULDBLOCK is a special case. Return the rest of
1137 + * the mp_chain to MAC and have this instance be
1138 + * marked as unable to transmit. Re-enable this
1139 + * instance when the mux's socket is able to send data
1140 + * again ("cansend" callback).
1141 + */
1101 1142 goto out;
1143 + }
1102 1144
1103 1145 mp = mp_chain;
1104 1146 }
1105 1147
1106 1148 out:
1107 1149 mutex_enter(&odd->odd_lock);
1108 1150 overlay_io_done(odd, OVERLAY_F_IN_TX);
1151 + if (mp_chain != NULL) {
1152 + /* Note that we're returning an unsent chain to MAC. */
1153 + odd->odd_flags |= OVERLAY_F_TXSTOPPED;
1154 + }
1109 1155 mutex_exit(&odd->odd_lock);
1110 1156 return (mp_chain);
1111 1157 }
1112 1158
1113 1159 /* ARGSUSED */
1114 1160 static void
1115 1161 overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1116 1162 {
1117 1163 miocnak(q, mp, 0, ENOTSUP);
1118 1164 }
1119 1165
1120 1166 /* ARGSUSED */
1121 1167 static boolean_t
1122 1168 overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
1123 1169 {
1124 1170 /*
1125 1171 * Tell MAC we're an overlay.
1126 1172 */
1127 1173 if (cap == MAC_CAPAB_OVERLAY)
1128 1174 return (B_TRUE);
1129 1175 return (B_FALSE);
1130 1176 }
1131 1177
1132 1178 /* ARGSUSED */
1133 1179 static int
1134 1180 overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1135 1181 uint_t pr_valsize, const void *pr_val)
1136 1182 {
1137 1183 uint32_t mtu, old;
1138 1184 int err;
1139 1185 overlay_dev_t *odd = arg;
1140 1186
1141 1187 if (pr_num != MAC_PROP_MTU)
1142 1188 return (ENOTSUP);
1143 1189
1144 1190 bcopy(pr_val, &mtu, sizeof (mtu));
1145 1191 if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX)
1146 1192 return (EINVAL);
1147 1193
1148 1194 mutex_enter(&odd->odd_lock);
1149 1195 old = odd->odd_mtu;
1150 1196 odd->odd_mtu = mtu;
1151 1197 err = mac_maxsdu_update(odd->odd_mh, mtu);
1152 1198 if (err != 0)
1153 1199 odd->odd_mtu = old;
1154 1200 mutex_exit(&odd->odd_lock);
1155 1201
1156 1202 return (err);
1157 1203 }
1158 1204
1159 1205 /* ARGSUSED */
1160 1206 static int
1161 1207 overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1162 1208 uint_t pr_valsize, void *pr_val)
1163 1209 {
1164 1210 return (ENOTSUP);
1165 1211 }
1166 1212
1167 1213 /* ARGSUSED */
1168 1214 static void
1169 1215 overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num,
1170 1216 mac_prop_info_handle_t prh)
1171 1217 {
1172 1218 if (pr_num != MAC_PROP_MTU)
1173 1219 return;
1174 1220
1175 1221 mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF);
1176 1222 mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX);
1177 1223 }
1178 1224
1179 1225 static mac_callbacks_t overlay_m_callbacks = {
1180 1226 .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP |
1181 1227 MC_PROPINFO),
1182 1228 .mc_getstat = overlay_m_stat,
1183 1229 .mc_start = overlay_m_start,
1184 1230 .mc_stop = overlay_m_stop,
1185 1231 .mc_setpromisc = overlay_m_promisc,
1186 1232 .mc_multicst = overlay_m_multicast,
1187 1233 .mc_unicst = overlay_m_unicast,
1188 1234 .mc_tx = overlay_m_tx,
1189 1235 .mc_ioctl = overlay_m_ioctl,
1190 1236 .mc_getcapab = overlay_m_getcapab,
1191 1237 .mc_getprop = overlay_m_getprop,
1192 1238 .mc_setprop = overlay_m_setprop,
1193 1239 .mc_propinfo = overlay_m_propinfo
1194 1240 };
1195 1241
1196 1242 static boolean_t
1197 1243 overlay_valid_name(const char *name, size_t buflen)
1198 1244 {
1199 1245 size_t actlen;
1200 1246 int err, i;
1201 1247
1202 1248 for (i = 0; i < buflen; i++) {
1203 1249 if (name[i] == '\0')
1204 1250 break;
1205 1251 }
1206 1252
1207 1253 if (i == 0 || i == buflen)
1208 1254 return (B_FALSE);
1209 1255 actlen = i;
1210 1256 if (strchr(name, '/') != NULL)
1211 1257 return (B_FALSE);
1212 1258 if (u8_validate((char *)name, actlen, NULL,
1213 1259 U8_VALIDATE_ENTIRE, &err) < 0)
1214 1260 return (B_FALSE);
1215 1261 return (B_TRUE);
1216 1262 }
1217 1263
1218 1264 /* ARGSUSED */
1219 1265 static int
1220 1266 overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1221 1267 {
1222 1268 int err;
1223 1269 uint64_t maxid;
1224 1270 overlay_dev_t *odd, *o;
1225 1271 mac_register_t *mac;
1226 1272 overlay_ioc_create_t *oicp = karg;
1227 1273
1228 1274 if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE)
1229 1275 return (EINVAL);
1230 1276
1231 1277 odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP);
1232 1278 odd->odd_linkid = oicp->oic_linkid;
1233 1279 odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap);
1234 1280 if (odd->odd_plugin == NULL) {
1235 1281 kmem_free(odd, sizeof (overlay_dev_t));
1236 1282 return (ENOENT);
1237 1283 }
1238 1284 err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd,
1239 1285 &odd->odd_pvoid);
1240 1286 if (err != 0) {
1241 1287 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1242 1288 overlay_plugin_rele(odd->odd_plugin);
1243 1289 kmem_free(odd, sizeof (overlay_dev_t));
1244 1290 return (EINVAL);
1245 1291 }
1246 1292
1247 1293 /*
1248 1294 * Make sure that our virtual network id is valid for the given plugin
1249 1295 * that we're working with.
1250 1296 */
1251 1297 ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1252 1298 maxid = UINT64_MAX;
1253 1299 if (odd->odd_plugin->ovp_id_size != 8)
1254 1300 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL;
1255 1301 if (oicp->oic_vnetid > maxid) {
1256 1302 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1257 1303 overlay_plugin_rele(odd->odd_plugin);
1258 1304 kmem_free(odd, sizeof (overlay_dev_t));
1259 1305 return (EINVAL);
1260 1306 }
1261 1307 odd->odd_vid = oicp->oic_vnetid;
1262 1308
1263 1309 mac = mac_alloc(MAC_VERSION);
1264 1310 if (mac == NULL) {
1265 1311 mutex_exit(&overlay_dev_lock);
1266 1312 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1267 1313 overlay_plugin_rele(odd->odd_plugin);
1268 1314 kmem_free(odd, sizeof (overlay_dev_t));
1269 1315 return (EINVAL);
1270 1316 }
1271 1317
1272 1318 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1273 1319 mac->m_driver = odd;
1274 1320 mac->m_dip = overlay_dip;
1275 1321 mac->m_dst_addr = NULL;
1276 1322 mac->m_callbacks = &overlay_m_callbacks;
1277 1323 mac->m_pdata = NULL;
1278 1324 mac->m_pdata_size = 0;
1279 1325
1280 1326 mac->m_priv_props = NULL;
1281 1327
1282 1328 /* Let mac handle this itself. */
1283 1329 mac->m_instance = (uint_t)-1;
1284 1330
1285 1331 /*
1286 1332 * There is no real source address that should be used here, but saying
1287 1333 * that we're not ethernet is going to cause its own problems. At the
1288 1334 * end of the say, this is fine.
1289 1335 */
1290 1336 mac->m_src_addr = overlay_macaddr;
1291 1337
1292 1338 /*
1293 1339 * Start with the default MTU as the max SDU. If the MTU is changed, the
1294 1340 * SDU will be changed to reflect that.
1295 1341 */
1296 1342 mac->m_min_sdu = 1;
1297 1343 mac->m_max_sdu = OVERLAY_MTU_DEF;
1298 1344 mac->m_multicast_sdu = 0;
1299 1345
1300 1346 /*
1301 1347 * The underlying device doesn't matter, instead this comes from the
1302 1348 * encapsulation protocol and whether or not they allow VLAN tags.
1303 1349 */
1304 1350 if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) {
1305 1351 mac->m_margin = VLAN_TAGSZ;
1306 1352 } else {
1307 1353 mac->m_margin = 0;
1308 1354 }
1309 1355
1310 1356 /*
1311 1357 * Today, we have no MAC virtualization, it may make sense in the future
1312 1358 * to go ahead and emulate some subset of this, but it doesn't today.
1313 1359 */
1314 1360 mac->m_v12n = MAC_VIRT_NONE;
1315 1361
1316 1362 mutex_enter(&overlay_dev_lock);
1317 1363 for (o = list_head(&overlay_dev_list); o != NULL;
1318 1364 o = list_next(&overlay_dev_list, o)) {
1319 1365 if (o->odd_linkid == oicp->oic_linkid) {
1320 1366 mutex_exit(&overlay_dev_lock);
1321 1367 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1322 1368 overlay_plugin_rele(odd->odd_plugin);
1323 1369 kmem_free(odd, sizeof (overlay_dev_t));
1324 1370 return (EEXIST);
1325 1371 }
1326 1372
1327 1373 if (o->odd_vid == oicp->oic_vnetid &&
1328 1374 o->odd_plugin == odd->odd_plugin) {
1329 1375 mutex_exit(&overlay_dev_lock);
1330 1376 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1331 1377 overlay_plugin_rele(odd->odd_plugin);
1332 1378 kmem_free(odd, sizeof (overlay_dev_t));
1333 1379 return (EEXIST);
1334 1380 }
1335 1381 }
1336 1382
1337 1383 err = mac_register(mac, &odd->odd_mh);
1338 1384 mac_free(mac);
1339 1385 if (err != 0) {
1340 1386 mutex_exit(&overlay_dev_lock);
1341 1387 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1342 1388 overlay_plugin_rele(odd->odd_plugin);
1343 1389 kmem_free(odd, sizeof (overlay_dev_t));
1344 1390 return (err);
1345 1391 }
1346 1392
1347 1393 err = dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1348 1394 crgetzoneid(cred));
1349 1395 if (err != 0) {
1350 1396 mutex_exit(&overlay_dev_lock);
1351 1397 (void) mac_unregister(odd->odd_mh);
1352 1398 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1353 1399 overlay_plugin_rele(odd->odd_plugin);
1354 1400 kmem_free(odd, sizeof (overlay_dev_t));
1355 1401 return (err);
1356 1402 }
1357 1403
1358 1404 mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL);
1359 1405 cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL);
1360 1406 odd->odd_ref = 0;
1361 1407 odd->odd_flags = 0;
1362 1408 list_insert_tail(&overlay_dev_list, odd);
1363 1409 mutex_exit(&overlay_dev_lock);
1364 1410
1365 1411 return (0);
1366 1412 }
1367 1413
1368 1414 /* ARGSUSED */
1369 1415 static int
1370 1416 overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1371 1417 {
1372 1418 int i, ret;
1373 1419 overlay_dev_t *odd;
1374 1420 mac_perim_handle_t mph;
1375 1421 overlay_ioc_activate_t *oiap = karg;
1376 1422 overlay_ioc_propinfo_t *infop;
1377 1423 overlay_ioc_prop_t *oip;
1378 1424 overlay_prop_handle_t phdl;
1379 1425
1380 1426 odd = overlay_hold_by_dlid(oiap->oia_linkid);
1381 1427 if (odd == NULL)
1382 1428 return (ENOENT);
1383 1429
1384 1430 infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP);
1385 1431 oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP);
1386 1432 phdl = (overlay_prop_handle_t)infop;
1387 1433
1388 1434 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1389 1435 mutex_enter(&odd->odd_lock);
1390 1436 if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1391 1437 mutex_exit(&odd->odd_lock);
1392 1438 mac_perim_exit(mph);
1393 1439 overlay_hold_rele(odd);
1394 1440 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1395 1441 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1396 1442 return (EEXIST);
1397 1443 }
1398 1444 mutex_exit(&odd->odd_lock);
1399 1445
1400 1446 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1401 1447 const char *pname = odd->odd_plugin->ovp_props[i];
1402 1448 bzero(infop, sizeof (overlay_ioc_propinfo_t));
1403 1449 overlay_prop_init(phdl);
1404 1450 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl);
1405 1451 if (ret != 0) {
1406 1452 mac_perim_exit(mph);
1407 1453 overlay_hold_rele(odd);
1408 1454 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1409 1455 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1410 1456 return (ret);
1411 1457 }
1412 1458
1413 1459 if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0)
1414 1460 continue;
1415 1461 bzero(oip, sizeof (overlay_ioc_prop_t));
1416 1462 oip->oip_size = sizeof (oip->oip_value);
1417 1463 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1418 1464 pname, oip->oip_value, &oip->oip_size);
1419 1465 if (ret != 0) {
1420 1466 mac_perim_exit(mph);
1421 1467 overlay_hold_rele(odd);
1422 1468 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1423 1469 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1424 1470 return (ret);
1425 1471 }
1426 1472 if (oip->oip_size == 0) {
1427 1473 mac_perim_exit(mph);
1428 1474 overlay_hold_rele(odd);
1429 1475 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1430 1476 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1431 1477 return (EINVAL);
1432 1478 }
1433 1479 }
1434 1480
1435 1481 mutex_enter(&odd->odd_lock);
1436 1482 if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) {
1437 1483 mutex_exit(&odd->odd_lock);
1438 1484 mac_perim_exit(mph);
1439 1485 overlay_hold_rele(odd);
1440 1486 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1441 1487 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1442 1488 return (ENXIO);
1443 1489 }
1444 1490
1445 1491 ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0);
1446 1492 odd->odd_flags |= OVERLAY_F_ACTIVATED;
1447 1493
1448 1494 /*
1449 1495 * Now that we've activated ourselves, we should indicate to the world
1450 1496 * that we're up. Note that we may not be able to perform lookups at
1451 1497 * this time, but our notion of being 'up' isn't dependent on that
1452 1498 * ability.
1453 1499 */
1454 1500 mac_link_update(odd->odd_mh, LINK_STATE_UP);
1455 1501 mutex_exit(&odd->odd_lock);
1456 1502
1457 1503 mac_perim_exit(mph);
1458 1504 overlay_hold_rele(odd);
1459 1505 kmem_free(infop, sizeof (overlay_ioc_propinfo_t));
1460 1506 kmem_free(oip, sizeof (overlay_ioc_prop_t));
1461 1507
1462 1508 return (0);
1463 1509 }
1464 1510
1465 1511 /* ARGSUSED */
1466 1512 static int
1467 1513 overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp)
1468 1514 {
1469 1515 overlay_ioc_delete_t *oidp = karg;
1470 1516 overlay_dev_t *odd;
1471 1517 datalink_id_t tid;
1472 1518 int ret;
1473 1519
1474 1520 odd = overlay_hold_by_dlid(oidp->oid_linkid);
1475 1521 if (odd == NULL) {
1476 1522 return (ENOENT);
1477 1523 }
1478 1524
1479 1525 mutex_enter(&odd->odd_lock);
1480 1526 /* If we're not the only hold, we're busy */
1481 1527 if (odd->odd_ref != 1) {
1482 1528 mutex_exit(&odd->odd_lock);
1483 1529 overlay_hold_rele(odd);
1484 1530 return (EBUSY);
1485 1531 }
1486 1532
1487 1533 if (odd->odd_flags & OVERLAY_F_IN_MUX) {
1488 1534 mutex_exit(&odd->odd_lock);
1489 1535 overlay_hold_rele(odd);
1490 1536 return (EBUSY);
1491 1537 }
1492 1538
1493 1539 /*
1494 1540 * To remove this, we need to first remove it from dls and then remove
1495 1541 * it from mac. The act of removing it from mac will check if there are
1496 1542 * devices on top of this, eg. vnics. If there are, then that will fail
1497 1543 * and we'll have to go through and recreate the dls entry. Only after
1498 1544 * mac_unregister has succeeded, then we'll go through and actually free
1499 1545 * everything and drop the dev lock.
1500 1546 */
1501 1547 ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE);
1502 1548 if (ret != 0) {
1503 1549 overlay_hold_rele(odd);
1504 1550 return (ret);
1505 1551 }
1506 1552
1507 1553 ASSERT(oidp->oid_linkid == tid);
1508 1554 ret = mac_disable(odd->odd_mh);
1509 1555 if (ret != 0) {
1510 1556 (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid,
1511 1557 crgetzoneid(cred));
1512 1558 overlay_hold_rele(odd);
1513 1559 return (ret);
1514 1560 }
1515 1561
1516 1562 overlay_target_quiesce(odd->odd_target);
1517 1563
1518 1564 mutex_enter(&overlay_dev_lock);
1519 1565 list_remove(&overlay_dev_list, odd);
1520 1566 mutex_exit(&overlay_dev_lock);
1521 1567
1522 1568 cv_destroy(&odd->odd_iowait);
1523 1569 mutex_destroy(&odd->odd_lock);
1524 1570 overlay_target_free(odd);
1525 1571 odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid);
1526 1572 overlay_plugin_rele(odd->odd_plugin);
1527 1573 kmem_free(odd, sizeof (overlay_dev_t));
1528 1574
1529 1575 return (0);
1530 1576 }
1531 1577
1532 1578 /* ARGSUSED */
1533 1579 static int
1534 1580 overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred,
1535 1581 int *rvalp)
1536 1582 {
1537 1583 overlay_dev_t *odd;
1538 1584 overlay_ioc_nprops_t *on = karg;
1539 1585
1540 1586 odd = overlay_hold_by_dlid(on->oipn_linkid);
1541 1587 if (odd == NULL)
1542 1588 return (ENOENT);
1543 1589 on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS;
1544 1590 overlay_hold_rele(odd);
1545 1591
1546 1592 return (0);
1547 1593 }
1548 1594
1549 1595 static int
1550 1596 overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg)
1551 1597 {
1552 1598 overlay_prop_handle_t phdl = arg;
1553 1599 overlay_prop_set_range_str(phdl, opp->ovp_name);
1554 1600 return (0);
1555 1601 }
1556 1602
1557 1603 static int
1558 1604 overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id)
1559 1605 {
1560 1606 int i;
1561 1607
1562 1608 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1563 1609 if (strcmp(overlay_dev_props[i], name) == 0) {
1564 1610 *id = i;
1565 1611 return (0);
1566 1612 }
1567 1613 }
1568 1614
1569 1615 for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) {
1570 1616 if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) {
1571 1617 *id = i + OVERLAY_DEV_NPROPS;
1572 1618 return (0);
1573 1619 }
1574 1620 }
1575 1621
1576 1622 return (ENOENT);
1577 1623 }
1578 1624
1579 1625 static void
1580 1626 overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl)
1581 1627 {
1582 1628 uint32_t def;
1583 1629 mac_propval_range_t range;
1584 1630 uint_t perm;
1585 1631
1586 1632 ASSERT(MAC_PERIM_HELD(odd->odd_mh));
1587 1633
1588 1634 bzero(&range, sizeof (mac_propval_range_t));
1589 1635 range.mpr_count = 1;
1590 1636 if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def,
1591 1637 sizeof (def), &range, &perm) != 0)
1592 1638 return;
1593 1639
1594 1640 if (perm == MAC_PROP_PERM_READ)
1595 1641 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1596 1642 else if (perm == MAC_PROP_PERM_WRITE)
1597 1643 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE);
1598 1644 else if (perm == MAC_PROP_PERM_RW)
1599 1645 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1600 1646
1601 1647 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1602 1648 overlay_prop_set_default(phdl, &def, sizeof (def));
1603 1649 overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min,
1604 1650 range.mpr_range_uint32[0].mpur_max);
1605 1651 }
1606 1652
1607 1653 /* ARGSUSED */
1608 1654 static int
1609 1655 overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred,
1610 1656 int *rvalp)
1611 1657 {
1612 1658 overlay_dev_t *odd;
1613 1659 int ret;
1614 1660 mac_perim_handle_t mph;
1615 1661 uint_t propid = UINT_MAX;
1616 1662 overlay_ioc_propinfo_t *oip = karg;
1617 1663 overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip;
1618 1664
1619 1665 odd = overlay_hold_by_dlid(oip->oipi_linkid);
1620 1666 if (odd == NULL)
1621 1667 return (ENOENT);
1622 1668
1623 1669 overlay_prop_init(phdl);
1624 1670 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1625 1671
1626 1672 /*
1627 1673 * If the id is -1, then the property that we're looking for is named in
1628 1674 * oipi_name and we should fill in its id. Otherwise, we've been given
1629 1675 * an id and we need to turn that into a name for our plugin's sake. The
1630 1676 * id is our own fabrication for property discovery.
1631 1677 */
1632 1678 if (oip->oipi_id == -1) {
1633 1679 /*
1634 1680 * Determine if it's a known generic property or it belongs to a
1635 1681 * module by checking against the list of known names.
1636 1682 */
1637 1683 oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1638 1684 if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name,
1639 1685 &propid)) != 0) {
1640 1686 overlay_hold_rele(odd);
1641 1687 mac_perim_exit(mph);
1642 1688 return (ret);
1643 1689 }
1644 1690 oip->oipi_id = propid;
1645 1691 if (propid >= OVERLAY_DEV_NPROPS) {
1646 1692 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1647 1693 oip->oipi_name, phdl);
1648 1694 overlay_hold_rele(odd);
1649 1695 mac_perim_exit(mph);
1650 1696 return (ret);
1651 1697
1652 1698 }
1653 1699 } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) {
1654 1700 uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS;
1655 1701
1656 1702 if (id >= odd->odd_plugin->ovp_nprops) {
1657 1703 overlay_hold_rele(odd);
1658 1704 mac_perim_exit(mph);
1659 1705 return (EINVAL);
1660 1706 }
1661 1707 ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(
1662 1708 odd->odd_plugin->ovp_props[id], phdl);
1663 1709 overlay_hold_rele(odd);
1664 1710 mac_perim_exit(mph);
1665 1711 return (ret);
1666 1712 } else if (oip->oipi_id < -1) {
1667 1713 overlay_hold_rele(odd);
1668 1714 mac_perim_exit(mph);
1669 1715 return (EINVAL);
1670 1716 } else {
1671 1717 ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS);
1672 1718 ASSERT(oip->oipi_id >= 0);
1673 1719 propid = oip->oipi_id;
1674 1720 (void) strlcpy(oip->oipi_name, overlay_dev_props[propid],
1675 1721 sizeof (oip->oipi_name));
1676 1722 }
1677 1723
1678 1724 switch (propid) {
1679 1725 case OVERLAY_DEV_P_MTU:
1680 1726 overlay_i_propinfo_mtu(odd, phdl);
1681 1727 break;
1682 1728 case OVERLAY_DEV_P_VNETID:
1683 1729 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW);
1684 1730 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1685 1731 overlay_prop_set_nodefault(phdl);
1686 1732 break;
1687 1733 case OVERLAY_DEV_P_ENCAP:
1688 1734 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1689 1735 overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING);
1690 1736 overlay_prop_set_nodefault(phdl);
1691 1737 overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl);
1692 1738 break;
1693 1739 case OVERLAY_DEV_P_VARPDID:
1694 1740 overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ);
1695 1741 overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT);
1696 1742 overlay_prop_set_nodefault(phdl);
1697 1743 break;
1698 1744 default:
1699 1745 overlay_hold_rele(odd);
1700 1746 mac_perim_exit(mph);
1701 1747 return (ENOENT);
1702 1748 }
1703 1749
1704 1750 overlay_hold_rele(odd);
1705 1751 mac_perim_exit(mph);
1706 1752 return (0);
1707 1753 }
1708 1754
1709 1755 /* ARGSUSED */
1710 1756 static int
1711 1757 overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1712 1758 int *rvalp)
1713 1759 {
1714 1760 int ret;
1715 1761 overlay_dev_t *odd;
1716 1762 mac_perim_handle_t mph;
1717 1763 overlay_ioc_prop_t *oip = karg;
1718 1764 uint_t propid, mtu;
1719 1765
1720 1766 odd = overlay_hold_by_dlid(oip->oip_linkid);
1721 1767 if (odd == NULL)
1722 1768 return (ENOENT);
1723 1769
1724 1770 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1725 1771 oip->oip_size = OVERLAY_PROP_SIZEMAX;
1726 1772 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1727 1773 if (oip->oip_id == -1) {
1728 1774 int i;
1729 1775
1730 1776 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1731 1777 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1732 1778 break;
1733 1779 if (i == OVERLAY_DEV_NPROPS) {
1734 1780 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(
1735 1781 odd->odd_pvoid, oip->oip_name,
1736 1782 oip->oip_value, &oip->oip_size);
1737 1783 overlay_hold_rele(odd);
1738 1784 mac_perim_exit(mph);
1739 1785 return (ret);
1740 1786 }
1741 1787 }
1742 1788
1743 1789 propid = i;
1744 1790 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1745 1791 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1746 1792
1747 1793 if (id > odd->odd_plugin->ovp_nprops) {
1748 1794 overlay_hold_rele(odd);
1749 1795 mac_perim_exit(mph);
1750 1796 return (EINVAL);
1751 1797 }
1752 1798 ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid,
1753 1799 odd->odd_plugin->ovp_props[id], oip->oip_value,
1754 1800 &oip->oip_size);
1755 1801 overlay_hold_rele(odd);
1756 1802 mac_perim_exit(mph);
1757 1803 return (ret);
1758 1804 } else if (oip->oip_id < -1) {
1759 1805 overlay_hold_rele(odd);
1760 1806 mac_perim_exit(mph);
1761 1807 return (EINVAL);
1762 1808 } else {
1763 1809 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1764 1810 ASSERT(oip->oip_id >= 0);
1765 1811 propid = oip->oip_id;
1766 1812 }
1767 1813
1768 1814 ret = 0;
1769 1815 switch (propid) {
1770 1816 case OVERLAY_DEV_P_MTU:
1771 1817 /*
1772 1818 * The MTU is always set and retrieved through MAC, to allow for
1773 1819 * MAC to do whatever it wants, as really that property belongs
1774 1820 * to MAC. This is important for things where vnics have hold on
1775 1821 * the MTU.
1776 1822 */
1777 1823 mac_sdu_get(odd->odd_mh, NULL, &mtu);
1778 1824 bcopy(&mtu, oip->oip_value, sizeof (uint_t));
1779 1825 oip->oip_size = sizeof (uint_t);
1780 1826 break;
1781 1827 case OVERLAY_DEV_P_VNETID:
1782 1828 /*
1783 1829 * While it's read-only while inside of a mux, we're not in a
1784 1830 * context that can guarantee that. Therefore we always grab the
1785 1831 * overlay_dev_t's odd_lock.
1786 1832 */
1787 1833 mutex_enter(&odd->odd_lock);
1788 1834 bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t));
1789 1835 mutex_exit(&odd->odd_lock);
1790 1836 oip->oip_size = sizeof (uint64_t);
1791 1837 break;
1792 1838 case OVERLAY_DEV_P_ENCAP:
1793 1839 oip->oip_size = strlcpy((char *)oip->oip_value,
1794 1840 odd->odd_plugin->ovp_name, oip->oip_size);
1795 1841 break;
1796 1842 case OVERLAY_DEV_P_VARPDID:
1797 1843 mutex_enter(&odd->odd_lock);
1798 1844 if (odd->odd_flags & OVERLAY_F_VARPD) {
1799 1845 const uint64_t val = odd->odd_target->ott_id;
1800 1846 bcopy(&val, oip->oip_value, sizeof (uint64_t));
1801 1847 oip->oip_size = sizeof (uint64_t);
1802 1848 } else {
1803 1849 oip->oip_size = 0;
1804 1850 }
1805 1851 mutex_exit(&odd->odd_lock);
1806 1852 break;
1807 1853 default:
1808 1854 ret = ENOENT;
1809 1855 }
1810 1856
1811 1857 overlay_hold_rele(odd);
1812 1858 mac_perim_exit(mph);
1813 1859 return (ret);
1814 1860 }
1815 1861
1816 1862 static void
1817 1863 overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid)
1818 1864 {
1819 1865 mutex_enter(&odd->odd_lock);
1820 1866
1821 1867 /* Simple case, not active */
1822 1868 if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) {
1823 1869 odd->odd_vid = vnetid;
1824 1870 mutex_exit(&odd->odd_lock);
1825 1871 return;
1826 1872 }
1827 1873
1828 1874 /*
1829 1875 * In the hard case, we need to set the drop flag, quiesce I/O and then
1830 1876 * we can go ahead and do everything.
1831 1877 */
1832 1878 odd->odd_flags |= OVERLAY_F_MDDROP;
1833 1879 overlay_io_wait(odd, OVERLAY_F_IOMASK);
1834 1880 mutex_exit(&odd->odd_lock);
1835 1881
1836 1882 overlay_mux_remove_dev(odd->odd_mux, odd);
1837 1883 mutex_enter(&odd->odd_lock);
1838 1884 odd->odd_vid = vnetid;
1839 1885 mutex_exit(&odd->odd_lock);
1840 1886 overlay_mux_add_dev(odd->odd_mux, odd);
1841 1887
1842 1888 mutex_enter(&odd->odd_lock);
1843 1889 ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX);
1844 1890 odd->odd_flags &= ~OVERLAY_F_IN_MUX;
1845 1891 mutex_exit(&odd->odd_lock);
1846 1892 }
1847 1893
1848 1894 /* ARGSUSED */
1849 1895 static int
1850 1896 overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred,
1851 1897 int *rvalp)
1852 1898 {
1853 1899 int ret;
1854 1900 overlay_dev_t *odd;
1855 1901 overlay_ioc_prop_t *oip = karg;
1856 1902 uint_t propid = UINT_MAX;
1857 1903 mac_perim_handle_t mph;
1858 1904 uint64_t maxid, *vidp;
1859 1905
1860 1906 if (oip->oip_size > OVERLAY_PROP_SIZEMAX)
1861 1907 return (EINVAL);
1862 1908
1863 1909 odd = overlay_hold_by_dlid(oip->oip_linkid);
1864 1910 if (odd == NULL)
1865 1911 return (ENOENT);
1866 1912
1867 1913 oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0';
1868 1914 mac_perim_enter_by_mh(odd->odd_mh, &mph);
1869 1915 mutex_enter(&odd->odd_lock);
1870 1916 if (odd->odd_flags & OVERLAY_F_ACTIVATED) {
1871 1917 mac_perim_exit(mph);
1872 1918 mutex_exit(&odd->odd_lock);
1873 1919 return (ENOTSUP);
1874 1920 }
1875 1921 mutex_exit(&odd->odd_lock);
1876 1922 if (oip->oip_id == -1) {
1877 1923 int i;
1878 1924
1879 1925 for (i = 0; i < OVERLAY_DEV_NPROPS; i++) {
1880 1926 if (strcmp(overlay_dev_props[i], oip->oip_name) == 0)
1881 1927 break;
1882 1928 if (i == OVERLAY_DEV_NPROPS) {
1883 1929 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(
1884 1930 odd->odd_pvoid, oip->oip_name,
1885 1931 oip->oip_value, oip->oip_size);
1886 1932 overlay_hold_rele(odd);
1887 1933 mac_perim_exit(mph);
1888 1934 return (ret);
1889 1935 }
1890 1936 }
1891 1937
1892 1938 propid = i;
1893 1939 } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) {
1894 1940 uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS;
1895 1941
1896 1942 if (id > odd->odd_plugin->ovp_nprops) {
1897 1943 mac_perim_exit(mph);
1898 1944 overlay_hold_rele(odd);
1899 1945 return (EINVAL);
1900 1946 }
1901 1947 ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid,
1902 1948 odd->odd_plugin->ovp_props[id], oip->oip_value,
1903 1949 oip->oip_size);
1904 1950 mac_perim_exit(mph);
1905 1951 overlay_hold_rele(odd);
1906 1952 return (ret);
1907 1953 } else if (oip->oip_id < -1) {
1908 1954 mac_perim_exit(mph);
1909 1955 overlay_hold_rele(odd);
1910 1956 return (EINVAL);
1911 1957 } else {
1912 1958 ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS);
1913 1959 ASSERT(oip->oip_id >= 0);
1914 1960 propid = oip->oip_id;
1915 1961 }
1916 1962
1917 1963 ret = 0;
1918 1964 switch (propid) {
1919 1965 case OVERLAY_DEV_P_MTU:
1920 1966 ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu",
1921 1967 oip->oip_value, oip->oip_size);
1922 1968 break;
1923 1969 case OVERLAY_DEV_P_VNETID:
1924 1970 if (oip->oip_size != sizeof (uint64_t)) {
1925 1971 ret = EINVAL;
1926 1972 break;
1927 1973 }
1928 1974 vidp = (uint64_t *)oip->oip_value;
1929 1975 ASSERT(odd->odd_plugin->ovp_id_size <= 8);
1930 1976 maxid = UINT64_MAX;
1931 1977 if (odd->odd_plugin->ovp_id_size != 8)
1932 1978 maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) -
1933 1979 1ULL;
1934 1980 if (*vidp >= maxid) {
1935 1981 ret = EINVAL;
1936 1982 break;
1937 1983 }
1938 1984 overlay_setprop_vnetid(odd, *vidp);
1939 1985 break;
1940 1986 case OVERLAY_DEV_P_ENCAP:
1941 1987 case OVERLAY_DEV_P_VARPDID:
1942 1988 ret = EPERM;
1943 1989 break;
1944 1990 default:
1945 1991 ret = ENOENT;
1946 1992 }
1947 1993
1948 1994 mac_perim_exit(mph);
1949 1995 overlay_hold_rele(odd);
1950 1996 return (ret);
1951 1997 }
1952 1998
1953 1999 /* ARGSUSED */
1954 2000 static int
1955 2001 overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred,
1956 2002 int *rvalp)
1957 2003 {
1958 2004 overlay_dev_t *odd;
1959 2005 overlay_ioc_status_t *os = karg;
1960 2006
1961 2007 odd = overlay_hold_by_dlid(os->ois_linkid);
1962 2008 if (odd == NULL)
1963 2009 return (ENOENT);
1964 2010
1965 2011 mutex_enter(&odd->odd_lock);
1966 2012 if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) {
1967 2013 os->ois_status = OVERLAY_I_DEGRADED;
1968 2014 if (odd->odd_fmamsg != NULL) {
1969 2015 (void) strlcpy(os->ois_message, odd->odd_fmamsg,
1970 2016 OVERLAY_STATUS_BUFLEN);
1971 2017 } else {
1972 2018 os->ois_message[0] = '\0';
1973 2019 }
1974 2020
1975 2021 } else {
1976 2022 os->ois_status = OVERLAY_I_OK;
1977 2023 os->ois_message[0] = '\0';
1978 2024 }
1979 2025 mutex_exit(&odd->odd_lock);
1980 2026 overlay_hold_rele(odd);
1981 2027
1982 2028 return (0);
1983 2029 }
1984 2030
1985 2031 static dld_ioc_info_t overlay_ioc_list[] = {
1986 2032 { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t),
1987 2033 overlay_i_create, secpolicy_dl_config },
1988 2034 { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t),
1989 2035 overlay_i_activate, secpolicy_dl_config },
1990 2036 { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t),
1991 2037 overlay_i_delete, secpolicy_dl_config },
1992 2038 { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT,
1993 2039 sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo,
1994 2040 secpolicy_dl_config },
1995 2041 { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT,
1996 2042 sizeof (overlay_ioc_prop_t), overlay_i_getprop,
1997 2043 secpolicy_dl_config },
1998 2044 { OVERLAY_IOC_SETPROP, DLDCOPYIN,
1999 2045 sizeof (overlay_ioc_prop_t), overlay_i_setprop,
2000 2046 secpolicy_dl_config },
2001 2047 { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT,
2002 2048 sizeof (overlay_ioc_nprops_t), overlay_i_nprops,
2003 2049 secpolicy_dl_config },
2004 2050 { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT,
2005 2051 sizeof (overlay_ioc_status_t), overlay_i_status,
2006 2052 NULL }
2007 2053 };
2008 2054
2009 2055 static int
2010 2056 overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2011 2057 {
2012 2058 int fmcap = DDI_FM_EREPORT_CAPABLE;
2013 2059 if (cmd != DDI_ATTACH)
2014 2060 return (DDI_FAILURE);
2015 2061
2016 2062 if (overlay_dip != NULL || ddi_get_instance(dip) != 0)
2017 2063 return (DDI_FAILURE);
2018 2064
2019 2065 ddi_fm_init(dip, &fmcap, NULL);
2020 2066
2021 2067 if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR,
2022 2068 ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE)
2023 2069 return (DDI_FAILURE);
2024 2070
2025 2071 if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list,
2026 2072 DLDIOCCNT(overlay_ioc_list)) != 0) {
2027 2073 ddi_remove_minor_node(dip, OVERLAY_CTL);
2028 2074 return (DDI_FAILURE);
2029 2075 }
2030 2076
2031 2077 overlay_dip = dip;
2032 2078 return (DDI_SUCCESS);
2033 2079 }
2034 2080
2035 2081 /* ARGSUSED */
2036 2082 static int
2037 2083 overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp)
2038 2084 {
2039 2085 int error;
2040 2086
2041 2087 switch (cmd) {
2042 2088 case DDI_INFO_DEVT2DEVINFO:
2043 2089 *resp = (void *)overlay_dip;
2044 2090 error = DDI_SUCCESS;
2045 2091 break;
2046 2092 case DDI_INFO_DEVT2INSTANCE:
2047 2093 *resp = (void *)0;
2048 2094 error = DDI_SUCCESS;
2049 2095 break;
2050 2096 default:
2051 2097 error = DDI_FAILURE;
2052 2098 break;
2053 2099 }
2054 2100
2055 2101 return (error);
2056 2102 }
2057 2103
2058 2104 static int
2059 2105 overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2060 2106 {
2061 2107 if (cmd != DDI_DETACH)
2062 2108 return (DDI_FAILURE);
2063 2109
2064 2110 mutex_enter(&overlay_dev_lock);
2065 2111 if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) {
2066 2112 mutex_exit(&overlay_dev_lock);
2067 2113 return (EBUSY);
2068 2114 }
2069 2115 mutex_exit(&overlay_dev_lock);
2070 2116
2071 2117
2072 2118 dld_ioc_unregister(OVERLAY_IOC);
2073 2119 ddi_remove_minor_node(dip, OVERLAY_CTL);
2074 2120 ddi_fm_fini(dip);
2075 2121 overlay_dip = NULL;
2076 2122 return (DDI_SUCCESS);
2077 2123 }
2078 2124
2079 2125 static struct cb_ops overlay_cbops = {
2080 2126 overlay_target_open, /* cb_open */
2081 2127 overlay_target_close, /* cb_close */
2082 2128 nodev, /* cb_strategy */
2083 2129 nodev, /* cb_print */
2084 2130 nodev, /* cb_dump */
2085 2131 nodev, /* cb_read */
2086 2132 nodev, /* cb_write */
2087 2133 overlay_target_ioctl, /* cb_ioctl */
2088 2134 nodev, /* cb_devmap */
2089 2135 nodev, /* cb_mmap */
2090 2136 nodev, /* cb_segmap */
2091 2137 nochpoll, /* cb_chpoll */
2092 2138 ddi_prop_op, /* cb_prop_op */
2093 2139 NULL, /* cb_stream */
2094 2140 D_MP, /* cb_flag */
2095 2141 CB_REV, /* cb_rev */
2096 2142 nodev, /* cb_aread */
2097 2143 nodev, /* cb_awrite */
2098 2144 };
2099 2145
2100 2146 static struct dev_ops overlay_dev_ops = {
2101 2147 DEVO_REV, /* devo_rev */
2102 2148 0, /* devo_refcnt */
2103 2149 overlay_getinfo, /* devo_getinfo */
2104 2150 nulldev, /* devo_identify */
2105 2151 nulldev, /* devo_probe */
2106 2152 overlay_attach, /* devo_attach */
2107 2153 overlay_detach, /* devo_detach */
2108 2154 nulldev, /* devo_reset */
2109 2155 &overlay_cbops, /* devo_cb_ops */
2110 2156 NULL, /* devo_bus_ops */
2111 2157 NULL, /* devo_power */
2112 2158 ddi_quiesce_not_supported /* devo_quiesce */
2113 2159 };
2114 2160
2115 2161 static struct modldrv overlay_modldrv = {
2116 2162 &mod_driverops,
2117 2163 "Overlay Network Driver",
2118 2164 &overlay_dev_ops
2119 2165 };
2120 2166
2121 2167 static struct modlinkage overlay_linkage = {
2122 2168 MODREV_1,
2123 2169 &overlay_modldrv
2124 2170 };
2125 2171
2126 2172 static int
2127 2173 overlay_init(void)
2128 2174 {
2129 2175 mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL);
2130 2176 list_create(&overlay_dev_list, sizeof (overlay_dev_t),
2131 2177 offsetof(overlay_dev_t, odd_link));
2132 2178 overlay_mux_init();
2133 2179 overlay_plugin_init();
2134 2180 overlay_target_init();
2135 2181
2136 2182 return (DDI_SUCCESS);
2137 2183 }
2138 2184
2139 2185 static void
2140 2186 overlay_fini(void)
2141 2187 {
2142 2188 overlay_target_fini();
2143 2189 overlay_plugin_fini();
2144 2190 overlay_mux_fini();
2145 2191 mutex_destroy(&overlay_dev_lock);
2146 2192 list_destroy(&overlay_dev_list);
2147 2193 }
2148 2194
2149 2195 int
2150 2196 _init(void)
2151 2197 {
2152 2198 int err;
2153 2199
2154 2200 if ((err = overlay_init()) != DDI_SUCCESS)
2155 2201 return (err);
2156 2202
2157 2203 mac_init_ops(NULL, "overlay");
2158 2204 err = mod_install(&overlay_linkage);
2159 2205 if (err != DDI_SUCCESS) {
2160 2206 overlay_fini();
2161 2207 return (err);
2162 2208 }
2163 2209
2164 2210 return (0);
2165 2211 }
2166 2212
2167 2213 int
2168 2214 _info(struct modinfo *modinfop)
2169 2215 {
2170 2216 return (mod_info(&overlay_linkage, modinfop));
2171 2217 }
2172 2218
2173 2219 int
2174 2220 _fini(void)
2175 2221 {
2176 2222 int err;
2177 2223
2178 2224 err = mod_remove(&overlay_linkage);
2179 2225 if (err != 0)
2180 2226 return (err);
2181 2227
2182 2228 overlay_fini();
2183 2229 return (0);
2184 2230 }
|
↓ open down ↓ |
1066 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX