1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2018 Joyent, Inc.
14 */
15
16 /*
17 * Overlay device target cache management
18 *
19 * For more information, see the big theory statement in
20 * uts/common/io/overlay/overlay.c
21 */
22
23 #include <sys/types.h>
24 #include <sys/ethernet.h>
25 #include <sys/kmem.h>
26 #include <sys/policy.h>
27 #include <sys/sysmacros.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/mac_provider.h>
32 #include <sys/mac_client.h>
33 #include <sys/mac_client_priv.h>
34 #include <sys/vlan.h>
35 #include <sys/crc32.h>
36 #include <sys/cred.h>
37 #include <sys/file.h>
38 #include <sys/errno.h>
39 #include <sys/ddi.h>
40 #include <sys/sunddi.h>
41
42 #include <sys/overlay_impl.h>
43 #include <sys/sdt.h>
44
45 /*
46 * This is total straw man, but at least it's a prime number. Here we're
47 * going to have to go through and do a lot of evaluation and understanding as
48 * to how these target caches should grow and shrink, as well as, memory
49 * pressure and evictions. This just gives us a starting point that'll be 'good
50 * enough', until it's not.
51 */
52 #define OVERLAY_HSIZE 823
53
54 /*
55 * We use this data structure to keep track of what requests have been actively
56 * allocated to a given instance so we know what to put back on the pending
57 * list.
58 */
59 typedef struct overlay_target_hdl {
60 minor_t oth_minor; /* RO */
61 zoneid_t oth_zoneid; /* RO */
62 int oth_oflags; /* RO */
63 list_node_t oth_link; /* overlay_target_lock */
64 kmutex_t oth_lock;
65 list_t oth_outstanding; /* oth_lock */
66 } overlay_target_hdl_t;
67
68 typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
69 typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
70 typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
71
72 typedef struct overaly_target_ioctl {
73 int oti_cmd; /* ioctl id */
74 boolean_t oti_write; /* ioctl requires FWRITE */
75 boolean_t oti_ncopyout; /* copyout data? */
76 overlay_target_copyin_f oti_copyin; /* copyin func */
77 overlay_target_ioctl_f oti_func; /* function to call */
78 overlay_target_copyout_f oti_copyout; /* copyin func */
79 size_t oti_size; /* size of user level structure */
80 } overlay_target_ioctl_t;
81
82 static kmem_cache_t *overlay_target_cache;
83 kmem_cache_t *overlay_entry_cache;
84 static id_space_t *overlay_thdl_idspace;
85 static void *overlay_thdl_state;
86
87 /*
88 * When we support overlay devices in the NGZ, then all of these need to become
89 * zone aware, by plugging into the netstack engine and becoming per-netstack
90 * data.
91 */
92 static list_t overlay_thdl_list;
93 static kmutex_t overlay_target_lock;
94 static kcondvar_t overlay_target_condvar;
95 static list_t overlay_target_list;
96 static boolean_t overlay_target_excl;
97
98 /*
99 * Outstanding data per hash table entry.
100 */
101 int overlay_ent_size = 128 * 1024;
102
103 /* ARGSUSED */
104 static int
105 overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
106 {
107 overlay_target_t *ott = buf;
108
109 mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
110 cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
111 return (0);
112 }
113
114 /* ARGSUSED */
115 static void
116 overlay_target_cache_destructor(void *buf, void *arg)
117 {
118 overlay_target_t *ott = buf;
119
120 cv_destroy(&ott->ott_cond);
121 mutex_destroy(&ott->ott_lock);
122 }
123
124 /* ARGSUSED */
125 static int
126 overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
127 {
128 overlay_target_entry_t *ote = buf;
129
130 bzero(ote, sizeof (overlay_target_entry_t));
131 mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
132 return (0);
133 }
134
135 /* ARGSUSED */
136 static void
137 overlay_entry_cache_destructor(void *buf, void *arg)
138 {
139 overlay_target_entry_t *ote = buf;
140
141 mutex_destroy(&ote->ote_lock);
142 }
143
144 static uint64_t
145 overlay_mac_hash(const void *v)
146 {
147 uint32_t crc;
148 CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
149 return (crc);
150 }
151
152 static int
153 overlay_mac_cmp(const void *a, const void *b)
154 {
155 return (bcmp(a, b, ETHERADDRL));
156 }
157
158 static uint64_t
159 overlay_vl3_hash(const void *v)
160 {
161 const overlay_target_entry_t *ote = v;
162 uint32_t crc;
163
164 CRC32(crc, &ote->ote_ip, sizeof (ote->ote_ip), -1U, crc32_table);
165 CRC32(crc, &ote->ote_fab, sizeof (ote->ote_fab), crc, crc32_table);
166 return (crc);
167 }
168
169 static int
170 overlay_vl3_cmp(const void *a, const void *b)
171 {
172 const overlay_target_entry_t *l = a;
173 const overlay_target_entry_t *r = b;
174
175 if (l->ote_fab != r->ote_fab ||
176 bcmp(&l->ote_ip, &r->ote_ip, sizeof (struct in6_addr)) != 0)
177 return (1);
178 return (0);
179 }
180
181 static int
182 overlay_vl3_avl(const void *a, const void *b)
183 {
184 const overlay_target_entry_t *l = a;
185 const overlay_target_entry_t *r = b;
186
187 if (l->ote_fab < r->ote_fab)
188 return (-1);
189 if (l->ote_fab > r->ote_fab)
190 return (1);
191 return (memcmp(&l->ote_ip, &r->ote_ip, sizeof (struct in6_addr)));
192 }
193
194 /* ARGSUSED */
195 void
196 overlay_target_entry_null_dtor(void *arg)
197 {
198 }
199
200 /* ARGSUSED */
201 void
202 overlay_target_entry_dtor(void *arg)
203 {
204 overlay_target_entry_t *ote = arg;
205
206 ASSERT3U(ote->ote_refcnt, ==, 0);
207
208 ote->ote_flags = 0;
209 bzero(ote->ote_addr, ETHERADDRL);
210 bzero(&ote->ote_ip, sizeof (ote->ote_ip));
211 ote->ote_ott = NULL;
212 ote->ote_odd = NULL;
213 ote->ote_fab = NULL;
214 freemsgchain(ote->ote_chead);
215 ote->ote_chead = ote->ote_ctail = NULL;
216 ote->ote_mbsize = 0;
217 ote->ote_vtime = 0;
218 kmem_cache_free(overlay_entry_cache, ote);
219 }
220
221 static int
222 overlay_mac_avl(const void *a, const void *b)
223 {
224 int i;
225 const overlay_target_entry_t *l, *r;
226 l = a;
227 r = b;
228
229 for (i = 0; i < ETHERADDRL; i++) {
230 if (l->ote_addr[i] > r->ote_addr[i])
231 return (1);
232 else if (l->ote_addr[i] < r->ote_addr[i])
233 return (-1);
234 }
235
236 return (0);
237 }
238
239 void
240 overlay_target_init(void)
241 {
242 int ret;
243 ret = ddi_soft_state_init(&overlay_thdl_state,
244 sizeof (overlay_target_hdl_t), 1);
245 VERIFY(ret == 0);
246 overlay_target_cache = kmem_cache_create("overlay_target",
247 sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
248 overlay_target_cache_destructor, NULL, NULL, NULL, 0);
249 overlay_entry_cache = kmem_cache_create("overlay_entry",
250 sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
251 overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
252 mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
253 cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
254 list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
255 offsetof(overlay_target_entry_t, ote_qlink));
256 list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
257 offsetof(overlay_target_hdl_t, oth_link));
258 overlay_thdl_idspace = id_space_create("overlay_target_minors",
259 1, INT32_MAX);
260 }
261
262 void
263 overlay_target_fini(void)
264 {
265 id_space_destroy(overlay_thdl_idspace);
266 list_destroy(&overlay_thdl_list);
267 list_destroy(&overlay_target_list);
268 cv_destroy(&overlay_target_condvar);
269 mutex_destroy(&overlay_target_lock);
270 kmem_cache_destroy(overlay_entry_cache);
271 kmem_cache_destroy(overlay_target_cache);
272 ddi_soft_state_fini(&overlay_thdl_state);
273 }
274
275 void
276 overlay_target_free(overlay_dev_t *odd)
277 {
278 if (odd->odd_target == NULL)
279 return;
280
281 if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
282 refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
283 refhash_t *r3p = odd->odd_target->ott_u.ott_dyn.ott_l3dhash;
284 avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
285 avl_tree_t *a3p = &odd->odd_target->ott_u.ott_dyn.ott_l3tree;
286 overlay_target_entry_t *ote;
287
288 /*
289 * Our AVL tree and hashtable contain the same elements,
290 * therefore we should just remove it from the tree, but then
291 * delete the entries when we remove them from the hash table
292 * (which happens through the refhash dtor).
293 */
294 while ((ote = avl_first(ap)) != NULL) {
295 avl_remove(ap, ote);
296 OVERLAY_TARG_ENTRY_REFRELE(ote);
297 }
298 avl_destroy(ap);
299
300 while ((ote = avl_first(a3p)) != NULL) {
301 avl_remove(a3p, ote);
302 OVERLAY_TARG_ENTRY_REFRELE(ote);
303 }
304 avl_destroy(a3p);
305
306 for (ote = refhash_first(rp); ote != NULL;
307 ote = refhash_next(rp, ote)) {
308 refhash_remove(rp, ote);
309 OVERLAY_TARG_ENTRY_REFRELE(ote);
310 }
311 refhash_destroy(rp);
312
313 for (ote = refhash_first(r3p); ote != NULL;
314 ote = refhash_next(r3p, ote)) {
315 refhash_remove(r3p, ote);
316 OVERLAY_TARG_ENTRY_REFRELE(ote);
317 }
318 refhash_destroy(r3p);
319 }
320
321 ASSERT(odd->odd_target->ott_ocount == 0);
322 bzero(&odd->odd_target->ott_u, sizeof (odd->odd_target->ott_u));
323 kmem_cache_free(overlay_target_cache, odd->odd_target);
324 odd->odd_target = NULL;
325 }
326
327 int
328 overlay_target_busy()
329 {
330 int ret;
331
332 mutex_enter(&overlay_target_lock);
333 ret = !list_is_empty(&overlay_thdl_list);
334 mutex_exit(&overlay_target_lock);
335
336 return (ret);
337 }
338
339 void
340 overlay_target_queue(overlay_target_entry_t *entry)
341 {
342 mutex_enter(&overlay_target_lock);
343 mutex_enter(&entry->ote_ott->ott_lock);
344 if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
345 mutex_exit(&entry->ote_ott->ott_lock);
346 mutex_exit(&overlay_target_lock);
347 return;
348 }
349 entry->ote_ott->ott_ocount++;
350 mutex_exit(&entry->ote_ott->ott_lock);
351 list_insert_tail(&overlay_target_list, entry);
352 cv_signal(&overlay_target_condvar);
353 mutex_exit(&overlay_target_lock);
354 }
355
356 void
357 overlay_target_quiesce(overlay_target_t *ott)
358 {
359 if (ott == NULL)
360 return;
361 mutex_enter(&ott->ott_lock);
362 ott->ott_flags |= OVERLAY_T_TEARDOWN;
363 while (ott->ott_ocount != 0)
364 cv_wait(&ott->ott_cond, &ott->ott_lock);
365 mutex_exit(&ott->ott_lock);
366 }
367
368 /*
369 * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
370 * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
371 * this time, say for NVGRE, we drop all packets that match this.
372 */
373 int
374 overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
375 socklen_t *slenp, uint64_t *vidp)
376 {
377 int ret;
378 struct sockaddr_in6 *v6;
379 overlay_target_t *ott;
380 mac_header_info_t mhi;
381 overlay_target_entry_t *entry;
382
383 ASSERT(odd->odd_target != NULL);
384
385 *vidp = odd->odd_vid;
386
387 /*
388 * At this point, the overlay device is in a mux which means that it's
389 * been activated. At this point, parts of the target, such as the mode
390 * and the destination are now read-only and we don't have to worry
391 * about synchronization for them.
392 */
393 ott = odd->odd_target;
394 if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
395 return (OVERLAY_TARGET_DROP);
396
397 v6 = (struct sockaddr_in6 *)sock;
398 bzero(v6, sizeof (struct sockaddr_in6));
399 v6->sin6_family = AF_INET6;
400
401 if (ott->ott_mode == OVERLAY_TARGET_POINT) {
402 mutex_enter(&ott->ott_lock);
403 bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
404 sizeof (struct in6_addr));
405 v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
406 mutex_exit(&ott->ott_lock);
407 *slenp = sizeof (struct sockaddr_in6);
408
409 return (OVERLAY_TARGET_OK);
410 }
411
412 ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
413
414 /*
415 * VL2 -> UL3 lookups only need the destination VL2 mac address,
416 * however, if we end up having to route the packet, we will need
417 * the source vlan as part of the destination selection.
418 */
419 if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0)
420 return (OVERLAY_TARGET_DROP);
421
422 mutex_enter(&ott->ott_lock);
423 entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
424 mhi.mhi_daddr);
425 if (entry == NULL) {
426 entry = kmem_cache_alloc(overlay_entry_cache,
427 KM_NOSLEEP | KM_NORMALPRI);
428 if (entry == NULL) {
429 mutex_exit(&ott->ott_lock);
430 return (OVERLAY_TARGET_DROP);
431 }
432 bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
433 entry->ote_chead = entry->ote_ctail = mp;
434 entry->ote_mbsize = msgsize(mp);
435 entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
436 entry->ote_ott = ott;
437 entry->ote_odd = odd;
438
439 OVERLAY_TARG_ENTRY_REFHOLD(entry);
440 refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
441
442 OVERLAY_TARG_ENTRY_REFHOLD(entry);
443 avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
444
445 mutex_exit(&ott->ott_lock);
446 overlay_target_queue(entry);
447 return (OVERLAY_TARGET_ASYNC);
448 }
449 OVERLAY_TARG_ENTRY_REFHOLD(entry);
450 mutex_exit(&ott->ott_lock);
451
452 mutex_enter(&entry->ote_lock);
453 if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
454 ret = OVERLAY_TARGET_DROP;
455 } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
456 ret = overlay_route_lookup(odd, mp, &mhi, sock, slenp, vidp);
457 } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
458 bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
459 sizeof (struct in6_addr));
460 v6->sin6_port = htons(entry->ote_dest.otp_port);
461 *slenp = sizeof (struct sockaddr_in6);
462 ret = OVERLAY_TARGET_OK;
463 } else {
464 size_t mlen = msgsize(mp);
465
466 if (mlen + entry->ote_mbsize > overlay_ent_size) {
467 ret = OVERLAY_TARGET_DROP;
468 } else {
469 if (entry->ote_ctail != NULL) {
470 ASSERT(entry->ote_ctail->b_next ==
471 NULL);
472 entry->ote_ctail->b_next = mp;
473 entry->ote_ctail = mp;
474 } else {
475 entry->ote_chead = mp;
476 entry->ote_ctail = mp;
477 }
478 entry->ote_mbsize += mlen;
479 if ((entry->ote_flags &
480 OVERLAY_ENTRY_F_PENDING) == 0) {
481 entry->ote_flags |=
482 OVERLAY_ENTRY_F_PENDING;
483 overlay_target_queue(entry);
484 }
485 ret = OVERLAY_TARGET_ASYNC;
486 }
487 }
488 mutex_exit(&entry->ote_lock);
489
490 mutex_enter(&ott->ott_lock);
491 OVERLAY_TARG_ENTRY_REFRELE(entry);
492 mutex_exit(&ott->ott_lock);
493
494 return (ret);
495 }
496
497 /* ARGSUSED */
498 static int
499 overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
500 {
501 overlay_dev_t *odd;
502 overlay_targ_info_t *oti = arg;
503
504 odd = overlay_hold_by_dlid(oti->oti_linkid);
505 if (odd == NULL)
506 return (ENOENT);
507
508 mutex_enter(&odd->odd_lock);
509 oti->oti_flags = 0;
510 oti->oti_needs = odd->odd_plugin->ovp_dest;
511 if (odd->odd_flags & OVERLAY_F_DEGRADED)
512 oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
513 if (odd->odd_flags & OVERLAY_F_ACTIVATED)
514 oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
515 oti->oti_vnetid = odd->odd_vid;
516 oti->oti_dcid = odd->odd_dcid;
517 mutex_exit(&odd->odd_lock);
518 overlay_hold_rele(odd);
519 return (0);
520 }
521
522 /* ARGSUSED */
523 static int
524 overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
525 {
526 overlay_dev_t *odd;
527 overlay_target_t *ott;
528 overlay_targ_associate_t *ota = arg;
529 overlay_router_t *ort;
530
531 odd = overlay_hold_by_dlid(ota->ota_linkid);
532 if (odd == NULL)
533 return (ENOENT);
534
535 if (ota->ota_id == 0) {
536 overlay_hold_rele(odd);
537 return (EINVAL);
538 }
539
540 if (ota->ota_mode != OVERLAY_TARGET_POINT &&
541 ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
542 overlay_hold_rele(odd);
543 return (EINVAL);
544 }
545
546 if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
547 overlay_hold_rele(odd);
548 return (EINVAL);
549 }
550
551 if (ota->ota_mode == OVERLAY_TARGET_POINT) {
552 if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
553 if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
554 IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
555 IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
556 overlay_hold_rele(odd);
557 return (EINVAL);
558 }
559 }
560
561 if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
562 if (ota->ota_point.otp_port == 0) {
563 overlay_hold_rele(odd);
564 return (EINVAL);
565 }
566 }
567 }
568
569 ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
570 ott->ott_flags = 0;
571 ott->ott_ocount = 0;
572 ott->ott_mode = ota->ota_mode;
573 ott->ott_dest = ota->ota_provides;
574 ott->ott_id = ota->ota_id;
575
576 if (ott->ott_mode == OVERLAY_TARGET_POINT) {
577 bcopy(&ota->ota_point, &ott->ott_u.ott_point,
578 sizeof (overlay_target_point_t));
579 } else {
580 ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
581 overlay_mac_hash, overlay_mac_cmp,
582 overlay_target_entry_null_dtor,
583 sizeof (overlay_target_entry_t),
584 offsetof(overlay_target_entry_t, ote_reflink),
585 offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
586 ott->ott_u.ott_dyn.ott_l3dhash = refhash_create(OVERLAY_HSIZE,
587 overlay_vl3_hash, overlay_vl3_cmp,
588 overlay_target_entry_null_dtor,
589 sizeof (overlay_target_entry_t),
590 offsetof(overlay_target_entry_t, ote_l3_reflink), 0,
591 KM_SLEEP);
592 avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
593 sizeof (overlay_target_entry_t),
594 offsetof(overlay_target_entry_t, ote_avllink));
595 avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_vl3_avl,
596 sizeof (overlay_target_entry_t),
597 offsetof(overlay_target_entry_t, ote_l3_avllink));
598
599 ort = kmem_zalloc(sizeof (*ort), KM_SLEEP);
600 mutex_init(&ort->otr_lock, NULL, MUTEX_DRIVER, NULL);
601 list_create(&ort->otr_tables, sizeof (overlay_route_table_t),
602 offsetof(overlay_route_table_t, ort_link));
603 avl_create(&ort->otr_tree, overlay_fabric_avl,
604 sizeof (overlay_fabric_entry_t),
605 offsetof(overlay_fabric_entry_t, ofe_avllink));
606 }
607 mutex_enter(&odd->odd_lock);
608 if (odd->odd_flags & OVERLAY_F_VARPD) {
609 mutex_exit(&odd->odd_lock);
610 kmem_cache_free(overlay_target_cache, ott);
611 overlay_hold_rele(odd);
612 return (EEXIST);
613 }
614
615 odd->odd_flags |= OVERLAY_F_VARPD;
616 odd->odd_target = ott;
617 mutex_exit(&odd->odd_lock);
618
619 overlay_hold_rele(odd);
620
621 return (0);
622 }
623
624
625 /* ARGSUSED */
626 static int
627 overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
628 {
629 overlay_dev_t *odd;
630 overlay_targ_degrade_t *otd = arg;
631
632 odd = overlay_hold_by_dlid(otd->otd_linkid);
633 if (odd == NULL)
634 return (ENOENT);
635
636 overlay_fm_degrade(odd, otd->otd_buf);
637 overlay_hold_rele(odd);
638 return (0);
639 }
640
641 /* ARGSUSED */
642 static int
643 overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
644 {
645 overlay_dev_t *odd;
646 overlay_targ_id_t *otid = arg;
647
648 odd = overlay_hold_by_dlid(otid->otid_linkid);
649 if (odd == NULL)
650 return (ENOENT);
651
652 overlay_fm_restore(odd);
653 overlay_hold_rele(odd);
654 return (0);
655 }
656
657 /* ARGSUSED */
658 static int
659 overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
660 {
661 overlay_dev_t *odd;
662 overlay_targ_id_t *otid = arg;
663
664 odd = overlay_hold_by_dlid(otid->otid_linkid);
665 if (odd == NULL)
666 return (ENOENT);
667
668 mutex_enter(&odd->odd_lock);
669 odd->odd_flags &= ~OVERLAY_F_VARPD;
670 mutex_exit(&odd->odd_lock);
671
672 overlay_hold_rele(odd);
673 return (0);
674
675 }
676
677 static int
678 overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
679 {
680 overlay_targ_lookup_t *otl = arg;
681 overlay_target_entry_t *entry;
682 void *src, *dst;
683 clock_t ret, timeout;
684 mac_header_info_t mhi;
685 timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
686 again:
687 mutex_enter(&overlay_target_lock);
688 while (list_is_empty(&overlay_target_list)) {
689 ret = cv_timedwait(&overlay_target_condvar,
690 &overlay_target_lock, timeout);
691 if (ret == -1) {
692 mutex_exit(&overlay_target_lock);
693 return (ETIME);
694 }
695 }
696 entry = list_remove_head(&overlay_target_list);
697 mutex_exit(&overlay_target_lock);
698 mutex_enter(&entry->ote_lock);
699 if (entry->ote_flags &
700 (OVERLAY_ENTRY_F_PENDING | OVERLAY_ENTRY_F_VL3_PENDING)) {
701 ASSERT(entry->ote_chead == NULL);
702 mutex_exit(&entry->ote_lock);
703 goto again;
704 }
705 ASSERT(entry->ote_chead != NULL);
706
707
708 otl->otl_l3req = (entry->ote_flags & OVERLAY_ENTRY_F_VL3_PENDING) ?
709 B_TRUE : B_FALSE;
710
711 if (otl->otl_l3req) {
712 src = &otl->otl_addru.otlu_l3.otl3_srcip;
713 dst = &otl->otl_addru.otlu_l3.otl3_dstip;
714 } else {
715 src = &otl->otl_addru.otlu_l2.otl2_srcaddr;
716 dst = &otl->otl_addru.otlu_l2.otl2_dstaddr;
717 }
718
719 /*
720 * If we have a bogon that doesn't have a valid mac header, or an
721 * invalid IP header for IP requests, drop it and try again.
722 */
723 if ((mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
724 &mhi) != 0) ||
725 (otl->otl_l3req && overlay_mblk_vl3ip(entry->ote_chead, src,
726 dst) != 0)) {
727 boolean_t queue = B_FALSE;
728 mblk_t *mp = entry->ote_chead;
729 entry->ote_chead = mp->b_next;
730 mp->b_next = NULL;
731 if (entry->ote_ctail == mp)
732 entry->ote_ctail = entry->ote_chead;
733 entry->ote_mbsize -= msgsize(mp);
734 if (entry->ote_chead != NULL)
735 queue = B_TRUE;
736 mutex_exit(&entry->ote_lock);
737 if (queue == B_TRUE)
738 overlay_target_queue(entry);
739 freemsg(mp);
740 goto again;
741 }
742
743 otl->otl_dlid = entry->ote_odd->odd_linkid;
744 otl->otl_reqid = (uintptr_t)entry;
745 otl->otl_varpdid = entry->ote_ott->ott_id;
746 otl->otl_vnetid = entry->ote_odd->odd_vid;
747
748 otl->otl_hdrsize = mhi.mhi_hdrsize;
749 otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
750 otl->otl_addru.otlu_l2.otl2_dsttype = mhi.mhi_dsttype;
751 otl->otl_addru.otlu_l2.otl2_sap = mhi.mhi_bindsap;
752 otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
753
754 /*
755 * The overlay_mblk_vl3ip() call above fills in dst & src for
756 * VL3->UL3 requests, so only need to care about VL2->UL3 here.
757 */
758 if (!otl->otl_l3req) {
759 bcopy(mhi.mhi_daddr, dst, ETHERADDRL);
760 bcopy(mhi.mhi_saddr, src, ETHERADDRL);
761 }
762 mutex_exit(&entry->ote_lock);
763
764 mutex_enter(&thdl->oth_lock);
765 list_insert_tail(&thdl->oth_outstanding, entry);
766 mutex_exit(&thdl->oth_lock);
767
768 return (0);
769 }
770
771 static int
772 overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
773 {
774 const overlay_targ_resp_t *otr = arg;
775 overlay_target_entry_t *entry;
776 mblk_t *mp;
777 boolean_t is_router = B_FALSE;
778
779 /*
780 * If we ever support a protocol that uses MAC addresses for the UL
781 * destination addr, we probably should expand this to check that
782 * all of otr is zero.
783 */
784 if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) &&
785 otr->otr_answer.otp_port == 0)
786 is_router = B_TRUE;
787
788 mutex_enter(&thdl->oth_lock);
789 for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
790 entry = list_next(&thdl->oth_outstanding, entry)) {
791 if ((uintptr_t)entry == otr->otr_reqid)
792 break;
793 }
794
795 if (entry == NULL) {
796 mutex_exit(&thdl->oth_lock);
797 return (EINVAL);
798 }
799 list_remove(&thdl->oth_outstanding, entry);
800 mutex_exit(&thdl->oth_lock);
801
802 mutex_enter(&entry->ote_lock);
803 bcopy(&otr->otr_answer, &entry->ote_dest,
804 sizeof (overlay_target_point_t));
805 entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
806 entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
807 if (is_router)
808 entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
809 mp = entry->ote_chead;
810 entry->ote_chead = NULL;
811 entry->ote_ctail = NULL;
812 entry->ote_mbsize = 0;
813 entry->ote_vtime = gethrtime();
814 mutex_exit(&entry->ote_lock);
815
816 /*
817 * For now do an in-situ drain.
818 *
819 * TODO: overlay_m_tx() will need to perform remote fabric attachment
820 * checks, which may leave mblk_t's left in the msg chain for
821 * mblk_t's whose connectivity with the target entry are unknown.
822 * This will then need to deal with the leftovers.
823 */
824 mp = overlay_m_tx(entry->ote_odd, mp);
825 freemsgchain(mp);
826
827 mutex_enter(&entry->ote_ott->ott_lock);
828 entry->ote_ott->ott_ocount--;
829 cv_signal(&entry->ote_ott->ott_cond);
830 mutex_exit(&entry->ote_ott->ott_lock);
831
832 return (0);
833 }
834
835 static int
836 overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
837 {
838 const overlay_targ_resp_t *otr = arg;
839 overlay_target_entry_t *entry;
840 mblk_t *mp;
841 boolean_t queue = B_FALSE;
842
843 mutex_enter(&thdl->oth_lock);
844 for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
845 entry = list_next(&thdl->oth_outstanding, entry)) {
846 if ((uintptr_t)entry == otr->otr_reqid)
847 break;
848 }
849
850 if (entry == NULL) {
851 mutex_exit(&thdl->oth_lock);
852 return (EINVAL);
853 }
854 list_remove(&thdl->oth_outstanding, entry);
855 mutex_exit(&thdl->oth_lock);
856
857 mutex_enter(&entry->ote_lock);
858
859 /* Safeguard against a confused varpd */
860 if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
861 entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
862 DTRACE_PROBE1(overlay__target__valid__drop,
863 overlay_target_entry_t *, entry);
864 mutex_exit(&entry->ote_lock);
865 goto done;
866 }
867
868 /*
869 * TODO: This will need to be smarter. This drop can only apply to
870 * packets from the same source fabric as the first mblk_t in the
871 * chain. If the target exists, packets from other fabrics which
872 * are chained to this target entry may be able to be sent (if we
873 * already know they are attached), or we might need to query from
874 * those other source fabrics if we don't know if the two are
875 * attached.
876 */
877 mp = entry->ote_chead;
878 if (mp != NULL) {
879 entry->ote_chead = mp->b_next;
880 mp->b_next = NULL;
881 if (entry->ote_ctail == mp)
882 entry->ote_ctail = entry->ote_chead;
883 entry->ote_mbsize -= msgsize(mp);
884 }
885 if (entry->ote_chead != NULL) {
886 queue = B_TRUE;
887 entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
888 } else {
889 entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
890 }
891 mutex_exit(&entry->ote_lock);
892
893 if (queue == B_TRUE)
894 overlay_target_queue(entry);
895 freemsg(mp);
896
897 done:
898 mutex_enter(&entry->ote_ott->ott_lock);
899 entry->ote_ott->ott_ocount--;
900 cv_signal(&entry->ote_ott->ott_cond);
901 mutex_exit(&entry->ote_ott->ott_lock);
902
903 return (0);
904 }
905
906 /* ARGSUSED */
907 static int
908 overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
909 int flags)
910 {
911 overlay_targ_pkt_t *pkt;
912 overlay_targ_pkt32_t *pkt32;
913
914 pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
915 *outp = pkt;
916 *bsize = sizeof (overlay_targ_pkt_t);
917 if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
918 uintptr_t addr;
919
920 if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
921 flags & FKIOCTL) != 0) {
922 kmem_free(pkt, *bsize);
923 return (EFAULT);
924 }
925 pkt32 = (overlay_targ_pkt32_t *)pkt;
926 addr = pkt32->otp_buf;
927 pkt->otp_buf = (void *)addr;
928 } else {
929 if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
930 kmem_free(pkt, *bsize);
931 return (EFAULT);
932 }
933 }
934 return (0);
935 }
936
937 static int
938 overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
939 int flags)
940 {
941 if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
942 overlay_targ_pkt_t *pkt = buf;
943 overlay_targ_pkt32_t *pkt32 = buf;
944 uintptr_t addr = (uintptr_t)pkt->otp_buf;
945 pkt32->otp_buf = (caddr32_t)addr;
946 if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
947 flags & FKIOCTL) != 0)
948 return (EFAULT);
949 } else {
950 if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
951 return (EFAULT);
952 }
953 return (0);
954 }
955
956 static int
957 overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
958 {
959 overlay_targ_pkt_t *pkt = arg;
960 overlay_target_entry_t *entry;
961 mblk_t *mp;
962 size_t mlen;
963 size_t boff;
964
965 mutex_enter(&thdl->oth_lock);
966 for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
967 entry = list_next(&thdl->oth_outstanding, entry)) {
968 if ((uintptr_t)entry == pkt->otp_reqid)
969 break;
970 }
971
972 if (entry == NULL) {
973 mutex_exit(&thdl->oth_lock);
974 return (EINVAL);
975 }
976 mutex_enter(&entry->ote_lock);
977 mutex_exit(&thdl->oth_lock);
978 mp = entry->ote_chead;
979 /* Protect against a rogue varpd */
980 if (mp == NULL) {
981 mutex_exit(&entry->ote_lock);
982 return (EINVAL);
983 }
984 mlen = MIN(msgsize(mp), pkt->otp_size);
985 pkt->otp_size = mlen;
986 boff = 0;
987 while (mlen > 0) {
988 size_t wlen = MIN(MBLKL(mp), mlen);
989 if (ddi_copyout(mp->b_rptr,
990 (void *)((uintptr_t)pkt->otp_buf + boff),
991 wlen, 0) != 0) {
992 mutex_exit(&entry->ote_lock);
993 return (EFAULT);
994 }
995 mlen -= wlen;
996 boff += wlen;
997 mp = mp->b_cont;
998 }
999 mutex_exit(&entry->ote_lock);
1000 return (0);
1001 }
1002
1003 static int
1004 overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
1005 {
1006 overlay_targ_pkt_t *pkt = arg;
1007 overlay_target_entry_t *entry;
1008 overlay_dev_t *odd;
1009 mblk_t *mp;
1010
1011 if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
1012 return (EINVAL);
1013
1014 mp = allocb(pkt->otp_size, 0);
1015 if (mp == NULL)
1016 return (ENOMEM);
1017
1018 if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
1019 freeb(mp);
1020 return (EFAULT);
1021 }
1022 mp->b_wptr += pkt->otp_size;
1023
1024 if (pkt->otp_linkid != UINT64_MAX) {
1025 odd = overlay_hold_by_dlid(pkt->otp_linkid);
1026 if (odd == NULL) {
1027 freeb(mp);
1028 return (ENOENT);
1029 }
1030 } else {
1031 mutex_enter(&thdl->oth_lock);
1032 for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
1033 entry = list_next(&thdl->oth_outstanding, entry)) {
1034 if ((uintptr_t)entry == pkt->otp_reqid)
1035 break;
1036 }
1037
1038 if (entry == NULL) {
1039 mutex_exit(&thdl->oth_lock);
1040 freeb(mp);
1041 return (ENOENT);
1042 }
1043 odd = entry->ote_odd;
1044 mutex_exit(&thdl->oth_lock);
1045 }
1046
1047 mutex_enter(&odd->odd_lock);
1048 overlay_io_start(odd, OVERLAY_F_IN_RX);
1049 mutex_exit(&odd->odd_lock);
1050
1051 mac_rx(odd->odd_mh, NULL, mp);
1052
1053 mutex_enter(&odd->odd_lock);
1054 overlay_io_done(odd, OVERLAY_F_IN_RX);
1055 mutex_exit(&odd->odd_lock);
1056
1057 return (0);
1058 }
1059
1060 static int
1061 overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
1062 {
1063 overlay_targ_pkt_t *pkt = arg;
1064 overlay_target_entry_t *entry;
1065 overlay_dev_t *odd;
1066 mblk_t *mp;
1067
1068 if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
1069 return (EINVAL);
1070
1071 mp = allocb(pkt->otp_size, 0);
1072 if (mp == NULL)
1073 return (ENOMEM);
1074
1075 if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
1076 freeb(mp);
1077 return (EFAULT);
1078 }
1079 mp->b_wptr += pkt->otp_size;
1080
1081 if (pkt->otp_linkid != UINT64_MAX) {
1082 odd = overlay_hold_by_dlid(pkt->otp_linkid);
1083 if (odd == NULL) {
1084 freeb(mp);
1085 return (ENOENT);
1086 }
1087 } else {
1088 mutex_enter(&thdl->oth_lock);
1089 for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
1090 entry = list_next(&thdl->oth_outstanding, entry)) {
1091 if ((uintptr_t)entry == pkt->otp_reqid)
1092 break;
1093 }
1094
1095 if (entry == NULL) {
1096 mutex_exit(&thdl->oth_lock);
1097 freeb(mp);
1098 return (ENOENT);
1099 }
1100 odd = entry->ote_odd;
1101 mutex_exit(&thdl->oth_lock);
1102 }
1103
1104 mp = overlay_m_tx(odd, mp);
1105 freemsgchain(mp);
1106
1107 return (0);
1108 }
1109
1110 typedef struct overlay_targ_list_int {
1111 boolean_t otli_count;
1112 uint32_t otli_cur;
1113 uint32_t otli_nents;
1114 uint32_t otli_ents[];
1115 } overlay_targ_list_int_t;
1116
1117 static int
1118 overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
1119 int flags)
1120 {
1121 overlay_targ_list_t n;
1122 overlay_targ_list_int_t *otl;
1123
1124 if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
1125 flags & FKIOCTL) != 0)
1126 return (EFAULT);
1127
1128 /*
1129 */
1130 if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
1131 return (EINVAL);
1132 *bsize = sizeof (overlay_targ_list_int_t) +
1133 sizeof (uint32_t) * n.otl_nents;
1134 otl = kmem_zalloc(*bsize, KM_SLEEP);
1135 otl->otli_cur = 0;
1136 otl->otli_nents = n.otl_nents;
1137 if (otl->otli_nents != 0) {
1138 otl->otli_count = B_FALSE;
1139 if (ddi_copyin((void *)((uintptr_t)ubuf +
1140 offsetof(overlay_targ_list_t, otl_ents)),
1141 otl->otli_ents, n.otl_nents * sizeof (uint32_t),
1142 flags & FKIOCTL) != 0) {
1143 kmem_free(otl, *bsize);
1144 return (EFAULT);
1145 }
1146 } else {
1147 otl->otli_count = B_TRUE;
1148 }
1149
1150 *outp = otl;
1151 return (0);
1152 }
1153
1154 static int
1155 overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
1156 {
1157 overlay_targ_list_int_t *otl = arg;
1158
1159 if (otl->otli_cur < otl->otli_nents)
1160 otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
1161 otl->otli_cur++;
1162 return (0);
1163 }
1164
1165 /* ARGSUSED */
1166 static int
1167 overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
1168 {
1169 overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
1170 return (0);
1171 }
1172
1173 /* ARGSUSED */
1174 static int
1175 overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
1176 {
1177 overlay_targ_list_int_t *otl = buf;
1178
1179 if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
1180 flags & FKIOCTL) != 0)
1181 return (EFAULT);
1182
1183 if (otl->otli_count == B_FALSE) {
1184 if (ddi_copyout(otl->otli_ents,
1185 (void *)((uintptr_t)ubuf +
1186 offsetof(overlay_targ_list_t, otl_ents)),
1187 sizeof (uint32_t) * otl->otli_nents,
1188 flags & FKIOCTL) != 0)
1189 return (EFAULT);
1190 }
1191 return (0);
1192 }
1193
1194 /* ARGSUSED */
1195 static int
1196 overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
1197 {
1198 int ret = 0;
1199 overlay_dev_t *odd;
1200 overlay_target_t *ott;
1201 overlay_targ_cache_t *otc = arg;
1202
1203 odd = overlay_hold_by_dlid(otc->otc_linkid);
1204 if (odd == NULL)
1205 return (ENOENT);
1206
1207 mutex_enter(&odd->odd_lock);
1208 if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1209 mutex_exit(&odd->odd_lock);
1210 overlay_hold_rele(odd);
1211 return (ENXIO);
1212 }
1213 ott = odd->odd_target;
1214 if (ott->ott_mode != OVERLAY_TARGET_POINT &&
1215 ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1216 mutex_exit(&odd->odd_lock);
1217 overlay_hold_rele(odd);
1218 return (ENOTSUP);
1219 }
1220 mutex_enter(&ott->ott_lock);
1221 mutex_exit(&odd->odd_lock);
1222
1223 if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1224 otc->otc_entry.otce_flags = 0;
1225 bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
1226 sizeof (overlay_target_point_t));
1227 } else {
1228 overlay_target_entry_t *ote;
1229 ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1230 otc->otc_entry.otce_mac);
1231 if (ote == NULL) {
1232 ret = ENOENT;
1233 goto done;
1234 }
1235
1236 mutex_enter(&ote->ote_lock);
1237 if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) {
1238 if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
1239 otc->otc_entry.otce_flags =
1240 OVERLAY_TARGET_CACHE_DROP;
1241 } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
1242 otc->otc_entry.otce_flags =
1243 OVERLAY_TARGET_CACHE_ROUTER;
1244 } else {
1245 otc->otc_entry.otce_flags = 0;
1246 bcopy(&ote->ote_dest, &otc->otc_entry.otce_dest,
1247 sizeof (overlay_target_point_t));
1248 }
1249 ret = 0;
1250 } else {
1251 ret = ENOENT;
1252 }
1253 mutex_exit(&ote->ote_lock);
1254 }
1255
1256 done:
1257 mutex_exit(&ott->ott_lock);
1258 overlay_hold_rele(odd);
1259
1260 return (ret);
1261 }
1262
1263 /* ARGSUSED */
1264 static int
1265 overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
1266 {
1267 overlay_dev_t *odd;
1268 overlay_target_t *ott;
1269 overlay_target_entry_t *ote;
1270 overlay_targ_cache_t *otc = arg;
1271 mblk_t *mp = NULL;
1272
1273 if (otc->otc_entry.otce_flags &
1274 ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
1275 return (EINVAL);
1276
1277 if (otc->otc_entry.otce_flags ==
1278 (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
1279 return (EINVAL);
1280
1281 odd = overlay_hold_by_dlid(otc->otc_linkid);
1282 if (odd == NULL)
1283 return (ENOENT);
1284
1285 mutex_enter(&odd->odd_lock);
1286 if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1287 mutex_exit(&odd->odd_lock);
1288 overlay_hold_rele(odd);
1289 return (ENXIO);
1290 }
1291 ott = odd->odd_target;
1292 if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1293 mutex_exit(&odd->odd_lock);
1294 overlay_hold_rele(odd);
1295 return (ENOTSUP);
1296 }
1297 mutex_enter(&ott->ott_lock);
1298 mutex_exit(&odd->odd_lock);
1299
1300 ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1301 otc->otc_entry.otce_mac);
1302 if (ote == NULL) {
1303 ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
1304 bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
1305 ote->ote_chead = ote->ote_ctail = NULL;
1306 ote->ote_mbsize = 0;
1307 ote->ote_ott = ott;
1308 ote->ote_odd = odd;
1309 mutex_enter(&ote->ote_lock);
1310 refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
1311 avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
1312 } else {
1313 mutex_enter(&ote->ote_lock);
1314 }
1315
1316 if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
1317 ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
1318 } else {
1319 ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
1320 if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER)
1321 ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
1322 bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
1323 sizeof (overlay_target_point_t));
1324 mp = ote->ote_chead;
1325 ote->ote_chead = NULL;
1326 ote->ote_ctail = NULL;
1327 ote->ote_mbsize = 0;
1328 ote->ote_vtime = gethrtime();
1329 }
1330
1331 mutex_exit(&ote->ote_lock);
1332 mutex_exit(&ott->ott_lock);
1333
1334 if (mp != NULL) {
1335 mp = overlay_m_tx(ote->ote_odd, mp);
1336 freemsgchain(mp);
1337 }
1338
1339 overlay_hold_rele(odd);
1340
1341 return (0);
1342 }
1343
1344 /* ARGSUSED */
1345 static int
1346 overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
1347 {
1348 int ret = 0;
1349 overlay_dev_t *odd;
1350 overlay_target_t *ott;
1351 overlay_target_entry_t *ote;
1352 overlay_targ_cache_t *otc = arg;
1353
1354 odd = overlay_hold_by_dlid(otc->otc_linkid);
1355 if (odd == NULL)
1356 return (ENOENT);
1357
1358 mutex_enter(&odd->odd_lock);
1359 if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1360 mutex_exit(&odd->odd_lock);
1361 overlay_hold_rele(odd);
1362 return (ENXIO);
1363 }
1364 ott = odd->odd_target;
1365 if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1366 mutex_exit(&odd->odd_lock);
1367 overlay_hold_rele(odd);
1368 return (ENOTSUP);
1369 }
1370 mutex_enter(&ott->ott_lock);
1371 mutex_exit(&odd->odd_lock);
1372
1373 ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1374 otc->otc_entry.otce_mac);
1375 if (ote != NULL) {
1376 mutex_enter(&ote->ote_lock);
1377 ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1378 mutex_exit(&ote->ote_lock);
1379 ret = 0;
1380 } else {
1381 ret = ENOENT;
1382 }
1383
1384 mutex_exit(&ott->ott_lock);
1385 overlay_hold_rele(odd);
1386
1387 return (ret);
1388 }
1389
1390 /* ARGSUSED */
1391 static int
1392 overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
1393 {
1394 avl_tree_t *avl;
1395 overlay_dev_t *odd;
1396 overlay_target_t *ott;
1397 overlay_target_entry_t *ote;
1398 overlay_targ_cache_t *otc = arg;
1399
1400 odd = overlay_hold_by_dlid(otc->otc_linkid);
1401 if (odd == NULL)
1402 return (ENOENT);
1403
1404 mutex_enter(&odd->odd_lock);
1405 if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1406 mutex_exit(&odd->odd_lock);
1407 overlay_hold_rele(odd);
1408 return (ENXIO);
1409 }
1410 ott = odd->odd_target;
1411 if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1412 mutex_exit(&odd->odd_lock);
1413 overlay_hold_rele(odd);
1414 return (ENOTSUP);
1415 }
1416 mutex_enter(&ott->ott_lock);
1417 mutex_exit(&odd->odd_lock);
1418 avl = &ott->ott_u.ott_dyn.ott_tree;
1419
1420 for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
1421 mutex_enter(&ote->ote_lock);
1422 ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1423 mutex_exit(&ote->ote_lock);
1424 }
1425 ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1426 otc->otc_entry.otce_mac);
1427
1428 mutex_exit(&ott->ott_lock);
1429 overlay_hold_rele(odd);
1430
1431 return (0);
1432 }
1433
1434 static int
1435 overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
1436 int flags)
1437 {
1438 overlay_targ_cache_iter_t base, *iter;
1439
1440 if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
1441 flags & FKIOCTL) != 0)
1442 return (EFAULT);
1443
1444 if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
1445 return (E2BIG);
1446
1447 if (base.otci_count == 0)
1448 return (EINVAL);
1449
1450 *bsize = sizeof (overlay_targ_cache_iter_t) +
1451 base.otci_count * sizeof (overlay_targ_cache_entry_t);
1452 iter = kmem_alloc(*bsize, KM_SLEEP);
1453 bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
1454 *outp = iter;
1455
1456 return (0);
1457 }
1458
1459 typedef struct overlay_targ_cache_marker {
1460 uint8_t otcm_mac[ETHERADDRL];
1461 uint16_t otcm_done;
1462 } overlay_targ_cache_marker_t;
1463
1464 /* ARGSUSED */
1465 static int
1466 overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
1467 {
1468 overlay_dev_t *odd;
1469 overlay_target_t *ott;
1470 overlay_target_entry_t lookup, *ent;
1471 overlay_targ_cache_marker_t *mark;
1472 avl_index_t where;
1473 avl_tree_t *avl;
1474 uint16_t written = 0;
1475
1476 overlay_targ_cache_iter_t *iter = arg;
1477 mark = (void *)&iter->otci_marker;
1478
1479 if (mark->otcm_done != 0) {
1480 iter->otci_count = 0;
1481 return (0);
1482 }
1483
1484 odd = overlay_hold_by_dlid(iter->otci_linkid);
1485 if (odd == NULL)
1486 return (ENOENT);
1487
1488 mutex_enter(&odd->odd_lock);
1489 if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1490 mutex_exit(&odd->odd_lock);
1491 overlay_hold_rele(odd);
1492 return (ENXIO);
1493 }
1494 ott = odd->odd_target;
1495 if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
1496 ott->ott_mode != OVERLAY_TARGET_POINT) {
1497 mutex_exit(&odd->odd_lock);
1498 overlay_hold_rele(odd);
1499 return (ENOTSUP);
1500 }
1501
1502 /*
1503 * Holding this lock across the entire iteration probably isn't very
1504 * good. We should perhaps add an r/w lock for the avl tree. But we'll
1505 * wait until we now it's necessary before we do more.
1506 */
1507 mutex_enter(&ott->ott_lock);
1508 mutex_exit(&odd->odd_lock);
1509
1510 if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1511 overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
1512 bzero(out->otce_mac, ETHERADDRL);
1513 out->otce_flags = 0;
1514 bcopy(&ott->ott_u.ott_point, &out->otce_dest,
1515 sizeof (overlay_target_point_t));
1516 written++;
1517 mark->otcm_done = 1;
1518 }
1519
1520 avl = &ott->ott_u.ott_dyn.ott_tree;
1521 bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
1522 ent = avl_find(avl, &lookup, &where);
1523
1524 /*
1525 * NULL ent means that the entry does not exist, so we want to start
1526 * with the closest node in the tree. This means that we implicitly rely
1527 * on the tree's order and the first node will be the mac 00:00:00:00:00
1528 * and the last will be ff:ff:ff:ff:ff:ff.
1529 */
1530 if (ent == NULL) {
1531 ent = avl_nearest(avl, where, AVL_AFTER);
1532 if (ent == NULL) {
1533 mark->otcm_done = 1;
1534 goto done;
1535 }
1536 }
1537
1538 for (; ent != NULL && written < iter->otci_count;
1539 ent = AVL_NEXT(avl, ent)) {
1540 overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
1541 mutex_enter(&ent->ote_lock);
1542 if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
1543 mutex_exit(&ent->ote_lock);
1544 continue;
1545 }
1546 bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
1547 out->otce_flags = 0;
1548 if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
1549 out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
1550 if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
1551 bcopy(&ent->ote_dest, &out->otce_dest,
1552 sizeof (overlay_target_point_t));
1553 written++;
1554 mutex_exit(&ent->ote_lock);
1555 }
1556
1557 if (ent != NULL) {
1558 bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
1559 } else {
1560 mark->otcm_done = 1;
1561 }
1562
1563 done:
1564 iter->otci_count = written;
1565 mutex_exit(&ott->ott_lock);
1566 overlay_hold_rele(odd);
1567
1568 return (0);
1569 }
1570
1571 /* ARGSUSED */
1572 static int
1573 overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
1574 int flags)
1575 {
1576 size_t outsize;
1577 const overlay_targ_cache_iter_t *iter = buf;
1578
1579 outsize = sizeof (overlay_targ_cache_iter_t) +
1580 iter->otci_count * sizeof (overlay_targ_cache_entry_t);
1581
1582 if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
1583 return (EFAULT);
1584
1585 return (0);
1586 }
1587
1588 static overlay_target_ioctl_t overlay_target_ioctab[] = {
1589 { OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
1590 NULL, overlay_target_info,
1591 NULL, sizeof (overlay_targ_info_t) },
1592 { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
1593 NULL, overlay_target_associate,
1594 NULL, sizeof (overlay_targ_associate_t) },
1595 { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
1596 NULL, overlay_target_disassociate,
1597 NULL, sizeof (overlay_targ_id_t) },
1598 { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
1599 NULL, overlay_target_degrade,
1600 NULL, sizeof (overlay_targ_degrade_t) },
1601 { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
1602 NULL, overlay_target_restore,
1603 NULL, sizeof (overlay_targ_id_t) },
1604 { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
1605 NULL, overlay_target_lookup_request,
1606 NULL, sizeof (overlay_targ_lookup_t) },
1607 { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
1608 NULL, overlay_target_lookup_respond,
1609 NULL, sizeof (overlay_targ_resp_t) },
1610 { OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
1611 NULL, overlay_target_lookup_drop,
1612 NULL, sizeof (overlay_targ_resp_t) },
1613 { OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
1614 overlay_target_pkt_copyin,
1615 overlay_target_packet,
1616 overlay_target_pkt_copyout,
1617 sizeof (overlay_targ_pkt_t) },
1618 { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
1619 overlay_target_pkt_copyin,
1620 overlay_target_inject,
1621 NULL, sizeof (overlay_targ_pkt_t) },
1622 { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
1623 overlay_target_pkt_copyin,
1624 overlay_target_resend,
1625 NULL, sizeof (overlay_targ_pkt_t) },
1626 { OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
1627 overlay_target_list_copyin,
1628 overlay_target_ioctl_list,
1629 overlay_target_list_copyout,
1630 sizeof (overlay_targ_list_t) },
1631 { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
1632 NULL, overlay_target_cache_get,
1633 NULL, sizeof (overlay_targ_cache_t) },
1634 { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
1635 NULL, overlay_target_cache_set,
1636 NULL, sizeof (overlay_targ_cache_t) },
1637 { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
1638 NULL, overlay_target_cache_remove,
1639 NULL, sizeof (overlay_targ_cache_t) },
1640 { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
1641 NULL, overlay_target_cache_flush,
1642 NULL, sizeof (overlay_targ_cache_t) },
1643 { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
1644 overlay_target_cache_iter_copyin,
1645 overlay_target_cache_iter,
1646 overlay_target_cache_iter_copyout,
1647 sizeof (overlay_targ_cache_iter_t) },
1648 { 0 }
1649 };
1650
1651 int
1652 overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
1653 {
1654 minor_t mid;
1655 overlay_target_hdl_t *thdl;
1656
1657 if (secpolicy_dl_config(credp) != 0)
1658 return (EPERM);
1659
1660 if (getminor(*devp) != 0)
1661 return (ENXIO);
1662
1663 if (otype & OTYP_BLK)
1664 return (EINVAL);
1665
1666 if (flags & ~(FREAD | FWRITE | FEXCL))
1667 return (EINVAL);
1668
1669 if ((flags & FWRITE) &&
1670 !(flags & FEXCL))
1671 return (EINVAL);
1672
1673 if (!(flags & FREAD) && !(flags & FWRITE))
1674 return (EINVAL);
1675
1676 if (crgetzoneid(credp) != GLOBAL_ZONEID)
1677 return (EPERM);
1678
1679 mid = id_alloc(overlay_thdl_idspace);
1680 if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
1681 id_free(overlay_thdl_idspace, mid);
1682 return (ENXIO);
1683 }
1684
1685 thdl = ddi_get_soft_state(overlay_thdl_state, mid);
1686 VERIFY(thdl != NULL);
1687 thdl->oth_minor = mid;
1688 thdl->oth_zoneid = crgetzoneid(credp);
1689 thdl->oth_oflags = flags;
1690 mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
1691 list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
1692 offsetof(overlay_target_entry_t, ote_qlink));
1693 *devp = makedevice(getmajor(*devp), mid);
1694
1695 mutex_enter(&overlay_target_lock);
1696 if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
1697 mutex_exit(&overlay_target_lock);
1698 list_destroy(&thdl->oth_outstanding);
1699 mutex_destroy(&thdl->oth_lock);
1700 ddi_soft_state_free(overlay_thdl_state, mid);
1701 id_free(overlay_thdl_idspace, mid);
1702 return (EEXIST);
1703 } else if ((flags & FEXCL) != 0) {
1704 VERIFY(overlay_target_excl == B_FALSE);
1705 overlay_target_excl = B_TRUE;
1706 }
1707 list_insert_tail(&overlay_thdl_list, thdl);
1708 mutex_exit(&overlay_target_lock);
1709
1710 return (0);
1711 }
1712
1713 /* ARGSUSED */
1714 int
1715 overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1716 int *rvalp)
1717 {
1718 overlay_target_ioctl_t *ioc;
1719 overlay_target_hdl_t *thdl;
1720
1721 if (secpolicy_dl_config(credp) != 0)
1722 return (EPERM);
1723
1724 if ((thdl = ddi_get_soft_state(overlay_thdl_state,
1725 getminor(dev))) == NULL)
1726 return (ENXIO);
1727
1728 for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
1729 int ret;
1730 caddr_t buf;
1731 size_t bufsize;
1732
1733 if (ioc->oti_cmd != cmd)
1734 continue;
1735
1736 if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
1737 return (EBADF);
1738
1739 if (ioc->oti_copyin == NULL) {
1740 bufsize = ioc->oti_size;
1741 buf = kmem_alloc(bufsize, KM_SLEEP);
1742 if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
1743 mode & FKIOCTL) != 0) {
1744 kmem_free(buf, bufsize);
1745 return (EFAULT);
1746 }
1747 } else {
1748 if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
1749 (void **)&buf, &bufsize, mode)) != 0)
1750 return (ret);
1751 }
1752
1753 ret = ioc->oti_func(thdl, buf);
1754 if (ret == 0 && ioc->oti_size != 0 &&
1755 ioc->oti_ncopyout == B_TRUE) {
1756 if (ioc->oti_copyout == NULL) {
1757 if (ddi_copyout(buf, (void *)(uintptr_t)arg,
1758 bufsize, mode & FKIOCTL) != 0)
1759 ret = EFAULT;
1760 } else {
1761 ret = ioc->oti_copyout((void *)(uintptr_t)arg,
1762 buf, bufsize, mode);
1763 }
1764 }
1765
1766 kmem_free(buf, bufsize);
1767 return (ret);
1768 }
1769
1770 return (ENOTTY);
1771 }
1772
1773 /* ARGSUSED */
1774 int
1775 overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
1776 {
1777 overlay_target_hdl_t *thdl;
1778 overlay_target_entry_t *entry;
1779 minor_t mid = getminor(dev);
1780
1781 if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
1782 return (ENXIO);
1783
1784 mutex_enter(&overlay_target_lock);
1785 list_remove(&overlay_thdl_list, thdl);
1786 mutex_enter(&thdl->oth_lock);
1787 while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
1788 list_insert_tail(&overlay_target_list, entry);
1789 cv_signal(&overlay_target_condvar);
1790 mutex_exit(&thdl->oth_lock);
1791 if ((thdl->oth_oflags & FEXCL) != 0) {
1792 VERIFY(overlay_target_excl == B_TRUE);
1793 overlay_target_excl = B_FALSE;
1794 }
1795 mutex_exit(&overlay_target_lock);
1796
1797 list_destroy(&thdl->oth_outstanding);
1798 mutex_destroy(&thdl->oth_lock);
1799 mid = thdl->oth_minor;
1800 ddi_soft_state_free(overlay_thdl_state, mid);
1801 id_free(overlay_thdl_idspace, mid);
1802
1803 return (0);
1804 }