1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Overlay device target cache management
  18  *
  19  * For more information, see the big theory statement in
  20  * uts/common/io/overlay/overlay.c
  21  */
  22 
  23 #include <sys/types.h>
  24 #include <sys/ethernet.h>
  25 #include <sys/kmem.h>
  26 #include <sys/policy.h>
  27 #include <sys/sysmacros.h>
  28 #include <sys/stream.h>
  29 #include <sys/strsun.h>
  30 #include <sys/strsubr.h>
  31 #include <sys/mac_provider.h>
  32 #include <sys/mac_client.h>
  33 #include <sys/mac_client_priv.h>
  34 #include <sys/vlan.h>
  35 #include <sys/crc32.h>
  36 #include <sys/cred.h>
  37 #include <sys/file.h>
  38 #include <sys/errno.h>
  39 #include <sys/ddi.h>
  40 #include <sys/sunddi.h>
  41 
  42 #include <sys/overlay_impl.h>
  43 #include <sys/sdt.h>
  44 
  45 /*
  46  * This is total straw man, but at least it's a prime number. Here we're
  47  * going to have to go through and do a lot of evaluation and understanding as
  48  * to how these target caches should grow and shrink, as well as, memory
  49  * pressure and evictions. This just gives us a starting point that'll be 'good
  50  * enough', until it's not.
  51  */
  52 #define OVERLAY_HSIZE   823
  53 
  54 /*
  55  * We use this data structure to keep track of what requests have been actively
  56  * allocated to a given instance so we know what to put back on the pending
  57  * list.
  58  */
  59 typedef struct overlay_target_hdl {
  60         minor_t oth_minor;              /* RO */
  61         zoneid_t oth_zoneid;            /* RO */
  62         int oth_oflags;                 /* RO */
  63         list_node_t oth_link;           /* overlay_target_lock */
  64         kmutex_t oth_lock;
  65         list_t  oth_outstanding;        /* oth_lock */
  66 } overlay_target_hdl_t;
  67 
  68 typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int);
  69 typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *);
  70 typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int);
  71 
  72 typedef struct overaly_target_ioctl {
  73         int             oti_cmd;        /* ioctl id */
  74         boolean_t       oti_write;      /* ioctl requires FWRITE */
  75         boolean_t       oti_ncopyout;   /* copyout data? */
  76         overlay_target_copyin_f oti_copyin;     /* copyin func */
  77         overlay_target_ioctl_f oti_func; /* function to call */
  78         overlay_target_copyout_f oti_copyout;   /* copyin func */
  79         size_t          oti_size;       /* size of user level structure */
  80 } overlay_target_ioctl_t;
  81 
  82 static kmem_cache_t *overlay_target_cache;
  83 static kmem_cache_t *overlay_entry_cache;
  84 static id_space_t *overlay_thdl_idspace;
  85 static void *overlay_thdl_state;
  86 
  87 /*
  88  * When we support overlay devices in the NGZ, then all of these need to become
  89  * zone aware, by plugging into the netstack engine and becoming per-netstack
  90  * data.
  91  */
  92 static list_t overlay_thdl_list;
  93 static kmutex_t overlay_target_lock;
  94 static kcondvar_t overlay_target_condvar;
  95 static list_t overlay_target_list;
  96 static boolean_t overlay_target_excl;
  97 
  98 /*
  99  * Outstanding data per hash table entry.
 100  */
 101 static int overlay_ent_size = 128 * 1024;
 102 
 103 /* ARGSUSED */
 104 static int
 105 overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
 106 {
 107         overlay_target_t *ott = buf;
 108 
 109         mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL);
 110         cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL);
 111         return (0);
 112 }
 113 
 114 /* ARGSUSED */
 115 static void
 116 overlay_target_cache_destructor(void *buf, void *arg)
 117 {
 118         overlay_target_t *ott = buf;
 119 
 120         cv_destroy(&ott->ott_cond);
 121         mutex_destroy(&ott->ott_lock);
 122 }
 123 
 124 /* ARGSUSED */
 125 static int
 126 overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs)
 127 {
 128         overlay_target_entry_t *ote = buf;
 129 
 130         bzero(ote, sizeof (overlay_target_entry_t));
 131         mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL);
 132         return (0);
 133 }
 134 
 135 /* ARGSUSED */
 136 static void
 137 overlay_entry_cache_destructor(void *buf, void *arg)
 138 {
 139         overlay_target_entry_t *ote = buf;
 140 
 141         mutex_destroy(&ote->ote_lock);
 142 }
 143 
 144 /* TODO: we will need to modify these to hash/cmp DCID + MAC */
 145 
 146 static uint64_t
 147 overlay_mac_hash(const void *v)
 148 {
 149         uint32_t crc;
 150         CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
 151         return (crc);
 152 }
 153 
 154 static int
 155 overlay_mac_cmp(const void *a, const void *b)
 156 {
 157         return (bcmp(a, b, ETHERADDRL));
 158 }
 159 
 160 /* ARGSUSED */
 161 static void
 162 overlay_target_entry_dtor(void *arg)
 163 {
 164         overlay_target_entry_t *ote = arg;
 165 
 166         ote->ote_flags = 0;
 167         bzero(ote->ote_addr, ETHERADDRL);
 168         ote->ote_ott = NULL;
 169         ote->ote_odd = NULL;
 170         freemsgchain(ote->ote_chead);
 171         ote->ote_chead = ote->ote_ctail = NULL;
 172         ote->ote_mbsize = 0;
 173         ote->ote_vtime = 0;
 174         kmem_cache_free(overlay_entry_cache, ote);
 175 }
 176 
 177 static int
 178 overlay_mac_avl(const void *a, const void *b)
 179 {
 180         int i;
 181         const overlay_target_entry_t *l, *r;
 182         l = a;
 183         r = b;
 184 
 185         for (i = 0; i < ETHERADDRL; i++) {
 186                 if (l->ote_addr[i] > r->ote_addr[i])
 187                         return (1);
 188                 else if (l->ote_addr[i] < r->ote_addr[i])
 189                         return (-1);
 190         }
 191 
 192         return (0);
 193 }
 194 
 195 void
 196 overlay_target_init(void)
 197 {
 198         int ret;
 199         ret = ddi_soft_state_init(&overlay_thdl_state,
 200             sizeof (overlay_target_hdl_t), 1);
 201         VERIFY(ret == 0);
 202         overlay_target_cache = kmem_cache_create("overlay_target",
 203             sizeof (overlay_target_t), 0, overlay_target_cache_constructor,
 204             overlay_target_cache_destructor, NULL, NULL, NULL, 0);
 205         overlay_entry_cache = kmem_cache_create("overlay_entry",
 206             sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor,
 207             overlay_entry_cache_destructor, NULL, NULL, NULL, 0);
 208         mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL);
 209         cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL);
 210         list_create(&overlay_target_list, sizeof (overlay_target_entry_t),
 211             offsetof(overlay_target_entry_t, ote_qlink));
 212         list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t),
 213             offsetof(overlay_target_hdl_t, oth_link));
 214         overlay_thdl_idspace = id_space_create("overlay_target_minors",
 215             1, INT32_MAX);
 216 }
 217 
 218 void
 219 overlay_target_fini(void)
 220 {
 221         id_space_destroy(overlay_thdl_idspace);
 222         list_destroy(&overlay_thdl_list);
 223         list_destroy(&overlay_target_list);
 224         cv_destroy(&overlay_target_condvar);
 225         mutex_destroy(&overlay_target_lock);
 226         kmem_cache_destroy(overlay_entry_cache);
 227         kmem_cache_destroy(overlay_target_cache);
 228         ddi_soft_state_fini(&overlay_thdl_state);
 229 }
 230 
 231 void
 232 overlay_target_free(overlay_dev_t *odd)
 233 {
 234         if (odd->odd_target == NULL)
 235                 return;
 236 
 237         if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
 238                 refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
 239                 avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
 240                 overlay_target_entry_t *ote;
 241 
 242                 /* TODO: remove from L3 trees */
 243 
 244                 /*
 245                  * Our AVL tree and hashtable contain the same elements,
 246                  * therefore we should just remove it from the tree, but then
 247                  * delete the entries when we remove them from the hash table
 248                  * (which happens through the refhash dtor).
 249                  */
 250                 while ((ote = avl_first(ap)) != NULL)
 251                         avl_remove(ap, ote);
 252 
 253                 avl_destroy(ap);
 254                 for (ote = refhash_first(rp); ote != NULL;
 255                     ote = refhash_next(rp, ote)) {
 256                         refhash_remove(rp, ote);
 257                 }
 258                 refhash_destroy(rp);
 259         }
 260 
 261         ASSERT(odd->odd_target->ott_ocount == 0);
 262         kmem_cache_free(overlay_target_cache, odd->odd_target);
 263 }
 264 
 265 int
 266 overlay_target_busy()
 267 {
 268         int ret;
 269 
 270         mutex_enter(&overlay_target_lock);
 271         ret = !list_is_empty(&overlay_thdl_list);
 272         mutex_exit(&overlay_target_lock);
 273 
 274         return (ret);
 275 }
 276 
 277 static void
 278 overlay_target_queue(overlay_target_entry_t *entry)
 279 {
 280         mutex_enter(&overlay_target_lock);
 281         mutex_enter(&entry->ote_ott->ott_lock);
 282         if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
 283                 mutex_exit(&entry->ote_ott->ott_lock);
 284                 mutex_exit(&overlay_target_lock);
 285                 return;
 286         }
 287         entry->ote_ott->ott_ocount++;
 288         mutex_exit(&entry->ote_ott->ott_lock);
 289         list_insert_tail(&overlay_target_list, entry);
 290         cv_signal(&overlay_target_condvar);
 291         mutex_exit(&overlay_target_lock);
 292 }
 293 
 294 void
 295 overlay_target_quiesce(overlay_target_t *ott)
 296 {
 297         if (ott == NULL)
 298                 return;
 299         mutex_enter(&ott->ott_lock);
 300         ott->ott_flags |= OVERLAY_T_TEARDOWN;
 301         while (ott->ott_ocount != 0)
 302                 cv_wait(&ott->ott_cond, &ott->ott_lock);
 303         mutex_exit(&ott->ott_lock);
 304 }
 305 
 306 /*
 307  * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
 308  * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
 309  * this time, say for NVGRE, we drop all packets that mcuh this.
 310  *
 311  * XXX: It might be better to replace the 'sock' argument with
 312  * overlay_target_entry_t** and set it with the found entry in the case
 313  * of OVERLAY_TARGET_OK.
 314  */
 315 int
 316 overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
 317     socklen_t *slenp)
 318 {
 319         int ret;
 320         struct sockaddr_in6 *v6;
 321         overlay_target_t *ott;
 322         mac_header_info_t mhi;
 323         overlay_target_entry_t *entry;
 324 
 325         ASSERT(odd->odd_target != NULL);
 326 
 327         /*
 328          * At this point, the overlay device is in a mux which means that it's
 329          * been activated. At this point, parts of the target, such as the mode
 330          * and the destination are now read-only and we don't have to worry
 331          * about synchronization for them.
 332          */
 333         ott = odd->odd_target;
 334         if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT))
 335                 return (OVERLAY_TARGET_DROP);
 336 
 337         v6 = (struct sockaddr_in6 *)sock;
 338         bzero(v6, sizeof (struct sockaddr_in6));
 339         v6->sin6_family = AF_INET6;
 340 
 341         if (ott->ott_mode == OVERLAY_TARGET_POINT) {
 342                 mutex_enter(&ott->ott_lock);
 343                 bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr,
 344                     sizeof (struct in6_addr));
 345                 v6->sin6_port = htons(ott->ott_u.ott_point.otp_port);
 346                 mutex_exit(&ott->ott_lock);
 347                 *slenp = sizeof (struct sockaddr_in6);
 348 
 349                 return (OVERLAY_TARGET_OK);
 350         }
 351 
 352         ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
 353 
 354         /*
 355          * Note we only want the MAC address here, therefore we won't bother
 356          * using mac_vlan_header_info(). If any caller needs the vlan info at
 357          * this point, this should change to a call to mac_vlan_header_info().
 358          */
 359         if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
 360                 return (OVERLAY_TARGET_DROP);
 361 
 362         /*
 363          * TODO: compare mhi.mhi_daddr with odd->macaddr.
 364          * If match,
 365          *      get VL3 dest from mp
 366          *      lookup target using VL3 dest
 367          * otherwise,
 368          *      lookup target using VL2 dest (existing refhash_lookup() call
 369          *      below)
 370          */
 371         mutex_enter(&ott->ott_lock);
 372         entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
 373             mhi.mhi_daddr);
 374         if (entry == NULL) {
 375                 entry = kmem_cache_alloc(overlay_entry_cache,
 376                     KM_NOSLEEP | KM_NORMALPRI);
 377                 if (entry == NULL) {
 378                         mutex_exit(&ott->ott_lock);
 379                         return (OVERLAY_TARGET_DROP);
 380                 }
 381                 /*
 382                  * TODO: set entry->ote_dcid, if VL3 lookup, copy dst addr
 383                  * into entry->ote_ip.  Probably zero out the address we're
 384                  * not lookup up (VL2 or VL3) as well.
 385                  */
 386                 bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
 387                 entry->ote_chead = entry->ote_ctail = mp;
 388                 entry->ote_mbsize = msgsize(mp);
 389                 entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
 390                 entry->ote_ott = ott;
 391                 entry->ote_odd = odd;
 392                 refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
 393                 avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
 394                 mutex_exit(&ott->ott_lock);
 395                 overlay_target_queue(entry);
 396                 return (OVERLAY_TARGET_ASYNC);
 397         }
 398         refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
 399         mutex_exit(&ott->ott_lock);
 400 
 401         mutex_enter(&entry->ote_lock);
 402         if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
 403                 ret = OVERLAY_TARGET_DROP;
 404         } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
 405                 bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
 406                     sizeof (struct in6_addr));
 407                 v6->sin6_port = htons(entry->ote_dest.otp_port);
 408                 *slenp = sizeof (struct sockaddr_in6);
 409                 ret = OVERLAY_TARGET_OK;
 410         } else {
 411                 size_t mlen = msgsize(mp);
 412 
 413                 if (mlen + entry->ote_mbsize > overlay_ent_size) {
 414                         ret = OVERLAY_TARGET_DROP;
 415                 } else {
 416                         if (entry->ote_ctail != NULL) {
 417                                 ASSERT(entry->ote_ctail->b_next ==
 418                                     NULL);
 419                                 entry->ote_ctail->b_next = mp;
 420                                 entry->ote_ctail = mp;
 421                         } else {
 422                                 entry->ote_chead = mp;
 423                                 entry->ote_ctail = mp;
 424                         }
 425                         entry->ote_mbsize += mlen;
 426                         if ((entry->ote_flags &
 427                             OVERLAY_ENTRY_F_PENDING) == 0) {
 428                                 entry->ote_flags |=
 429                                     OVERLAY_ENTRY_F_PENDING;
 430                                 overlay_target_queue(entry);
 431                         }
 432                         ret = OVERLAY_TARGET_ASYNC;
 433                 }
 434         }
 435         mutex_exit(&entry->ote_lock);
 436 
 437         mutex_enter(&ott->ott_lock);
 438         refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
 439         mutex_exit(&ott->ott_lock);
 440 
 441         return (ret);
 442 }
 443 
 444 /* ARGSUSED */
 445 static int
 446 overlay_target_info(overlay_target_hdl_t *thdl, void *arg)
 447 {
 448         overlay_dev_t *odd;
 449         overlay_targ_info_t *oti = arg;
 450 
 451         odd = overlay_hold_by_dlid(oti->oti_linkid);
 452         if (odd == NULL)
 453                 return (ENOENT);
 454 
 455         mutex_enter(&odd->odd_lock);
 456         oti->oti_flags = 0;
 457         oti->oti_needs = odd->odd_plugin->ovp_dest;
 458         if (odd->odd_flags & OVERLAY_F_DEGRADED)
 459                 oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED;
 460         if (odd->odd_flags & OVERLAY_F_ACTIVATED)
 461                 oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE;
 462         oti->oti_vnetid = odd->odd_vid;
 463         oti->oti_dcid = odd->odd_dcid;
 464         mutex_exit(&odd->odd_lock);
 465         overlay_hold_rele(odd);
 466         return (0);
 467 }
 468 
 469 /* ARGSUSED */
 470 static int
 471 overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
 472 {
 473         overlay_dev_t *odd;
 474         overlay_target_t *ott;
 475         overlay_targ_associate_t *ota = arg;
 476 
 477         odd = overlay_hold_by_dlid(ota->ota_linkid);
 478         if (odd == NULL)
 479                 return (ENOENT);
 480 
 481         if (ota->ota_id == 0) {
 482                 overlay_hold_rele(odd);
 483                 return (EINVAL);
 484         }
 485 
 486         if (ota->ota_mode != OVERLAY_TARGET_POINT &&
 487             ota->ota_mode != OVERLAY_TARGET_DYNAMIC) {
 488                 overlay_hold_rele(odd);
 489                 return (EINVAL);
 490         }
 491 
 492         if (ota->ota_provides != odd->odd_plugin->ovp_dest) {
 493                 overlay_hold_rele(odd);
 494                 return (EINVAL);
 495         }
 496 
 497         if (ota->ota_mode == OVERLAY_TARGET_POINT) {
 498                 if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) {
 499                         if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) ||
 500                             IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) ||
 501                             IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) {
 502                                 overlay_hold_rele(odd);
 503                                 return (EINVAL);
 504                         }
 505                 }
 506 
 507                 if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) {
 508                         if (ota->ota_point.otp_port == 0) {
 509                                 overlay_hold_rele(odd);
 510                                 return (EINVAL);
 511                         }
 512                 }
 513         }
 514 
 515         ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP);
 516         ott->ott_flags = 0;
 517         ott->ott_ocount = 0;
 518         ott->ott_mode = ota->ota_mode;
 519         ott->ott_dest = ota->ota_provides;
 520         ott->ott_id = ota->ota_id;
 521 
 522         if (ott->ott_mode == OVERLAY_TARGET_POINT) {
 523                 bcopy(&ota->ota_point, &ott->ott_u.ott_point,
 524                     sizeof (overlay_target_point_t));
 525         } else {
 526                 ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
 527                     overlay_mac_hash, overlay_mac_cmp,
 528                     overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
 529                     offsetof(overlay_target_entry_t, ote_reflink),
 530                     offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
 531                 avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
 532                     sizeof (overlay_target_entry_t),
 533                     offsetof(overlay_target_entry_t, ote_avllink));
 534         }
 535         mutex_enter(&odd->odd_lock);
 536         if (odd->odd_flags & OVERLAY_F_VARPD) {
 537                 mutex_exit(&odd->odd_lock);
 538                 kmem_cache_free(overlay_target_cache, ott);
 539                 overlay_hold_rele(odd);
 540                 return (EEXIST);
 541         }
 542 
 543         odd->odd_flags |= OVERLAY_F_VARPD;
 544         odd->odd_target = ott;
 545         mutex_exit(&odd->odd_lock);
 546 
 547         overlay_hold_rele(odd);
 548 
 549 
 550         return (0);
 551 }
 552 
 553 
 554 /* ARGSUSED */
 555 static int
 556 overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg)
 557 {
 558         overlay_dev_t *odd;
 559         overlay_targ_degrade_t *otd = arg;
 560 
 561         odd = overlay_hold_by_dlid(otd->otd_linkid);
 562         if (odd == NULL)
 563                 return (ENOENT);
 564 
 565         overlay_fm_degrade(odd, otd->otd_buf);
 566         overlay_hold_rele(odd);
 567         return (0);
 568 }
 569 
 570 /* ARGSUSED */
 571 static int
 572 overlay_target_restore(overlay_target_hdl_t *thdl, void *arg)
 573 {
 574         overlay_dev_t *odd;
 575         overlay_targ_id_t *otid = arg;
 576 
 577         odd = overlay_hold_by_dlid(otid->otid_linkid);
 578         if (odd == NULL)
 579                 return (ENOENT);
 580 
 581         overlay_fm_restore(odd);
 582         overlay_hold_rele(odd);
 583         return (0);
 584 }
 585 
 586 /* ARGSUSED */
 587 static int
 588 overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg)
 589 {
 590         overlay_dev_t *odd;
 591         overlay_targ_id_t *otid = arg;
 592 
 593         odd = overlay_hold_by_dlid(otid->otid_linkid);
 594         if (odd == NULL)
 595                 return (ENOENT);
 596 
 597         mutex_enter(&odd->odd_lock);
 598         odd->odd_flags &= ~OVERLAY_F_VARPD;
 599         mutex_exit(&odd->odd_lock);
 600 
 601         overlay_hold_rele(odd);
 602         return (0);
 603 
 604 }
 605 
 606 static int
 607 overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
 608 {
 609         overlay_targ_lookup_t *otl = arg;
 610         overlay_target_entry_t *entry;
 611         clock_t ret, timeout;
 612         mac_header_info_t mhi;
 613 
 614         timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
 615 again:
 616         mutex_enter(&overlay_target_lock);
 617         while (list_is_empty(&overlay_target_list)) {
 618                 ret = cv_timedwait(&overlay_target_condvar,
 619                     &overlay_target_lock, timeout);
 620                 if (ret == -1) {
 621                         mutex_exit(&overlay_target_lock);
 622                         return (ETIME);
 623                 }
 624         }
 625         entry = list_remove_head(&overlay_target_list);
 626         mutex_exit(&overlay_target_lock);
 627         mutex_enter(&entry->ote_lock);
 628         if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
 629                 ASSERT(entry->ote_chead == NULL);
 630                 mutex_exit(&entry->ote_lock);
 631                 goto again;
 632         }
 633         ASSERT(entry->ote_chead != NULL);
 634 
 635         /*
 636          * If we have a bogon that doesn't have a valid mac header, drop it and
 637          * try again.
 638          */
 639         if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
 640             &mhi) != 0) {
 641                 boolean_t queue = B_FALSE;
 642                 mblk_t *mp = entry->ote_chead;
 643                 entry->ote_chead = mp->b_next;
 644                 mp->b_next = NULL;
 645                 if (entry->ote_ctail == mp)
 646                         entry->ote_ctail = entry->ote_chead;
 647                 entry->ote_mbsize -= msgsize(mp);
 648                 if (entry->ote_chead != NULL)
 649                         queue = B_TRUE;
 650                 mutex_exit(&entry->ote_lock);
 651                 if (queue == B_TRUE)
 652                         overlay_target_queue(entry);
 653                 freemsg(mp);
 654                 goto again;
 655         }
 656 
 657         /*
 658          * TODO: If VL3 request,
 659          *      set otl->otl_l3req
 660          *      Fill in otl_{src,dst}ip
 661          * Else
 662          *      clear otl->otl_l3req
 663          */
 664         otl->otl_dlid = entry->ote_odd->odd_linkid;
 665         otl->otl_reqid = (uintptr_t)entry;
 666         otl->otl_varpdid = entry->ote_ott->ott_id;
 667         otl->otl_vnetid = entry->ote_odd->odd_vid;
 668 
 669         otl->otl_hdrsize = mhi.mhi_hdrsize;
 670         otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
 671         bcopy(mhi.mhi_daddr, otl->otl_addru.otlu_l2.otl2_dstaddr, ETHERADDRL);
 672         bcopy(mhi.mhi_saddr, otl->otl_addru.otlu_l2.otl2_srcaddr, ETHERADDRL);
 673         otl->otl_addru.otlu_l2.otl2_dsttype = mhi.mhi_dsttype;
 674         otl->otl_addru.otlu_l2.otl2_sap = mhi.mhi_bindsap;
 675         otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
 676         mutex_exit(&entry->ote_lock);
 677 
 678         mutex_enter(&thdl->oth_lock);
 679         list_insert_tail(&thdl->oth_outstanding, entry);
 680         mutex_exit(&thdl->oth_lock);
 681 
 682         return (0);
 683 }
 684 
 685 static int
 686 overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
 687 {
 688         const overlay_targ_resp_t *otr = arg;
 689         overlay_target_entry_t *entry;
 690         mblk_t *mp;
 691 
 692         mutex_enter(&thdl->oth_lock);
 693         for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
 694             entry = list_next(&thdl->oth_outstanding, entry)) {
 695                 if ((uintptr_t)entry == otr->otr_reqid)
 696                         break;
 697         }
 698 
 699         if (entry == NULL) {
 700                 mutex_exit(&thdl->oth_lock);
 701                 return (EINVAL);
 702         }
 703         list_remove(&thdl->oth_outstanding, entry);
 704         mutex_exit(&thdl->oth_lock);
 705 
 706         mutex_enter(&entry->ote_lock);
 707         bcopy(&otr->otr_answer, &entry->ote_dest,
 708             sizeof (overlay_target_point_t));
 709         entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
 710         entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
 711         mp = entry->ote_chead;
 712         entry->ote_chead = NULL;
 713         entry->ote_ctail = NULL;
 714         entry->ote_mbsize = 0;
 715         entry->ote_vtime = gethrtime();
 716         mutex_exit(&entry->ote_lock);
 717 
 718         /*
 719          * For now do an in-situ drain.
 720          *
 721          * TODO: overlay_m_tx() will need to perform remote fabric attachment
 722          * checks, which may leave mblk_t's left in the msg chain for
 723          * mblk_t's whose connectivity with the target entry are unknown.
 724          * This will then need to deal with the leftovers.
 725          */
 726         mp = overlay_m_tx(entry->ote_odd, mp);
 727         freemsgchain(mp);
 728 
 729         mutex_enter(&entry->ote_ott->ott_lock);
 730         entry->ote_ott->ott_ocount--;
 731         cv_signal(&entry->ote_ott->ott_cond);
 732         mutex_exit(&entry->ote_ott->ott_lock);
 733 
 734         return (0);
 735 }
 736 
 737 static int
 738 overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg)
 739 {
 740         const overlay_targ_resp_t *otr = arg;
 741         overlay_target_entry_t *entry;
 742         mblk_t *mp;
 743         boolean_t queue = B_FALSE;
 744 
 745         mutex_enter(&thdl->oth_lock);
 746         for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
 747             entry = list_next(&thdl->oth_outstanding, entry)) {
 748                 if ((uintptr_t)entry == otr->otr_reqid)
 749                         break;
 750         }
 751 
 752         if (entry == NULL) {
 753                 mutex_exit(&thdl->oth_lock);
 754                 return (EINVAL);
 755         }
 756         list_remove(&thdl->oth_outstanding, entry);
 757         mutex_exit(&thdl->oth_lock);
 758 
 759         mutex_enter(&entry->ote_lock);
 760 
 761         /* Safeguard against a confused varpd */
 762         if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
 763                 entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
 764                 DTRACE_PROBE1(overlay__target__valid__drop,
 765                     overlay_target_entry_t *, entry);
 766                 mutex_exit(&entry->ote_lock);
 767                 goto done;
 768         }
 769 
 770         /*
 771          * TODO: This will need to be smarter.  This drop can only apply to
 772          * packets from the same source fabric as the first mblk_t in the
 773          * chain.  If the target exists, packets from other fabrics which
 774          * are chained to this target entry may be able to be sent (if we
 775          * already know they are attached), or we might need to query from
 776          * those other source fabrics if we don't know if the two are
 777          * attached.
 778          */
 779         mp = entry->ote_chead;
 780         if (mp != NULL) {
 781                 entry->ote_chead = mp->b_next;
 782                 mp->b_next = NULL;
 783                 if (entry->ote_ctail == mp)
 784                         entry->ote_ctail = entry->ote_chead;
 785                 entry->ote_mbsize -= msgsize(mp);
 786         }
 787         if (entry->ote_chead != NULL) {
 788                 queue = B_TRUE;
 789                 entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
 790         } else {
 791                 entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
 792         }
 793         mutex_exit(&entry->ote_lock);
 794 
 795         if (queue == B_TRUE)
 796                 overlay_target_queue(entry);
 797         freemsg(mp);
 798 
 799 done:
 800         mutex_enter(&entry->ote_ott->ott_lock);
 801         entry->ote_ott->ott_ocount--;
 802         cv_signal(&entry->ote_ott->ott_cond);
 803         mutex_exit(&entry->ote_ott->ott_lock);
 804 
 805         return (0);
 806 }
 807 
 808 /* ARGSUSED */
 809 static int
 810 overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize,
 811     int flags)
 812 {
 813         overlay_targ_pkt_t *pkt;
 814         overlay_targ_pkt32_t *pkt32;
 815 
 816         pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP);
 817         *outp = pkt;
 818         *bsize = sizeof (overlay_targ_pkt_t);
 819         if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
 820                 uintptr_t addr;
 821 
 822                 if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t),
 823                     flags & FKIOCTL) != 0) {
 824                         kmem_free(pkt, *bsize);
 825                         return (EFAULT);
 826                 }
 827                 pkt32 = (overlay_targ_pkt32_t *)pkt;
 828                 addr = pkt32->otp_buf;
 829                 pkt->otp_buf = (void *)addr;
 830         } else {
 831                 if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) {
 832                         kmem_free(pkt, *bsize);
 833                         return (EFAULT);
 834                 }
 835         }
 836         return (0);
 837 }
 838 
 839 static int
 840 overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize,
 841     int flags)
 842 {
 843         if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) {
 844                 overlay_targ_pkt_t *pkt = buf;
 845                 overlay_targ_pkt32_t *pkt32 = buf;
 846                 uintptr_t addr = (uintptr_t)pkt->otp_buf;
 847                 pkt32->otp_buf = (caddr32_t)addr;
 848                 if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t),
 849                     flags & FKIOCTL) != 0)
 850                         return (EFAULT);
 851         } else {
 852                 if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0)
 853                         return (EFAULT);
 854         }
 855         return (0);
 856 }
 857 
 858 static int
 859 overlay_target_packet(overlay_target_hdl_t *thdl, void *arg)
 860 {
 861         overlay_targ_pkt_t *pkt = arg;
 862         overlay_target_entry_t *entry;
 863         mblk_t *mp;
 864         size_t mlen;
 865         size_t boff;
 866 
 867         mutex_enter(&thdl->oth_lock);
 868         for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
 869             entry = list_next(&thdl->oth_outstanding, entry)) {
 870                 if ((uintptr_t)entry == pkt->otp_reqid)
 871                         break;
 872         }
 873 
 874         if (entry == NULL) {
 875                 mutex_exit(&thdl->oth_lock);
 876                 return (EINVAL);
 877         }
 878         mutex_enter(&entry->ote_lock);
 879         mutex_exit(&thdl->oth_lock);
 880         mp = entry->ote_chead;
 881         /* Protect against a rogue varpd */
 882         if (mp == NULL) {
 883                 mutex_exit(&entry->ote_lock);
 884                 return (EINVAL);
 885         }
 886         mlen = MIN(msgsize(mp), pkt->otp_size);
 887         pkt->otp_size = mlen;
 888         boff = 0;
 889         while (mlen > 0) {
 890                 size_t wlen = MIN(MBLKL(mp), mlen);
 891                 if (ddi_copyout(mp->b_rptr,
 892                     (void *)((uintptr_t)pkt->otp_buf + boff),
 893                     wlen, 0) != 0) {
 894                         mutex_exit(&entry->ote_lock);
 895                         return (EFAULT);
 896                 }
 897                 mlen -= wlen;
 898                 boff += wlen;
 899                 mp = mp->b_cont;
 900         }
 901         mutex_exit(&entry->ote_lock);
 902         return (0);
 903 }
 904 
 905 static int
 906 overlay_target_inject(overlay_target_hdl_t *thdl, void *arg)
 907 {
 908         overlay_targ_pkt_t *pkt = arg;
 909         overlay_target_entry_t *entry;
 910         overlay_dev_t *odd;
 911         mblk_t *mp;
 912 
 913         if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
 914                 return (EINVAL);
 915 
 916         mp = allocb(pkt->otp_size, 0);
 917         if (mp == NULL)
 918                 return (ENOMEM);
 919 
 920         if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
 921                 freeb(mp);
 922                 return (EFAULT);
 923         }
 924         mp->b_wptr += pkt->otp_size;
 925 
 926         if (pkt->otp_linkid != UINT64_MAX) {
 927                 odd = overlay_hold_by_dlid(pkt->otp_linkid);
 928                 if (odd == NULL) {
 929                         freeb(mp);
 930                         return (ENOENT);
 931                 }
 932         } else {
 933                 mutex_enter(&thdl->oth_lock);
 934                 for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
 935                     entry = list_next(&thdl->oth_outstanding, entry)) {
 936                         if ((uintptr_t)entry == pkt->otp_reqid)
 937                                 break;
 938                 }
 939 
 940                 if (entry == NULL) {
 941                         mutex_exit(&thdl->oth_lock);
 942                         freeb(mp);
 943                         return (ENOENT);
 944                 }
 945                 odd = entry->ote_odd;
 946                 mutex_exit(&thdl->oth_lock);
 947         }
 948 
 949         mutex_enter(&odd->odd_lock);
 950         overlay_io_start(odd, OVERLAY_F_IN_RX);
 951         mutex_exit(&odd->odd_lock);
 952 
 953         mac_rx(odd->odd_mh, NULL, mp);
 954 
 955         mutex_enter(&odd->odd_lock);
 956         overlay_io_done(odd, OVERLAY_F_IN_RX);
 957         mutex_exit(&odd->odd_lock);
 958 
 959         return (0);
 960 }
 961 
 962 static int
 963 overlay_target_resend(overlay_target_hdl_t *thdl, void *arg)
 964 {
 965         overlay_targ_pkt_t *pkt = arg;
 966         overlay_target_entry_t *entry;
 967         overlay_dev_t *odd;
 968         mblk_t *mp;
 969 
 970         if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ)
 971                 return (EINVAL);
 972 
 973         mp = allocb(pkt->otp_size, 0);
 974         if (mp == NULL)
 975                 return (ENOMEM);
 976 
 977         if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) {
 978                 freeb(mp);
 979                 return (EFAULT);
 980         }
 981         mp->b_wptr += pkt->otp_size;
 982 
 983         if (pkt->otp_linkid != UINT64_MAX) {
 984                 odd = overlay_hold_by_dlid(pkt->otp_linkid);
 985                 if (odd == NULL) {
 986                         freeb(mp);
 987                         return (ENOENT);
 988                 }
 989         } else {
 990                 mutex_enter(&thdl->oth_lock);
 991                 for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
 992                     entry = list_next(&thdl->oth_outstanding, entry)) {
 993                         if ((uintptr_t)entry == pkt->otp_reqid)
 994                                 break;
 995                 }
 996 
 997                 if (entry == NULL) {
 998                         mutex_exit(&thdl->oth_lock);
 999                         freeb(mp);
1000                         return (ENOENT);
1001                 }
1002                 odd = entry->ote_odd;
1003                 mutex_exit(&thdl->oth_lock);
1004         }
1005 
1006         mp = overlay_m_tx(odd, mp);
1007         freemsgchain(mp);
1008 
1009         return (0);
1010 }
1011 
1012 typedef struct overlay_targ_list_int {
1013         boolean_t       otli_count;
1014         uint32_t        otli_cur;
1015         uint32_t        otli_nents;
1016         uint32_t        otli_ents[];
1017 } overlay_targ_list_int_t;
1018 
1019 static int
1020 overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize,
1021     int flags)
1022 {
1023         overlay_targ_list_t n;
1024         overlay_targ_list_int_t *otl;
1025 
1026         if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t),
1027             flags & FKIOCTL) != 0)
1028                 return (EFAULT);
1029 
1030         /*
1031          */
1032         if (n.otl_nents >= INT32_MAX / sizeof (uint32_t))
1033                 return (EINVAL);
1034         *bsize = sizeof (overlay_targ_list_int_t) +
1035             sizeof (uint32_t) * n.otl_nents;
1036         otl = kmem_zalloc(*bsize, KM_SLEEP);
1037         otl->otli_cur = 0;
1038         otl->otli_nents = n.otl_nents;
1039         if (otl->otli_nents != 0) {
1040                 otl->otli_count = B_FALSE;
1041                 if (ddi_copyin((void *)((uintptr_t)ubuf +
1042                     offsetof(overlay_targ_list_t, otl_ents)),
1043                     otl->otli_ents, n.otl_nents * sizeof (uint32_t),
1044                     flags & FKIOCTL) != 0) {
1045                         kmem_free(otl, *bsize);
1046                         return (EFAULT);
1047                 }
1048         } else {
1049                 otl->otli_count = B_TRUE;
1050         }
1051 
1052         *outp = otl;
1053         return (0);
1054 }
1055 
1056 static int
1057 overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg)
1058 {
1059         overlay_targ_list_int_t *otl = arg;
1060 
1061         if (otl->otli_cur < otl->otli_nents)
1062                 otl->otli_ents[otl->otli_cur] = odd->odd_linkid;
1063         otl->otli_cur++;
1064         return (0);
1065 }
1066 
1067 /* ARGSUSED */
1068 static int
1069 overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg)
1070 {
1071         overlay_dev_iter(overlay_target_ioctl_list_cb, arg);
1072         return (0);
1073 }
1074 
1075 /* ARGSUSED */
1076 static int
1077 overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags)
1078 {
1079         overlay_targ_list_int_t *otl = buf;
1080 
1081         if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t),
1082             flags & FKIOCTL) != 0)
1083                 return (EFAULT);
1084 
1085         if (otl->otli_count == B_FALSE) {
1086                 if (ddi_copyout(otl->otli_ents,
1087                     (void *)((uintptr_t)ubuf +
1088                     offsetof(overlay_targ_list_t, otl_ents)),
1089                     sizeof (uint32_t) * otl->otli_nents,
1090                     flags & FKIOCTL) != 0)
1091                         return (EFAULT);
1092         }
1093         return (0);
1094 }
1095 
1096 /* ARGSUSED */
1097 static int
1098 overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg)
1099 {
1100         int ret = 0;
1101         overlay_dev_t *odd;
1102         overlay_target_t *ott;
1103         overlay_targ_cache_t *otc = arg;
1104 
1105         odd = overlay_hold_by_dlid(otc->otc_linkid);
1106         if (odd == NULL)
1107                 return (ENOENT);
1108 
1109         mutex_enter(&odd->odd_lock);
1110         if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1111                 mutex_exit(&odd->odd_lock);
1112                 overlay_hold_rele(odd);
1113                 return (ENXIO);
1114         }
1115         ott = odd->odd_target;
1116         if (ott->ott_mode != OVERLAY_TARGET_POINT &&
1117             ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1118                 mutex_exit(&odd->odd_lock);
1119                 overlay_hold_rele(odd);
1120                 return (ENOTSUP);
1121         }
1122         mutex_enter(&ott->ott_lock);
1123         mutex_exit(&odd->odd_lock);
1124 
1125         if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1126                 otc->otc_entry.otce_flags = 0;
1127                 bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest,
1128                     sizeof (overlay_target_point_t));
1129         } else {
1130                 overlay_target_entry_t *ote;
1131                 ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1132                     otc->otc_entry.otce_mac);
1133                 if (ote != NULL) {
1134                         mutex_enter(&ote->ote_lock);
1135                         if ((ote->ote_flags &
1136                             OVERLAY_ENTRY_F_VALID_MASK) != 0) {
1137                                 if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
1138                                         otc->otc_entry.otce_flags =
1139                                             OVERLAY_TARGET_CACHE_DROP;
1140                                 } else {
1141                                         otc->otc_entry.otce_flags = 0;
1142                                         bcopy(&ote->ote_dest,
1143                                             &otc->otc_entry.otce_dest,
1144                                             sizeof (overlay_target_point_t));
1145                                 }
1146                                 ret = 0;
1147                         } else {
1148                                 ret = ENOENT;
1149                         }
1150                         mutex_exit(&ote->ote_lock);
1151                 } else {
1152                         ret = ENOENT;
1153                 }
1154         }
1155 
1156         mutex_exit(&ott->ott_lock);
1157         overlay_hold_rele(odd);
1158 
1159         return (ret);
1160 }
1161 
1162 /* ARGSUSED */
1163 static int
1164 overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg)
1165 {
1166         overlay_dev_t *odd;
1167         overlay_target_t *ott;
1168         overlay_target_entry_t *ote;
1169         overlay_targ_cache_t *otc = arg;
1170         mblk_t *mp = NULL;
1171 
1172         if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
1173                 return (EINVAL);
1174 
1175         odd = overlay_hold_by_dlid(otc->otc_linkid);
1176         if (odd == NULL)
1177                 return (ENOENT);
1178 
1179         mutex_enter(&odd->odd_lock);
1180         if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1181                 mutex_exit(&odd->odd_lock);
1182                 overlay_hold_rele(odd);
1183                 return (ENXIO);
1184         }
1185         ott = odd->odd_target;
1186         if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1187                 mutex_exit(&odd->odd_lock);
1188                 overlay_hold_rele(odd);
1189                 return (ENOTSUP);
1190         }
1191         mutex_enter(&ott->ott_lock);
1192         mutex_exit(&odd->odd_lock);
1193 
1194         ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1195             otc->otc_entry.otce_mac);
1196         if (ote == NULL) {
1197                 ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP);
1198                 bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL);
1199                 ote->ote_chead = ote->ote_ctail = NULL;
1200                 ote->ote_mbsize = 0;
1201                 ote->ote_ott = ott;
1202                 ote->ote_odd = odd;
1203                 mutex_enter(&ote->ote_lock);
1204                 refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote);
1205                 avl_add(&ott->ott_u.ott_dyn.ott_tree, ote);
1206         } else {
1207                 mutex_enter(&ote->ote_lock);
1208         }
1209 
1210         if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
1211                 ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
1212         } else {
1213                 ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
1214                 bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
1215                     sizeof (overlay_target_point_t));
1216                 mp = ote->ote_chead;
1217                 ote->ote_chead = NULL;
1218                 ote->ote_ctail = NULL;
1219                 ote->ote_mbsize = 0;
1220                 ote->ote_vtime = gethrtime();
1221         }
1222 
1223         mutex_exit(&ote->ote_lock);
1224         mutex_exit(&ott->ott_lock);
1225 
1226         if (mp != NULL) {
1227                 mp = overlay_m_tx(ote->ote_odd, mp);
1228                 freemsgchain(mp);
1229         }
1230 
1231         overlay_hold_rele(odd);
1232 
1233         return (0);
1234 }
1235 
1236 /* ARGSUSED */
1237 static int
1238 overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg)
1239 {
1240         int ret = 0;
1241         overlay_dev_t *odd;
1242         overlay_target_t *ott;
1243         overlay_target_entry_t *ote;
1244         overlay_targ_cache_t *otc = arg;
1245 
1246         odd = overlay_hold_by_dlid(otc->otc_linkid);
1247         if (odd == NULL)
1248                 return (ENOENT);
1249 
1250         mutex_enter(&odd->odd_lock);
1251         if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1252                 mutex_exit(&odd->odd_lock);
1253                 overlay_hold_rele(odd);
1254                 return (ENXIO);
1255         }
1256         ott = odd->odd_target;
1257         if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1258                 mutex_exit(&odd->odd_lock);
1259                 overlay_hold_rele(odd);
1260                 return (ENOTSUP);
1261         }
1262         mutex_enter(&ott->ott_lock);
1263         mutex_exit(&odd->odd_lock);
1264 
1265         ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1266             otc->otc_entry.otce_mac);
1267         if (ote != NULL) {
1268                 mutex_enter(&ote->ote_lock);
1269                 ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1270                 mutex_exit(&ote->ote_lock);
1271                 ret = 0;
1272         } else {
1273                 ret = ENOENT;
1274         }
1275 
1276         mutex_exit(&ott->ott_lock);
1277         overlay_hold_rele(odd);
1278 
1279         return (ret);
1280 }
1281 
1282 /* ARGSUSED */
1283 static int
1284 overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg)
1285 {
1286         avl_tree_t *avl;
1287         overlay_dev_t *odd;
1288         overlay_target_t *ott;
1289         overlay_target_entry_t *ote;
1290         overlay_targ_cache_t *otc = arg;
1291 
1292         odd = overlay_hold_by_dlid(otc->otc_linkid);
1293         if (odd == NULL)
1294                 return (ENOENT);
1295 
1296         mutex_enter(&odd->odd_lock);
1297         if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1298                 mutex_exit(&odd->odd_lock);
1299                 overlay_hold_rele(odd);
1300                 return (ENXIO);
1301         }
1302         ott = odd->odd_target;
1303         if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) {
1304                 mutex_exit(&odd->odd_lock);
1305                 overlay_hold_rele(odd);
1306                 return (ENOTSUP);
1307         }
1308         mutex_enter(&ott->ott_lock);
1309         mutex_exit(&odd->odd_lock);
1310         avl = &ott->ott_u.ott_dyn.ott_tree;
1311 
1312         for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) {
1313                 mutex_enter(&ote->ote_lock);
1314                 ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK;
1315                 mutex_exit(&ote->ote_lock);
1316         }
1317         ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
1318             otc->otc_entry.otce_mac);
1319 
1320         mutex_exit(&ott->ott_lock);
1321         overlay_hold_rele(odd);
1322 
1323         return (0);
1324 }
1325 
1326 static int
1327 overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize,
1328     int flags)
1329 {
1330         overlay_targ_cache_iter_t base, *iter;
1331 
1332         if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t),
1333             flags & FKIOCTL) != 0)
1334                 return (EFAULT);
1335 
1336         if (base.otci_count > OVERLAY_TARGET_ITER_MAX)
1337                 return (E2BIG);
1338 
1339         if (base.otci_count == 0)
1340                 return (EINVAL);
1341 
1342         *bsize = sizeof (overlay_targ_cache_iter_t) +
1343             base.otci_count * sizeof (overlay_targ_cache_entry_t);
1344         iter = kmem_alloc(*bsize, KM_SLEEP);
1345         bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t));
1346         *outp = iter;
1347 
1348         return (0);
1349 }
1350 
1351 typedef struct overlay_targ_cache_marker {
1352         uint8_t         otcm_mac[ETHERADDRL];
1353         uint16_t        otcm_done;
1354 } overlay_targ_cache_marker_t;
1355 
1356 /* ARGSUSED */
1357 static int
1358 overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg)
1359 {
1360         overlay_dev_t *odd;
1361         overlay_target_t *ott;
1362         overlay_target_entry_t lookup, *ent;
1363         overlay_targ_cache_marker_t *mark;
1364         avl_index_t where;
1365         avl_tree_t *avl;
1366         uint16_t written = 0;
1367 
1368         overlay_targ_cache_iter_t *iter = arg;
1369         mark = (void *)&iter->otci_marker;
1370 
1371         if (mark->otcm_done != 0) {
1372                 iter->otci_count = 0;
1373                 return (0);
1374         }
1375 
1376         odd = overlay_hold_by_dlid(iter->otci_linkid);
1377         if (odd == NULL)
1378                 return (ENOENT);
1379 
1380         mutex_enter(&odd->odd_lock);
1381         if (!(odd->odd_flags & OVERLAY_F_VARPD)) {
1382                 mutex_exit(&odd->odd_lock);
1383                 overlay_hold_rele(odd);
1384                 return (ENXIO);
1385         }
1386         ott = odd->odd_target;
1387         if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC &&
1388             ott->ott_mode != OVERLAY_TARGET_POINT) {
1389                 mutex_exit(&odd->odd_lock);
1390                 overlay_hold_rele(odd);
1391                 return (ENOTSUP);
1392         }
1393 
1394         /*
1395          * Holding this lock across the entire iteration probably isn't very
1396          * good. We should perhaps add an r/w lock for the avl tree. But we'll
1397          * wait until we now it's necessary before we do more.
1398          */
1399         mutex_enter(&ott->ott_lock);
1400         mutex_exit(&odd->odd_lock);
1401 
1402         if (ott->ott_mode == OVERLAY_TARGET_POINT) {
1403                 overlay_targ_cache_entry_t *out = &iter->otci_ents[0];
1404                 bzero(out->otce_mac, ETHERADDRL);
1405                 out->otce_flags = 0;
1406                 bcopy(&ott->ott_u.ott_point, &out->otce_dest,
1407                     sizeof (overlay_target_point_t));
1408                 written++;
1409                 mark->otcm_done = 1;
1410         }
1411 
1412         avl = &ott->ott_u.ott_dyn.ott_tree;
1413         bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL);
1414         ent = avl_find(avl, &lookup, &where);
1415 
1416         /*
1417          * NULL ent means that the entry does not exist, so we want to start
1418          * with the closest node in the tree. This means that we implicitly rely
1419          * on the tree's order and the first node will be the mac 00:00:00:00:00
1420          * and the last will be ff:ff:ff:ff:ff:ff.
1421          */
1422         if (ent == NULL) {
1423                 ent = avl_nearest(avl, where, AVL_AFTER);
1424                 if (ent == NULL) {
1425                         mark->otcm_done = 1;
1426                         goto done;
1427                 }
1428         }
1429 
1430         for (; ent != NULL && written < iter->otci_count;
1431             ent = AVL_NEXT(avl, ent)) {
1432                 overlay_targ_cache_entry_t *out = &iter->otci_ents[written];
1433                 mutex_enter(&ent->ote_lock);
1434                 if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) {
1435                         mutex_exit(&ent->ote_lock);
1436                         continue;
1437                 }
1438                 bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL);
1439                 out->otce_flags = 0;
1440                 if (ent->ote_flags & OVERLAY_ENTRY_F_DROP)
1441                         out->otce_flags |= OVERLAY_TARGET_CACHE_DROP;
1442                 if (ent->ote_flags & OVERLAY_ENTRY_F_VALID)
1443                         bcopy(&ent->ote_dest, &out->otce_dest,
1444                             sizeof (overlay_target_point_t));
1445                 written++;
1446                 mutex_exit(&ent->ote_lock);
1447         }
1448 
1449         if (ent != NULL) {
1450                 bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL);
1451         } else {
1452                 mark->otcm_done = 1;
1453         }
1454 
1455 done:
1456         iter->otci_count = written;
1457         mutex_exit(&ott->ott_lock);
1458         overlay_hold_rele(odd);
1459 
1460         return (0);
1461 }
1462 
1463 /* ARGSUSED */
1464 static int
1465 overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize,
1466     int flags)
1467 {
1468         size_t outsize;
1469         const overlay_targ_cache_iter_t *iter = buf;
1470 
1471         outsize = sizeof (overlay_targ_cache_iter_t) +
1472             iter->otci_count * sizeof (overlay_targ_cache_entry_t);
1473 
1474         if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0)
1475                 return (EFAULT);
1476 
1477         return (0);
1478 }
1479 
1480 static overlay_target_ioctl_t overlay_target_ioctab[] = {
1481         { OVERLAY_TARG_INFO, B_TRUE, B_TRUE,
1482                 NULL, overlay_target_info,
1483                 NULL, sizeof (overlay_targ_info_t)      },
1484         { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE,
1485                 NULL, overlay_target_associate,
1486                 NULL, sizeof (overlay_targ_associate_t) },
1487         { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE,
1488                 NULL, overlay_target_disassociate,
1489                 NULL, sizeof (overlay_targ_id_t)        },
1490         { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE,
1491                 NULL, overlay_target_degrade,
1492                 NULL, sizeof (overlay_targ_degrade_t)   },
1493         { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE,
1494                 NULL, overlay_target_restore,
1495                 NULL, sizeof (overlay_targ_id_t)        },
1496         { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE,
1497                 NULL, overlay_target_lookup_request,
1498                 NULL, sizeof (overlay_targ_lookup_t)    },
1499         { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE,
1500                 NULL, overlay_target_lookup_respond,
1501                 NULL, sizeof (overlay_targ_resp_t)      },
1502         { OVERLAY_TARG_DROP, B_TRUE, B_FALSE,
1503                 NULL, overlay_target_lookup_drop,
1504                 NULL, sizeof (overlay_targ_resp_t)      },
1505         { OVERLAY_TARG_PKT, B_TRUE, B_TRUE,
1506                 overlay_target_pkt_copyin,
1507                 overlay_target_packet,
1508                 overlay_target_pkt_copyout,
1509                 sizeof (overlay_targ_pkt_t)             },
1510         { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE,
1511                 overlay_target_pkt_copyin,
1512                 overlay_target_inject,
1513                 NULL, sizeof (overlay_targ_pkt_t)       },
1514         { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE,
1515                 overlay_target_pkt_copyin,
1516                 overlay_target_resend,
1517                 NULL, sizeof (overlay_targ_pkt_t)       },
1518         { OVERLAY_TARG_LIST, B_FALSE, B_TRUE,
1519                 overlay_target_list_copyin,
1520                 overlay_target_ioctl_list,
1521                 overlay_target_list_copyout,
1522                 sizeof (overlay_targ_list_t)            },
1523         { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE,
1524                 NULL, overlay_target_cache_get,
1525                 NULL, sizeof (overlay_targ_cache_t)     },
1526         { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE,
1527                 NULL, overlay_target_cache_set,
1528                 NULL, sizeof (overlay_targ_cache_t)     },
1529         { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE,
1530                 NULL, overlay_target_cache_remove,
1531                 NULL, sizeof (overlay_targ_cache_t)     },
1532         { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE,
1533                 NULL, overlay_target_cache_flush,
1534                 NULL, sizeof (overlay_targ_cache_t)     },
1535         { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE,
1536                 overlay_target_cache_iter_copyin,
1537                 overlay_target_cache_iter,
1538                 overlay_target_cache_iter_copyout,
1539                 sizeof (overlay_targ_cache_iter_t)              },
1540         { 0 }
1541 };
1542 
1543 int
1544 overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp)
1545 {
1546         minor_t mid;
1547         overlay_target_hdl_t *thdl;
1548 
1549         if (secpolicy_dl_config(credp) != 0)
1550                 return (EPERM);
1551 
1552         if (getminor(*devp) != 0)
1553                 return (ENXIO);
1554 
1555         if (otype & OTYP_BLK)
1556                 return (EINVAL);
1557 
1558         if (flags & ~(FREAD | FWRITE | FEXCL))
1559                 return (EINVAL);
1560 
1561         if ((flags & FWRITE) &&
1562             !(flags & FEXCL))
1563                 return (EINVAL);
1564 
1565         if (!(flags & FREAD) && !(flags & FWRITE))
1566                 return (EINVAL);
1567 
1568         if (crgetzoneid(credp) != GLOBAL_ZONEID)
1569                 return (EPERM);
1570 
1571         mid = id_alloc(overlay_thdl_idspace);
1572         if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) {
1573                 id_free(overlay_thdl_idspace, mid);
1574                 return (ENXIO);
1575         }
1576 
1577         thdl = ddi_get_soft_state(overlay_thdl_state, mid);
1578         VERIFY(thdl != NULL);
1579         thdl->oth_minor = mid;
1580         thdl->oth_zoneid = crgetzoneid(credp);
1581         thdl->oth_oflags = flags;
1582         mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL);
1583         list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t),
1584             offsetof(overlay_target_entry_t, ote_qlink));
1585         *devp = makedevice(getmajor(*devp), mid);
1586 
1587         mutex_enter(&overlay_target_lock);
1588         if ((flags & FEXCL) && overlay_target_excl == B_TRUE) {
1589                 mutex_exit(&overlay_target_lock);
1590                 list_destroy(&thdl->oth_outstanding);
1591                 mutex_destroy(&thdl->oth_lock);
1592                 ddi_soft_state_free(overlay_thdl_state, mid);
1593                 id_free(overlay_thdl_idspace, mid);
1594                 return (EEXIST);
1595         } else if ((flags & FEXCL) != 0) {
1596                 VERIFY(overlay_target_excl == B_FALSE);
1597                 overlay_target_excl = B_TRUE;
1598         }
1599         list_insert_tail(&overlay_thdl_list, thdl);
1600         mutex_exit(&overlay_target_lock);
1601 
1602         return (0);
1603 }
1604 
1605 /* ARGSUSED */
1606 int
1607 overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1608     int *rvalp)
1609 {
1610         overlay_target_ioctl_t *ioc;
1611         overlay_target_hdl_t *thdl;
1612 
1613         if (secpolicy_dl_config(credp) != 0)
1614                 return (EPERM);
1615 
1616         if ((thdl = ddi_get_soft_state(overlay_thdl_state,
1617             getminor(dev))) == NULL)
1618                 return (ENXIO);
1619 
1620         for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) {
1621                 int ret;
1622                 caddr_t buf;
1623                 size_t bufsize;
1624 
1625                 if (ioc->oti_cmd != cmd)
1626                         continue;
1627 
1628                 if (ioc->oti_write == B_TRUE && !(mode & FWRITE))
1629                         return (EBADF);
1630 
1631                 if (ioc->oti_copyin == NULL) {
1632                         bufsize = ioc->oti_size;
1633                         buf = kmem_alloc(bufsize, KM_SLEEP);
1634                         if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize,
1635                             mode & FKIOCTL) != 0) {
1636                                 kmem_free(buf, bufsize);
1637                                 return (EFAULT);
1638                         }
1639                 } else {
1640                         if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg,
1641                             (void **)&buf, &bufsize, mode)) != 0)
1642                                 return (ret);
1643                 }
1644 
1645                 ret = ioc->oti_func(thdl, buf);
1646                 if (ret == 0 && ioc->oti_size != 0 &&
1647                     ioc->oti_ncopyout == B_TRUE) {
1648                         if (ioc->oti_copyout == NULL) {
1649                                 if (ddi_copyout(buf, (void *)(uintptr_t)arg,
1650                                     bufsize, mode & FKIOCTL) != 0)
1651                                         ret = EFAULT;
1652                         } else {
1653                                 ret = ioc->oti_copyout((void *)(uintptr_t)arg,
1654                                     buf, bufsize, mode);
1655                         }
1656                 }
1657 
1658                 kmem_free(buf, bufsize);
1659                 return (ret);
1660         }
1661 
1662         return (ENOTTY);
1663 }
1664 
1665 /* ARGSUSED */
1666 int
1667 overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp)
1668 {
1669         overlay_target_hdl_t *thdl;
1670         overlay_target_entry_t *entry;
1671         minor_t mid = getminor(dev);
1672 
1673         if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL)
1674                 return (ENXIO);
1675 
1676         mutex_enter(&overlay_target_lock);
1677         list_remove(&overlay_thdl_list, thdl);
1678         mutex_enter(&thdl->oth_lock);
1679         while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL)
1680                 list_insert_tail(&overlay_target_list, entry);
1681         cv_signal(&overlay_target_condvar);
1682         mutex_exit(&thdl->oth_lock);
1683         if ((thdl->oth_oflags & FEXCL) != 0) {
1684                 VERIFY(overlay_target_excl == B_TRUE);
1685                 overlay_target_excl = B_FALSE;
1686         }
1687         mutex_exit(&overlay_target_lock);
1688 
1689         list_destroy(&thdl->oth_outstanding);
1690         mutex_destroy(&thdl->oth_lock);
1691         mid = thdl->oth_minor;
1692         ddi_soft_state_free(overlay_thdl_state, mid);
1693         id_free(overlay_thdl_idspace, mid);
1694 
1695         return (0);
1696 }