Print this page
Overlay fabric router
@@ -78,11 +78,11 @@
overlay_target_copyout_f oti_copyout; /* copyin func */
size_t oti_size; /* size of user level structure */
} overlay_target_ioctl_t;
static kmem_cache_t *overlay_target_cache;
-static kmem_cache_t *overlay_entry_cache;
+kmem_cache_t *overlay_entry_cache;
static id_space_t *overlay_thdl_idspace;
static void *overlay_thdl_state;
/*
* When we support overlay devices in the NGZ, then all of these need to become
@@ -96,11 +96,11 @@
static boolean_t overlay_target_excl;
/*
* Outstanding data per hash table entry.
*/
-static int overlay_ent_size = 128 * 1024;
+int overlay_ent_size = 128 * 1024;
/* ARGSUSED */
static int
overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
{
@@ -139,12 +139,10 @@
overlay_target_entry_t *ote = buf;
mutex_destroy(&ote->ote_lock);
}
-/* TODO: we will need to modify these to hash/cmp DCID + MAC */
-
static uint64_t
overlay_mac_hash(const void *v)
{
uint32_t crc;
CRC32(crc, v, ETHERADDRL, -1U, crc32_table);
@@ -155,20 +153,66 @@
overlay_mac_cmp(const void *a, const void *b)
{
return (bcmp(a, b, ETHERADDRL));
}
+static uint64_t
+overlay_vl3_hash(const void *v)
+{
+ const overlay_target_entry_t *ote = v;
+ uint32_t crc;
+
+ CRC32(crc, &ote->ote_ip, sizeof (ote->ote_ip), -1U, crc32_table);
+ CRC32(crc, &ote->ote_fab, sizeof (ote->ote_fab), crc, crc32_table);
+ return (crc);
+}
+
+static int
+overlay_vl3_cmp(const void *a, const void *b)
+{
+ const overlay_target_entry_t *l = a;
+ const overlay_target_entry_t *r = b;
+
+ if (l->ote_fab != r->ote_fab ||
+ bcmp(&l->ote_ip, &r->ote_ip, sizeof (struct in6_addr)) != 0)
+ return (1);
+ return (0);
+}
+
+static int
+overlay_vl3_avl(const void *a, const void *b)
+{
+ const overlay_target_entry_t *l = a;
+ const overlay_target_entry_t *r = b;
+
+ if (l->ote_fab < r->ote_fab)
+ return (-1);
+ if (l->ote_fab > r->ote_fab)
+ return (1);
+ return (memcmp(&l->ote_ip, &r->ote_ip, sizeof (struct in6_addr)));
+}
+
/* ARGSUSED */
-static void
+void
+overlay_target_entry_null_dtor(void *arg)
+{
+}
+
+/* ARGSUSED */
+void
overlay_target_entry_dtor(void *arg)
{
overlay_target_entry_t *ote = arg;
+ ASSERT3U(ote->ote_refcnt, ==, 0);
+
ote->ote_flags = 0;
bzero(ote->ote_addr, ETHERADDRL);
+ bzero(&ote->ote_ip, sizeof (ote->ote_ip));
ote->ote_ott = NULL;
ote->ote_odd = NULL;
+ ote->ote_fab = NULL;
freemsgchain(ote->ote_chead);
ote->ote_chead = ote->ote_ctail = NULL;
ote->ote_mbsize = 0;
ote->ote_vtime = 0;
kmem_cache_free(overlay_entry_cache, ote);
@@ -234,34 +278,52 @@
if (odd->odd_target == NULL)
return;
if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
+ refhash_t *r3p = odd->odd_target->ott_u.ott_dyn.ott_l3dhash;
avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
+ avl_tree_t *a3p = &odd->odd_target->ott_u.ott_dyn.ott_l3tree;
overlay_target_entry_t *ote;
- /* TODO: remove from L3 trees */
-
/*
* Our AVL tree and hashtable contain the same elements,
* therefore we should just remove it from the tree, but then
* delete the entries when we remove them from the hash table
* (which happens through the refhash dtor).
*/
- while ((ote = avl_first(ap)) != NULL)
+ while ((ote = avl_first(ap)) != NULL) {
avl_remove(ap, ote);
-
+ OVERLAY_TARG_ENTRY_REFRELE(ote);
+ }
avl_destroy(ap);
+
+ while ((ote = avl_first(a3p)) != NULL) {
+ avl_remove(a3p, ote);
+ OVERLAY_TARG_ENTRY_REFRELE(ote);
+ }
+ avl_destroy(a3p);
+
for (ote = refhash_first(rp); ote != NULL;
ote = refhash_next(rp, ote)) {
refhash_remove(rp, ote);
+ OVERLAY_TARG_ENTRY_REFRELE(ote);
}
refhash_destroy(rp);
+
+ for (ote = refhash_first(r3p); ote != NULL;
+ ote = refhash_next(r3p, ote)) {
+ refhash_remove(r3p, ote);
+ OVERLAY_TARG_ENTRY_REFRELE(ote);
}
+ refhash_destroy(r3p);
+ }
ASSERT(odd->odd_target->ott_ocount == 0);
+ bzero(&odd->odd_target->ott_u, sizeof (odd->odd_target->ott_u));
kmem_cache_free(overlay_target_cache, odd->odd_target);
+ odd->odd_target = NULL;
}
int
overlay_target_busy()
{
@@ -272,11 +334,11 @@
mutex_exit(&overlay_target_lock);
return (ret);
}
-static void
+void
overlay_target_queue(overlay_target_entry_t *entry)
{
mutex_enter(&overlay_target_lock);
mutex_enter(&entry->ote_ott->ott_lock);
if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {
@@ -304,28 +366,26 @@
}
/*
* This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
* OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
- * this time, say for NVGRE, we drop all packets that mcuh this.
- *
- * XXX: It might be better to replace the 'sock' argument with
- * overlay_target_entry_t** and set it with the found entry in the case
- * of OVERLAY_TARGET_OK.
+ * this time, say for NVGRE, we drop all packets that match this.
*/
int
overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
- socklen_t *slenp)
+ socklen_t *slenp, uint64_t *vidp)
{
int ret;
struct sockaddr_in6 *v6;
overlay_target_t *ott;
mac_header_info_t mhi;
overlay_target_entry_t *entry;
ASSERT(odd->odd_target != NULL);
+ *vidp = odd->odd_vid;
+
/*
* At this point, the overlay device is in a mux which means that it's
* been activated. At this point, parts of the target, such as the mode
* and the destination are now read-only and we don't have to worry
* about synchronization for them.
@@ -350,26 +410,17 @@
}
ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
/*
- * Note we only want the MAC address here, therefore we won't bother
- * using mac_vlan_header_info(). If any caller needs the vlan info at
- * this point, this should change to a call to mac_vlan_header_info().
+ * VL2 -> UL3 lookups only need the destination VL2 mac address,
+ * however, if we end up having to route the packet, we will need
+ * the source vlan as part of the destination selection.
*/
- if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+ if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0)
return (OVERLAY_TARGET_DROP);
- /*
- * TODO: compare mhi.mhi_daddr with odd->macaddr.
- * If match,
- * get VL3 dest from mp
- * lookup target using VL3 dest
- * otherwise,
- * lookup target using VL2 dest (existing refhash_lookup() call
- * below)
- */
mutex_enter(&ott->ott_lock);
entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
mhi.mhi_daddr);
if (entry == NULL) {
entry = kmem_cache_alloc(overlay_entry_cache,
@@ -376,33 +427,35 @@
KM_NOSLEEP | KM_NORMALPRI);
if (entry == NULL) {
mutex_exit(&ott->ott_lock);
return (OVERLAY_TARGET_DROP);
}
- /*
- * TODO: set entry->ote_dcid, if VL3 lookup, copy dst addr
- * into entry->ote_ip. Probably zero out the address we're
- * not lookup up (VL2 or VL3) as well.
- */
bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
entry->ote_chead = entry->ote_ctail = mp;
entry->ote_mbsize = msgsize(mp);
entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
entry->ote_ott = ott;
entry->ote_odd = odd;
+
+ OVERLAY_TARG_ENTRY_REFHOLD(entry);
refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+ OVERLAY_TARG_ENTRY_REFHOLD(entry);
avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
+
mutex_exit(&ott->ott_lock);
overlay_target_queue(entry);
return (OVERLAY_TARGET_ASYNC);
}
- refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+ OVERLAY_TARG_ENTRY_REFHOLD(entry);
mutex_exit(&ott->ott_lock);
mutex_enter(&entry->ote_lock);
if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
ret = OVERLAY_TARGET_DROP;
+ } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ ret = overlay_route_lookup(odd, mp, &mhi, sock, slenp, vidp);
} else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
sizeof (struct in6_addr));
v6->sin6_port = htons(entry->ote_dest.otp_port);
*slenp = sizeof (struct sockaddr_in6);
@@ -433,11 +486,11 @@
}
}
mutex_exit(&entry->ote_lock);
mutex_enter(&ott->ott_lock);
- refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+ OVERLAY_TARG_ENTRY_REFRELE(entry);
mutex_exit(&ott->ott_lock);
return (ret);
}
@@ -471,10 +524,11 @@
overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
{
overlay_dev_t *odd;
overlay_target_t *ott;
overlay_targ_associate_t *ota = arg;
+ overlay_router_t *ort;
odd = overlay_hold_by_dlid(ota->ota_linkid);
if (odd == NULL)
return (ENOENT);
@@ -523,16 +577,34 @@
bcopy(&ota->ota_point, &ott->ott_u.ott_point,
sizeof (overlay_target_point_t));
} else {
ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
overlay_mac_hash, overlay_mac_cmp,
- overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+ overlay_target_entry_null_dtor,
+ sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_reflink),
offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+ ott->ott_u.ott_dyn.ott_l3dhash = refhash_create(OVERLAY_HSIZE,
+ overlay_vl3_hash, overlay_vl3_cmp,
+ overlay_target_entry_null_dtor,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_l3_reflink), 0,
+ KM_SLEEP);
avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
sizeof (overlay_target_entry_t),
offsetof(overlay_target_entry_t, ote_avllink));
+ avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_vl3_avl,
+ sizeof (overlay_target_entry_t),
+ offsetof(overlay_target_entry_t, ote_l3_avllink));
+
+ ort = kmem_zalloc(sizeof (*ort), KM_SLEEP);
+ mutex_init(&ort->otr_lock, NULL, MUTEX_DRIVER, NULL);
+ list_create(&ort->otr_tables, sizeof (overlay_route_table_t),
+ offsetof(overlay_route_table_t, ort_link));
+ avl_create(&ort->otr_tree, overlay_fabric_avl,
+ sizeof (overlay_fabric_entry_t),
+ offsetof(overlay_fabric_entry_t, ofe_avllink));
}
mutex_enter(&odd->odd_lock);
if (odd->odd_flags & OVERLAY_F_VARPD) {
mutex_exit(&odd->odd_lock);
kmem_cache_free(overlay_target_cache, ott);
@@ -544,11 +616,10 @@
odd->odd_target = ott;
mutex_exit(&odd->odd_lock);
overlay_hold_rele(odd);
-
return (0);
}
/* ARGSUSED */
@@ -606,13 +677,13 @@
static int
overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
{
overlay_targ_lookup_t *otl = arg;
overlay_target_entry_t *entry;
+ void *src, *dst;
clock_t ret, timeout;
mac_header_info_t mhi;
-
timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
again:
mutex_enter(&overlay_target_lock);
while (list_is_empty(&overlay_target_list)) {
ret = cv_timedwait(&overlay_target_condvar,
@@ -623,23 +694,38 @@
}
}
entry = list_remove_head(&overlay_target_list);
mutex_exit(&overlay_target_lock);
mutex_enter(&entry->ote_lock);
- if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+ if (entry->ote_flags &
+ (OVERLAY_ENTRY_F_PENDING | OVERLAY_ENTRY_F_VL3_PENDING)) {
ASSERT(entry->ote_chead == NULL);
mutex_exit(&entry->ote_lock);
goto again;
}
ASSERT(entry->ote_chead != NULL);
+
+ otl->otl_l3req = (entry->ote_flags & OVERLAY_ENTRY_F_VL3_PENDING) ?
+ B_TRUE : B_FALSE;
+
+ if (otl->otl_l3req) {
+ src = &otl->otl_addru.otlu_l3.otl3_srcip;
+ dst = &otl->otl_addru.otlu_l3.otl3_dstip;
+ } else {
+ src = &otl->otl_addru.otlu_l2.otl2_srcaddr;
+ dst = &otl->otl_addru.otlu_l2.otl2_dstaddr;
+ }
+
/*
- * If we have a bogon that doesn't have a valid mac header, drop it and
- * try again.
+ * If we have a bogon that doesn't have a valid mac header, or an
+ * invalid IP header for IP requests, drop it and try again.
*/
- if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
- &mhi) != 0) {
+ if ((mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
+ &mhi) != 0) ||
+ (otl->otl_l3req && overlay_mblk_vl3ip(entry->ote_chead, src,
+ dst) != 0)) {
boolean_t queue = B_FALSE;
mblk_t *mp = entry->ote_chead;
entry->ote_chead = mp->b_next;
mp->b_next = NULL;
if (entry->ote_ctail == mp)
@@ -652,29 +738,29 @@
overlay_target_queue(entry);
freemsg(mp);
goto again;
}
- /*
- * TODO: If VL3 request,
- * set otl->otl_l3req
- * Fill in otl_{src,dst}ip
- * Else
- * clear otl->otl_l3req
- */
otl->otl_dlid = entry->ote_odd->odd_linkid;
otl->otl_reqid = (uintptr_t)entry;
otl->otl_varpdid = entry->ote_ott->ott_id;
otl->otl_vnetid = entry->ote_odd->odd_vid;
otl->otl_hdrsize = mhi.mhi_hdrsize;
otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
- bcopy(mhi.mhi_daddr, otl->otl_addru.otlu_l2.otl2_dstaddr, ETHERADDRL);
- bcopy(mhi.mhi_saddr, otl->otl_addru.otlu_l2.otl2_srcaddr, ETHERADDRL);
otl->otl_addru.otlu_l2.otl2_dsttype = mhi.mhi_dsttype;
otl->otl_addru.otlu_l2.otl2_sap = mhi.mhi_bindsap;
otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
+
+ /*
+ * The overlay_mblk_vl3ip() call above fills in dst & src for
+ * VL3->UL3 requests, so only need to care about VL2->UL3 here.
+ */
+ if (!otl->otl_l3req) {
+ bcopy(mhi.mhi_daddr, dst, ETHERADDRL);
+ bcopy(mhi.mhi_saddr, src, ETHERADDRL);
+ }
mutex_exit(&entry->ote_lock);
mutex_enter(&thdl->oth_lock);
list_insert_tail(&thdl->oth_outstanding, entry);
mutex_exit(&thdl->oth_lock);
@@ -686,11 +772,21 @@
overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
{
const overlay_targ_resp_t *otr = arg;
overlay_target_entry_t *entry;
mblk_t *mp;
+ boolean_t is_router = B_FALSE;
+ /*
+ * If we ever support a protocol that uses MAC addresses for the UL
+ * destination addr, we probably should expand this to check that
+ * all of otr is zero.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) &&
+ otr->otr_answer.otp_port == 0)
+ is_router = B_TRUE;
+
mutex_enter(&thdl->oth_lock);
for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
entry = list_next(&thdl->oth_outstanding, entry)) {
if ((uintptr_t)entry == otr->otr_reqid)
break;
@@ -706,10 +802,12 @@
mutex_enter(&entry->ote_lock);
bcopy(&otr->otr_answer, &entry->ote_dest,
sizeof (overlay_target_point_t));
entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ if (is_router)
+ entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
mp = entry->ote_chead;
entry->ote_chead = NULL;
entry->ote_ctail = NULL;
entry->ote_mbsize = 0;
entry->ote_vtime = gethrtime();
@@ -1128,33 +1226,36 @@
sizeof (overlay_target_point_t));
} else {
overlay_target_entry_t *ote;
ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
otc->otc_entry.otce_mac);
- if (ote != NULL) {
+ if (ote == NULL) {
+ ret = ENOENT;
+ goto done;
+ }
+
mutex_enter(&ote->ote_lock);
- if ((ote->ote_flags &
- OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+ if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) {
if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
otc->otc_entry.otce_flags =
OVERLAY_TARGET_CACHE_DROP;
+ } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+ otc->otc_entry.otce_flags =
+ OVERLAY_TARGET_CACHE_ROUTER;
} else {
otc->otc_entry.otce_flags = 0;
- bcopy(&ote->ote_dest,
- &otc->otc_entry.otce_dest,
+ bcopy(&ote->ote_dest, &otc->otc_entry.otce_dest,
sizeof (overlay_target_point_t));
}
ret = 0;
} else {
ret = ENOENT;
}
mutex_exit(&ote->ote_lock);
- } else {
- ret = ENOENT;
}
- }
+done:
mutex_exit(&ott->ott_lock);
overlay_hold_rele(odd);
return (ret);
}
@@ -1167,13 +1268,18 @@
overlay_target_t *ott;
overlay_target_entry_t *ote;
overlay_targ_cache_t *otc = arg;
mblk_t *mp = NULL;
- if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+ if (otc->otc_entry.otce_flags &
+ ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
return (EINVAL);
+ if (otc->otc_entry.otce_flags ==
+ (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
+ return (EINVAL);
+
odd = overlay_hold_by_dlid(otc->otc_linkid);
if (odd == NULL)
return (ENOENT);
mutex_enter(&odd->odd_lock);
@@ -1209,10 +1315,12 @@
if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
} else {
ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
+ if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER)
+ ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
sizeof (overlay_target_point_t));
mp = ote->ote_chead;
ote->ote_chead = NULL;
ote->ote_ctail = NULL;