Print this page
Overlay fabric router

@@ -78,11 +78,11 @@
         overlay_target_copyout_f oti_copyout;   /* copyin func */
         size_t          oti_size;       /* size of user level structure */
 } overlay_target_ioctl_t;
 
 static kmem_cache_t *overlay_target_cache;
-static kmem_cache_t *overlay_entry_cache;
+kmem_cache_t *overlay_entry_cache;
 static id_space_t *overlay_thdl_idspace;
 static void *overlay_thdl_state;
 
 /*
  * When we support overlay devices in the NGZ, then all of these need to become

@@ -96,11 +96,11 @@
 static boolean_t overlay_target_excl;
 
 /*
  * Outstanding data per hash table entry.
  */
-static int overlay_ent_size = 128 * 1024;
+int overlay_ent_size = 128 * 1024;
 
 /* ARGSUSED */
 static int
 overlay_target_cache_constructor(void *buf, void *arg, int kmflgs)
 {

@@ -139,12 +139,10 @@
         overlay_target_entry_t *ote = buf;
 
         mutex_destroy(&ote->ote_lock);
 }
 
-/* TODO: we will need to modify these to hash/cmp DCID + MAC */
-
 static uint64_t
 overlay_mac_hash(const void *v)
 {
         uint32_t crc;
         CRC32(crc, v, ETHERADDRL, -1U, crc32_table);

@@ -155,20 +153,66 @@
 overlay_mac_cmp(const void *a, const void *b)
 {
         return (bcmp(a, b, ETHERADDRL));
 }
 
+static uint64_t
+overlay_vl3_hash(const void *v)
+{
+        const overlay_target_entry_t *ote = v;
+        uint32_t crc;
+
+        CRC32(crc, &ote->ote_ip, sizeof (ote->ote_ip), -1U, crc32_table);
+        CRC32(crc, &ote->ote_fab, sizeof (ote->ote_fab), crc, crc32_table);
+        return (crc);
+}
+
+static int
+overlay_vl3_cmp(const void *a, const void *b)
+{
+        const overlay_target_entry_t *l = a;
+        const overlay_target_entry_t *r = b;
+
+        if (l->ote_fab != r->ote_fab ||
+            bcmp(&l->ote_ip, &r->ote_ip, sizeof (struct in6_addr)) != 0)
+                return (1);
+        return (0);
+}
+
+static int
+overlay_vl3_avl(const void *a, const void *b)
+{
+        const overlay_target_entry_t *l = a;
+        const overlay_target_entry_t *r = b;
+
+        if (l->ote_fab < r->ote_fab)
+                return (-1);
+        if (l->ote_fab > r->ote_fab)
+                return (1);
+        return (memcmp(&l->ote_ip, &r->ote_ip, sizeof (struct in6_addr)));
+}
+
 /* ARGSUSED */
-static void
+void
+overlay_target_entry_null_dtor(void *arg)
+{
+}
+
+/* ARGSUSED */
+void
 overlay_target_entry_dtor(void *arg)
 {
         overlay_target_entry_t *ote = arg;
 
+        ASSERT3U(ote->ote_refcnt, ==, 0);
+
         ote->ote_flags = 0;
         bzero(ote->ote_addr, ETHERADDRL);
+        bzero(&ote->ote_ip, sizeof (ote->ote_ip));
         ote->ote_ott = NULL;
         ote->ote_odd = NULL;
+        ote->ote_fab = NULL;
         freemsgchain(ote->ote_chead);
         ote->ote_chead = ote->ote_ctail = NULL;
         ote->ote_mbsize = 0;
         ote->ote_vtime = 0;
         kmem_cache_free(overlay_entry_cache, ote);

@@ -234,34 +278,52 @@
         if (odd->odd_target == NULL)
                 return;
 
         if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) {
                 refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash;
+                refhash_t *r3p = odd->odd_target->ott_u.ott_dyn.ott_l3dhash;
                 avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree;
+                avl_tree_t *a3p = &odd->odd_target->ott_u.ott_dyn.ott_l3tree;
                 overlay_target_entry_t *ote;
 
-                /* TODO: remove from L3 trees */
-
                 /*
                  * Our AVL tree and hashtable contain the same elements,
                  * therefore we should just remove it from the tree, but then
                  * delete the entries when we remove them from the hash table
                  * (which happens through the refhash dtor).
                  */
-                while ((ote = avl_first(ap)) != NULL)
+                while ((ote = avl_first(ap)) != NULL) {
                         avl_remove(ap, ote);
-
+                        OVERLAY_TARG_ENTRY_REFRELE(ote);
+                }
                 avl_destroy(ap);
+
+                while ((ote = avl_first(a3p)) != NULL) {
+                        avl_remove(a3p, ote);
+                        OVERLAY_TARG_ENTRY_REFRELE(ote);
+                }
+                avl_destroy(a3p);
+
                 for (ote = refhash_first(rp); ote != NULL;
                     ote = refhash_next(rp, ote)) {
                         refhash_remove(rp, ote);
+                        OVERLAY_TARG_ENTRY_REFRELE(ote);
                 }
                 refhash_destroy(rp);
+
+                for (ote = refhash_first(r3p); ote != NULL;
+                    ote = refhash_next(r3p, ote)) {
+                        refhash_remove(r3p, ote);
+                        OVERLAY_TARG_ENTRY_REFRELE(ote);
         }
+                refhash_destroy(r3p);
+        }
 
         ASSERT(odd->odd_target->ott_ocount == 0);
+        bzero(&odd->odd_target->ott_u, sizeof (odd->odd_target->ott_u));
         kmem_cache_free(overlay_target_cache, odd->odd_target);
+        odd->odd_target = NULL;
 }
 
 int
 overlay_target_busy()
 {

@@ -272,11 +334,11 @@
         mutex_exit(&overlay_target_lock);
 
         return (ret);
 }
 
-static void
+void
 overlay_target_queue(overlay_target_entry_t *entry)
 {
         mutex_enter(&overlay_target_lock);
         mutex_enter(&entry->ote_ott->ott_lock);
         if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) {

@@ -304,28 +366,26 @@
 }
 
 /*
  * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP |
  * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at
- * this time, say for NVGRE, we drop all packets that mcuh this.
- *
- * XXX: It might be better to replace the 'sock' argument with
- * overlay_target_entry_t** and set it with the found entry in the case
- * of OVERLAY_TARGET_OK.
+ * this time, say for NVGRE, we drop all packets that match this.
  */
 int
 overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock,
-    socklen_t *slenp)
+    socklen_t *slenp, uint64_t *vidp)
 {
         int ret;
         struct sockaddr_in6 *v6;
         overlay_target_t *ott;
         mac_header_info_t mhi;
         overlay_target_entry_t *entry;
 
         ASSERT(odd->odd_target != NULL);
 
+        *vidp = odd->odd_vid;
+
         /*
          * At this point, the overlay device is in a mux which means that it's
          * been activated. At this point, parts of the target, such as the mode
          * and the destination are now read-only and we don't have to worry
          * about synchronization for them.

@@ -350,26 +410,17 @@
         }
 
         ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
 
         /*
-         * Note we only want the MAC address here, therefore we won't bother
-         * using mac_vlan_header_info(). If any caller needs the vlan info at
-         * this point, this should change to a call to mac_vlan_header_info().
+         * VL2 -> UL3 lookups only need the destination VL2 mac address,
+         * however, if we end up having to route the packet, we will need
+         * the source vlan as part of the destination selection.
          */
-        if (mac_header_info(odd->odd_mh, mp, &mhi) != 0)
+        if (mac_vlan_header_info(odd->odd_mh, mp, &mhi) != 0)
                 return (OVERLAY_TARGET_DROP);
 
-        /*
-         * TODO: compare mhi.mhi_daddr with odd->macaddr.
-         * If match,
-         *      get VL3 dest from mp
-         *      lookup target using VL3 dest
-         * otherwise,
-         *      lookup target using VL2 dest (existing refhash_lookup() call
-         *      below)
-         */
         mutex_enter(&ott->ott_lock);
         entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
             mhi.mhi_daddr);
         if (entry == NULL) {
                 entry = kmem_cache_alloc(overlay_entry_cache,

@@ -376,33 +427,35 @@
                     KM_NOSLEEP | KM_NORMALPRI);
                 if (entry == NULL) {
                         mutex_exit(&ott->ott_lock);
                         return (OVERLAY_TARGET_DROP);
                 }
-                /*
-                 * TODO: set entry->ote_dcid, if VL3 lookup, copy dst addr
-                 * into entry->ote_ip.  Probably zero out the address we're
-                 * not lookup up (VL2 or VL3) as well.
-                 */
                 bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL);
                 entry->ote_chead = entry->ote_ctail = mp;
                 entry->ote_mbsize = msgsize(mp);
                 entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
                 entry->ote_ott = ott;
                 entry->ote_odd = odd;
+
+                OVERLAY_TARG_ENTRY_REFHOLD(entry);
                 refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry);
+
+                OVERLAY_TARG_ENTRY_REFHOLD(entry);
                 avl_add(&ott->ott_u.ott_dyn.ott_tree, entry);
+
                 mutex_exit(&ott->ott_lock);
                 overlay_target_queue(entry);
                 return (OVERLAY_TARGET_ASYNC);
         }
-        refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry);
+        OVERLAY_TARG_ENTRY_REFHOLD(entry);
         mutex_exit(&ott->ott_lock);
 
         mutex_enter(&entry->ote_lock);
         if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) {
                 ret = OVERLAY_TARGET_DROP;
+        } else if (entry->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+                ret = overlay_route_lookup(odd, mp, &mhi, sock, slenp, vidp);
         } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
                 bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
                     sizeof (struct in6_addr));
                 v6->sin6_port = htons(entry->ote_dest.otp_port);
                 *slenp = sizeof (struct sockaddr_in6);

@@ -433,11 +486,11 @@
                 }
         }
         mutex_exit(&entry->ote_lock);
 
         mutex_enter(&ott->ott_lock);
-        refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry);
+        OVERLAY_TARG_ENTRY_REFRELE(entry);
         mutex_exit(&ott->ott_lock);
 
         return (ret);
 }
 

@@ -471,10 +524,11 @@
 overlay_target_associate(overlay_target_hdl_t *thdl, void *arg)
 {
         overlay_dev_t *odd;
         overlay_target_t *ott;
         overlay_targ_associate_t *ota = arg;
+        overlay_router_t *ort;
 
         odd = overlay_hold_by_dlid(ota->ota_linkid);
         if (odd == NULL)
                 return (ENOENT);
 

@@ -523,16 +577,34 @@
                 bcopy(&ota->ota_point, &ott->ott_u.ott_point,
                     sizeof (overlay_target_point_t));
         } else {
                 ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE,
                     overlay_mac_hash, overlay_mac_cmp,
-                    overlay_target_entry_dtor, sizeof (overlay_target_entry_t),
+                    overlay_target_entry_null_dtor,
+                    sizeof (overlay_target_entry_t),
                     offsetof(overlay_target_entry_t, ote_reflink),
                     offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP);
+                ott->ott_u.ott_dyn.ott_l3dhash = refhash_create(OVERLAY_HSIZE,
+                    overlay_vl3_hash, overlay_vl3_cmp,
+                    overlay_target_entry_null_dtor,
+                    sizeof (overlay_target_entry_t),
+                    offsetof(overlay_target_entry_t, ote_l3_reflink), 0,
+                    KM_SLEEP);
                 avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl,
                     sizeof (overlay_target_entry_t),
                     offsetof(overlay_target_entry_t, ote_avllink));
+                avl_create(&ott->ott_u.ott_dyn.ott_l3tree, overlay_vl3_avl,
+                    sizeof (overlay_target_entry_t),
+                    offsetof(overlay_target_entry_t, ote_l3_avllink));
+
+                ort = kmem_zalloc(sizeof (*ort), KM_SLEEP);
+                mutex_init(&ort->otr_lock, NULL, MUTEX_DRIVER, NULL);
+                list_create(&ort->otr_tables, sizeof (overlay_route_table_t),
+                    offsetof(overlay_route_table_t, ort_link));
+                avl_create(&ort->otr_tree, overlay_fabric_avl,
+                    sizeof (overlay_fabric_entry_t),
+                    offsetof(overlay_fabric_entry_t, ofe_avllink));
         }
         mutex_enter(&odd->odd_lock);
         if (odd->odd_flags & OVERLAY_F_VARPD) {
                 mutex_exit(&odd->odd_lock);
                 kmem_cache_free(overlay_target_cache, ott);

@@ -544,11 +616,10 @@
         odd->odd_target = ott;
         mutex_exit(&odd->odd_lock);
 
         overlay_hold_rele(odd);
 
-
         return (0);
 }
 
 
 /* ARGSUSED */

@@ -606,13 +677,13 @@
 static int
 overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg)
 {
         overlay_targ_lookup_t *otl = arg;
         overlay_target_entry_t *entry;
+        void *src, *dst;
         clock_t ret, timeout;
         mac_header_info_t mhi;
-
         timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC);
 again:
         mutex_enter(&overlay_target_lock);
         while (list_is_empty(&overlay_target_list)) {
                 ret = cv_timedwait(&overlay_target_condvar,

@@ -623,23 +694,38 @@
                 }
         }
         entry = list_remove_head(&overlay_target_list);
         mutex_exit(&overlay_target_lock);
         mutex_enter(&entry->ote_lock);
-        if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
+        if (entry->ote_flags &
+            (OVERLAY_ENTRY_F_PENDING | OVERLAY_ENTRY_F_VL3_PENDING)) {
                 ASSERT(entry->ote_chead == NULL);
                 mutex_exit(&entry->ote_lock);
                 goto again;
         }
         ASSERT(entry->ote_chead != NULL);
 
+
+        otl->otl_l3req = (entry->ote_flags & OVERLAY_ENTRY_F_VL3_PENDING) ?
+            B_TRUE : B_FALSE;
+
+        if (otl->otl_l3req) {
+                src = &otl->otl_addru.otlu_l3.otl3_srcip;
+                dst = &otl->otl_addru.otlu_l3.otl3_dstip;
+        } else {
+                src = &otl->otl_addru.otlu_l2.otl2_srcaddr;
+                dst = &otl->otl_addru.otlu_l2.otl2_dstaddr;
+        }
+
         /*
-         * If we have a bogon that doesn't have a valid mac header, drop it and
-         * try again.
+         * If we have a bogon that doesn't have a valid mac header, or an
+         * invalid IP header for IP requests, drop it and try again.
          */
-        if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
-            &mhi) != 0) {
+        if ((mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead,
+            &mhi) != 0) ||
+            (otl->otl_l3req && overlay_mblk_vl3ip(entry->ote_chead, src,
+            dst) != 0)) {
                 boolean_t queue = B_FALSE;
                 mblk_t *mp = entry->ote_chead;
                 entry->ote_chead = mp->b_next;
                 mp->b_next = NULL;
                 if (entry->ote_ctail == mp)

@@ -652,29 +738,29 @@
                         overlay_target_queue(entry);
                 freemsg(mp);
                 goto again;
         }
 
-        /*
-         * TODO: If VL3 request,
-         *      set otl->otl_l3req
-         *      Fill in otl_{src,dst}ip
-         * Else
-         *      clear otl->otl_l3req
-         */
         otl->otl_dlid = entry->ote_odd->odd_linkid;
         otl->otl_reqid = (uintptr_t)entry;
         otl->otl_varpdid = entry->ote_ott->ott_id;
         otl->otl_vnetid = entry->ote_odd->odd_vid;
 
         otl->otl_hdrsize = mhi.mhi_hdrsize;
         otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize;
-        bcopy(mhi.mhi_daddr, otl->otl_addru.otlu_l2.otl2_dstaddr, ETHERADDRL);
-        bcopy(mhi.mhi_saddr, otl->otl_addru.otlu_l2.otl2_srcaddr, ETHERADDRL);
         otl->otl_addru.otlu_l2.otl2_dsttype = mhi.mhi_dsttype;
         otl->otl_addru.otlu_l2.otl2_sap = mhi.mhi_bindsap;
         otl->otl_vlan = VLAN_ID(mhi.mhi_tci);
+
+        /*
+         * The overlay_mblk_vl3ip() call above fills in dst & src for
+         * VL3->UL3 requests, so only need to care about VL2->UL3 here.
+         */
+        if (!otl->otl_l3req) {
+                bcopy(mhi.mhi_daddr, dst, ETHERADDRL);
+                bcopy(mhi.mhi_saddr, src, ETHERADDRL);
+        }
         mutex_exit(&entry->ote_lock);
 
         mutex_enter(&thdl->oth_lock);
         list_insert_tail(&thdl->oth_outstanding, entry);
         mutex_exit(&thdl->oth_lock);

@@ -686,11 +772,21 @@
 overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg)
 {
         const overlay_targ_resp_t *otr = arg;
         overlay_target_entry_t *entry;
         mblk_t *mp;
+        boolean_t is_router = B_FALSE;
 
+        /*
+         * If we ever support a protocol that uses MAC addresses for the UL
+         * destination addr, we probably should expand this to check that
+         * all of otr is zero.
+         */
+        if (IN6_IS_ADDR_UNSPECIFIED(&otr->otr_answer.otp_ip) &&
+            otr->otr_answer.otp_port == 0)
+                is_router = B_TRUE;
+
         mutex_enter(&thdl->oth_lock);
         for (entry = list_head(&thdl->oth_outstanding); entry != NULL;
             entry = list_next(&thdl->oth_outstanding, entry)) {
                 if ((uintptr_t)entry == otr->otr_reqid)
                         break;

@@ -706,10 +802,12 @@
         mutex_enter(&entry->ote_lock);
         bcopy(&otr->otr_answer, &entry->ote_dest,
             sizeof (overlay_target_point_t));
         entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING;
         entry->ote_flags |= OVERLAY_ENTRY_F_VALID;
+        if (is_router)
+                entry->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
         mp = entry->ote_chead;
         entry->ote_chead = NULL;
         entry->ote_ctail = NULL;
         entry->ote_mbsize = 0;
         entry->ote_vtime = gethrtime();

@@ -1128,33 +1226,36 @@
                     sizeof (overlay_target_point_t));
         } else {
                 overlay_target_entry_t *ote;
                 ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
                     otc->otc_entry.otce_mac);
-                if (ote != NULL) {
+                if (ote == NULL) {
+                        ret = ENOENT;
+                        goto done;
+                }
+
                         mutex_enter(&ote->ote_lock);
-                        if ((ote->ote_flags &
-                            OVERLAY_ENTRY_F_VALID_MASK) != 0) {
+                if ((ote->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) != 0) {
                                 if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) {
                                         otc->otc_entry.otce_flags =
                                             OVERLAY_TARGET_CACHE_DROP;
+                        } else if (ote->ote_flags & OVERLAY_ENTRY_F_ROUTER) {
+                                otc->otc_entry.otce_flags =
+                                    OVERLAY_TARGET_CACHE_ROUTER;
                                 } else {
                                         otc->otc_entry.otce_flags = 0;
-                                        bcopy(&ote->ote_dest,
-                                            &otc->otc_entry.otce_dest,
+                                bcopy(&ote->ote_dest, &otc->otc_entry.otce_dest,
                                             sizeof (overlay_target_point_t));
                                 }
                                 ret = 0;
                         } else {
                                 ret = ENOENT;
                         }
                         mutex_exit(&ote->ote_lock);
-                } else {
-                        ret = ENOENT;
                 }
-        }
 
+done:
         mutex_exit(&ott->ott_lock);
         overlay_hold_rele(odd);
 
         return (ret);
 }

@@ -1167,13 +1268,18 @@
         overlay_target_t *ott;
         overlay_target_entry_t *ote;
         overlay_targ_cache_t *otc = arg;
         mblk_t *mp = NULL;
 
-        if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP)
+        if (otc->otc_entry.otce_flags &
+            ~(OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
                 return (EINVAL);
 
+        if (otc->otc_entry.otce_flags ==
+            (OVERLAY_TARGET_CACHE_DROP | OVERLAY_TARGET_CACHE_ROUTER))
+                return (EINVAL);
+
         odd = overlay_hold_by_dlid(otc->otc_linkid);
         if (odd == NULL)
                 return (ENOENT);
 
         mutex_enter(&odd->odd_lock);

@@ -1209,10 +1315,12 @@
 
         if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) {
                 ote->ote_flags |= OVERLAY_ENTRY_F_DROP;
         } else {
                 ote->ote_flags |= OVERLAY_ENTRY_F_VALID;
+                if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_ROUTER)
+                        ote->ote_flags |= OVERLAY_ENTRY_F_ROUTER;
                 bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest,
                     sizeof (overlay_target_point_t));
                 mp = ote->ote_chead;
                 ote->ote_chead = NULL;
                 ote->ote_ctail = NULL;