1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2018, Joyent, Inc.
  14  */
  15 
  16 #include <sys/types.h>
  17 #include <sys/ethernet.h>
  18 #include <sys/mac_client.h>
  19 #include <sys/vlan.h>
  20 #include <sys/ddi.h>
  21 #include <sys/strsun.h>
  22 #include <inet/ip.h>
  23 #include <inet/ip6.h>
  24 #include <netinet/in.h>
  25 
  26 #include <sys/overlay_impl.h>
  27 
  28 extern kmem_cache_t *overlay_entry_cache;
  29 extern int overlay_ent_size;
  30 
  31 int
  32 overlay_fabric_avl(const void *a, const void *b)
  33 {
  34         const overlay_fabric_t *l;
  35         const overlay_fabric_t *r;
  36         int i;
  37 
  38         l = &((const overlay_fabric_entry_t *)a)->ofe_fabric;
  39         r = &((const overlay_fabric_entry_t *)b)->ofe_fabric;
  40 
  41         if (l->ofb_dcid < r->ofb_dcid)
  42                 return (-1);
  43         if (l->ofb_dcid > r->ofb_dcid)
  44                 return (1);
  45         if (l->ofb_vid < r->ofb_vid)
  46                 return (-1);
  47         if (l->ofb_vid > r->ofb_vid)
  48                 return (1);
  49         if (l->ofb_vlan < r->ofb_vlan)
  50                 return (-1);
  51         if (l->ofb_vlan > r->ofb_vlan)
  52                 return (1);
  53         for (i = 0; i < 4; i++) {
  54                 if (l->ofb_addr.s6_addr32[i] < r->ofb_addr.s6_addr32[i])
  55                         return (-1);
  56                 if (l->ofb_addr.s6_addr32[i] > r->ofb_addr.s6_addr32[i])
  57                         return (1);
  58         }
  59         return (0);
  60 }
  61 
  62 overlay_fabric_entry_t *
  63 overlay_fabric_entry_new(uint64_t vid, uint32_t dcid, uint16_t vlan,
  64     const struct in6_addr *addr, uint8_t prefixlen, const uint8_t *mac)
  65 {
  66         overlay_fabric_entry_t *ofe = NULL;
  67 
  68         ofe = kmem_zalloc(sizeof (*ofe), KM_SLEEP);
  69 
  70         ofe->ofe_fabric.ofb_vid = vid;
  71         ofe->ofe_fabric.ofb_dcid = dcid;
  72         ofe->ofe_fabric.ofb_vlan = vlan;
  73         ofe->ofe_fabric.ofb_prefixlen = prefixlen;
  74         bcopy(addr, &ofe->ofe_fabric.ofb_addr, sizeof (*addr));
  75         bcopy(mac, ofe->ofe_fabric.ofb_mac, ETHERADDRL);
  76         return (ofe);
  77 }
  78 
  79 void
  80 overlay_fabric_entry_free(overlay_fabric_entry_t *ofe)
  81 {
  82         if (ofe == NULL)
  83                 return;
  84 
  85         VERIFY3U(ofe->ofe_refcnt, ==, 0);
  86         kmem_free(ofe, sizeof (*ofe));
  87 }
  88 
  89 /*
  90  * Using the source and destination IP address, locate the target fabric, or
  91  * create larval entries and queue for missing information.
  92  */
  93 static overlay_fabric_entry_t *
  94 overlay_route_find(overlay_dev_t *odd, const mac_header_info_t *mhi,
  95     const struct in6_addr *dst)
  96 {
  97         overlay_target_t *ott = odd->odd_target;
  98         overlay_router_t *otr;
  99         overlay_fabric_entry_t *ofb_src, *ofb_dst;
 100         overlay_target_entry_t *src_entry;
 101         overlay_route_table_t *orte;
 102         uint16_t vlan = VLAN_ID(mhi->mhi_tci);
 103         boolean_t dst_found = B_FALSE;
 104 
 105         /* First attempt to find the overlay_fabric_t for the source */
 106         otr = odd->odd_router;
 107 
 108         mutex_enter(&ott->ott_lock);
 109         if ((src_entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash,
 110             mhi->mhi_saddr)) != NULL)
 111                 OVERLAY_TARG_ENTRY_REFHOLD(src_entry);
 112         mutex_exit(&ott->ott_lock);
 113 
 114         if (src_entry == NULL)
 115                 return (NULL);
 116 
 117         mutex_enter(&src_entry->ote_lock);
 118         if (src_entry->ote_fab == NULL) {
 119                 mutex_exit(&src_entry->ote_lock);
 120                 OVERLAY_TARG_ENTRY_REFRELE(src_entry);
 121                 return (NULL);
 122         }
 123         if ((ofb_src = src_entry->ote_fab) != NULL)
 124                 OVERLAY_FAB_ENTRY_REFHOLD(ofb_src);
 125         mutex_exit(&src_entry->ote_lock);
 126 
 127         OVERLAY_TARG_ENTRY_REFRELE(src_entry);
 128         src_entry = NULL;
 129 
 130         if (ofb_src == NULL || ofb_src->ofe_route_table == NULL) {
 131                 mutex_exit(&otr->otr_lock);
 132                 return (NULL);
 133         }
 134 
 135         /* Go through the fabric route table and try to find a destination */
 136         orte = ofb_src->ofe_route_table;
 137         for (size_t i = 0; orte->ort_dest[i] != NULL; i++) {
 138                 ofb_dst = orte->ort_dest[i];
 139                 if (IN6_ARE_PREFIXEDADDR_EQUAL(dst,
 140                     &ofb_dst->ofe_fabric.ofb_addr,
 141                     ofb_dst->ofe_fabric.ofb_prefixlen)) {
 142                         dst_found = B_TRUE;
 143                         break;
 144                 }
 145         }
 146 
 147         if (dst_found)
 148                 OVERLAY_FAB_ENTRY_REFHOLD(ofb_dst);
 149 
 150         mutex_exit(&otr->otr_lock);
 151 
 152         return (dst_found ? ofb_dst : NULL);
 153 }
 154 
 155 /*
 156  * Adjust packet fields as necessary for delivery of an encapsulated packet
 157  * to a remote target (e.g. cross-DC).
 158  */
 159 static int
 160 overlay_route(overlay_dev_t *odd, mblk_t *mp, const overlay_fabric_entry_t *ofe,
 161     const overlay_target_entry_t *ote)
 162 {
 163         if (MBLKL(mp) >= sizeof (struct ether_vlan_header)) {
 164                 /* The easy way */
 165                 struct ether_vlan_header *evh;
 166 
 167                 evh = (struct ether_vlan_header *)mp->b_rptr;
 168 
 169                 /*
 170                  * Today, we require all encapsulated frames to be vlan tagged.
 171                  * In the future, we might relax this and insert the vlan tag if
 172                  * the destination is tagged and the source is not.
 173                  */
 174                 if (evh->ether_tpid != ETHERTYPE_VLAN)
 175                         return (OVERLAY_TARGET_DROP);
 176 
 177                 evh->ether_tci = evh->ether_tci & ~(VLAN_ID_MASK);
 178                 evh->ether_tci |= ofe->ofe_fabric.ofb_vlan;
 179                 bcopy(ote->ote_addr, &evh->ether_dhost, ETHERADDRL);
 180                 bcopy(ofe->ofe_fabric.ofb_mac, &evh->ether_shost, ETHERADDRL);
 181                 return (OVERLAY_TARGET_OK);
 182         }
 183 
 184         /* The painful, long, tedious way */
 185         unsigned char *p = mp->b_rptr;
 186         unsigned char *end = mp->b_wptr;
 187         unsigned char *vlanp = NULL;
 188         uint16_t sap = 0;
 189         uint16_t vlan = 0;
 190         size_t i;
 191 
 192         /*
 193          * XXX: This method seems so absolutely awful, I wonder if there
 194          * shouldn't be a counter and/or dtrace probe to allow us to find
 195          * out what upstack causes this path to be taken.
 196          */
 197         for (i = 0; i < ETHERADDRL; i++) {
 198                 if (p == end) {
 199                         mp = mp->b_cont;
 200                         if (mp == NULL)
 201                                 return (OVERLAY_TARGET_DROP);
 202                         p = mp->b_rptr;
 203                         end = mp->b_wptr;
 204                 }
 205                 *p++ = ote->ote_addr[i];
 206         }
 207 
 208         for (i = 0; i < ETHERADDRL; i++) {
 209                 if (p == end) {
 210                         mp = mp->b_cont;
 211                         if (mp == NULL)
 212                                 return (OVERLAY_TARGET_DROP);
 213                         p = mp->b_rptr;
 214                         end = mp->b_wptr;
 215                 }
 216                 *p++ = ofe->ofe_fabric.ofb_mac[i];
 217         }
 218 
 219         for (i = 0; i < 2; i++) {
 220                 if (p == end) {
 221                         mp = mp->b_cont;
 222                         if (mp == NULL)
 223                                 return (OVERLAY_TARGET_DROP);
 224                         p = mp->b_rptr;
 225                         end = mp->b_wptr;
 226                 }
 227                 sap |= *p++ << (8 - i*8);
 228         }
 229 
 230         if (sap != ETHERTYPE_VLAN)
 231                 return (OVERLAY_TARGET_DROP);
 232 
 233         if (p == end) {
 234                 mp = mp->b_cont;
 235                 if (mp == NULL)
 236                         return (OVERLAY_TARGET_DROP);
 237                 p = mp->b_rptr;
 238                 end = mp->b_wptr;
 239         }
 240         vlanp = p;
 241 
 242         for (i = 0; i < 2; i++) {
 243                 if (p == end) {
 244                         mp = mp->b_cont;
 245                         if (mp == NULL)
 246                                 return (OVERLAY_TARGET_DROP);
 247                         p = mp->b_rptr;
 248                         end = mp->b_wptr;
 249                 }
 250                 vlan |= *p++ << (8 - i*8);
 251         }
 252 
 253         vlan &= ~(VLAN_ID_MASK);
 254         vlan |= ofe->ofe_fabric.ofb_vlan;
 255 
 256         for (p = vlanp, i = 0; i < 2; i++) {
 257                 if (p == end) {
 258                         mp = mp->b_cont;
 259                         if (mp == NULL)
 260                                 return (OVERLAY_TARGET_DROP);
 261                         p = mp->b_rptr;
 262                         end = mp->b_wptr;
 263                 }
 264                 *p++ = vlan >> (8 - i*8);
 265         }
 266 
 267         return (OVERLAY_TARGET_OK);
 268 }
 269 
 270 int
 271 overlay_route_lookup(overlay_dev_t *odd, mblk_t *mp,
 272     const mac_header_info_t *mhi, struct sockaddr *sock, socklen_t *slenp,
 273         uint64_t *vidp)
 274 {
 275         overlay_target_t *ott = odd->odd_target;
 276         overlay_router_t *otr = odd->odd_router;
 277         overlay_fabric_entry_t *dst_fab;
 278         overlay_target_entry_t *entry;
 279         overlay_target_entry_t search = { 0 };
 280         struct sockaddr_in6 *v6;
 281         struct in6_addr src, dst;
 282         int ret;
 283 
 284         ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC);
 285 
 286         if (odd->odd_router == NULL)
 287                 return (OVERLAY_TARGET_DROP);
 288 
 289         if ((ret = overlay_mblk_vl3ip(mp, &src, &dst)) != 0)
 290                 return (OVERLAY_TARGET_DROP);
 291 
 292         if ((dst_fab = overlay_route_find(odd, mhi, &dst)) == NULL)
 293                 return (OVERLAY_TARGET_DROP);
 294 
 295         v6 = (struct sockaddr_in6 *)sock;
 296         bzero(v6, sizeof (struct sockaddr_in6));
 297         v6->sin6_family = AF_INET6;
 298 
 299         bcopy(&dst, &search.ote_ip, sizeof (dst));
 300         search.ote_fab = dst_fab;
 301 
 302         mutex_enter(&ott->ott_lock);
 303         if ((entry = refhash_lookup(ott->ott_u.ott_dyn.ott_l3dhash,
 304             &search)) == NULL) {
 305                 if ((entry = kmem_cache_alloc(overlay_entry_cache,
 306                     KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
 307                         mutex_exit(&ott->ott_lock);
 308                         return (OVERLAY_TARGET_DROP);
 309                 }
 310 
 311                 bcopy(&dst, &entry->ote_ip, sizeof (dst));
 312                 entry->ote_chead = entry->ote_ctail = mp;
 313                 entry->ote_mbsize = msgsize(mp);
 314                 entry->ote_flags |= OVERLAY_ENTRY_F_VL3_PENDING;
 315                 entry->ote_ott = ott;
 316                 entry->ote_odd = odd;
 317                 OVERLAY_TARG_ENTRY_REFHOLD(entry);
 318                 refhash_insert(ott->ott_u.ott_dyn.ott_l3dhash, entry);
 319                 mutex_exit(&ott->ott_lock);
 320                 overlay_target_queue(entry);
 321                 OVERLAY_FAB_ENTRY_REFRELE(dst_fab);
 322                 return (OVERLAY_TARGET_ASYNC);
 323         }
 324         OVERLAY_TARG_ENTRY_REFHOLD(entry);
 325         mutex_exit(&ott->ott_lock);
 326 
 327         mutex_enter(&entry->ote_lock);
 328         /* There's no point in sending packets to a remote fabric's router IP */
 329         if (entry->ote_flags & (OVERLAY_ENTRY_F_DROP|OVERLAY_ENTRY_F_ROUTER)) {
 330                 ret = OVERLAY_TARGET_DROP;
 331         } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) {
 332                 *vidp = dst_fab->ofe_fabric.ofb_vid;
 333 
 334                 bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr,
 335                     sizeof (struct in6_addr));
 336                 v6->sin6_port = htons(entry->ote_dest.otp_port);
 337                 *slenp = sizeof (struct sockaddr_in6);
 338 
 339                 ret = overlay_route(odd, mp, dst_fab, entry);
 340         } else {
 341                 size_t mlen = msgsize(mp);
 342 
 343                 if (mlen + entry->ote_mbsize > overlay_ent_size) {
 344                         ret = OVERLAY_TARGET_DROP;
 345                 } else {
 346                         if (entry->ote_ctail != NULL) {
 347                                 ASSERT(entry->ote_ctail->b_next == NULL);
 348                                 entry->ote_ctail->b_next = mp;
 349                                 entry->ote_ctail = mp;
 350                         } else {
 351                                 entry->ote_chead = mp;
 352                                 entry->ote_ctail = mp;
 353                         }
 354                         entry->ote_mbsize += mlen;
 355                         if ((entry->ote_flags & OVERLAY_ENTRY_F_PENDING) == 0) {
 356                                 entry->ote_flags |= OVERLAY_ENTRY_F_PENDING;
 357                                 overlay_target_queue(entry);
 358                         }
 359                         ret = OVERLAY_TARGET_ASYNC;
 360                 }
 361         }
 362         mutex_exit(&entry->ote_lock);
 363 
 364         OVERLAY_TARG_ENTRY_REFRELE(entry);
 365         OVERLAY_FAB_ENTRY_REFRELE(dst_fab);
 366 
 367         return (ret);
 368 }
 369 
 370 /*
 371  * Obtain the source and/or destination VL3 IPs of a packet.  If this is
 372  * IPV4 packet, the addresses are returned as IPV6 mapped addresses.
 373  *
 374  * XXX: The name seems a bit ugly, anyone have better suggestions?
 375  */
 376 int
 377 overlay_mblk_vl3ip(mblk_t *mp, struct in6_addr *src, struct in6_addr *dst)
 378 {
 379         struct ether_header *ehp;
 380         ipha_t *iphp;
 381         ip6_t *ip6hp;
 382         size_t skip_len;
 383         uint16_t sap;
 384 
 385         /*
 386          * This is only used for outbound packets, so we use a similar
 387          * approach as inet_pkt_hash().
 388          */
 389         ASSERT(IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)));
 390         ASSERT(MBLKL(mp) >= sizeof (struct ether_header));
 391 
 392         ehp = (struct ether_header *)mp->b_rptr;
 393         sap = ntohs(ehp->ether_type);
 394         if (sap == ETHERTYPE_VLAN) {
 395                 struct ether_vlan_header *evhp;
 396                 mblk_t *newmp = NULL;
 397 
 398                 skip_len = sizeof (struct ether_vlan_header);
 399                 if (MBLKL(mp) < skip_len) {
 400                         /* the vlan tag is the payload, pull up first */
 401                         newmp = msgpullup(mp, -1);
 402                         if ((newmp == NULL) || (MBLKL(newmp) < skip_len)) {
 403                                 freemsg(newmp);
 404                                 return (ENOMEM);
 405                         }
 406                         evhp = (struct ether_vlan_header *)newmp->b_rptr;
 407                 } else {
 408                         evhp = (struct ether_vlan_header *)mp->b_rptr;
 409                 }
 410                 sap = ntohs(evhp->ether_type);
 411                 freemsg(newmp);
 412         } else {
 413                 skip_len = sizeof (struct ether_header);
 414         }
 415 
 416         /* if the ethernet header is in its own mblk, skip it */
 417         if (MBLKL(mp) <= skip_len) {
 418                 skip_len -= MBLKL(mp);
 419                 mp = mp->b_cont;
 420                 if (mp == NULL)
 421                         return (EINVAL);
 422         }
 423 
 424         switch (sap) {
 425         case ETHERTYPE_IP:
 426                 iphp = (ipha_t *)(mp->b_rptr + skip_len);
 427                 if (((unsigned char *)iphp + sizeof (ipha_t) > mp->b_wptr) ||
 428                     !OK_32PTR((char *)iphp))
 429                         return (EINVAL);        /* XXX: better error code? */
 430 
 431                 IN6_IPADDR_TO_V4MAPPED(iphp->ipha_src, src);
 432                 IN6_IPADDR_TO_V4MAPPED(iphp->ipha_dst, dst);
 433                 return (0);
 434         case ETHERTYPE_IPV6:
 435                 ip6hp = (ip6_t *)(mp->b_rptr + skip_len);
 436                 if (((unsigned char *)ip6hp + IPV6_HDR_LEN > mp->b_wptr) ||
 437                     !OK_32PTR((char *)ip6hp))
 438                         return (EINVAL);
 439 
 440                 bcopy(&ip6hp->ip6_src, src, sizeof (*src));
 441                 bcopy(&ip6hp->ip6_dst, dst, sizeof (*dst));
 442                 return (0);
 443         default:
 444                 return (EINVAL);
 445         }
 446 
 447         return (0);
 448 }
 449 
 450 /* ARGSUSED */
 451 int
 452 overlay_router_ioctl(dev_t dev, int cmt, intptr_t arg, int mode, cred_t *credp,
 453     int *rvalp)
 454 {
 455         /* TODO */
 456         return (ENOTTY);
 457 }