1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2019 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Overlay device ksocket multiplexer.
  18  *
  19  * For more information, see the big theory statement in
  20  * uts/common/io/overlay/overlay.c
  21  */
  22 
  23 #include <sys/types.h>
  24 #include <sys/socket.h>
  25 #include <sys/ksynch.h>
  26 #include <sys/ksocket.h>
  27 #include <sys/avl.h>
  28 #include <sys/list.h>
  29 #include <sys/pattr.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/strsun.h>
  33 #include <sys/tihdr.h>
  34 
  35 #include <sys/overlay_impl.h>
  36 
  37 #include <sys/sdt.h>
  38 
  39 #define OVERLAY_FREEMSG(mp, reason) \
  40     DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
  41 
  42 static list_t overlay_mux_list;
  43 static kmutex_t overlay_mux_lock;
  44 
  45 void
  46 overlay_mux_init(void)
  47 {
  48         list_create(&overlay_mux_list, sizeof (overlay_mux_t),
  49             offsetof(overlay_mux_t, omux_lnode));
  50         mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
  51 }
  52 
  53 void
  54 overlay_mux_fini(void)
  55 {
  56         mutex_destroy(&overlay_mux_lock);
  57         list_destroy(&overlay_mux_list);
  58 }
  59 
  60 static int
  61 overlay_mux_comparator(const void *a, const void *b)
  62 {
  63         const overlay_dev_t *odl, *odr;
  64         odl = a;
  65         odr = b;
  66         if (odl->odd_vid > odr->odd_vid)
  67                 return (1);
  68         else if (odl->odd_vid < odr->odd_vid)
  69                 return (-1);
  70         else
  71                 return (0);
  72 }
  73 
  74 /*
  75  * This is the central receive data path. We need to decode the packet, if we
  76  * can, and then deliver it to the appropriate overlay.
  77  */
  78 /* ARGSUSED */
  79 static boolean_t
  80 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
  81     void *arg)
  82 {
  83         mblk_t *mp, *nmp, *fmp;
  84         overlay_mux_t *mux = arg;
  85 
  86         /*
  87          * We may have a received a chain of messages. Each messsage in the
  88          * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
  89          * If we aren't getting that, we should probably drop that for the
  90          * moment.
  91          */
  92         for (mp = mpchain; mp != NULL; mp = nmp) {
  93                 struct T_unitdata_ind *tudi;
  94                 ovep_encap_info_t infop;
  95                 overlay_dev_t od, *odd;
  96                 int ret;
  97 
  98                 nmp = mp->b_next;
  99                 mp->b_next = NULL;
 100 
 101                 if (DB_TYPE(mp) != M_PROTO) {
 102                         OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
 103                         freemsg(mp);
 104                         continue;
 105                 }
 106 
 107                 if (mp->b_cont == NULL) {
 108                         OVERLAY_FREEMSG(mp, "missing a b_cont");
 109                         freemsg(mp);
 110                         continue;
 111                 }
 112 
 113                 tudi = (struct T_unitdata_ind *)mp->b_rptr;
 114                 if (tudi->PRIM_type != T_UNITDATA_IND) {
 115                         OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
 116                         freemsg(mp);
 117                         continue;
 118                 }
 119 
 120                 /*
 121                  * In the future, we'll care about the source information
 122                  * for purposes of telling varpd for oob invalidation. But for
 123                  * now, just drop that block.
 124                  */
 125                 fmp = mp;
 126                 mp = fmp->b_cont;
 127                 freeb(fmp);
 128 
 129                 /*
 130                  * Until we have VXLAN-or-other-decap HW acceleration support
 131                  * (e.g.  we support NICs that reach into VXLAN-encapsulated
 132                  * packets and check the inside-VXLAN IP packets' checksums,
 133                  * or do LSO with VXLAN), we should clear any HW-accelerated-
 134                  * performed bits.
 135                  */
 136                 DB_CKSUMFLAGS(mp) = 0;
 137 
 138                 /*
 139                  * Decap and deliver.
 140                  */
 141                 bzero(&infop, sizeof (ovep_encap_info_t));
 142                 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
 143                 if (ret != 0) {
 144                         OVERLAY_FREEMSG(mp, "decap failed");
 145                         freemsg(mp);
 146                         continue;
 147                 }
 148                 if (MBLKL(mp) > infop.ovdi_hdr_size) {
 149                         mp->b_rptr += infop.ovdi_hdr_size;
 150                 } else {
 151                         while (infop.ovdi_hdr_size != 0) {
 152                                 size_t rem, blkl;
 153 
 154                                 if (mp == NULL)
 155                                         break;
 156 
 157                                 blkl = MBLKL(mp);
 158                                 rem = MIN(infop.ovdi_hdr_size, blkl);
 159                                 infop.ovdi_hdr_size -= rem;
 160                                 mp->b_rptr += rem;
 161                                 if (rem == blkl) {
 162                                         fmp = mp;
 163                                         mp = fmp->b_cont;
 164                                         fmp->b_cont = NULL;
 165                                         OVERLAY_FREEMSG(mp,
 166                                             "freed a fmp block");
 167                                         freemsg(fmp);
 168                                 }
 169                         }
 170                         if (mp == NULL) {
 171                                 OVERLAY_FREEMSG(mp, "freed it all...");
 172                                 continue;
 173                         }
 174                 }
 175 
 176 
 177                 od.odd_vid = infop.ovdi_id;
 178                 mutex_enter(&mux->omux_lock);
 179                 odd = avl_find(&mux->omux_devices, &od, NULL);
 180                 if (odd == NULL) {
 181                         mutex_exit(&mux->omux_lock);
 182                         OVERLAY_FREEMSG(mp, "no matching vid");
 183                         freemsg(mp);
 184                         continue;
 185                 }
 186                 mutex_enter(&odd->odd_lock);
 187                 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
 188                     !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
 189                         mutex_exit(&odd->odd_lock);
 190                         mutex_exit(&mux->omux_lock);
 191                         OVERLAY_FREEMSG(mp, "dev dropped");
 192                         freemsg(mp);
 193                         continue;
 194                 }
 195                 overlay_io_start(odd, OVERLAY_F_IN_RX);
 196                 mutex_exit(&odd->odd_lock);
 197                 mutex_exit(&mux->omux_lock);
 198 
 199                 mac_rx(odd->odd_mh, NULL, mp);
 200 
 201                 mutex_enter(&odd->odd_lock);
 202                 overlay_io_done(odd, OVERLAY_F_IN_RX);
 203                 mutex_exit(&odd->odd_lock);
 204         }
 205 
 206         return (B_TRUE);
 207 }
 208 
 209 /*
 210  * Register a given device with a socket backend. If no such device socket
 211  * exists, create a new one.
 212  */
 213 overlay_mux_t *
 214 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
 215     struct sockaddr *addr, socklen_t len, int *errp)
 216 {
 217         int err;
 218         overlay_mux_t *mux;
 219         ksocket_t ksock;
 220 
 221         if (errp == NULL)
 222                 errp = &err;
 223 
 224         mutex_enter(&overlay_mux_lock);
 225         for (mux = list_head(&overlay_mux_list); mux != NULL;
 226             mux = list_next(&overlay_mux_list, mux)) {
 227                 if (domain == mux->omux_domain &&
 228                     family == mux->omux_family &&
 229                     protocol == mux->omux_protocol &&
 230                     len == mux->omux_alen &&
 231                     bcmp(addr, mux->omux_addr, len) == 0) {
 232 
 233                         if (opp != mux->omux_plugin) {
 234                                 *errp = EEXIST;
 235                                 return (NULL);
 236                         }
 237 
 238                         mutex_enter(&mux->omux_lock);
 239                         mux->omux_count++;
 240                         mutex_exit(&mux->omux_lock);
 241                         mutex_exit(&overlay_mux_lock);
 242                         *errp = 0;
 243                         return (mux);
 244                 }
 245         }
 246 
 247         /*
 248          * Today we aren't zone-aware and only exist in the global zone. When we
 249          * allow for things to exist in the non-global zone, we'll want to use a
 250          * credential that's actually specific to the zone.
 251          */
 252         *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
 253             kcred);
 254         if (*errp != 0) {
 255                 mutex_exit(&overlay_mux_lock);
 256                 return (NULL);
 257         }
 258 
 259         *errp = ksocket_bind(ksock, addr, len, kcred);
 260         if (*errp != 0) {
 261                 mutex_exit(&overlay_mux_lock);
 262                 ksocket_close(ksock, kcred);
 263                 return (NULL);
 264         }
 265 
 266         /*
 267          * Ask our lower layer to optionally toggle anything they need on this
 268          * socket. Because a socket is owned by a single type of plugin, we can
 269          * then ask it to perform any additional socket set up it'd like to do.
 270          */
 271         if (opp->ovp_ops->ovpo_sockopt != NULL &&
 272             (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
 273                 mutex_exit(&overlay_mux_lock);
 274                 ksocket_close(ksock, kcred);
 275                 return (NULL);
 276         }
 277 
 278         mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
 279         list_link_init(&mux->omux_lnode);
 280         mux->omux_ksock = ksock;
 281         mux->omux_plugin = opp;
 282         mux->omux_domain = domain;
 283         mux->omux_family = family;
 284         mux->omux_protocol = protocol;
 285         mux->omux_addr = kmem_alloc(len, KM_SLEEP);
 286         bcopy(addr, mux->omux_addr, len);
 287         mux->omux_alen = len;
 288         mux->omux_count = 1;
 289         avl_create(&mux->omux_devices, overlay_mux_comparator,
 290             sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
 291         mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
 292 
 293 
 294         /* Once this is called, we need to expect to rx data */
 295         *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
 296         if (*errp != 0) {
 297                 ksocket_close(ksock, kcred);
 298                 mutex_destroy(&mux->omux_lock);
 299                 avl_destroy(&mux->omux_devices);
 300                 kmem_free(mux->omux_addr, len);
 301                 kmem_free(mux, sizeof (overlay_mux_t));
 302                 return (NULL);
 303         }
 304 
 305         list_insert_tail(&overlay_mux_list, mux);
 306         mutex_exit(&overlay_mux_lock);
 307 
 308         *errp = 0;
 309         return (mux);
 310 }
 311 
 312 void
 313 overlay_mux_close(overlay_mux_t *mux)
 314 {
 315         mutex_enter(&overlay_mux_lock);
 316         mutex_enter(&mux->omux_lock);
 317         mux->omux_count--;
 318         if (mux->omux_count != 0) {
 319                 mutex_exit(&mux->omux_lock);
 320                 mutex_exit(&overlay_mux_lock);
 321                 return;
 322         }
 323         list_remove(&overlay_mux_list, mux);
 324         mutex_exit(&mux->omux_lock);
 325         mutex_exit(&overlay_mux_lock);
 326 
 327         ksocket_close(mux->omux_ksock, kcred);
 328         avl_destroy(&mux->omux_devices);
 329         kmem_free(mux->omux_addr, mux->omux_alen);
 330         kmem_free(mux, sizeof (overlay_mux_t));
 331 }
 332 
 333 void
 334 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 335 {
 336         mutex_enter(&mux->omux_lock);
 337         avl_add(&mux->omux_devices, odd);
 338         mutex_exit(&mux->omux_lock);
 339 }
 340 
 341 void
 342 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 343 {
 344         mutex_enter(&mux->omux_lock);
 345         avl_remove(&mux->omux_devices, odd);
 346         mutex_exit(&mux->omux_lock);
 347 }
 348 
 349 int
 350 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
 351 {
 352         int ret;
 353 
 354         /*
 355          * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
 356          * that isn't actually supported by UDP at this time.
 357          */
 358         ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
 359         if (ret != 0)
 360                 freemsg(mp);
 361 
 362         return (ret);
 363 }