1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2019 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Overlay device ksocket multiplexer.
  18  *
  19  * For more information, see the big theory statement in
  20  * uts/common/io/overlay/overlay.c
  21  */
  22 
  23 #include <sys/types.h>
  24 #include <sys/socket.h>
  25 #include <sys/ksynch.h>
  26 #include <sys/ksocket.h>
  27 #include <sys/avl.h>
  28 #include <sys/list.h>
  29 #include <sys/pattr.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/strsun.h>
  33 #include <sys/tihdr.h>
  34 
  35 #include <sys/overlay_impl.h>
  36 
  37 #include <sys/sdt.h>
  38 
  39 #define OVERLAY_FREEMSG(mp, reason) \
  40     DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
  41 
  42 static list_t overlay_mux_list;
  43 static kmutex_t overlay_mux_lock;
  44 
  45 void
  46 overlay_mux_init(void)
  47 {
  48         list_create(&overlay_mux_list, sizeof (overlay_mux_t),
  49             offsetof(overlay_mux_t, omux_lnode));
  50         mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
  51 }
  52 
  53 void
  54 overlay_mux_fini(void)
  55 {
  56         mutex_destroy(&overlay_mux_lock);
  57         list_destroy(&overlay_mux_list);
  58 }
  59 
  60 static int
  61 overlay_mux_comparator(const void *a, const void *b)
  62 {
  63         const overlay_dev_t *odl, *odr;
  64         odl = a;
  65         odr = b;
  66         if (odl->odd_vid > odr->odd_vid)
  67                 return (1);
  68         else if (odl->odd_vid < odr->odd_vid)
  69                 return (-1);
  70         else
  71                 return (0);
  72 }
  73 
  74 /*
  75  * This is the central receive data path. We need to decode the packet, if we
  76  * can, and then deliver it to the appropriate overlay.
  77  */
  78 /* ARGSUSED */
  79 static boolean_t
  80 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
  81     void *arg)
  82 {
  83         mblk_t *mp, *nmp, *fmp;
  84         overlay_mux_t *mux = arg;
  85 
  86         /*
  87          * We may have a received a chain of messages. Each messsage in the
  88          * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
  89          * If we aren't getting that, we should probably drop that for the
  90          * moment.
  91          */
  92         for (mp = mpchain; mp != NULL; mp = nmp) {
  93                 struct T_unitdata_ind *tudi;
  94                 ovep_encap_info_t infop;
  95                 overlay_dev_t od, *odd;
  96                 int ret;
  97 
  98                 nmp = mp->b_next;
  99                 mp->b_next = NULL;
 100 
 101                 if (DB_TYPE(mp) != M_PROTO) {
 102                         OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
 103                         freemsg(mp);
 104                         continue;
 105                 }
 106 
 107                 if (mp->b_cont == NULL) {
 108                         OVERLAY_FREEMSG(mp, "missing a b_cont");
 109                         freemsg(mp);
 110                         continue;
 111                 }
 112 
 113                 tudi = (struct T_unitdata_ind *)mp->b_rptr;
 114                 if (tudi->PRIM_type != T_UNITDATA_IND) {
 115                         OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
 116                         freemsg(mp);
 117                         continue;
 118                 }
 119 
 120                 /*
 121                  * In the future, we'll care about the source information
 122                  * for purposes of telling varpd for oob invalidation. But for
 123                  * now, just drop that block.
 124                  */
 125                 fmp = mp;
 126                 mp = fmp->b_cont;
 127                 freeb(fmp);
 128 
 129                 /*
 130                  * Until we have VXLAN-or-other-decap HW acceleration support
 131                  * (e.g.  we support NICs that reach into VXLAN-encapsulated
 132                  * packets and check the inside-VXLAN IP packets' checksums,
 133                  * or do LSO with VXLAN), we should clear any HW-accelerated-
 134                  * performed bits.
 135                  */
 136                 DB_CKSUMFLAGS(mp) = 0;
 137 
 138                 /*
 139                  * Decap and deliver.
 140                  */
 141                 bzero(&infop, sizeof (ovep_encap_info_t));
 142                 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
 143                 if (ret != 0) {
 144                         OVERLAY_FREEMSG(mp, "decap failed");
 145                         freemsg(mp);
 146                         continue;
 147                 }
 148                 if (MBLKL(mp) > infop.ovdi_hdr_size) {
 149                         mp->b_rptr += infop.ovdi_hdr_size;
 150                 } else {
 151                         while (infop.ovdi_hdr_size != 0) {
 152                                 size_t rem, blkl;
 153 
 154                                 if (mp == NULL)
 155                                         break;
 156 
 157                                 blkl = MBLKL(mp);
 158                                 rem = MIN(infop.ovdi_hdr_size, blkl);
 159                                 infop.ovdi_hdr_size -= rem;
 160                                 mp->b_rptr += rem;
 161                                 if (rem == blkl) {
 162                                         fmp = mp;
 163                                         mp = fmp->b_cont;
 164                                         fmp->b_cont = NULL;
 165                                         OVERLAY_FREEMSG(mp,
 166                                             "freed a fmp block");
 167                                         freemsg(fmp);
 168                                 }
 169                         }
 170                         if (mp == NULL) {
 171                                 OVERLAY_FREEMSG(mp, "freed it all...");
 172                                 continue;
 173                         }
 174                 }
 175 
 176 
 177                 od.odd_vid = infop.ovdi_id;
 178                 mutex_enter(&mux->omux_lock);
 179                 odd = avl_find(&mux->omux_devices, &od, NULL);
 180                 if (odd == NULL) {
 181                         mutex_exit(&mux->omux_lock);
 182                         OVERLAY_FREEMSG(mp, "no matching vid");
 183                         freemsg(mp);
 184                         continue;
 185                 }
 186                 mutex_enter(&odd->odd_lock);
 187                 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
 188                     !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
 189                         mutex_exit(&odd->odd_lock);
 190                         mutex_exit(&mux->omux_lock);
 191                         OVERLAY_FREEMSG(mp, "dev dropped");
 192                         freemsg(mp);
 193                         continue;
 194                 }
 195                 overlay_io_start(odd, OVERLAY_F_IN_RX);
 196                 mutex_exit(&odd->odd_lock);
 197                 mutex_exit(&mux->omux_lock);
 198 
 199                 mac_rx(odd->odd_mh, NULL, mp);
 200 
 201                 mutex_enter(&odd->odd_lock);
 202                 overlay_io_done(odd, OVERLAY_F_IN_RX);
 203                 mutex_exit(&odd->odd_lock);
 204         }
 205 
 206         return (B_TRUE);
 207 }
 208 
 209 /*
 210  * Kernel socket callback to indicate the socket itself is able to send
 211  * data again.  Check for devices on this mux that were send-blocked,
 212  * and clear them.
 213  */
 214 /* ARGSUSED */
 215 static void
 216 overlay_mux_cansend_now(ksocket_t ksock, ksocket_callback_event_t event,
 217     void *arg, uintptr_t ignore_me)
 218 {
 219         overlay_mux_t *mux = (overlay_mux_t *)arg;
 220         overlay_dev_t *odd;
 221         mac_handle_t *mhs_to_update, *current_mh;
 222         size_t allocsize;
 223 
 224         ASSERT3P(ksock, ==, mux->omux_ksock);
 225         ASSERT3U(event, ==, KSOCKET_EV_CANSEND);
 226 
 227         /* Traverse omux_devices and check for ones marked as send-blocked. */
 228         mutex_enter(&mux->omux_lock);
 229         if (mux->omux_count == 0) {
 230                 /* Nothing to wake up. */
 231                 mutex_exit(&mux->omux_lock);
 232                 return;
 233         }
 234         allocsize = sizeof (mac_handle_t) * mux->omux_count;
 235         mhs_to_update = kmem_zalloc(allocsize, KM_NOSLEEP);
 236         VERIFY(mhs_to_update != NULL);  /* Failure should be rare. */
 237         current_mh = mhs_to_update;
 238 
 239         for (odd = avl_first(&mux->omux_devices); odd != NULL;
 240             odd = AVL_NEXT(&mux->omux_devices, odd)) {
 241                 mac_handle_t odd_mh = NULL;
 242 
 243                 mutex_enter(&odd->odd_lock);
 244                 if ((odd->odd_flags & OVERLAY_F_TXSTOPPED) != 0) {
 245                         /* Get ready to tell MAC it can transmit again. */
 246                         odd->odd_flags &= ~OVERLAY_F_TXSTOPPED;
 247                         odd_mh = odd->odd_mh;
 248                 }
 249                 mutex_exit(&odd->odd_lock);
 250                 if (odd_mh != NULL) {
 251                         *current_mh = odd_mh;
 252                         current_mh++;
 253                 }
 254         }
 255         mutex_exit(&mux->omux_lock);
 256 
 257         /*
 258          * Yes, I'm using the value-then-decrement.  "current_mh" is
 259          * guaranteed to be at least one ahead of mhs_to_update if there are
 260          * any mac handles that need updating.  I also have to do this outside
 261          * the omux lock because the tx_update may trigger immediate or
 262          * concurrent packet transmission.
 263          */
 264         while (current_mh-- != mhs_to_update)
 265                 mac_tx_update(*current_mh);
 266 
 267         kmem_free(mhs_to_update, allocsize);
 268 }
 269 
 270 /*
 271  * Register a given device with a socket backend. If no such device socket
 272  * exists, create a new one.
 273  */
 274 overlay_mux_t *
 275 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
 276     struct sockaddr *addr, socklen_t len, int *errp)
 277 {
 278         int err;
 279         overlay_mux_t *mux;
 280         ksocket_t ksock;
 281         ksocket_callbacks_t ks_cb = { 0 };
 282 
 283         if (errp == NULL)
 284                 errp = &err;
 285 
 286         mutex_enter(&overlay_mux_lock);
 287         for (mux = list_head(&overlay_mux_list); mux != NULL;
 288             mux = list_next(&overlay_mux_list, mux)) {
 289                 if (domain == mux->omux_domain &&
 290                     family == mux->omux_family &&
 291                     protocol == mux->omux_protocol &&
 292                     len == mux->omux_alen &&
 293                     bcmp(addr, mux->omux_addr, len) == 0) {
 294 
 295                         if (opp != mux->omux_plugin) {
 296                                 *errp = EEXIST;
 297                                 return (NULL);
 298                         }
 299 
 300                         mutex_enter(&mux->omux_lock);
 301                         mux->omux_count++;
 302                         mutex_exit(&mux->omux_lock);
 303                         mutex_exit(&overlay_mux_lock);
 304                         *errp = 0;
 305                         return (mux);
 306                 }
 307         }
 308 
 309         /*
 310          * Today we aren't zone-aware and only exist in the global zone. When we
 311          * allow for things to exist in the non-global zone, we'll want to use a
 312          * credential that's actually specific to the zone.
 313          */
 314         *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
 315             kcred);
 316         if (*errp != 0) {
 317                 mutex_exit(&overlay_mux_lock);
 318                 return (NULL);
 319         }
 320 
 321         *errp = ksocket_bind(ksock, addr, len, kcred);
 322         if (*errp != 0) {
 323                 mutex_exit(&overlay_mux_lock);
 324                 ksocket_close(ksock, kcred);
 325                 return (NULL);
 326         }
 327 
 328         /*
 329          * Ask our lower layer to optionally toggle anything they need on this
 330          * socket. Because a socket is owned by a single type of plugin, we can
 331          * then ask it to perform any additional socket set up it'd like to do.
 332          */
 333         if (opp->ovp_ops->ovpo_sockopt != NULL &&
 334             (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
 335                 mutex_exit(&overlay_mux_lock);
 336                 ksocket_close(ksock, kcred);
 337                 return (NULL);
 338         }
 339 
 340         mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
 341         list_link_init(&mux->omux_lnode);
 342         mux->omux_ksock = ksock;
 343         mux->omux_plugin = opp;
 344         mux->omux_domain = domain;
 345         mux->omux_family = family;
 346         mux->omux_protocol = protocol;
 347         mux->omux_addr = kmem_alloc(len, KM_SLEEP);
 348         bcopy(addr, mux->omux_addr, len);
 349         mux->omux_alen = len;
 350         mux->omux_count = 1;
 351         avl_create(&mux->omux_devices, overlay_mux_comparator,
 352             sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
 353         mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
 354 
 355 #if defined(OVERLAY_PINCH) || defined(OVERLAY_FC_TEST)
 356         /* Set the xmit buf to a REALLY SMALL value, say 12k (1-3 packets) */
 357         int bufsize = 12 * 1024;
 358 
 359         if (ksocket_setsockopt(ksock, SOL_SOCKET, SO_SNDBUF,
 360                 (const void *)&bufsize, sizeof (bufsize), CRED()) != 0) {
 361                 ksocket_close(ksock, kcred);
 362                 mutex_destroy(&mux->omux_lock);
 363                 avl_destroy(&mux->omux_devices);
 364                 kmem_free(mux->omux_addr, len);
 365                 kmem_free(mux, sizeof (overlay_mux_t));
 366                 return (NULL);
 367         }
 368 #endif
 369         /*
 370          * Set a callback in case we hit socket flow control and need to know
 371          * when it's ready to send again.  See the aforementioned
 372          * ksocket_socket() comments about the use of kcred vs. being
 373          * zone-aware.
 374          */
 375         ks_cb.ksock_cb_cansend = overlay_mux_cansend_now;
 376         if (ksocket_setcallbacks(ksock, &ks_cb, mux, kcred) != 0) {
 377                 ksocket_close(ksock, kcred);
 378                 mutex_destroy(&mux->omux_lock);
 379                 avl_destroy(&mux->omux_devices);
 380                 kmem_free(mux->omux_addr, len);
 381                 kmem_free(mux, sizeof (overlay_mux_t));
 382                 return (NULL);
 383         }
 384 
 385         /* Once this is called, we need to expect to rx data */
 386         *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
 387         if (*errp != 0) {
 388                 ksocket_close(ksock, kcred);
 389                 mutex_destroy(&mux->omux_lock);
 390                 avl_destroy(&mux->omux_devices);
 391                 kmem_free(mux->omux_addr, len);
 392                 kmem_free(mux, sizeof (overlay_mux_t));
 393                 return (NULL);
 394         }
 395 
 396         list_insert_tail(&overlay_mux_list, mux);
 397         mutex_exit(&overlay_mux_lock);
 398 
 399         *errp = 0;
 400         return (mux);
 401 }
 402 
 403 void
 404 overlay_mux_close(overlay_mux_t *mux)
 405 {
 406         mutex_enter(&overlay_mux_lock);
 407         mutex_enter(&mux->omux_lock);
 408         mux->omux_count--;
 409         if (mux->omux_count != 0) {
 410                 mutex_exit(&mux->omux_lock);
 411                 mutex_exit(&overlay_mux_lock);
 412                 return;
 413         }
 414         list_remove(&overlay_mux_list, mux);
 415         mutex_exit(&mux->omux_lock);
 416         mutex_exit(&overlay_mux_lock);
 417 
 418         ksocket_close(mux->omux_ksock, kcred);
 419         avl_destroy(&mux->omux_devices);
 420         kmem_free(mux->omux_addr, mux->omux_alen);
 421         kmem_free(mux, sizeof (overlay_mux_t));
 422 }
 423 
 424 void
 425 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 426 {
 427         mutex_enter(&mux->omux_lock);
 428         avl_add(&mux->omux_devices, odd);
 429         mutex_exit(&mux->omux_lock);
 430 }
 431 
 432 void
 433 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 434 {
 435         mutex_enter(&mux->omux_lock);
 436         avl_remove(&mux->omux_devices, odd);
 437         mutex_exit(&mux->omux_lock);
 438 }
 439 
 440 int
 441 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
 442 {
 443         int ret;
 444 
 445         /*
 446          * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
 447          * that isn't actually supported by UDP at this time.
 448          *
 449          * Send with MSG_DONTWAIT to indicate clogged UDP sockets upstack.
 450          */
 451         ret = ksocket_sendmblk(mux->omux_ksock, hdr, MSG_DONTWAIT, &mp, kcred);
 452         /*
 453          * NOTE: ksocket_sendmblk() may send partial packets downstack,
 454          * returning what's not sent in &mp (i.e. mp pre-call might be a
 455          * b_cont of mp post-call).  We can't hold up this message (it's a
 456          * datagram), so we drop, and let the caller cope.
 457          */
 458         if (ret != 0)
 459                 freemsg(mp);
 460 
 461         return (ret);
 462 }