1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2015 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * Overlay device ksocket multiplexer.
  18  *
  19  * For more information, see the big theory statement in
  20  * uts/common/io/overlay/overlay.c
  21  */
  22 
  23 #include <sys/types.h>
  24 #include <sys/socket.h>
  25 #include <sys/ksynch.h>
  26 #include <sys/ksocket.h>
  27 #include <sys/avl.h>
  28 #include <sys/list.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/strsubr.h>
  31 #include <sys/strsun.h>
  32 #include <sys/tihdr.h>
  33 
  34 #include <sys/overlay_impl.h>
  35 
  36 #include <sys/sdt.h>
  37 
  38 #define OVERLAY_FREEMSG(mp, reason) \
  39     DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
  40 
  41 static list_t overlay_mux_list;
  42 static kmutex_t overlay_mux_lock;
  43 
  44 void
  45 overlay_mux_init(void)
  46 {
  47         list_create(&overlay_mux_list, sizeof (overlay_mux_t),
  48             offsetof(overlay_mux_t, omux_lnode));
  49         mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
  50 }
  51 
  52 void
  53 overlay_mux_fini(void)
  54 {
  55         mutex_destroy(&overlay_mux_lock);
  56         list_destroy(&overlay_mux_list);
  57 }
  58 
  59 static int
  60 overlay_mux_comparator(const void *a, const void *b)
  61 {
  62         const overlay_dev_t *odl, *odr;
  63         odl = a;
  64         odr = b;
  65         if (odl->odd_vid > odr->odd_vid)
  66                 return (1);
  67         else if (odl->odd_vid < odr->odd_vid)
  68                 return (-1);
  69         else
  70                 return (0);
  71 }
  72 
  73 /*
  74  * This is the central receive data path. We need to decode the packet, if we
  75  * can, and then deliver it to the appropriate overlay.
  76  */
  77 /* ARGSUSED */
  78 static boolean_t
  79 overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
  80     void *arg)
  81 {
  82         mblk_t *mp, *nmp, *fmp;
  83         overlay_mux_t *mux = arg;
  84 
  85         /*
  86          * We may have a received a chain of messages. Each messsage in the
  87          * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
  88          * If we aren't getting that, we should probably drop that for the
  89          * moment.
  90          */
  91         for (mp = mpchain; mp != NULL; mp = nmp) {
  92                 struct T_unitdata_ind *tudi;
  93                 ovep_encap_info_t infop;
  94                 overlay_dev_t od, *odd;
  95                 int ret;
  96 
  97                 nmp = mp->b_next;
  98                 mp->b_next = NULL;
  99 
 100                 if (DB_TYPE(mp) != M_PROTO) {
 101                         OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
 102                         freemsg(mp);
 103                         continue;
 104                 }
 105 
 106                 if (mp->b_cont == NULL) {
 107                         OVERLAY_FREEMSG(mp, "missing a b_cont");
 108                         freemsg(mp);
 109                         continue;
 110                 }
 111 
 112                 tudi = (struct T_unitdata_ind *)mp->b_rptr;
 113                 if (tudi->PRIM_type != T_UNITDATA_IND) {
 114                         OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
 115                         freemsg(mp);
 116                         continue;
 117                 }
 118 
 119                 /*
 120                  * In the future, we'll care about the source information
 121                  * for purposes of telling varpd for oob invalidation. But for
 122                  * now, just drop that block.
 123                  */
 124                 fmp = mp;
 125                 mp = fmp->b_cont;
 126                 fmp->b_cont = NULL;
 127                 freemsg(fmp);
 128 
 129                 /*
 130                  * Decap and deliver.
 131                  */
 132                 bzero(&infop, sizeof (ovep_encap_info_t));
 133                 ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
 134                 if (ret != 0) {
 135                         OVERLAY_FREEMSG(mp, "decap failed");
 136                         freemsg(mp);
 137                         continue;
 138                 }
 139                 if (MBLKL(mp) > infop.ovdi_hdr_size) {
 140                         mp->b_rptr += infop.ovdi_hdr_size;
 141                 } else {
 142                         while (infop.ovdi_hdr_size != 0) {
 143                                 size_t rem, blkl;
 144 
 145                                 if (mp == NULL)
 146                                         break;
 147 
 148                                 blkl = MBLKL(mp);
 149                                 rem = MIN(infop.ovdi_hdr_size, blkl);
 150                                 infop.ovdi_hdr_size -= rem;
 151                                 mp->b_rptr += rem;
 152                                 if (rem == blkl) {
 153                                         fmp = mp;
 154                                         mp = fmp->b_cont;
 155                                         fmp->b_cont = NULL;
 156                                         OVERLAY_FREEMSG(mp,
 157                                             "freed a fmp block");
 158                                         freemsg(fmp);
 159                                 }
 160                         }
 161                         if (mp == NULL) {
 162                                 OVERLAY_FREEMSG(mp, "freed it all...");
 163                                 continue;
 164                         }
 165                 }
 166 
 167 
 168                 od.odd_vid = infop.ovdi_id;
 169                 mutex_enter(&mux->omux_lock);
 170                 odd = avl_find(&mux->omux_devices, &od, NULL);
 171                 if (odd == NULL) {
 172                         mutex_exit(&mux->omux_lock);
 173                         OVERLAY_FREEMSG(mp, "no matching vid");
 174                         freemsg(mp);
 175                         continue;
 176                 }
 177                 mutex_enter(&odd->odd_lock);
 178                 if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
 179                     !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
 180                         mutex_exit(&odd->odd_lock);
 181                         mutex_exit(&mux->omux_lock);
 182                         OVERLAY_FREEMSG(mp, "dev dropped");
 183                         freemsg(mp);
 184                         continue;
 185                 }
 186                 overlay_io_start(odd, OVERLAY_F_IN_RX);
 187                 mutex_exit(&odd->odd_lock);
 188                 mutex_exit(&mux->omux_lock);
 189 
 190                 mac_rx(odd->odd_mh, NULL, mp);
 191 
 192                 mutex_enter(&odd->odd_lock);
 193                 overlay_io_done(odd, OVERLAY_F_IN_RX);
 194                 mutex_exit(&odd->odd_lock);
 195         }
 196 
 197         return (B_TRUE);
 198 }
 199 
 200 /*
 201  * Register a given device with a socket backend. If no such device socket
 202  * exists, create a new one.
 203  */
 204 overlay_mux_t *
 205 overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
 206     struct sockaddr *addr, socklen_t len, int *errp)
 207 {
 208         int err;
 209         overlay_mux_t *mux;
 210         ksocket_t ksock;
 211 
 212         if (errp == NULL)
 213                 errp = &err;
 214 
 215         mutex_enter(&overlay_mux_lock);
 216         for (mux = list_head(&overlay_mux_list); mux != NULL;
 217             mux = list_next(&overlay_mux_list, mux)) {
 218                 if (domain == mux->omux_domain &&
 219                     family == mux->omux_family &&
 220                     protocol == mux->omux_protocol &&
 221                     len == mux->omux_alen &&
 222                     bcmp(addr, mux->omux_addr, len) == 0) {
 223 
 224                         if (opp != mux->omux_plugin) {
 225                                 *errp = EEXIST;
 226                                 return (NULL);
 227                         }
 228 
 229                         mutex_enter(&mux->omux_lock);
 230                         mux->omux_count++;
 231                         mutex_exit(&mux->omux_lock);
 232                         mutex_exit(&overlay_mux_lock);
 233                         *errp = 0;
 234                         return (mux);
 235                 }
 236         }
 237 
 238         /*
 239          * Today we aren't zone-aware and only exist in the global zone. When we
 240          * allow for things to exist in the non-global zone, we'll want to use a
 241          * credential that's actually specific to the zone.
 242          */
 243         *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
 244             kcred);
 245         if (*errp != 0) {
 246                 mutex_exit(&overlay_mux_lock);
 247                 return (NULL);
 248         }
 249 
 250         *errp = ksocket_bind(ksock, addr, len, kcred);
 251         if (*errp != 0) {
 252                 mutex_exit(&overlay_mux_lock);
 253                 ksocket_close(ksock, kcred);
 254                 return (NULL);
 255         }
 256 
 257         /*
 258          * Ask our lower layer to optionally toggle anything they need on this
 259          * socket. Because a socket is owned by a single type of plugin, we can
 260          * then ask it to perform any additional socket set up it'd like to do.
 261          */
 262         if (opp->ovp_ops->ovpo_sockopt != NULL &&
 263             (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
 264                 mutex_exit(&overlay_mux_lock);
 265                 ksocket_close(ksock, kcred);
 266                 return (NULL);
 267         }
 268 
 269         mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
 270         list_link_init(&mux->omux_lnode);
 271         mux->omux_ksock = ksock;
 272         mux->omux_plugin = opp;
 273         mux->omux_domain = domain;
 274         mux->omux_family = family;
 275         mux->omux_protocol = protocol;
 276         mux->omux_addr = kmem_alloc(len, KM_SLEEP);
 277         bcopy(addr, mux->omux_addr, len);
 278         mux->omux_alen = len;
 279         mux->omux_count = 1;
 280         avl_create(&mux->omux_devices, overlay_mux_comparator,
 281             sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
 282         mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
 283 
 284 
 285         /* Once this is called, we need to expect to rx data */
 286         *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
 287         if (*errp != 0) {
 288                 ksocket_close(ksock, kcred);
 289                 mutex_destroy(&mux->omux_lock);
 290                 avl_destroy(&mux->omux_devices);
 291                 kmem_free(mux->omux_addr, len);
 292                 kmem_free(mux, sizeof (overlay_mux_t));
 293                 return (NULL);
 294         }
 295 
 296         list_insert_tail(&overlay_mux_list, mux);
 297         mutex_exit(&overlay_mux_lock);
 298 
 299         *errp = 0;
 300         return (mux);
 301 }
 302 
 303 void
 304 overlay_mux_close(overlay_mux_t *mux)
 305 {
 306         mutex_enter(&overlay_mux_lock);
 307         mutex_enter(&mux->omux_lock);
 308         mux->omux_count--;
 309         if (mux->omux_count != 0) {
 310                 mutex_exit(&mux->omux_lock);
 311                 mutex_exit(&overlay_mux_lock);
 312                 return;
 313         }
 314         list_remove(&overlay_mux_list, mux);
 315         mutex_exit(&mux->omux_lock);
 316         mutex_exit(&overlay_mux_lock);
 317 
 318         ksocket_close(mux->omux_ksock, kcred);
 319         avl_destroy(&mux->omux_devices);
 320         kmem_free(mux->omux_addr, mux->omux_alen);
 321         kmem_free(mux, sizeof (overlay_mux_t));
 322 }
 323 
 324 void
 325 overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 326 {
 327         mutex_enter(&mux->omux_lock);
 328         avl_add(&mux->omux_devices, odd);
 329         mutex_exit(&mux->omux_lock);
 330 }
 331 
 332 void
 333 overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 334 {
 335         mutex_enter(&mux->omux_lock);
 336         avl_remove(&mux->omux_devices, odd);
 337         mutex_exit(&mux->omux_lock);
 338 }
 339 
 340 int
 341 overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
 342 {
 343         int ret;
 344 
 345         /*
 346          * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
 347          * that isn't actually supported by UDP at this time.
 348          *
 349          * Send with MSG_DONTWAIT to indicate clogged UDP sockets upstack.
 350          */
 351         ret = ksocket_sendmblk(mux->omux_ksock, hdr, MSG_DONTWAIT, &mp, kcred);
 352         /*
 353          * NOTE: ksocket_sendmblk() may send partial packets downstack,
 354          * returning what's not sent in &mp (i.e. mp pre-call might be a
 355          * b_cont of mp post-call).  We can't hold up this message (it's a
 356          * datagram), so we drop, and let the caller cope.
 357          */
 358         if (ret != 0)
 359                 freemsg(mp);
 360 
 361         return (ret);
 362 }