Print this page
    
OS-7088 cyclics corked on overlay socket with full queue
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/io/overlay/overlay_mux.c
          +++ new/usr/src/uts/common/io/overlay/overlay_mux.c
   1    1  /*
   2    2   * This file and its contents are supplied under the terms of the
   3    3   * Common Development and Distribution License ("CDDL"), version 1.0.
   4    4   * You may only use this file in accordance with the terms of version
   5    5   * 1.0 of the CDDL.
   6    6   *
   7    7   * A full copy of the text of the CDDL should have accompanied this
   8    8   * source.  A copy of the CDDL is also available via the Internet at
   9    9   * http://www.illumos.org/license/CDDL.
  10   10   */
  11   11  
  12   12  /*
  13   13   * Copyright 2019 Joyent, Inc.
  14   14   */
  15   15  
  16   16  /*
  17   17   * Overlay device ksocket multiplexer.
  18   18   *
  19   19   * For more information, see the big theory statement in
  20   20   * uts/common/io/overlay/overlay.c
  21   21   */
  22   22  
  23   23  #include <sys/types.h>
  24   24  #include <sys/socket.h>
  25   25  #include <sys/ksynch.h>
  26   26  #include <sys/ksocket.h>
  27   27  #include <sys/avl.h>
  28   28  #include <sys/list.h>
  29   29  #include <sys/pattr.h>
  30   30  #include <sys/sysmacros.h>
  31   31  #include <sys/strsubr.h>
  32   32  #include <sys/strsun.h>
  33   33  #include <sys/tihdr.h>
  34   34  
  35   35  #include <sys/overlay_impl.h>
  36   36  
  37   37  #include <sys/sdt.h>
  38   38  
  39   39  #define OVERLAY_FREEMSG(mp, reason) \
  40   40      DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason)
  41   41  
  42   42  static list_t overlay_mux_list;
  43   43  static kmutex_t overlay_mux_lock;
  44   44  
  45   45  void
  46   46  overlay_mux_init(void)
  47   47  {
  48   48          list_create(&overlay_mux_list, sizeof (overlay_mux_t),
  49   49              offsetof(overlay_mux_t, omux_lnode));
  50   50          mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL);
  51   51  }
  52   52  
  53   53  void
  54   54  overlay_mux_fini(void)
  55   55  {
  56   56          mutex_destroy(&overlay_mux_lock);
  57   57          list_destroy(&overlay_mux_list);
  58   58  }
  59   59  
  60   60  static int
  61   61  overlay_mux_comparator(const void *a, const void *b)
  62   62  {
  63   63          const overlay_dev_t *odl, *odr;
  64   64          odl = a;
  65   65          odr = b;
  66   66          if (odl->odd_vid > odr->odd_vid)
  67   67                  return (1);
  68   68          else if (odl->odd_vid < odr->odd_vid)
  69   69                  return (-1);
  70   70          else
  71   71                  return (0);
  72   72  }
  73   73  
  74   74  /*
  75   75   * This is the central receive data path. We need to decode the packet, if we
  76   76   * can, and then deliver it to the appropriate overlay.
  77   77   */
  78   78  /* ARGSUSED */
  79   79  static boolean_t
  80   80  overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob,
  81   81      void *arg)
  82   82  {
  83   83          mblk_t *mp, *nmp, *fmp;
  84   84          overlay_mux_t *mux = arg;
  85   85  
  86   86          /*
  87   87           * We may have a received a chain of messages. Each messsage in the
  88   88           * chain will likely have a T_unitdata_ind attached to it as an M_PROTO.
  89   89           * If we aren't getting that, we should probably drop that for the
  90   90           * moment.
  91   91           */
  92   92          for (mp = mpchain; mp != NULL; mp = nmp) {
  93   93                  struct T_unitdata_ind *tudi;
  94   94                  ovep_encap_info_t infop;
  95   95                  overlay_dev_t od, *odd;
  96   96                  int ret;
  97   97  
  98   98                  nmp = mp->b_next;
  99   99                  mp->b_next = NULL;
 100  100  
 101  101                  if (DB_TYPE(mp) != M_PROTO) {
 102  102                          OVERLAY_FREEMSG(mp, "first one isn't M_PROTO");
 103  103                          freemsg(mp);
 104  104                          continue;
 105  105                  }
 106  106  
 107  107                  if (mp->b_cont == NULL) {
 108  108                          OVERLAY_FREEMSG(mp, "missing a b_cont");
 109  109                          freemsg(mp);
 110  110                          continue;
 111  111                  }
 112  112  
 113  113                  tudi = (struct T_unitdata_ind *)mp->b_rptr;
 114  114                  if (tudi->PRIM_type != T_UNITDATA_IND) {
 115  115                          OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *");
 116  116                          freemsg(mp);
 117  117                          continue;
 118  118                  }
 119  119  
 120  120                  /*
 121  121                   * In the future, we'll care about the source information
 122  122                   * for purposes of telling varpd for oob invalidation. But for
 123  123                   * now, just drop that block.
 124  124                   */
 125  125                  fmp = mp;
 126  126                  mp = fmp->b_cont;
 127  127                  freeb(fmp);
 128  128  
 129  129                  /*
 130  130                   * Until we have VXLAN-or-other-decap HW acceleration support
 131  131                   * (e.g.  we support NICs that reach into VXLAN-encapsulated
 132  132                   * packets and check the inside-VXLAN IP packets' checksums,
 133  133                   * or do LSO with VXLAN), we should clear any HW-accelerated-
 134  134                   * performed bits.
 135  135                   */
 136  136                  DB_CKSUMFLAGS(mp) = 0;
 137  137  
 138  138                  /*
 139  139                   * Decap and deliver.
 140  140                   */
 141  141                  bzero(&infop, sizeof (ovep_encap_info_t));
 142  142                  ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop);
 143  143                  if (ret != 0) {
 144  144                          OVERLAY_FREEMSG(mp, "decap failed");
 145  145                          freemsg(mp);
 146  146                          continue;
 147  147                  }
 148  148                  if (MBLKL(mp) > infop.ovdi_hdr_size) {
 149  149                          mp->b_rptr += infop.ovdi_hdr_size;
 150  150                  } else {
 151  151                          while (infop.ovdi_hdr_size != 0) {
 152  152                                  size_t rem, blkl;
 153  153  
 154  154                                  if (mp == NULL)
 155  155                                          break;
 156  156  
 157  157                                  blkl = MBLKL(mp);
 158  158                                  rem = MIN(infop.ovdi_hdr_size, blkl);
 159  159                                  infop.ovdi_hdr_size -= rem;
 160  160                                  mp->b_rptr += rem;
 161  161                                  if (rem == blkl) {
 162  162                                          fmp = mp;
 163  163                                          mp = fmp->b_cont;
 164  164                                          fmp->b_cont = NULL;
 165  165                                          OVERLAY_FREEMSG(mp,
 166  166                                              "freed a fmp block");
 167  167                                          freemsg(fmp);
 168  168                                  }
 169  169                          }
 170  170                          if (mp == NULL) {
 171  171                                  OVERLAY_FREEMSG(mp, "freed it all...");
 172  172                                  continue;
 173  173                          }
 174  174                  }
 175  175  
 176  176  
 177  177                  od.odd_vid = infop.ovdi_id;
 178  178                  mutex_enter(&mux->omux_lock);
 179  179                  odd = avl_find(&mux->omux_devices, &od, NULL);
 180  180                  if (odd == NULL) {
 181  181                          mutex_exit(&mux->omux_lock);
 182  182                          OVERLAY_FREEMSG(mp, "no matching vid");
 183  183                          freemsg(mp);
 184  184                          continue;
 185  185                  }
 186  186                  mutex_enter(&odd->odd_lock);
 187  187                  if ((odd->odd_flags & OVERLAY_F_MDDROP) ||
 188  188                      !(odd->odd_flags & OVERLAY_F_IN_MUX)) {
 189  189                          mutex_exit(&odd->odd_lock);
 190  190                          mutex_exit(&mux->omux_lock);
 191  191                          OVERLAY_FREEMSG(mp, "dev dropped");
 192  192                          freemsg(mp);
 193  193                          continue;
 194  194                  }
 195  195                  overlay_io_start(odd, OVERLAY_F_IN_RX);
 196  196                  mutex_exit(&odd->odd_lock);
 197  197                  mutex_exit(&mux->omux_lock);
 198  198  
 199  199                  mac_rx(odd->odd_mh, NULL, mp);
  
    | 
      ↓ open down ↓ | 
    199 lines elided | 
    
      ↑ open up ↑ | 
  
 200  200  
 201  201                  mutex_enter(&odd->odd_lock);
 202  202                  overlay_io_done(odd, OVERLAY_F_IN_RX);
 203  203                  mutex_exit(&odd->odd_lock);
 204  204          }
 205  205  
 206  206          return (B_TRUE);
 207  207  }
 208  208  
 209  209  /*
      210 + * Kernel socket callback to indicate the socket itself is able to send
      211 + * data again.  Check for devices on this mux that were send-blocked,
      212 + * and clear them.
      213 + */
      214 +/* ARGSUSED */
      215 +static void
      216 +overlay_mux_cansend_now(ksocket_t ksock, ksocket_callback_event_t event,
      217 +    void *arg, uintptr_t ignore_me)
      218 +{
      219 +        overlay_mux_t *mux = (overlay_mux_t *)arg;
      220 +        overlay_dev_t *odd;
      221 +        mac_handle_t *mhs_to_update, *current_mh;
      222 +        size_t allocsize;
      223 +
      224 +        ASSERT3P(ksock, ==, mux->omux_ksock);
      225 +        ASSERT3U(event, ==, KSOCKET_EV_CANSEND);
      226 +
      227 +        /* Traverse omux_devices and check for ones marked as send-blocked. */
      228 +        mutex_enter(&mux->omux_lock);
      229 +        if (mux->omux_count == 0) {
      230 +                /* Nothing to wake up. */
      231 +                mutex_exit(&mux->omux_lock);
      232 +                return;
      233 +        }
      234 +        allocsize = sizeof (mac_handle_t) * mux->omux_count;
      235 +        mhs_to_update = kmem_zalloc(allocsize, KM_NOSLEEP);
      236 +        VERIFY(mhs_to_update != NULL);  /* Failure should be rare. */
      237 +        current_mh = mhs_to_update;
      238 +
      239 +        for (odd = avl_first(&mux->omux_devices); odd != NULL;
      240 +            odd = AVL_NEXT(&mux->omux_devices, odd)) {
      241 +                mac_handle_t odd_mh = NULL;
      242 +
      243 +                mutex_enter(&odd->odd_lock);
      244 +                if ((odd->odd_flags & OVERLAY_F_TXSTOPPED) != 0) {
      245 +                        /* Get ready to tell MAC it can transmit again. */
      246 +                        odd->odd_flags &= ~OVERLAY_F_TXSTOPPED;
      247 +                        odd_mh = odd->odd_mh;
      248 +                }
      249 +                mutex_exit(&odd->odd_lock);
      250 +                if (odd_mh != NULL) {
      251 +                        *current_mh = odd_mh;
      252 +                        current_mh++;
      253 +                }
      254 +        }
      255 +        mutex_exit(&mux->omux_lock);
      256 +
      257 +        /*
      258 +         * Yes, I'm using the value-then-decrement.  "current_mh" is
      259 +         * guaranteed to be at least one ahead of mhs_to_update if there are
      260 +         * any mac handles that need updating.  I also have to do this outside
      261 +         * the omux lock because the tx_update may trigger immediate or
      262 +         * concurrent packet transmission.
      263 +         */
      264 +        while (current_mh-- != mhs_to_update)
      265 +                mac_tx_update(*current_mh);
      266 +
      267 +        kmem_free(mhs_to_update, allocsize);
      268 +}
      269 +
      270 +/*
 210  271   * Register a given device with a socket backend. If no such device socket
 211  272   * exists, create a new one.
 212  273   */
 213  274  overlay_mux_t *
 214  275  overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol,
 215  276      struct sockaddr *addr, socklen_t len, int *errp)
 216  277  {
 217  278          int err;
 218  279          overlay_mux_t *mux;
 219  280          ksocket_t ksock;
      281 +        ksocket_callbacks_t ks_cb = { 0 };
 220  282  
 221  283          if (errp == NULL)
 222  284                  errp = &err;
 223  285  
 224  286          mutex_enter(&overlay_mux_lock);
 225  287          for (mux = list_head(&overlay_mux_list); mux != NULL;
 226  288              mux = list_next(&overlay_mux_list, mux)) {
 227  289                  if (domain == mux->omux_domain &&
 228  290                      family == mux->omux_family &&
 229  291                      protocol == mux->omux_protocol &&
 230  292                      len == mux->omux_alen &&
 231  293                      bcmp(addr, mux->omux_addr, len) == 0) {
 232  294  
 233  295                          if (opp != mux->omux_plugin) {
 234  296                                  *errp = EEXIST;
 235  297                                  return (NULL);
 236  298                          }
 237  299  
 238  300                          mutex_enter(&mux->omux_lock);
 239  301                          mux->omux_count++;
 240  302                          mutex_exit(&mux->omux_lock);
 241  303                          mutex_exit(&overlay_mux_lock);
 242  304                          *errp = 0;
 243  305                          return (mux);
 244  306                  }
 245  307          }
 246  308  
 247  309          /*
 248  310           * Today we aren't zone-aware and only exist in the global zone. When we
 249  311           * allow for things to exist in the non-global zone, we'll want to use a
 250  312           * credential that's actually specific to the zone.
 251  313           */
 252  314          *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP,
 253  315              kcred);
 254  316          if (*errp != 0) {
 255  317                  mutex_exit(&overlay_mux_lock);
 256  318                  return (NULL);
 257  319          }
 258  320  
 259  321          *errp = ksocket_bind(ksock, addr, len, kcred);
 260  322          if (*errp != 0) {
 261  323                  mutex_exit(&overlay_mux_lock);
 262  324                  ksocket_close(ksock, kcred);
 263  325                  return (NULL);
 264  326          }
 265  327  
 266  328          /*
 267  329           * Ask our lower layer to optionally toggle anything they need on this
 268  330           * socket. Because a socket is owned by a single type of plugin, we can
 269  331           * then ask it to perform any additional socket set up it'd like to do.
 270  332           */
 271  333          if (opp->ovp_ops->ovpo_sockopt != NULL &&
 272  334              (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) {
 273  335                  mutex_exit(&overlay_mux_lock);
 274  336                  ksocket_close(ksock, kcred);
 275  337                  return (NULL);
 276  338          }
 277  339  
 278  340          mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP);
 279  341          list_link_init(&mux->omux_lnode);
 280  342          mux->omux_ksock = ksock;
 281  343          mux->omux_plugin = opp;
 282  344          mux->omux_domain = domain;
  
    | 
      ↓ open down ↓ | 
    53 lines elided | 
    
      ↑ open up ↑ | 
  
 283  345          mux->omux_family = family;
 284  346          mux->omux_protocol = protocol;
 285  347          mux->omux_addr = kmem_alloc(len, KM_SLEEP);
 286  348          bcopy(addr, mux->omux_addr, len);
 287  349          mux->omux_alen = len;
 288  350          mux->omux_count = 1;
 289  351          avl_create(&mux->omux_devices, overlay_mux_comparator,
 290  352              sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode));
 291  353          mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL);
 292  354  
      355 +#if defined(OVERLAY_PINCH) || defined(OVERLAY_FC_TEST)
      356 +        /* Set the xmit buf to a REALLY SMALL value, say 12k (1-3 packets) */
      357 +        int bufsize = 12 * 1024;
 293  358  
      359 +        if (ksocket_setsockopt(ksock, SOL_SOCKET, SO_SNDBUF,
      360 +                (const void *)&bufsize, sizeof (bufsize), CRED()) != 0) {
      361 +                ksocket_close(ksock, kcred);
      362 +                mutex_destroy(&mux->omux_lock);
      363 +                avl_destroy(&mux->omux_devices);
      364 +                kmem_free(mux->omux_addr, len);
      365 +                kmem_free(mux, sizeof (overlay_mux_t));
      366 +                return (NULL);
      367 +        }
      368 +#endif
      369 +        /*
      370 +         * Set a callback in case we hit socket flow control and need to know
      371 +         * when it's ready to send again.  See the aforementioned
      372 +         * ksocket_socket() comments about the use of kcred vs. being
      373 +         * zone-aware.
      374 +         */
      375 +        ks_cb.ksock_cb_cansend = overlay_mux_cansend_now;
      376 +        if (ksocket_setcallbacks(ksock, &ks_cb, mux, kcred) != 0) {
      377 +                ksocket_close(ksock, kcred);
      378 +                mutex_destroy(&mux->omux_lock);
      379 +                avl_destroy(&mux->omux_devices);
      380 +                kmem_free(mux->omux_addr, len);
      381 +                kmem_free(mux, sizeof (overlay_mux_t));
      382 +                return (NULL);
      383 +        }
      384 +
 294  385          /* Once this is called, we need to expect to rx data */
 295  386          *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux);
 296  387          if (*errp != 0) {
 297  388                  ksocket_close(ksock, kcred);
 298  389                  mutex_destroy(&mux->omux_lock);
 299  390                  avl_destroy(&mux->omux_devices);
 300  391                  kmem_free(mux->omux_addr, len);
 301  392                  kmem_free(mux, sizeof (overlay_mux_t));
 302  393                  return (NULL);
 303  394          }
 304  395  
 305  396          list_insert_tail(&overlay_mux_list, mux);
 306  397          mutex_exit(&overlay_mux_lock);
 307  398  
 308  399          *errp = 0;
 309  400          return (mux);
 310  401  }
 311  402  
 312  403  void
 313  404  overlay_mux_close(overlay_mux_t *mux)
 314  405  {
 315  406          mutex_enter(&overlay_mux_lock);
 316  407          mutex_enter(&mux->omux_lock);
 317  408          mux->omux_count--;
 318  409          if (mux->omux_count != 0) {
 319  410                  mutex_exit(&mux->omux_lock);
 320  411                  mutex_exit(&overlay_mux_lock);
 321  412                  return;
 322  413          }
 323  414          list_remove(&overlay_mux_list, mux);
 324  415          mutex_exit(&mux->omux_lock);
 325  416          mutex_exit(&overlay_mux_lock);
 326  417  
 327  418          ksocket_close(mux->omux_ksock, kcred);
 328  419          avl_destroy(&mux->omux_devices);
 329  420          kmem_free(mux->omux_addr, mux->omux_alen);
 330  421          kmem_free(mux, sizeof (overlay_mux_t));
 331  422  }
 332  423  
 333  424  void
 334  425  overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 335  426  {
 336  427          mutex_enter(&mux->omux_lock);
 337  428          avl_add(&mux->omux_devices, odd);
 338  429          mutex_exit(&mux->omux_lock);
 339  430  }
 340  431  
 341  432  void
 342  433  overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd)
 343  434  {
 344  435          mutex_enter(&mux->omux_lock);
 345  436          avl_remove(&mux->omux_devices, odd);
 346  437          mutex_exit(&mux->omux_lock);
  
    | 
      ↓ open down ↓ | 
    43 lines elided | 
    
      ↑ open up ↑ | 
  
 347  438  }
 348  439  
 349  440  int
 350  441  overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp)
 351  442  {
 352  443          int ret;
 353  444  
 354  445          /*
 355  446           * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately,
 356  447           * that isn't actually supported by UDP at this time.
      448 +         *
      449 +         * Send with MSG_DONTWAIT to indicate clogged UDP sockets upstack.
 357  450           */
 358      -        ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred);
      451 +        ret = ksocket_sendmblk(mux->omux_ksock, hdr, MSG_DONTWAIT, &mp, kcred);
      452 +        /*
      453 +         * NOTE: ksocket_sendmblk() may send partial packets downstack,
      454 +         * returning what's not sent in &mp (i.e. mp pre-call might be a
      455 +         * b_cont of mp post-call).  We can't hold up this message (it's a
      456 +         * datagram), so we drop, and let the caller cope.
      457 +         */
 359  458          if (ret != 0)
 360  459                  freemsg(mp);
 361  460  
 362  461          return (ret);
 363  462  }
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX