9832-improved Wdiff usr/src/uts/common/io/mac/mac_flow.c

Print this page

9832 Original bug discovered as 9560 has friends IPv4 packets coming in as IPv6 creating chaos
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/io/mac/mac_flow.c
          +++ new/usr/src/uts/common/io/mac/mac_flow.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.

↓ open down ↓

14 lines elided

↑ open up ↑

  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
       25 + *
       26 + * Copyright 2019, Joyent, Inc.
  25   27   */
  26   28  
  27   29  #include <sys/strsun.h>
  28   30  #include <sys/sdt.h>
  29   31  #include <sys/mac.h>
  30   32  #include <sys/mac_impl.h>
  31   33  #include <sys/mac_client_impl.h>
  32   34  #include <sys/mac_stat.h>
  33   35  #include <sys/dls.h>
  34   36  #include <sys/dls_impl.h>

  35   37  #include <sys/mac_soft_ring.h>
  36   38  #include <sys/ethernet.h>
  37   39  #include <sys/cpupart.h>
  38   40  #include <sys/pool.h>
  39   41  #include <sys/pool_pset.h>
  40   42  #include <sys/vlan.h>
  41   43  #include <inet/ip.h>
  42   44  #include <inet/ip6.h>
  43   45  #include <netinet/tcp.h>
  44   46  #include <netinet/udp.h>
  45   47  #include <netinet/sctp.h>
  46   48  
  47   49  typedef struct flow_stats_s {
  48   50          uint64_t        fs_obytes;
  49   51          uint64_t        fs_opackets;
  50   52          uint64_t        fs_oerrors;
  51   53          uint64_t        fs_ibytes;
  52   54          uint64_t        fs_ipackets;
  53   55          uint64_t        fs_ierrors;
  54   56  } flow_stats_t;
  55   57  
  56   58  
  57   59  /* global flow table, will be a per exclusive-zone table later */
  58   60  static mod_hash_t       *flow_hash;
  59   61  static krwlock_t        flow_tab_lock;
  60   62  
  61   63  static kmem_cache_t     *flow_cache;
  62   64  static kmem_cache_t     *flow_tab_cache;
  63   65  static flow_ops_t       flow_l2_ops;
  64   66  
  65   67  typedef struct {
  66   68          const char      *fs_name;
  67   69          uint_t          fs_offset;
  68   70  } flow_stats_info_t;
  69   71  
  70   72  #define FS_OFF(f)       (offsetof(flow_stats_t, f))
  71   73  static flow_stats_info_t flow_stats_list[] = {
  72   74          {"rbytes",      FS_OFF(fs_ibytes)},
  73   75          {"ipackets",    FS_OFF(fs_ipackets)},
  74   76          {"ierrors",     FS_OFF(fs_ierrors)},
  75   77          {"obytes",      FS_OFF(fs_obytes)},
  76   78          {"opackets",    FS_OFF(fs_opackets)},
  77   79          {"oerrors",     FS_OFF(fs_oerrors)}
  78   80  };
  79   81  #define FS_SIZE         (sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
  80   82  
  81   83  /*
  82   84   * Checks whether a flow mask is legal.
  83   85   */
  84   86  static flow_tab_info_t  *mac_flow_tab_info_get(flow_mask_t);
  85   87  
  86   88  static void
  87   89  flow_stat_init(kstat_named_t *knp)
  88   90  {
  89   91          int     i;
  90   92  
  91   93          for (i = 0; i < FS_SIZE; i++, knp++) {
  92   94                  kstat_named_init(knp, flow_stats_list[i].fs_name,
  93   95                      KSTAT_DATA_UINT64);
  94   96          }
  95   97  }
  96   98  
  97   99  static int
  98  100  flow_stat_update(kstat_t *ksp, int rw)
  99  101  {
 100  102          flow_entry_t            *fep = ksp->ks_private;
 101  103          kstat_named_t           *knp = ksp->ks_data;
 102  104          uint64_t                *statp;
 103  105          int                     i;
 104  106          mac_rx_stats_t          *mac_rx_stat;
 105  107          mac_tx_stats_t          *mac_tx_stat;

↓ open down ↓

71 lines elided

↑ open up ↑

 106  108          flow_stats_t            flow_stats;
 107  109          mac_soft_ring_set_t     *mac_srs;
 108  110  
 109  111          if (rw != KSTAT_READ)
 110  112                  return (EACCES);
 111  113  
 112  114          bzero(&flow_stats, sizeof (flow_stats_t));
 113  115  
 114  116          for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
 115  117                  mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
 116      -                if (mac_srs == NULL)            /* Multicast flow */
      118 +                if (mac_srs == NULL)            /* Multicast flow */
 117  119                          break;
 118  120                  mac_rx_stat = &mac_srs->srs_rx.sr_stat;
 119  121  
 120  122                  flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
 121  123                      mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
 122  124  
 123  125                  flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
 124  126                      mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
 125  127  
 126  128                  flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
 127  129          }
 128  130  
 129  131          mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
 130      -        if (mac_srs == NULL)            /* Multicast flow */
      132 +        if (mac_srs == NULL)            /* Multicast flow */
 131  133                  goto done;
 132  134          mac_tx_stat = &mac_srs->srs_tx.st_stat;
 133  135  
 134  136          flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
 135  137          flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
 136  138          flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;
 137  139  
 138  140  done:
 139  141          for (i = 0; i < FS_SIZE; i++, knp++) {
 140  142                  statp = (uint64_t *)

 141  143                      ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
 142  144                  knp->value.ui64 = *statp;
 143  145          }
 144  146          return (0);
 145  147  }
 146  148  
 147  149  static void
 148  150  flow_stat_create(flow_entry_t *fep)
 149  151  {
 150  152          kstat_t         *ksp;
 151  153          kstat_named_t   *knp;
 152  154          uint_t          nstats = FS_SIZE;
 153  155  
 154  156          /*
 155  157           * Fow now, flow entries are only manipulated and visible from the
 156  158           * global zone.
 157  159           */
 158  160          ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
 159  161              KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
 160  162          if (ksp == NULL)
 161  163                  return;
 162  164  
 163  165          ksp->ks_update = flow_stat_update;
 164  166          ksp->ks_private = fep;
 165  167          fep->fe_ksp = ksp;
 166  168  
 167  169          knp = (kstat_named_t *)ksp->ks_data;
 168  170          flow_stat_init(knp);
 169  171          kstat_install(ksp);
 170  172  }
 171  173  
 172  174  void
 173  175  flow_stat_destroy(flow_entry_t *fep)
 174  176  {
 175  177          if (fep->fe_ksp != NULL) {
 176  178                  kstat_delete(fep->fe_ksp);
 177  179                  fep->fe_ksp = NULL;
 178  180          }
 179  181  }
 180  182  
 181  183  /*
 182  184   * Initialize the flow table
 183  185   */
 184  186  void
 185  187  mac_flow_init()
 186  188  {
 187  189          flow_cache = kmem_cache_create("flow_entry_cache",
 188  190              sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 189  191          flow_tab_cache = kmem_cache_create("flow_tab_cache",
 190  192              sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 191  193          flow_hash = mod_hash_create_extended("flow_hash",
 192  194              100, mod_hash_null_keydtor, mod_hash_null_valdtor,
 193  195              mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 194  196          rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
 195  197  }
 196  198  
 197  199  /*
 198  200   * Cleanup and release the flow table
 199  201   */
 200  202  void
 201  203  mac_flow_fini()
 202  204  {
 203  205          kmem_cache_destroy(flow_cache);
 204  206          kmem_cache_destroy(flow_tab_cache);
 205  207          mod_hash_destroy_hash(flow_hash);
 206  208          rw_destroy(&flow_tab_lock);
 207  209  }
 208  210  
 209  211  /*
 210  212   * mac_create_flow(): create a flow_entry_t.
 211  213   */
 212  214  int
 213  215  mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
 214  216      void *client_cookie, uint_t type, flow_entry_t **flentp)
 215  217  {
 216  218          flow_entry_t            *flent = *flentp;
 217  219          int                     err = 0;
 218  220  
 219  221          if (mrp != NULL) {
 220  222                  err = mac_validate_props(NULL, mrp);
 221  223                  if (err != 0)
 222  224                          return (err);
 223  225          }
 224  226  
 225  227          if (flent == NULL) {
 226  228                  flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
 227  229                  bzero(flent, sizeof (*flent));
 228  230                  mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
 229  231                  cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
 230  232  
 231  233                  /* Initialize the receiver function to a safe routine */
 232  234                  flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
 233  235                  flent->fe_index = -1;
 234  236          }
 235  237          (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 236  238  
 237  239          /* This is an initial flow, will be configured later */
 238  240          if (fd == NULL) {
 239  241                  *flentp = flent;
 240  242                  return (0);
 241  243          }
 242  244  
 243  245          flent->fe_client_cookie = client_cookie;
 244  246          flent->fe_type = type;
 245  247  
 246  248          /* Save flow desc */
 247  249          bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 248  250  
 249  251          if (mrp != NULL) {
 250  252                  /*
 251  253                   * We have already set fe_resource_props for a Link.
 252  254                   */
 253  255                  if (type & FLOW_USER) {
 254  256                          bcopy(mrp, &flent->fe_resource_props,
 255  257                              sizeof (mac_resource_props_t));
 256  258                  }
 257  259                  /*
 258  260                   * The effective resource list should reflect the priority
 259  261                   * that we set implicitly.
 260  262                   */
 261  263                  if (!(mrp->mrp_mask & MRP_PRIORITY))
 262  264                          mrp->mrp_mask |= MRP_PRIORITY;
 263  265                  if (type & FLOW_USER)
 264  266                          mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
 265  267                  else
 266  268                          mrp->mrp_priority = MPL_LINK_DEFAULT;
 267  269                  bzero(mrp->mrp_pool, MAXPATHLEN);
 268  270                  bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
 269  271                  bcopy(mrp, &flent->fe_effective_props,
 270  272                      sizeof (mac_resource_props_t));
 271  273          }
 272  274          flow_stat_create(flent);
 273  275  
 274  276          *flentp = flent;
 275  277          return (0);
 276  278  }
 277  279  
 278  280  /*
 279  281   * Validate flow entry and add it to a flow table.
 280  282   */
 281  283  int
 282  284  mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
 283  285  {
 284  286          flow_entry_t    **headp, **p;
 285  287          flow_ops_t      *ops = &ft->ft_ops;
 286  288          flow_mask_t     mask;
 287  289          uint32_t        index;
 288  290          int             err;
 289  291  
 290  292          ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 291  293  
 292  294          /*
 293  295           * Check for invalid bits in mask.
 294  296           */
 295  297          mask = flent->fe_flow_desc.fd_mask;
 296  298          if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
 297  299                  return (EOPNOTSUPP);
 298  300  
 299  301          /*
 300  302           * Validate flent.
 301  303           */
 302  304          if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
 303  305                  DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
 304  306                      flow_entry_t *, flent, int, err);
 305  307                  return (err);
 306  308          }
 307  309  
 308  310          /*
 309  311           * Flent is valid. now calculate hash and insert it
 310  312           * into hash table.
 311  313           */
 312  314          index = ops->fo_hash_fe(ft, flent);
 313  315  
 314  316          /*
 315  317           * We do not need a lock up until now because we were
 316  318           * not accessing the flow table.
 317  319           */
 318  320          rw_enter(&ft->ft_lock, RW_WRITER);
 319  321          headp = &ft->ft_table[index];
 320  322  
 321  323          /*
 322  324           * Check for duplicate flow.
 323  325           */
 324  326          for (p = headp; *p != NULL; p = &(*p)->fe_next) {
 325  327                  if ((*p)->fe_flow_desc.fd_mask !=
 326  328                      flent->fe_flow_desc.fd_mask)
 327  329                          continue;
 328  330  
 329  331                  if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
 330  332                          rw_exit(&ft->ft_lock);
 331  333                          DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
 332  334                              flow_entry_t *, flent, int, err);
 333  335                          return (EALREADY);
 334  336                  }
 335  337          }
 336  338  
 337  339          /*
 338  340           * Insert flow to hash list.
 339  341           */
 340  342          err = ops->fo_insert_fe(ft, headp, flent);
 341  343          if (err != 0) {
 342  344                  rw_exit(&ft->ft_lock);
 343  345                  DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
 344  346                      flow_entry_t *, flent, int, err);
 345  347                  return (err);
 346  348          }
 347  349  
 348  350          /*
 349  351           * Save the hash index so it can be used by mac_flow_remove().
 350  352           */
 351  353          flent->fe_index = (int)index;
 352  354  
 353  355          /*
 354  356           * Save the flow tab back reference.
 355  357           */
 356  358          flent->fe_flow_tab = ft;
 357  359          FLOW_MARK(flent, FE_FLOW_TAB);
 358  360          ft->ft_flow_count++;
 359  361          rw_exit(&ft->ft_lock);
 360  362          return (0);
 361  363  }
 362  364  
 363  365  /*
 364  366   * Remove a flow from a mac client's subflow table
 365  367   */
 366  368  void
 367  369  mac_flow_rem_subflow(flow_entry_t *flent)
 368  370  {
 369  371          flow_tab_t              *ft = flent->fe_flow_tab;
 370  372          mac_client_impl_t       *mcip = ft->ft_mcip;
 371  373          mac_handle_t            mh = (mac_handle_t)ft->ft_mip;
 372  374  
 373  375          ASSERT(MAC_PERIM_HELD(mh));
 374  376  
 375  377          mac_flow_remove(ft, flent, B_FALSE);
 376  378          if (flent->fe_mcip == NULL) {
 377  379                  /*
 378  380                   * The interface is not yet plumbed and mac_client_flow_add
 379  381                   * was not done.
 380  382                   */
 381  383                  if (FLOW_TAB_EMPTY(ft)) {
 382  384                          mac_flow_tab_destroy(ft);
 383  385                          mcip->mci_subflow_tab = NULL;
 384  386                  }
 385  387          } else {
 386  388                  mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
 387  389                  mac_link_flow_clean((mac_client_handle_t)mcip, flent);
 388  390          }
 389  391          mac_fastpath_enable(mh);
 390  392  }
 391  393  
 392  394  /*
 393  395   * Add a flow to a mac client's subflow table and instantiate the flow
 394  396   * in the mac by creating the associated SRSs etc.
 395  397   */
 396  398  int
 397  399  mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
 398  400      boolean_t instantiate_flow)
 399  401  {
 400  402          mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
 401  403          mac_handle_t            mh = (mac_handle_t)mcip->mci_mip;
 402  404          flow_tab_info_t         *ftinfo;
 403  405          flow_mask_t             mask;
 404  406          flow_tab_t              *ft;
 405  407          int                     err;
 406  408          boolean_t               ft_created = B_FALSE;
 407  409  
 408  410          ASSERT(MAC_PERIM_HELD(mh));
 409  411  
 410  412          if ((err = mac_fastpath_disable(mh)) != 0)
 411  413                  return (err);
 412  414  
 413  415          /*
 414  416           * If the subflow table exists already just add the new subflow
 415  417           * to the existing table, else we create a new subflow table below.
 416  418           */
 417  419          ft = mcip->mci_subflow_tab;
 418  420          if (ft == NULL) {
 419  421                  mask = flent->fe_flow_desc.fd_mask;
 420  422                  /*
 421  423                   * Try to create a new table and then add the subflow to the
 422  424                   * newly created subflow table
 423  425                   */
 424  426                  if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
 425  427                          mac_fastpath_enable(mh);
 426  428                          return (EOPNOTSUPP);
 427  429                  }
 428  430  
 429  431                  mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
 430  432                      mcip->mci_mip, &ft);
 431  433                  ft_created = B_TRUE;
 432  434          }
 433  435  
 434  436          err = mac_flow_add(ft, flent);
 435  437          if (err != 0) {
 436  438                  if (ft_created)
 437  439                          mac_flow_tab_destroy(ft);
 438  440                  mac_fastpath_enable(mh);
 439  441                  return (err);
 440  442          }
 441  443  
 442  444          if (instantiate_flow) {
 443  445                  /* Now activate the flow by creating its SRSs */
 444  446                  ASSERT(MCIP_DATAPATH_SETUP(mcip));
 445  447                  err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
 446  448                  if (err != 0) {
 447  449                          mac_flow_remove(ft, flent, B_FALSE);
 448  450                          if (ft_created)
 449  451                                  mac_flow_tab_destroy(ft);
 450  452                          mac_fastpath_enable(mh);
 451  453                          return (err);
 452  454                  }
 453  455          } else {
 454  456                  FLOW_MARK(flent, FE_UF_NO_DATAPATH);
 455  457          }
 456  458          if (ft_created) {
 457  459                  ASSERT(mcip->mci_subflow_tab == NULL);
 458  460                  ft->ft_mcip = mcip;
 459  461                  mcip->mci_subflow_tab = ft;
 460  462                  if (instantiate_flow)
 461  463                          mac_client_update_classifier(mcip, B_TRUE);
 462  464          }
 463  465          return (0);
 464  466  }
 465  467  
 466  468  /*
 467  469   * Remove flow entry from flow table.
 468  470   */
 469  471  void
 470  472  mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
 471  473  {
 472  474          flow_entry_t    **fp;
 473  475  
 474  476          ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 475  477          if (!(flent->fe_flags & FE_FLOW_TAB))
 476  478                  return;
 477  479  
 478  480          rw_enter(&ft->ft_lock, RW_WRITER);
 479  481          /*
 480  482           * If this is a permanent removal from the flow table, mark it
 481  483           * CONDEMNED to prevent future references. If this is a temporary
 482  484           * removal from the table, say to update the flow descriptor then
 483  485           * we don't mark it CONDEMNED
 484  486           */
 485  487          if (!temp)
 486  488                  FLOW_MARK(flent, FE_CONDEMNED);
 487  489          /*
 488  490           * Locate the specified flent.
 489  491           */
 490  492          fp = &ft->ft_table[flent->fe_index];
 491  493          while (*fp != flent)
 492  494                  fp = &(*fp)->fe_next;
 493  495  
 494  496          /*
 495  497           * The flent must exist. Otherwise it's a bug.
 496  498           */
 497  499          ASSERT(fp != NULL);
 498  500          *fp = flent->fe_next;
 499  501          flent->fe_next = NULL;
 500  502  
 501  503          /*
 502  504           * Reset fe_index to -1 so any attempt to call mac_flow_remove()
 503  505           * on a flent that is supposed to be in the table (FE_FLOW_TAB)
 504  506           * will panic.
 505  507           */
 506  508          flent->fe_index = -1;
 507  509          FLOW_UNMARK(flent, FE_FLOW_TAB);
 508  510          ft->ft_flow_count--;
 509  511          rw_exit(&ft->ft_lock);
 510  512  }
 511  513  
 512  514  /*
 513  515   * This is the flow lookup routine used by the mac sw classifier engine.
 514  516   */
 515  517  int
 516  518  mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
 517  519  {
 518  520          flow_state_t    s;
 519  521          flow_entry_t    *flent;
 520  522          flow_ops_t      *ops = &ft->ft_ops;
 521  523          boolean_t       retried = B_FALSE;
 522  524          int             i, err;
 523  525  
 524  526          s.fs_flags = flags;
 525  527  retry:
 526  528          s.fs_mp = mp;
 527  529  
 528  530          /*
 529  531           * Walk the list of predeclared accept functions.
 530  532           * Each of these would accumulate enough state to allow the next
 531  533           * accept routine to make progress.
 532  534           */
 533  535          for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
 534  536                  if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
 535  537                          mblk_t  *last;
 536  538  
 537  539                          /*
 538  540                           * ENOBUFS indicates that the mp could be too short
 539  541                           * and may need a pullup.
 540  542                           */
 541  543                          if (err != ENOBUFS || retried)
 542  544                                  return (err);
 543  545  
 544  546                          /*
 545  547                           * The pullup is done on the last processed mblk, not
 546  548                           * the starting one. pullup is not done if the mblk
 547  549                           * has references or if b_cont is NULL.
 548  550                           */
 549  551                          last = s.fs_mp;
 550  552                          if (DB_REF(last) > 1 || last->b_cont == NULL ||
 551  553                              pullupmsg(last, -1) == 0)
 552  554                                  return (EINVAL);
 553  555  
 554  556                          retried = B_TRUE;
 555  557                          DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
 556  558                              flow_state_t *, &s);
 557  559                          goto retry;
 558  560                  }
 559  561          }
 560  562  
 561  563          /*
 562  564           * The packet is considered sane. We may now attempt to
 563  565           * find the corresponding flent.
 564  566           */
 565  567          rw_enter(&ft->ft_lock, RW_READER);
 566  568          flent = ft->ft_table[ops->fo_hash(ft, &s)];
 567  569          for (; flent != NULL; flent = flent->fe_next) {
 568  570                  if (flent->fe_match(ft, flent, &s)) {
 569  571                          FLOW_TRY_REFHOLD(flent, err);
 570  572                          if (err != 0)
 571  573                                  continue;
 572  574                          *flentp = flent;
 573  575                          rw_exit(&ft->ft_lock);
 574  576                          return (0);
 575  577                  }
 576  578          }
 577  579          rw_exit(&ft->ft_lock);
 578  580          return (ENOENT);
 579  581  }
 580  582  
 581  583  /*
 582  584   * Walk flow table.
 583  585   * The caller is assumed to have proper perimeter protection.
 584  586   */
 585  587  int
 586  588  mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
 587  589      void *arg)
 588  590  {
 589  591          int             err, i, cnt = 0;
 590  592          flow_entry_t    *flent;
 591  593  
 592  594          if (ft == NULL)
 593  595                  return (0);
 594  596  
 595  597          for (i = 0; i < ft->ft_size; i++) {
 596  598                  for (flent = ft->ft_table[i]; flent != NULL;
 597  599                      flent = flent->fe_next) {
 598  600                          cnt++;
 599  601                          err = (*fn)(flent, arg);
 600  602                          if (err != 0)
 601  603                                  return (err);
 602  604                  }
 603  605          }
 604  606          VERIFY(cnt == ft->ft_flow_count);
 605  607          return (0);
 606  608  }
 607  609  
 608  610  /*
 609  611   * Same as the above except a mutex is used for protection here.
 610  612   */
 611  613  int
 612  614  mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
 613  615      void *arg)
 614  616  {
 615  617          int             err;
 616  618  
 617  619          if (ft == NULL)
 618  620                  return (0);
 619  621  
 620  622          rw_enter(&ft->ft_lock, RW_WRITER);
 621  623          err = mac_flow_walk_nolock(ft, fn, arg);
 622  624          rw_exit(&ft->ft_lock);
 623  625          return (err);
 624  626  }
 625  627  
 626  628  static boolean_t        mac_flow_clean(flow_entry_t *);
 627  629  
 628  630  /*
 629  631   * Destroy a flow entry. Called when the last reference on a flow is released.
 630  632   */
 631  633  void
 632  634  mac_flow_destroy(flow_entry_t *flent)
 633  635  {
 634  636          ASSERT(flent->fe_refcnt == 0);
 635  637  
 636  638          if ((flent->fe_type & FLOW_USER) != 0) {
 637  639                  ASSERT(mac_flow_clean(flent));
 638  640          } else {
 639  641                  mac_flow_cleanup(flent);
 640  642          }
 641  643          mac_misc_stat_delete(flent);
 642  644          mutex_destroy(&flent->fe_lock);
 643  645          cv_destroy(&flent->fe_cv);
 644  646          flow_stat_destroy(flent);
 645  647          kmem_cache_free(flow_cache, flent);
 646  648  }
 647  649  
 648  650  /*
 649  651   * XXX eric
 650  652   * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
 651  653   * mac_link_flow_modify() should really be moved/reworked into the
 652  654   * two functions below. This would consolidate all the mac property
 653  655   * checking in one place. I'm leaving this alone for now since it's
 654  656   * out of scope of the new flows work.
 655  657   */
 656  658  /* ARGSUSED */
 657  659  uint32_t
 658  660  mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
 659  661  {
 660  662          uint32_t                changed_mask = 0;
 661  663          mac_resource_props_t    *fmrp = &flent->fe_effective_props;
 662  664          int                     i;
 663  665  
 664  666          if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
 665  667              (!(fmrp->mrp_mask & MRP_MAXBW) ||
 666  668              (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
 667  669                  changed_mask |= MRP_MAXBW;
 668  670                  if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
 669  671                          fmrp->mrp_mask &= ~MRP_MAXBW;
 670  672                          fmrp->mrp_maxbw = 0;
 671  673                  } else {
 672  674                          fmrp->mrp_mask |= MRP_MAXBW;
 673  675                          fmrp->mrp_maxbw = mrp->mrp_maxbw;
 674  676                  }
 675  677          }
 676  678  
 677  679          if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
 678  680                  if (fmrp->mrp_priority != mrp->mrp_priority)
 679  681                          changed_mask |= MRP_PRIORITY;
 680  682                  if (mrp->mrp_priority == MPL_RESET) {
 681  683                          fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
 682  684                          fmrp->mrp_mask &= ~MRP_PRIORITY;
 683  685                  } else {
 684  686                          fmrp->mrp_priority = mrp->mrp_priority;
 685  687                          fmrp->mrp_mask |= MRP_PRIORITY;
 686  688                  }
 687  689          }
 688  690  
 689  691          /* modify fanout */
 690  692          if ((mrp->mrp_mask & MRP_CPUS) != 0) {
 691  693                  if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
 692  694                      (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
 693  695                          for (i = 0; i < mrp->mrp_ncpus; i++) {
 694  696                                  if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
 695  697                                          break;
 696  698                          }
 697  699                          if (i == mrp->mrp_ncpus) {
 698  700                                  /*
 699  701                                   * The new set of cpus passed is exactly
 700  702                                   * the same as the existing set.
 701  703                                   */
 702  704                                  return (changed_mask);
 703  705                          }
 704  706                  }
 705  707                  changed_mask |= MRP_CPUS;
 706  708                  MAC_COPY_CPUS(mrp, fmrp);
 707  709          }
 708  710  
 709  711          /*
 710  712           * Modify the rings property.
 711  713           */
 712  714          if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
 713  715                  mac_set_rings_effective(flent->fe_mcip);
 714  716  
 715  717          if ((mrp->mrp_mask & MRP_POOL) != 0) {
 716  718                  if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
 717  719                          changed_mask |= MRP_POOL;
 718  720                  if (strlen(mrp->mrp_pool) == 0)
 719  721                          fmrp->mrp_mask &= ~MRP_POOL;
 720  722                  else
 721  723                          fmrp->mrp_mask |= MRP_POOL;
 722  724                  (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
 723  725          }
 724  726          return (changed_mask);
 725  727  }
 726  728  
 727  729  void
 728  730  mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
 729  731  {
 730  732          uint32_t changed_mask;
 731  733          mac_client_impl_t *mcip = flent->fe_mcip;
 732  734          mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
 733  735          mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
 734  736          cpupart_t *cpupart = NULL;
 735  737          boolean_t use_default = B_FALSE;
 736  738  
 737  739          ASSERT(flent != NULL);
 738  740          ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 739  741  
 740  742          rw_enter(&ft->ft_lock, RW_WRITER);
 741  743  
 742  744          /* Update the cached values inside the subflow entry */
 743  745          changed_mask = mac_flow_modify_props(flent, mrp);
 744  746          rw_exit(&ft->ft_lock);
 745  747          /*
 746  748           * Push the changed parameters to the scheduling code in the
 747  749           * SRS's, to take effect right away.
 748  750           */
 749  751          if (changed_mask & MRP_MAXBW) {
 750  752                  mac_srs_update_bwlimit(flent, mrp);
 751  753                  /*
 752  754                   * If bandwidth is changed, we may have to change
 753  755                   * the number of soft ring to be used for fanout.
 754  756                   * Call mac_flow_update_fanout() if MAC_BIND_CPU
 755  757                   * is not set and there is no user supplied cpu
 756  758                   * info. This applies only to link at this time.
 757  759                   */
 758  760                  if (!(flent->fe_type & FLOW_USER) &&
 759  761                      !(changed_mask & MRP_CPUS) &&
 760  762                      !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
 761  763                          mac_fanout_setup(mcip, flent, mcip_mrp,
 762  764                              mac_rx_deliver, mcip, NULL, NULL);
 763  765                  }
 764  766          }
 765  767          if (mrp->mrp_mask & MRP_PRIORITY)
 766  768                  mac_flow_update_priority(mcip, flent);
 767  769  
 768  770          if (changed_mask & MRP_CPUS)
 769  771                  mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
 770  772                      NULL);
 771  773  
 772  774          if (mrp->mrp_mask & MRP_POOL) {
 773  775                  pool_lock();
 774  776                  cpupart = mac_pset_find(mrp, &use_default);
 775  777                  mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
 776  778                      cpupart);
 777  779                  mac_set_pool_effective(use_default, cpupart, mrp, emrp);
 778  780                  pool_unlock();
 779  781          }
 780  782  }
 781  783  
 782  784  /*
 783  785   * This function waits for a certain condition to be met and is generally
 784  786   * used before a destructive or quiescing operation.
 785  787   */
 786  788  void
 787  789  mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
 788  790  {
 789  791          mutex_enter(&flent->fe_lock);
 790  792          flent->fe_flags |= FE_WAITER;
 791  793  
 792  794          switch (event) {
 793  795          case FLOW_DRIVER_UPCALL:
 794  796                  /*
 795  797                   * We want to make sure the driver upcalls have finished before
 796  798                   * we signal the Rx SRS worker to quit.
 797  799                   */
 798  800                  while (flent->fe_refcnt != 1)
 799  801                          cv_wait(&flent->fe_cv, &flent->fe_lock);
 800  802                  break;
 801  803  
 802  804          case FLOW_USER_REF:
 803  805                  /*
 804  806                   * Wait for the fe_user_refcnt to drop to 0. The flow has
 805  807                   * been removed from the global flow hash.
 806  808                   */
 807  809                  ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
 808  810                  while (flent->fe_user_refcnt != 0)
 809  811                          cv_wait(&flent->fe_cv, &flent->fe_lock);
 810  812                  break;
 811  813  
 812  814          default:
 813  815                  ASSERT(0);
 814  816          }
 815  817  
 816  818          flent->fe_flags &= ~FE_WAITER;
 817  819          mutex_exit(&flent->fe_lock);
 818  820  }
 819  821  
 820  822  static boolean_t
 821  823  mac_flow_clean(flow_entry_t *flent)
 822  824  {
 823  825          ASSERT(flent->fe_next == NULL);
 824  826          ASSERT(flent->fe_tx_srs == NULL);
 825  827          ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
 826  828          ASSERT(flent->fe_mbg == NULL);
 827  829  
 828  830          return (B_TRUE);
 829  831  }
 830  832  
 831  833  void
 832  834  mac_flow_cleanup(flow_entry_t *flent)
 833  835  {
 834  836          if ((flent->fe_type & FLOW_USER) == 0) {
 835  837                  ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
 836  838                      (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
 837  839                  ASSERT(flent->fe_refcnt == 0);
 838  840          } else {
 839  841                  ASSERT(flent->fe_refcnt == 1);
 840  842          }
 841  843  
 842  844          if (flent->fe_mbg != NULL) {
 843  845                  ASSERT(flent->fe_tx_srs == NULL);
 844  846                  /* This is a multicast or broadcast flow entry */
 845  847                  mac_bcast_grp_free(flent->fe_mbg);
 846  848                  flent->fe_mbg = NULL;
 847  849          }
 848  850  
 849  851          if (flent->fe_tx_srs != NULL) {
 850  852                  ASSERT(flent->fe_mbg == NULL);
 851  853                  mac_srs_free(flent->fe_tx_srs);
 852  854                  flent->fe_tx_srs = NULL;
 853  855          }
 854  856  
 855  857          /*
 856  858           * In the normal case fe_rx_srs_cnt is 1. However in the error case
 857  859           * when mac_unicast_add fails we may not have set up any SRS
 858  860           * in which case fe_rx_srs_cnt will be zero.
 859  861           */
 860  862          if (flent->fe_rx_srs_cnt != 0) {
 861  863                  ASSERT(flent->fe_rx_srs_cnt == 1);
 862  864                  mac_srs_free(flent->fe_rx_srs[0]);
 863  865                  flent->fe_rx_srs[0] = NULL;
 864  866                  flent->fe_rx_srs_cnt = 0;
 865  867          }
 866  868          ASSERT(flent->fe_rx_srs[0] == NULL);
 867  869  }
 868  870  
 869  871  void
 870  872  mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
 871  873  {
 872  874          /*
 873  875           * Grab the fe_lock to see a self-consistent fe_flow_desc.
 874  876           * Updates to the fe_flow_desc happen under the fe_lock
 875  877           * after removing the flent from the flow table
 876  878           */
 877  879          mutex_enter(&flent->fe_lock);
 878  880          bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
 879  881          mutex_exit(&flent->fe_lock);
 880  882  }
 881  883  
 882  884  /*
 883  885   * Update a field of a flow entry. The mac perimeter ensures that
 884  886   * this is the only thread doing a modify operation on this mac end point.
 885  887   * So the flow table can't change or disappear. The ft_lock protects access
 886  888   * to the flow entry, and holding the lock ensures that there isn't any thread
 887  889   * accessing the flow entry or attempting a flow table lookup. However
 888  890   * data threads that are using the flow entry based on the old descriptor
 889  891   * will continue to use the flow entry. If strong coherence is required
 890  892   * then the flow will have to be quiesced before the descriptor can be
 891  893   * changed.
 892  894   */
 893  895  void
 894  896  mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
 895  897  {
 896  898          flow_tab_t      *ft = flent->fe_flow_tab;
 897  899          flow_desc_t     old_desc;
 898  900          int             err;
 899  901  
 900  902          if (ft == NULL) {
 901  903                  /*
 902  904                   * The flow hasn't yet been inserted into the table,
 903  905                   * so only the caller knows about this flow, however for
 904  906                   * uniformity we grab the fe_lock here.
 905  907                   */
 906  908                  mutex_enter(&flent->fe_lock);
 907  909                  bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 908  910                  mutex_exit(&flent->fe_lock);
 909  911          }
 910  912  
 911  913          ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 912  914  
 913  915          /*
 914  916           * Need to remove the flow entry from the table and reinsert it,
 915  917           * into a potentially diference hash line. The hash depends on
 916  918           * the new descriptor fields. However access to fe_desc itself
 917  919           * is always under the fe_lock. This helps log and stat functions
 918  920           * see a self-consistent fe_flow_desc.
 919  921           */
 920  922          mac_flow_remove(ft, flent, B_TRUE);
 921  923          old_desc = flent->fe_flow_desc;
 922  924  
 923  925          mutex_enter(&flent->fe_lock);
 924  926          bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 925  927          mutex_exit(&flent->fe_lock);
 926  928  
 927  929          if (mac_flow_add(ft, flent) != 0) {
 928  930                  /*
 929  931                   * The add failed say due to an invalid flow descriptor.
 930  932                   * Undo the update
 931  933                   */
 932  934                  flent->fe_flow_desc = old_desc;
 933  935                  err = mac_flow_add(ft, flent);
 934  936                  ASSERT(err == 0);
 935  937          }
 936  938  }
 937  939  
 938  940  void
 939  941  mac_flow_set_name(flow_entry_t *flent, const char *name)
 940  942  {
 941  943          flow_tab_t      *ft = flent->fe_flow_tab;
 942  944  
 943  945          if (ft == NULL) {
 944  946                  /*
 945  947                   *  The flow hasn't yet been inserted into the table,
 946  948                   * so only the caller knows about this flow
 947  949                   */
 948  950                  (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 949  951          } else {
 950  952                  ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 951  953          }
 952  954  
 953  955          mutex_enter(&flent->fe_lock);
 954  956          (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 955  957          mutex_exit(&flent->fe_lock);
 956  958  }
 957  959  
 958  960  /*
 959  961   * Return the client-private cookie that was associated with
 960  962   * the flow when it was created.
 961  963   */
 962  964  void *
 963  965  mac_flow_get_client_cookie(flow_entry_t *flent)
 964  966  {
 965  967          return (flent->fe_client_cookie);
 966  968  }
 967  969  
 968  970  /*
 969  971   * Forward declarations.
 970  972   */
 971  973  static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *);
 972  974  static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
 973  975  static int      flow_l2_accept(flow_tab_t *, flow_state_t *);
 974  976  static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *);
 975  977  static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
 976  978  static int      flow_ether_accept(flow_tab_t *, flow_state_t *);
 977  979  
 978  980  /*
 979  981   * Create flow table.
 980  982   */
 981  983  void
 982  984  mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
 983  985      mac_impl_t *mip, flow_tab_t **ftp)
 984  986  {
 985  987          flow_tab_t      *ft;
 986  988          flow_ops_t      *new_ops;
 987  989  
 988  990          ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
 989  991          bzero(ft, sizeof (*ft));
 990  992  
 991  993          ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
 992  994  
 993  995          /*
 994  996           * We make a copy of the ops vector instead of just pointing to it
 995  997           * because we might want to customize the ops vector on a per table
 996  998           * basis (e.g. for optimization).
 997  999           */
 998 1000          new_ops = &ft->ft_ops;
 999 1001          bcopy(ops, new_ops, sizeof (*ops));
1000 1002          ft->ft_mask = mask;
1001 1003          ft->ft_size = size;
1002 1004          ft->ft_mip = mip;
1003 1005  
1004 1006          /*
1005 1007           * Optimizations for DL_ETHER media.
1006 1008           */
1007 1009          if (mip->mi_info.mi_nativemedia == DL_ETHER) {
1008 1010                  if (new_ops->fo_hash == flow_l2_hash)
1009 1011                          new_ops->fo_hash = flow_ether_hash;
1010 1012                  if (new_ops->fo_hash_fe == flow_l2_hash_fe)
1011 1013                          new_ops->fo_hash_fe = flow_ether_hash_fe;
1012 1014                  if (new_ops->fo_accept[0] == flow_l2_accept)
1013 1015                          new_ops->fo_accept[0] = flow_ether_accept;
1014 1016          }
1015 1017          *ftp = ft;
1016 1018  }
1017 1019  
1018 1020  void
1019 1021  mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
1020 1022  {
1021 1023          mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
1022 1024              1024, mip, ftp);
1023 1025  }
1024 1026  
1025 1027  /*
1026 1028   * Destroy flow table.
1027 1029   */
1028 1030  void
1029 1031  mac_flow_tab_destroy(flow_tab_t *ft)
1030 1032  {
1031 1033          if (ft == NULL)
1032 1034                  return;
1033 1035  
1034 1036          ASSERT(ft->ft_flow_count == 0);
1035 1037          kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
1036 1038          bzero(ft, sizeof (*ft));
1037 1039          kmem_cache_free(flow_tab_cache, ft);
1038 1040  }
1039 1041  
1040 1042  /*
1041 1043   * Add a new flow entry to the global flow hash table
1042 1044   */
1043 1045  int
1044 1046  mac_flow_hash_add(flow_entry_t *flent)
1045 1047  {
1046 1048          int     err;
1047 1049  
1048 1050          rw_enter(&flow_tab_lock, RW_WRITER);
1049 1051          err = mod_hash_insert(flow_hash,
1050 1052              (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
1051 1053          if (err != 0) {
1052 1054                  rw_exit(&flow_tab_lock);
1053 1055                  return (EEXIST);
1054 1056          }
1055 1057          /* Mark as inserted into the global flow hash table */
1056 1058          FLOW_MARK(flent, FE_G_FLOW_HASH);
1057 1059          rw_exit(&flow_tab_lock);
1058 1060          return (err);
1059 1061  }
1060 1062  
1061 1063  /*
1062 1064   * Remove a flow entry from the global flow hash table
1063 1065   */
1064 1066  void
1065 1067  mac_flow_hash_remove(flow_entry_t *flent)
1066 1068  {
1067 1069          mod_hash_val_t  val;
1068 1070  
1069 1071          rw_enter(&flow_tab_lock, RW_WRITER);
1070 1072          VERIFY(mod_hash_remove(flow_hash,
1071 1073              (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1072 1074  
1073 1075          /* Clear the mark that says inserted into the global flow hash table */
1074 1076          FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1075 1077          rw_exit(&flow_tab_lock);
1076 1078  }
1077 1079  
1078 1080  /*
1079 1081   * Retrieve a flow entry from the global flow hash table.
1080 1082   */
1081 1083  int
1082 1084  mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1083 1085  {
1084 1086          int             err;
1085 1087          flow_entry_t    *flent;
1086 1088  
1087 1089          rw_enter(&flow_tab_lock, RW_READER);
1088 1090          err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1089 1091              (mod_hash_val_t *)&flent);
1090 1092          if (err != 0) {
1091 1093                  rw_exit(&flow_tab_lock);
1092 1094                  return (ENOENT);
1093 1095          }
1094 1096          ASSERT(flent != NULL);
1095 1097          FLOW_USER_REFHOLD(flent);
1096 1098          rw_exit(&flow_tab_lock);
1097 1099  
1098 1100          *flentp = flent;
1099 1101          return (0);
1100 1102  }
1101 1103  
1102 1104  /*
1103 1105   * Initialize or release mac client flows by walking the subflow table.
1104 1106   * These are typically invoked during plumb/unplumb of links.
1105 1107   */
1106 1108  
1107 1109  static int
1108 1110  mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1109 1111  {
1110 1112          mac_client_impl_t       *mcip = arg;
1111 1113  
1112 1114          if (mac_link_flow_init(arg, flent) != 0) {
1113 1115                  cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1114 1116                      flent->fe_flow_name, mcip->mci_name);
1115 1117          } else {
1116 1118                  FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1117 1119          }
1118 1120          return (0);
1119 1121  }
1120 1122  
1121 1123  void
1122 1124  mac_link_init_flows(mac_client_handle_t mch)
1123 1125  {
1124 1126          mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1125 1127  
1126 1128          (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1127 1129              mac_link_init_flows_cb, mcip);
1128 1130          /*
1129 1131           * If mac client had subflow(s) configured before plumb, change
1130 1132           * function to mac_rx_srs_subflow_process and in case of hardware
1131 1133           * classification, disable polling.
1132 1134           */
1133 1135          mac_client_update_classifier(mcip, B_TRUE);
1134 1136  
1135 1137  }
1136 1138  
1137 1139  boolean_t
1138 1140  mac_link_has_flows(mac_client_handle_t mch)
1139 1141  {
1140 1142          mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1141 1143  
1142 1144          if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1143 1145                  return (B_TRUE);
1144 1146  
1145 1147          return (B_FALSE);
1146 1148  }
1147 1149  
1148 1150  static int
1149 1151  mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1150 1152  {
1151 1153          FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1152 1154          mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1153 1155          mac_link_flow_clean(arg, flent);
1154 1156          return (0);
1155 1157  }
1156 1158  
1157 1159  void
1158 1160  mac_link_release_flows(mac_client_handle_t mch)
1159 1161  {
1160 1162          mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1161 1163  
1162 1164          /*
1163 1165           * Change the mci_flent callback back to mac_rx_srs_process()
1164 1166           * because flows are about to be deactivated.
1165 1167           */
1166 1168          mac_client_update_classifier(mcip, B_FALSE);
1167 1169          (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1168 1170              mac_link_release_flows_cb, mcip);
1169 1171  }
1170 1172  
1171 1173  void
1172 1174  mac_rename_flow(flow_entry_t *fep, const char *new_name)
1173 1175  {
1174 1176          mac_flow_set_name(fep, new_name);
1175 1177          if (fep->fe_ksp != NULL) {
1176 1178                  flow_stat_destroy(fep);
1177 1179                  flow_stat_create(fep);
1178 1180          }

↓ open down ↓

1038 lines elided

↑ open up ↑

1179 1181  }
1180 1182  
1181 1183  /*
1182 1184   * mac_link_flow_init()
1183 1185   * Internal flow interface used for allocating SRSs and related
1184 1186   * data structures. Not meant to be used by mac clients.
1185 1187   */
1186 1188  int
1187 1189  mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1188 1190  {
1189      -        mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
     1191 +        mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1190 1192          mac_impl_t              *mip = mcip->mci_mip;
1191 1193          int                     err;
1192 1194  
1193 1195          ASSERT(mch != NULL);
1194 1196          ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1195 1197  
1196 1198          if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1197 1199                  return (err);
1198 1200  
1199 1201          sub_flow->fe_mcip = mcip;

1200 1202  
1201 1203          return (0);
1202 1204  }
1203 1205  
1204 1206  /*
1205 1207   * mac_link_flow_add()
1206 1208   * Used by flowadm(1m) or kernel mac clients for creating flows.
1207 1209   */
1208 1210  int
1209 1211  mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1210 1212      flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1211 1213  {
1212 1214          flow_entry_t            *flent = NULL;
1213 1215          int                     err;
1214 1216          dls_dl_handle_t         dlh;
1215 1217          dls_link_t              *dlp;
1216 1218          boolean_t               link_held = B_FALSE;
1217 1219          boolean_t               hash_added = B_FALSE;
1218 1220          mac_perim_handle_t      mph;
1219 1221  
1220 1222          err = mac_flow_lookup_byname(flow_name, &flent);
1221 1223          if (err == 0) {
1222 1224                  FLOW_USER_REFRELE(flent);
1223 1225                  return (EEXIST);
1224 1226          }
1225 1227  
1226 1228          /*
1227 1229           * First create a flow entry given the description provided
1228 1230           * by the caller.
1229 1231           */
1230 1232          err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1231 1233              FLOW_USER | FLOW_OTHER, &flent);
1232 1234  
1233 1235          if (err != 0)
1234 1236                  return (err);
1235 1237  
1236 1238          /*
1237 1239           * We've got a local variable referencing this flow now, so we need
1238 1240           * to hold it. We'll release this flow before returning.
1239 1241           * All failures until we return will undo any action that may internally
1240 1242           * held the flow, so the last REFRELE will assure a clean freeing
1241 1243           * of resources.
1242 1244           */
1243 1245          FLOW_REFHOLD(flent);
1244 1246  
1245 1247          flent->fe_link_id = linkid;
1246 1248          FLOW_MARK(flent, FE_INCIPIENT);
1247 1249  
1248 1250          err = mac_perim_enter_by_linkid(linkid, &mph);
1249 1251          if (err != 0) {
1250 1252                  FLOW_FINAL_REFRELE(flent);
1251 1253                  return (err);
1252 1254          }
1253 1255  
1254 1256          /*
1255 1257           * dls will eventually be merged with mac so it's ok
1256 1258           * to call dls' internal functions.
1257 1259           */
1258 1260          err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1259 1261          if (err != 0)
1260 1262                  goto bail;
1261 1263  
1262 1264          link_held = B_TRUE;
1263 1265  
1264 1266          /*
1265 1267           * Add the flow to the global flow table, this table will be per
1266 1268           * exclusive zone so each zone can have its own flow namespace.
1267 1269           * RFE 6625651 will fix this.
1268 1270           *
1269 1271           */
1270 1272          if ((err = mac_flow_hash_add(flent)) != 0)
1271 1273                  goto bail;
1272 1274  
1273 1275          hash_added = B_TRUE;
1274 1276  
1275 1277          /*
1276 1278           * do not allow flows to be configured on an anchor VNIC
1277 1279           */
1278 1280          if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1279 1281                  err = ENOTSUP;
1280 1282                  goto bail;
1281 1283          }
1282 1284  
1283 1285          /*
1284 1286           * Add the subflow to the subflow table. Also instantiate the flow
1285 1287           * in the mac if there is an active user (we check if the MAC client's
1286 1288           * datapath has been setup).
1287 1289           */
1288 1290          err = mac_flow_add_subflow(dlp->dl_mch, flent,
1289 1291              MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1290 1292          if (err != 0)
1291 1293                  goto bail;
1292 1294  
1293 1295          FLOW_UNMARK(flent, FE_INCIPIENT);
1294 1296          dls_devnet_rele_link(dlh, dlp);
1295 1297          mac_perim_exit(mph);
1296 1298          return (0);
1297 1299  
1298 1300  bail:
1299 1301          if (hash_added)
1300 1302                  mac_flow_hash_remove(flent);
1301 1303  
1302 1304          if (link_held)
1303 1305                  dls_devnet_rele_link(dlh, dlp);
1304 1306  
1305 1307          /*
1306 1308           * Wait for any transient global flow hash refs to clear
1307 1309           * and then release the creation reference on the flow
1308 1310           */
1309 1311          mac_flow_wait(flent, FLOW_USER_REF);
1310 1312          FLOW_FINAL_REFRELE(flent);
1311 1313          mac_perim_exit(mph);
1312 1314          return (err);

↓ open down ↓

113 lines elided

↑ open up ↑

1313 1315  }
1314 1316  
1315 1317  /*
1316 1318   * mac_link_flow_clean()
1317 1319   * Internal flow interface used for freeing SRSs and related
1318 1320   * data structures. Not meant to be used by mac clients.
1319 1321   */
1320 1322  void
1321 1323  mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1322 1324  {
1323      -        mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
     1325 +        mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1324 1326          mac_impl_t              *mip = mcip->mci_mip;
1325 1327          boolean_t               last_subflow;
1326 1328  
1327 1329          ASSERT(mch != NULL);
1328 1330          ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1329 1331  
1330 1332          /*
1331 1333           * This sub flow entry may fail to be fully initialized by
1332 1334           * mac_link_flow_init(). If so, simply return.
1333 1335           */

1334 1336          if (sub_flow->fe_mcip == NULL)
1335 1337                  return;
1336 1338  
1337 1339          last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1338 1340          /*
1339 1341           * Tear down the data path
1340 1342           */
1341 1343          mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1342 1344          sub_flow->fe_mcip = NULL;
1343 1345  
1344 1346          /*
1345 1347           * Delete the SRSs associated with this subflow. If this is being
1346 1348           * driven by flowadm(1M) then the subflow will be deleted by
1347 1349           * dls_rem_flow. However if this is a result of the interface being
1348 1350           * unplumbed then the subflow itself won't be deleted.
1349 1351           */
1350 1352          mac_flow_cleanup(sub_flow);
1351 1353  
1352 1354          /*
1353 1355           * If all the subflows are gone, renable some of the stuff
1354 1356           * we disabled when adding a subflow, polling etc.
1355 1357           */
1356 1358          if (last_subflow) {
1357 1359                  /*
1358 1360                   * The subflow table itself is not protected by any locks or
1359 1361                   * refcnts. Hence quiesce the client upfront before clearing
1360 1362                   * mci_subflow_tab.
1361 1363                   */
1362 1364                  mac_client_quiesce(mcip);
1363 1365                  mac_client_update_classifier(mcip, B_FALSE);
1364 1366                  mac_flow_tab_destroy(mcip->mci_subflow_tab);
1365 1367                  mcip->mci_subflow_tab = NULL;
1366 1368                  mac_client_restart(mcip);
1367 1369          }
1368 1370  }
1369 1371  
1370 1372  /*
1371 1373   * mac_link_flow_remove()
1372 1374   * Used by flowadm(1m) or kernel mac clients for removing flows.
1373 1375   */
1374 1376  int
1375 1377  mac_link_flow_remove(char *flow_name)
1376 1378  {
1377 1379          flow_entry_t            *flent;
1378 1380          mac_perim_handle_t      mph;
1379 1381          int                     err;
1380 1382          datalink_id_t           linkid;
1381 1383  
1382 1384          err = mac_flow_lookup_byname(flow_name, &flent);
1383 1385          if (err != 0)
1384 1386                  return (err);
1385 1387  
1386 1388          linkid = flent->fe_link_id;
1387 1389          FLOW_USER_REFRELE(flent);
1388 1390  
1389 1391          /*
1390 1392           * The perim must be acquired before acquiring any other references
1391 1393           * to maintain the lock and perimeter hierarchy. Please note the
1392 1394           * FLOW_REFRELE above.
1393 1395           */
1394 1396          err = mac_perim_enter_by_linkid(linkid, &mph);
1395 1397          if (err != 0)
1396 1398                  return (err);
1397 1399  
1398 1400          /*
1399 1401           * Note the second lookup of the flow, because a concurrent thread
1400 1402           * may have removed it already while we were waiting to enter the
1401 1403           * link's perimeter.
1402 1404           */
1403 1405          err = mac_flow_lookup_byname(flow_name, &flent);
1404 1406          if (err != 0) {
1405 1407                  mac_perim_exit(mph);
1406 1408                  return (err);
1407 1409          }
1408 1410          FLOW_USER_REFRELE(flent);
1409 1411  
1410 1412          /*
1411 1413           * Remove the flow from the subflow table and deactivate the flow
1412 1414           * by quiescing and removings its SRSs
1413 1415           */
1414 1416          mac_flow_rem_subflow(flent);
1415 1417  
1416 1418          /*
1417 1419           * Finally, remove the flow from the global table.
1418 1420           */
1419 1421          mac_flow_hash_remove(flent);
1420 1422  
1421 1423          /*
1422 1424           * Wait for any transient global flow hash refs to clear
1423 1425           * and then release the creation reference on the flow
1424 1426           */
1425 1427          mac_flow_wait(flent, FLOW_USER_REF);
1426 1428          FLOW_FINAL_REFRELE(flent);
1427 1429  
1428 1430          mac_perim_exit(mph);
1429 1431  
1430 1432          return (0);

↓ open down ↓

97 lines elided

↑ open up ↑

1431 1433  }
1432 1434  
1433 1435  /*
1434 1436   * mac_link_flow_modify()
1435 1437   * Modifies the properties of a flow identified by its name.
1436 1438   */
1437 1439  int
1438 1440  mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1439 1441  {
1440 1442          flow_entry_t            *flent;
1441      -        mac_client_impl_t       *mcip;
     1443 +        mac_client_impl_t       *mcip;
1442 1444          int                     err = 0;
1443 1445          mac_perim_handle_t      mph;
1444 1446          datalink_id_t           linkid;
1445 1447          flow_tab_t              *flow_tab;
1446 1448  
1447 1449          err = mac_validate_props(NULL, mrp);
1448 1450          if (err != 0)
1449 1451                  return (err);
1450 1452  
1451 1453          err = mac_flow_lookup_byname(flow_name, &flent);

1452 1454          if (err != 0)
1453 1455                  return (err);
1454 1456  
1455 1457          linkid = flent->fe_link_id;
1456 1458          FLOW_USER_REFRELE(flent);
1457 1459  
1458 1460          /*
1459 1461           * The perim must be acquired before acquiring any other references
1460 1462           * to maintain the lock and perimeter hierarchy. Please note the
1461 1463           * FLOW_REFRELE above.
1462 1464           */
1463 1465          err = mac_perim_enter_by_linkid(linkid, &mph);
1464 1466          if (err != 0)
1465 1467                  return (err);
1466 1468  
1467 1469          /*
1468 1470           * Note the second lookup of the flow, because a concurrent thread
1469 1471           * may have removed it already while we were waiting to enter the
1470 1472           * link's perimeter.
1471 1473           */
1472 1474          err = mac_flow_lookup_byname(flow_name, &flent);
1473 1475          if (err != 0) {
1474 1476                  mac_perim_exit(mph);
1475 1477                  return (err);
1476 1478          }
1477 1479          FLOW_USER_REFRELE(flent);
1478 1480  
1479 1481          /*
1480 1482           * If this flow is attached to a MAC client, then pass the request
1481 1483           * along to the client.
1482 1484           * Otherwise, just update the cached values.
1483 1485           */
1484 1486          mcip = flent->fe_mcip;
1485 1487          mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1486 1488          if (mcip != NULL) {
1487 1489                  if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1488 1490                          err = ENOENT;
1489 1491                  } else {
1490 1492                          mac_flow_modify(flow_tab, flent, mrp);
1491 1493                  }
1492 1494          } else {
1493 1495                  (void) mac_flow_modify_props(flent, mrp);
1494 1496          }
1495 1497  
1496 1498  done:
1497 1499          mac_perim_exit(mph);
1498 1500          return (err);
1499 1501  }
1500 1502  
1501 1503  
1502 1504  /*
1503 1505   * State structure and misc functions used by mac_link_flow_walk().
1504 1506   */
1505 1507  typedef struct {
1506 1508          int     (*ws_func)(mac_flowinfo_t *, void *);
1507 1509          void    *ws_arg;
1508 1510  } flow_walk_state_t;
1509 1511  
1510 1512  static void
1511 1513  mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1512 1514  {
1513 1515          (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1514 1516              MAXFLOWNAMELEN);
1515 1517          finfop->fi_link_id = flent->fe_link_id;
1516 1518          finfop->fi_flow_desc = flent->fe_flow_desc;
1517 1519          finfop->fi_resource_props = flent->fe_resource_props;
1518 1520  }
1519 1521  
1520 1522  static int
1521 1523  mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1522 1524  {
1523 1525          flow_walk_state_t       *statep = arg;
1524 1526          mac_flowinfo_t          *finfo;
1525 1527          int                     err;
1526 1528  
1527 1529          finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
1528 1530          mac_link_flowinfo_copy(finfo, flent);
1529 1531          err = statep->ws_func(finfo, statep->ws_arg);
1530 1532          kmem_free(finfo, sizeof (*finfo));
1531 1533          return (err);
1532 1534  }
1533 1535  
1534 1536  /*
1535 1537   * mac_link_flow_walk()
1536 1538   * Invokes callback 'func' for all flows belonging to the specified link.
1537 1539   */
1538 1540  int
1539 1541  mac_link_flow_walk(datalink_id_t linkid,
1540 1542      int (*func)(mac_flowinfo_t *, void *), void *arg)
1541 1543  {
1542 1544          mac_client_impl_t       *mcip;
1543 1545          mac_perim_handle_t      mph;
1544 1546          flow_walk_state_t       state;
1545 1547          dls_dl_handle_t         dlh;
1546 1548          dls_link_t              *dlp;
1547 1549          int                     err;
1548 1550  
1549 1551          err = mac_perim_enter_by_linkid(linkid, &mph);
1550 1552          if (err != 0)
1551 1553                  return (err);
1552 1554  
1553 1555          err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1554 1556          if (err != 0) {
1555 1557                  mac_perim_exit(mph);
1556 1558                  return (err);
1557 1559          }
1558 1560  
1559 1561          mcip = (mac_client_impl_t *)dlp->dl_mch;
1560 1562          state.ws_func = func;
1561 1563          state.ws_arg = arg;
1562 1564  
1563 1565          err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1564 1566              mac_link_flow_walk_cb, &state);
1565 1567  
1566 1568          dls_devnet_rele_link(dlh, dlp);
1567 1569          mac_perim_exit(mph);
1568 1570          return (err);
1569 1571  }
1570 1572  
1571 1573  /*
1572 1574   * mac_link_flow_info()
1573 1575   * Retrieves information about a specific flow.
1574 1576   */
1575 1577  int
1576 1578  mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1577 1579  {
1578 1580          flow_entry_t    *flent;
1579 1581          int             err;
1580 1582  
1581 1583          err = mac_flow_lookup_byname(flow_name, &flent);
1582 1584          if (err != 0)
1583 1585                  return (err);
1584 1586  
1585 1587          mac_link_flowinfo_copy(finfo, flent);
1586 1588          FLOW_USER_REFRELE(flent);
1587 1589          return (0);
1588 1590  }
1589 1591  
1590 1592  /*
1591 1593   * Hash function macro that takes an Ethernet address and VLAN id as input.
1592 1594   */
1593 1595  #define HASH_ETHER_VID(a, v, s) \
1594 1596          ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1595 1597  
1596 1598  /*
1597 1599   * Generic layer-2 address hashing function that takes an address and address
1598 1600   * length as input.  This is the DJB hash function.
1599 1601   */
1600 1602  static uint32_t
1601 1603  flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1602 1604  {
1603 1605          uint32_t        hash = 5381;
1604 1606          size_t          i;
1605 1607  
1606 1608          for (i = 0; i < addrlen; i++)
1607 1609                  hash = ((hash << 5) + hash) + addr[i];
1608 1610          return (hash % htsize);
1609 1611  }
1610 1612  
1611 1613  #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1612 1614  
1613 1615  #define CHECK_AND_ADJUST_START_PTR(s, start) {          \
1614 1616          if ((s)->fs_mp->b_wptr == (start)) {            \
1615 1617                  mblk_t  *next = (s)->fs_mp->b_cont;     \
1616 1618                  if (next == NULL)                       \
1617 1619                          return (EINVAL);                \
1618 1620                                                          \
1619 1621                  (s)->fs_mp = next;                      \
1620 1622                  (start) = next->b_rptr;                 \
1621 1623          }                                               \
1622 1624  }
1623 1625  
1624 1626  /* ARGSUSED */
1625 1627  static boolean_t
1626 1628  flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1627 1629  {
1628 1630          flow_l2info_t           *l2 = &s->fs_l2info;
1629 1631          flow_desc_t             *fd = &flent->fe_flow_desc;
1630 1632  
1631 1633          return (l2->l2_vid == fd->fd_vid &&
1632 1634              bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1633 1635  }
1634 1636  
1635 1637  /*
1636 1638   * Layer 2 hash function.
1637 1639   * Must be paired with flow_l2_accept() within a set of flow_ops
1638 1640   * because it assumes the dest address is already extracted.
1639 1641   */
1640 1642  static uint32_t
1641 1643  flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1642 1644  {
1643 1645          return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1644 1646              ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1645 1647  }
1646 1648  
1647 1649  /*
1648 1650   * This is the generic layer 2 accept function.
1649 1651   * It makes use of mac_header_info() to extract the header length,
1650 1652   * sap, vlan ID and destination address.
1651 1653   */
1652 1654  static int
1653 1655  flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1654 1656  {
1655 1657          boolean_t               is_ether;
1656 1658          flow_l2info_t           *l2 = &s->fs_l2info;
1657 1659          mac_header_info_t       mhi;
1658 1660          int                     err;
1659 1661  
1660 1662          is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1661 1663          if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1662 1664              s->fs_mp, &mhi)) != 0) {
1663 1665                  if (err == EINVAL)
1664 1666                          err = ENOBUFS;
1665 1667  
1666 1668                  return (err);
1667 1669          }
1668 1670  
1669 1671          l2->l2_start = s->fs_mp->b_rptr;
1670 1672          l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1671 1673  
1672 1674          if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1673 1675              ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1674 1676                  struct ether_vlan_header        *evhp =
1675 1677                      (struct ether_vlan_header *)l2->l2_start;
1676 1678  
1677 1679                  if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1678 1680                          return (ENOBUFS);
1679 1681  
1680 1682                  l2->l2_sap = ntohs(evhp->ether_type);
1681 1683                  l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1682 1684                  l2->l2_hdrsize = sizeof (*evhp);
1683 1685          } else {
1684 1686                  l2->l2_sap = mhi.mhi_bindsap;
1685 1687                  l2->l2_vid = 0;
1686 1688                  l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1687 1689          }
1688 1690          return (0);
1689 1691  }
1690 1692  
1691 1693  /*
1692 1694   * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1693 1695   * accept(). The notable difference is that dest address is now extracted
1694 1696   * by hash() rather than by accept(). This saves a few memory references
1695 1697   * for flow tables that do not care about mac addresses.
1696 1698   */
1697 1699  static uint32_t
1698 1700  flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1699 1701  {
1700 1702          flow_l2info_t                   *l2 = &s->fs_l2info;
1701 1703          struct ether_vlan_header        *evhp;
1702 1704  
1703 1705          evhp = (struct ether_vlan_header *)l2->l2_start;
1704 1706          l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1705 1707          return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1706 1708  }
1707 1709  
1708 1710  static uint32_t
1709 1711  flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1710 1712  {
1711 1713          flow_desc_t     *fd = &flent->fe_flow_desc;
1712 1714  
1713 1715          ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1714 1716          return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1715 1717  }
1716 1718  
1717 1719  /* ARGSUSED */
1718 1720  static int
1719 1721  flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1720 1722  {
1721 1723          flow_l2info_t                   *l2 = &s->fs_l2info;
1722 1724          struct ether_vlan_header        *evhp;
1723 1725          uint16_t                        sap;
1724 1726  
1725 1727          evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1726 1728          l2->l2_start = (uchar_t *)evhp;
1727 1729  
1728 1730          if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1729 1731                  return (ENOBUFS);
1730 1732  
1731 1733          if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1732 1734              ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1733 1735                  if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1734 1736                          return (ENOBUFS);
1735 1737  
1736 1738                  l2->l2_sap = ntohs(evhp->ether_type);
1737 1739                  l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1738 1740                  l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1739 1741          } else {
1740 1742                  l2->l2_sap = sap;
1741 1743                  l2->l2_vid = 0;
1742 1744                  l2->l2_hdrsize = sizeof (struct ether_header);
1743 1745          }
1744 1746          return (0);
1745 1747  }
1746 1748  
1747 1749  /*
1748 1750   * Validates a layer 2 flow entry.
1749 1751   */
1750 1752  static int
1751 1753  flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1752 1754  {
1753 1755          flow_desc_t     *fd = &flent->fe_flow_desc;
1754 1756  
1755 1757          /*
1756 1758           * Dest address is mandatory, and 0 length addresses are not yet
1757 1759           * supported.
1758 1760           */
1759 1761          if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1760 1762                  return (EINVAL);
1761 1763  
1762 1764          if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1763 1765                  /*
1764 1766                   * VLAN flows are only supported over ethernet macs.
1765 1767                   */
1766 1768                  if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1767 1769                          return (EINVAL);
1768 1770  
1769 1771                  if (fd->fd_vid == 0)
1770 1772                          return (EINVAL);
1771 1773  
1772 1774          }
1773 1775          flent->fe_match = flow_l2_match;
1774 1776          return (0);
1775 1777  }
1776 1778  
1777 1779  /*
1778 1780   * Calculates hash index of flow entry.
1779 1781   */
1780 1782  static uint32_t
1781 1783  flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1782 1784  {
1783 1785          flow_desc_t     *fd = &flent->fe_flow_desc;
1784 1786  
1785 1787          ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1786 1788          return (flow_l2_addrhash(fd->fd_dst_mac,
1787 1789              ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1788 1790  }
1789 1791  
1790 1792  /*
1791 1793   * This is used for duplicate flow checking.
1792 1794   */
1793 1795  /* ARGSUSED */
1794 1796  static boolean_t
1795 1797  flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1796 1798  {
1797 1799          flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1798 1800  
1799 1801          ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1800 1802          return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1801 1803              fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1802 1804  }
1803 1805  
1804 1806  /*
1805 1807   * Generic flow entry insertion function.
1806 1808   * Used by flow tables that do not have ordering requirements.
1807 1809   */
1808 1810  /* ARGSUSED */
1809 1811  static int
1810 1812  flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1811 1813      flow_entry_t *flent)
1812 1814  {
1813 1815          ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1814 1816  
1815 1817          if (*headp != NULL) {
1816 1818                  ASSERT(flent->fe_next == NULL);
1817 1819                  flent->fe_next = *headp;
1818 1820          }
1819 1821          *headp = flent;
1820 1822          return (0);
1821 1823  }
1822 1824  
1823 1825  /*
1824 1826   * IP version independent DSField matching function.
1825 1827   */
1826 1828  /* ARGSUSED */
1827 1829  static boolean_t
1828 1830  flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1829 1831  {
1830 1832          flow_l3info_t   *l3info = &s->fs_l3info;
1831 1833          flow_desc_t     *fd = &flent->fe_flow_desc;
1832 1834  
1833 1835          switch (l3info->l3_version) {
1834 1836          case IPV4_VERSION: {
1835 1837                  ipha_t          *ipha = (ipha_t *)l3info->l3_start;
1836 1838  
1837 1839                  return ((ipha->ipha_type_of_service &
1838 1840                      fd->fd_dsfield_mask) == fd->fd_dsfield);
1839 1841          }
1840 1842          case IPV6_VERSION: {
1841 1843                  ip6_t           *ip6h = (ip6_t *)l3info->l3_start;
1842 1844  
1843 1845                  return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1844 1846                      fd->fd_dsfield_mask) == fd->fd_dsfield);
1845 1847          }
1846 1848          default:
1847 1849                  return (B_FALSE);
1848 1850          }
1849 1851  }
1850 1852  
1851 1853  /*
1852 1854   * IP v4 and v6 address matching.
1853 1855   * The netmask only needs to be applied on the packet but not on the
1854 1856   * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1855 1857   */
1856 1858  
1857 1859  /* ARGSUSED */
1858 1860  static boolean_t
1859 1861  flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1860 1862  {
1861 1863          flow_l3info_t   *l3info = &s->fs_l3info;
1862 1864          flow_desc_t     *fd = &flent->fe_flow_desc;
1863 1865          ipha_t          *ipha = (ipha_t *)l3info->l3_start;
1864 1866          in_addr_t       addr;
1865 1867  
1866 1868          addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1867 1869          if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1868 1870                  return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1869 1871                      V4_PART_OF_V6(fd->fd_local_addr));
1870 1872          }
1871 1873          return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1872 1874              V4_PART_OF_V6(fd->fd_remote_addr));
1873 1875  }
1874 1876  
1875 1877  /* ARGSUSED */
1876 1878  static boolean_t
1877 1879  flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1878 1880  {
1879 1881          flow_l3info_t   *l3info = &s->fs_l3info;
1880 1882          flow_desc_t     *fd = &flent->fe_flow_desc;
1881 1883          ip6_t           *ip6h = (ip6_t *)l3info->l3_start;
1882 1884          in6_addr_t      *addrp;
1883 1885  
1884 1886          addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1885 1887          if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1886 1888                  return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1887 1889                      fd->fd_local_addr));
1888 1890          }
1889 1891          return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1890 1892  }
1891 1893  
1892 1894  /* ARGSUSED */
1893 1895  static boolean_t
1894 1896  flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1895 1897  {
1896 1898          flow_l3info_t   *l3info = &s->fs_l3info;
1897 1899          flow_desc_t     *fd = &flent->fe_flow_desc;
1898 1900  
1899 1901          return (l3info->l3_protocol == fd->fd_protocol);
1900 1902  }
1901 1903  
1902 1904  static uint32_t
1903 1905  flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1904 1906  {
1905 1907          flow_l3info_t   *l3info = &s->fs_l3info;
1906 1908          flow_mask_t     mask = ft->ft_mask;
1907 1909  
1908 1910          if ((mask & FLOW_IP_LOCAL) != 0) {
1909 1911                  l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1910 1912          } else if ((mask & FLOW_IP_REMOTE) != 0) {
1911 1913                  l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1912 1914          } else if ((mask & FLOW_IP_DSFIELD) != 0) {
1913 1915                  /*
1914 1916                   * DSField flents are arranged as a single list.
1915 1917                   */
1916 1918                  return (0);
1917 1919          }
1918 1920          /*
1919 1921           * IP addr flents are hashed into two lists, v4 or v6.
1920 1922           */
1921 1923          ASSERT(ft->ft_size >= 2);
1922 1924          return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1923 1925  }
1924 1926  
1925 1927  static uint32_t
1926 1928  flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1927 1929  {
1928 1930          flow_l3info_t   *l3info = &s->fs_l3info;
1929 1931  
1930 1932          return (l3info->l3_protocol % ft->ft_size);
1931 1933  }
1932 1934  
1933 1935  /* ARGSUSED */
1934 1936  static int
1935 1937  flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1936 1938  {
1937 1939          flow_l2info_t   *l2info = &s->fs_l2info;
1938 1940          flow_l3info_t   *l3info = &s->fs_l3info;
1939 1941          uint16_t        sap = l2info->l2_sap;
1940 1942          uchar_t         *l3_start;
1941 1943  
1942 1944          l3_start = l2info->l2_start + l2info->l2_hdrsize;
1943 1945  
1944 1946          /*
1945 1947           * Adjust start pointer if we're at the end of an mblk.
1946 1948           */

↓ open down ↓

495 lines elided

↑ open up ↑

1947 1949          CHECK_AND_ADJUST_START_PTR(s, l3_start);
1948 1950  
1949 1951          l3info->l3_start = l3_start;
1950 1952          if (!OK_32PTR(l3_start))
1951 1953                  return (EINVAL);
1952 1954  
1953 1955          switch (sap) {
1954 1956          case ETHERTYPE_IP: {
1955 1957                  ipha_t  *ipha = (ipha_t *)l3_start;
1956 1958  
     1959 +                if (IPH_HDR_VERSION(ipha) != IPV4_VERSION)
     1960 +                        return (EINVAL);
1957 1961                  if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1958 1962                          return (ENOBUFS);
1959 1963  
1960 1964                  l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1961 1965                  l3info->l3_protocol = ipha->ipha_protocol;
1962 1966                  l3info->l3_version = IPV4_VERSION;
1963 1967                  l3info->l3_fragmented =
1964 1968                      IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1965 1969                  break;
1966 1970          }
1967 1971          case ETHERTYPE_IPV6: {
1968 1972                  ip6_t           *ip6h = (ip6_t *)l3_start;
1969 1973                  ip6_frag_t      *frag = NULL;
1970 1974                  uint16_t        ip6_hdrlen;
1971 1975                  uint8_t         nexthdr;
     1976 +                int             errno;
1972 1977  
1973      -                if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen,
1974      -                    &nexthdr, &frag)) {
1975      -                        return (ENOBUFS);
1976      -                }
     1978 +                errno = mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr,
     1979 +                    &ip6_hdrlen, &nexthdr, &frag);
     1980 +                /*
     1981 +                 * ENOBUFS is not ENOSPC, but the semantics are the
     1982 +                 * same for this caller.
     1983 +                 */
     1984 +                if (errno != 0)
     1985 +                        return (errno == ENOSPC ? ENOBUFS : errno);
1977 1986                  l3info->l3_hdrsize = ip6_hdrlen;
1978 1987                  l3info->l3_protocol = nexthdr;
1979 1988                  l3info->l3_version = IPV6_VERSION;
1980 1989                  l3info->l3_fragmented = (frag != NULL);
1981 1990                  break;
1982 1991          }
1983 1992          default:
1984 1993                  return (EINVAL);
1985 1994          }
1986 1995          return (0);

1987 1996  }
1988 1997  
1989 1998  /* ARGSUSED */
1990 1999  static int
1991 2000  flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1992 2001  {
1993 2002          flow_desc_t     *fd = &flent->fe_flow_desc;
1994 2003  
1995 2004          switch (fd->fd_protocol) {
1996 2005          case IPPROTO_TCP:
1997 2006          case IPPROTO_UDP:
1998 2007          case IPPROTO_SCTP:
1999 2008          case IPPROTO_ICMP:
2000 2009          case IPPROTO_ICMPV6:
2001 2010                  flent->fe_match = flow_ip_proto_match;
2002 2011                  return (0);
2003 2012          default:
2004 2013                  return (EINVAL);
2005 2014          }
2006 2015  }
2007 2016  
2008 2017  /* ARGSUSED */
2009 2018  static int
2010 2019  flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2011 2020  {
2012 2021          flow_desc_t     *fd = &flent->fe_flow_desc;
2013 2022          flow_mask_t     mask;
2014 2023          uint8_t         version;
2015 2024          in6_addr_t      *addr, *netmask;
2016 2025  
2017 2026          /*
2018 2027           * DSField does not require a IP version.
2019 2028           */
2020 2029          if (fd->fd_mask == FLOW_IP_DSFIELD) {
2021 2030                  if (fd->fd_dsfield_mask == 0)
2022 2031                          return (EINVAL);
2023 2032  
2024 2033                  flent->fe_match = flow_ip_dsfield_match;
2025 2034                  return (0);
2026 2035          }
2027 2036  
2028 2037          /*
2029 2038           * IP addresses must come with a version to avoid ambiguity.
2030 2039           */
2031 2040          if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
2032 2041                  return (EINVAL);
2033 2042  
2034 2043          version = fd->fd_ipversion;
2035 2044          if (version != IPV4_VERSION && version != IPV6_VERSION)
2036 2045                  return (EINVAL);
2037 2046  
2038 2047          mask = fd->fd_mask & ~FLOW_IP_VERSION;
2039 2048          switch (mask) {
2040 2049          case FLOW_IP_LOCAL:
2041 2050                  addr = &fd->fd_local_addr;
2042 2051                  netmask = &fd->fd_local_netmask;
2043 2052                  break;
2044 2053          case FLOW_IP_REMOTE:
2045 2054                  addr = &fd->fd_remote_addr;
2046 2055                  netmask = &fd->fd_remote_netmask;
2047 2056                  break;
2048 2057          default:
2049 2058                  return (EINVAL);
2050 2059          }
2051 2060  
2052 2061          /*
2053 2062           * Apply netmask onto specified address.
2054 2063           */
2055 2064          V6_MASK_COPY(*addr, *netmask, *addr);
2056 2065          if (version == IPV4_VERSION) {
2057 2066                  ipaddr_t        v4addr = V4_PART_OF_V6((*addr));
2058 2067                  ipaddr_t        v4mask = V4_PART_OF_V6((*netmask));
2059 2068  
2060 2069                  if (v4addr == 0 || v4mask == 0)
2061 2070                          return (EINVAL);
2062 2071                  flent->fe_match = flow_ip_v4_match;
2063 2072          } else {
2064 2073                  if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
2065 2074                      IN6_IS_ADDR_UNSPECIFIED(netmask))
2066 2075                          return (EINVAL);
2067 2076                  flent->fe_match = flow_ip_v6_match;
2068 2077          }
2069 2078          return (0);
2070 2079  }
2071 2080  
2072 2081  static uint32_t
2073 2082  flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2074 2083  {
2075 2084          flow_desc_t     *fd = &flent->fe_flow_desc;
2076 2085  
2077 2086          return (fd->fd_protocol % ft->ft_size);
2078 2087  }
2079 2088  
2080 2089  static uint32_t
2081 2090  flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2082 2091  {
2083 2092          flow_desc_t     *fd = &flent->fe_flow_desc;
2084 2093  
2085 2094          /*
2086 2095           * DSField flents are arranged as a single list.
2087 2096           */
2088 2097          if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2089 2098                  return (0);
2090 2099  
2091 2100          /*
2092 2101           * IP addr flents are hashed into two lists, v4 or v6.
2093 2102           */
2094 2103          ASSERT(ft->ft_size >= 2);
2095 2104          return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2096 2105  }
2097 2106  
2098 2107  /* ARGSUSED */
2099 2108  static boolean_t
2100 2109  flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2101 2110  {
2102 2111          flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2103 2112  
2104 2113          return (fd1->fd_protocol == fd2->fd_protocol);
2105 2114  }
2106 2115  
2107 2116  /* ARGSUSED */
2108 2117  static boolean_t
2109 2118  flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2110 2119  {
2111 2120          flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2112 2121          in6_addr_t      *a1, *m1, *a2, *m2;
2113 2122  
2114 2123          ASSERT(fd1->fd_mask == fd2->fd_mask);
2115 2124          if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2116 2125                  return (fd1->fd_dsfield == fd2->fd_dsfield &&
2117 2126                      fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2118 2127          }
2119 2128  
2120 2129          /*
2121 2130           * flow_ip_accept_fe() already validated the version.
2122 2131           */
2123 2132          ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2124 2133          if (fd1->fd_ipversion != fd2->fd_ipversion)
2125 2134                  return (B_FALSE);
2126 2135  
2127 2136          switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2128 2137          case FLOW_IP_LOCAL:
2129 2138                  a1 = &fd1->fd_local_addr;
2130 2139                  m1 = &fd1->fd_local_netmask;
2131 2140                  a2 = &fd2->fd_local_addr;
2132 2141                  m2 = &fd2->fd_local_netmask;
2133 2142                  break;
2134 2143          case FLOW_IP_REMOTE:
2135 2144                  a1 = &fd1->fd_remote_addr;
2136 2145                  m1 = &fd1->fd_remote_netmask;
2137 2146                  a2 = &fd2->fd_remote_addr;
2138 2147                  m2 = &fd2->fd_remote_netmask;
2139 2148                  break;
2140 2149          default:
2141 2150                  /*
2142 2151                   * This is unreachable given the checks in
2143 2152                   * flow_ip_accept_fe().
2144 2153                   */
2145 2154                  return (B_FALSE);
2146 2155          }
2147 2156  
2148 2157          if (fd1->fd_ipversion == IPV4_VERSION) {
2149 2158                  return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2150 2159                      V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2151 2160  
2152 2161          } else {
2153 2162                  return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2154 2163                      IN6_ARE_ADDR_EQUAL(m1, m2));
2155 2164          }
2156 2165  }
2157 2166  
2158 2167  static int
2159 2168  flow_ip_mask2plen(in6_addr_t *v6mask)
2160 2169  {
2161 2170          int             bits;
2162 2171          int             plen = IPV6_ABITS;
2163 2172          int             i;
2164 2173  
2165 2174          for (i = 3; i >= 0; i--) {
2166 2175                  if (v6mask->s6_addr32[i] == 0) {
2167 2176                          plen -= 32;
2168 2177                          continue;
2169 2178                  }
2170 2179                  bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2171 2180                  if (bits == 0)
2172 2181                          break;
2173 2182                  plen -= bits;
2174 2183          }
2175 2184          return (plen);
2176 2185  }
2177 2186  
2178 2187  /* ARGSUSED */
2179 2188  static int
2180 2189  flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2181 2190      flow_entry_t *flent)
2182 2191  {
2183 2192          flow_entry_t    **p = headp;
2184 2193          flow_desc_t     *fd0, *fd;
2185 2194          in6_addr_t      *m0, *m;
2186 2195          int             plen0, plen;
2187 2196  
2188 2197          ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2189 2198  
2190 2199          /*
2191 2200           * No special ordering needed for dsfield.
2192 2201           */
2193 2202          fd0 = &flent->fe_flow_desc;
2194 2203          if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2195 2204                  if (*p != NULL) {
2196 2205                          ASSERT(flent->fe_next == NULL);
2197 2206                          flent->fe_next = *p;
2198 2207                  }
2199 2208                  *p = flent;
2200 2209                  return (0);
2201 2210          }
2202 2211  
2203 2212          /*
2204 2213           * IP address flows are arranged in descending prefix length order.
2205 2214           */
2206 2215          m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2207 2216              &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2208 2217          plen0 = flow_ip_mask2plen(m0);
2209 2218          ASSERT(plen0 != 0);
2210 2219  
2211 2220          for (; *p != NULL; p = &(*p)->fe_next) {
2212 2221                  fd = &(*p)->fe_flow_desc;
2213 2222  
2214 2223                  /*
2215 2224                   * Normally a dsfield flent shouldn't end up on the same
2216 2225                   * list as an IP address because flow tables are (for now)
2217 2226                   * disjoint. If we decide to support both IP and dsfield
2218 2227                   * in the same table in the future, this check will allow
2219 2228                   * for that.
2220 2229                   */
2221 2230                  if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2222 2231                          continue;
2223 2232  
2224 2233                  /*
2225 2234                   * We also allow for the mixing of local and remote address
2226 2235                   * flents within one list.
2227 2236                   */
2228 2237                  m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2229 2238                      &fd->fd_local_netmask : &fd->fd_remote_netmask;
2230 2239                  plen = flow_ip_mask2plen(m);
2231 2240  
2232 2241                  if (plen <= plen0)
2233 2242                          break;
2234 2243          }
2235 2244          if (*p != NULL) {
2236 2245                  ASSERT(flent->fe_next == NULL);
2237 2246                  flent->fe_next = *p;
2238 2247          }
2239 2248          *p = flent;
2240 2249          return (0);
2241 2250  }
2242 2251  
2243 2252  /*
2244 2253   * Transport layer protocol and port matching functions.
2245 2254   */
2246 2255  
2247 2256  /* ARGSUSED */
2248 2257  static boolean_t
2249 2258  flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2250 2259  {
2251 2260          flow_l3info_t   *l3info = &s->fs_l3info;
2252 2261          flow_l4info_t   *l4info = &s->fs_l4info;
2253 2262          flow_desc_t     *fd = &flent->fe_flow_desc;
2254 2263  
2255 2264          return (fd->fd_protocol == l3info->l3_protocol &&
2256 2265              fd->fd_local_port == l4info->l4_hash_port);
2257 2266  }
2258 2267  
2259 2268  /* ARGSUSED */
2260 2269  static boolean_t
2261 2270  flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2262 2271  {
2263 2272          flow_l3info_t   *l3info = &s->fs_l3info;
2264 2273          flow_l4info_t   *l4info = &s->fs_l4info;
2265 2274          flow_desc_t     *fd = &flent->fe_flow_desc;
2266 2275  
2267 2276          return (fd->fd_protocol == l3info->l3_protocol &&
2268 2277              fd->fd_remote_port == l4info->l4_hash_port);
2269 2278  }
2270 2279  
2271 2280  /*
2272 2281   * Transport hash function.
2273 2282   * Since we only support either local or remote port flows,
2274 2283   * we only need to extract one of the ports to be used for
2275 2284   * matching.
2276 2285   */
2277 2286  static uint32_t
2278 2287  flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2279 2288  {
2280 2289          flow_l3info_t   *l3info = &s->fs_l3info;
2281 2290          flow_l4info_t   *l4info = &s->fs_l4info;
2282 2291          uint8_t         proto = l3info->l3_protocol;
2283 2292          boolean_t       dst_or_src;
2284 2293  
2285 2294          if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2286 2295                  dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2287 2296          } else {
2288 2297                  dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2289 2298          }
2290 2299  
2291 2300          l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2292 2301              l4info->l4_src_port;
2293 2302  
2294 2303          return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2295 2304  }
2296 2305  
2297 2306  /*
2298 2307   * Unlike other accept() functions above, we do not need to get the header
2299 2308   * size because this is our highest layer so far. If we want to do support
2300 2309   * other higher layer protocols, we would need to save the l4_hdrsize
2301 2310   * in the code below.
2302 2311   */
2303 2312  
2304 2313  /* ARGSUSED */
2305 2314  static int
2306 2315  flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2307 2316  {
2308 2317          flow_l3info_t   *l3info = &s->fs_l3info;
2309 2318          flow_l4info_t   *l4info = &s->fs_l4info;
2310 2319          uint8_t         proto = l3info->l3_protocol;
2311 2320          uchar_t         *l4_start;
2312 2321  
2313 2322          l4_start = l3info->l3_start + l3info->l3_hdrsize;
2314 2323  
2315 2324          /*
2316 2325           * Adjust start pointer if we're at the end of an mblk.
2317 2326           */
2318 2327          CHECK_AND_ADJUST_START_PTR(s, l4_start);
2319 2328  
2320 2329          l4info->l4_start = l4_start;
2321 2330          if (!OK_32PTR(l4_start))
2322 2331                  return (EINVAL);
2323 2332  
2324 2333          if (l3info->l3_fragmented == B_TRUE)
2325 2334                  return (EINVAL);
2326 2335  
2327 2336          switch (proto) {
2328 2337          case IPPROTO_TCP: {
2329 2338                  struct tcphdr   *tcph = (struct tcphdr *)l4_start;
2330 2339  
2331 2340                  if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2332 2341                          return (ENOBUFS);
2333 2342  
2334 2343                  l4info->l4_src_port = tcph->th_sport;
2335 2344                  l4info->l4_dst_port = tcph->th_dport;
2336 2345                  break;
2337 2346          }
2338 2347          case IPPROTO_UDP: {
2339 2348                  struct udphdr   *udph = (struct udphdr *)l4_start;
2340 2349  
2341 2350                  if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2342 2351                          return (ENOBUFS);
2343 2352  
2344 2353                  l4info->l4_src_port = udph->uh_sport;
2345 2354                  l4info->l4_dst_port = udph->uh_dport;
2346 2355                  break;
2347 2356          }
2348 2357          case IPPROTO_SCTP: {
2349 2358                  sctp_hdr_t      *sctph = (sctp_hdr_t *)l4_start;
2350 2359  
2351 2360                  if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2352 2361                          return (ENOBUFS);
2353 2362  
2354 2363                  l4info->l4_src_port = sctph->sh_sport;
2355 2364                  l4info->l4_dst_port = sctph->sh_dport;
2356 2365                  break;
2357 2366          }
2358 2367          default:
2359 2368                  return (EINVAL);
2360 2369          }
2361 2370  
2362 2371          return (0);
2363 2372  }
2364 2373  
2365 2374  /*
2366 2375   * Validates transport flow entry.
2367 2376   * The protocol field must be present.
2368 2377   */
2369 2378  
2370 2379  /* ARGSUSED */
2371 2380  static int
2372 2381  flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2373 2382  {
2374 2383          flow_desc_t     *fd = &flent->fe_flow_desc;
2375 2384          flow_mask_t     mask = fd->fd_mask;
2376 2385  
2377 2386          if ((mask & FLOW_IP_PROTOCOL) == 0)
2378 2387                  return (EINVAL);
2379 2388  
2380 2389          switch (fd->fd_protocol) {
2381 2390          case IPPROTO_TCP:
2382 2391          case IPPROTO_UDP:
2383 2392          case IPPROTO_SCTP:
2384 2393                  break;
2385 2394          default:
2386 2395                  return (EINVAL);
2387 2396          }
2388 2397  
2389 2398          switch (mask & ~FLOW_IP_PROTOCOL) {
2390 2399          case FLOW_ULP_PORT_LOCAL:
2391 2400                  if (fd->fd_local_port == 0)
2392 2401                          return (EINVAL);
2393 2402  
2394 2403                  flent->fe_match = flow_transport_lport_match;
2395 2404                  break;
2396 2405          case FLOW_ULP_PORT_REMOTE:
2397 2406                  if (fd->fd_remote_port == 0)
2398 2407                          return (EINVAL);
2399 2408  
2400 2409                  flent->fe_match = flow_transport_rport_match;
2401 2410                  break;
2402 2411          case 0:
2403 2412                  /*
2404 2413                   * transport-only flows conflicts with our table type.
2405 2414                   */
2406 2415                  return (EOPNOTSUPP);
2407 2416          default:
2408 2417                  return (EINVAL);
2409 2418          }
2410 2419  
2411 2420          return (0);
2412 2421  }
2413 2422  
2414 2423  static uint32_t
2415 2424  flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2416 2425  {
2417 2426          flow_desc_t     *fd = &flent->fe_flow_desc;
2418 2427          uint16_t        port = 0;
2419 2428  
2420 2429          port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2421 2430              fd->fd_local_port : fd->fd_remote_port;
2422 2431  
2423 2432          return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2424 2433  }
2425 2434  
2426 2435  /* ARGSUSED */
2427 2436  static boolean_t
2428 2437  flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2429 2438  {
2430 2439          flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2431 2440  
2432 2441          if (fd1->fd_protocol != fd2->fd_protocol)
2433 2442                  return (B_FALSE);
2434 2443  
2435 2444          if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2436 2445                  return (fd1->fd_local_port == fd2->fd_local_port);
2437 2446  
2438 2447          if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2439 2448                  return (fd1->fd_remote_port == fd2->fd_remote_port);
2440 2449  
2441 2450          return (B_TRUE);
2442 2451  }
2443 2452  
2444 2453  static flow_ops_t flow_l2_ops = {
2445 2454          flow_l2_accept_fe,
2446 2455          flow_l2_hash_fe,
2447 2456          flow_l2_match_fe,
2448 2457          flow_generic_insert_fe,
2449 2458          flow_l2_hash,
2450 2459          {flow_l2_accept}
2451 2460  };
2452 2461  
2453 2462  static flow_ops_t flow_ip_ops = {
2454 2463          flow_ip_accept_fe,
2455 2464          flow_ip_hash_fe,
2456 2465          flow_ip_match_fe,
2457 2466          flow_ip_insert_fe,
2458 2467          flow_ip_hash,
2459 2468          {flow_l2_accept, flow_ip_accept}
2460 2469  };
2461 2470  
2462 2471  static flow_ops_t flow_ip_proto_ops = {
2463 2472          flow_ip_proto_accept_fe,
2464 2473          flow_ip_proto_hash_fe,
2465 2474          flow_ip_proto_match_fe,
2466 2475          flow_generic_insert_fe,
2467 2476          flow_ip_proto_hash,
2468 2477          {flow_l2_accept, flow_ip_accept}
2469 2478  };
2470 2479  
2471 2480  static flow_ops_t flow_transport_ops = {
2472 2481          flow_transport_accept_fe,
2473 2482          flow_transport_hash_fe,
2474 2483          flow_transport_match_fe,
2475 2484          flow_generic_insert_fe,
2476 2485          flow_transport_hash,
2477 2486          {flow_l2_accept, flow_ip_accept, flow_transport_accept}
2478 2487  };
2479 2488  
2480 2489  static flow_tab_info_t flow_tab_info_list[] = {
2481 2490          {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2482 2491          {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2483 2492          {&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2484 2493          {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2485 2494          {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2486 2495          {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2487 2496  };
2488 2497  
2489 2498  #define FLOW_MAX_TAB_INFO \
2490 2499          ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2491 2500  
2492 2501  static flow_tab_info_t *
2493 2502  mac_flow_tab_info_get(flow_mask_t mask)
2494 2503  {
2495 2504          int     i;
2496 2505  
2497 2506          for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2498 2507                  if (mask == flow_tab_info_list[i].fti_mask)
2499 2508                          return (&flow_tab_info_list[i]);
2500 2509          }
2501 2510          return (NULL);
2502 2511  }

↓ open down ↓

516 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX