9832-improved Old usr/src/uts/common/io/mac/mac

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 #include <sys/strsun.h>
  28 #include <sys/sdt.h>
  29 #include <sys/mac.h>
  30 #include <sys/mac_impl.h>
  31 #include <sys/mac_client_impl.h>
  32 #include <sys/mac_stat.h>
  33 #include <sys/dls.h>
  34 #include <sys/dls_impl.h>
  35 #include <sys/mac_soft_ring.h>
  36 #include <sys/ethernet.h>
  37 #include <sys/cpupart.h>
  38 #include <sys/pool.h>
  39 #include <sys/pool_pset.h>
  40 #include <sys/vlan.h>
  41 #include <inet/ip.h>
  42 #include <inet/ip6.h>
  43 #include <netinet/tcp.h>
  44 #include <netinet/udp.h>
  45 #include <netinet/sctp.h>
  46 
  47 typedef struct flow_stats_s {
  48         uint64_t        fs_obytes;
  49         uint64_t        fs_opackets;
  50         uint64_t        fs_oerrors;
  51         uint64_t        fs_ibytes;
  52         uint64_t        fs_ipackets;
  53         uint64_t        fs_ierrors;
  54 } flow_stats_t;
  55 
  56 
  57 /* global flow table, will be a per exclusive-zone table later */
  58 static mod_hash_t       *flow_hash;
  59 static krwlock_t        flow_tab_lock;
  60 
  61 static kmem_cache_t     *flow_cache;
  62 static kmem_cache_t     *flow_tab_cache;
  63 static flow_ops_t       flow_l2_ops;
  64 
  65 typedef struct {
  66         const char      *fs_name;
  67         uint_t          fs_offset;
  68 } flow_stats_info_t;
  69 
  70 #define FS_OFF(f)       (offsetof(flow_stats_t, f))
  71 static flow_stats_info_t flow_stats_list[] = {
  72         {"rbytes",      FS_OFF(fs_ibytes)},
  73         {"ipackets",    FS_OFF(fs_ipackets)},
  74         {"ierrors",     FS_OFF(fs_ierrors)},
  75         {"obytes",      FS_OFF(fs_obytes)},
  76         {"opackets",    FS_OFF(fs_opackets)},
  77         {"oerrors",     FS_OFF(fs_oerrors)}
  78 };
  79 #define FS_SIZE         (sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
  80 
  81 /*
  82  * Checks whether a flow mask is legal.
  83  */
  84 static flow_tab_info_t  *mac_flow_tab_info_get(flow_mask_t);
  85 
  86 static void
  87 flow_stat_init(kstat_named_t *knp)
  88 {
  89         int     i;
  90 
  91         for (i = 0; i < FS_SIZE; i++, knp++) {
  92                 kstat_named_init(knp, flow_stats_list[i].fs_name,
  93                     KSTAT_DATA_UINT64);
  94         }
  95 }
  96 
  97 static int
  98 flow_stat_update(kstat_t *ksp, int rw)
  99 {
 100         flow_entry_t            *fep = ksp->ks_private;
 101         kstat_named_t           *knp = ksp->ks_data;
 102         uint64_t                *statp;
 103         int                     i;
 104         mac_rx_stats_t          *mac_rx_stat;
 105         mac_tx_stats_t          *mac_tx_stat;
 106         flow_stats_t            flow_stats;
 107         mac_soft_ring_set_t     *mac_srs;
 108 
 109         if (rw != KSTAT_READ)
 110                 return (EACCES);
 111 
 112         bzero(&flow_stats, sizeof (flow_stats_t));
 113 
 114         for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
 115                 mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
 116                 if (mac_srs == NULL)            /* Multicast flow */
 117                         break;
 118                 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
 119 
 120                 flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
 121                     mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
 122 
 123                 flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
 124                     mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
 125 
 126                 flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
 127         }
 128 
 129         mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
 130         if (mac_srs == NULL)            /* Multicast flow */
 131                 goto done;
 132         mac_tx_stat = &mac_srs->srs_tx.st_stat;
 133 
 134         flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
 135         flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
 136         flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;
 137 
 138 done:
 139         for (i = 0; i < FS_SIZE; i++, knp++) {
 140                 statp = (uint64_t *)
 141                     ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
 142                 knp->value.ui64 = *statp;
 143         }
 144         return (0);
 145 }
 146 
 147 static void
 148 flow_stat_create(flow_entry_t *fep)
 149 {
 150         kstat_t         *ksp;
 151         kstat_named_t   *knp;
 152         uint_t          nstats = FS_SIZE;
 153 
 154         /*
 155          * Fow now, flow entries are only manipulated and visible from the
 156          * global zone.
 157          */
 158         ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
 159             KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
 160         if (ksp == NULL)
 161                 return;
 162 
 163         ksp->ks_update = flow_stat_update;
 164         ksp->ks_private = fep;
 165         fep->fe_ksp = ksp;
 166 
 167         knp = (kstat_named_t *)ksp->ks_data;
 168         flow_stat_init(knp);
 169         kstat_install(ksp);
 170 }
 171 
 172 void
 173 flow_stat_destroy(flow_entry_t *fep)
 174 {
 175         if (fep->fe_ksp != NULL) {
 176                 kstat_delete(fep->fe_ksp);
 177                 fep->fe_ksp = NULL;
 178         }
 179 }
 180 
 181 /*
 182  * Initialize the flow table
 183  */
 184 void
 185 mac_flow_init()
 186 {
 187         flow_cache = kmem_cache_create("flow_entry_cache",
 188             sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 189         flow_tab_cache = kmem_cache_create("flow_tab_cache",
 190             sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 191         flow_hash = mod_hash_create_extended("flow_hash",
 192             100, mod_hash_null_keydtor, mod_hash_null_valdtor,
 193             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 194         rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
 195 }
 196 
 197 /*
 198  * Cleanup and release the flow table
 199  */
 200 void
 201 mac_flow_fini()
 202 {
 203         kmem_cache_destroy(flow_cache);
 204         kmem_cache_destroy(flow_tab_cache);
 205         mod_hash_destroy_hash(flow_hash);
 206         rw_destroy(&flow_tab_lock);
 207 }
 208 
 209 /*
 210  * mac_create_flow(): create a flow_entry_t.
 211  */
 212 int
 213 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
 214     void *client_cookie, uint_t type, flow_entry_t **flentp)
 215 {
 216         flow_entry_t            *flent = *flentp;
 217         int                     err = 0;
 218 
 219         if (mrp != NULL) {
 220                 err = mac_validate_props(NULL, mrp);
 221                 if (err != 0)
 222                         return (err);
 223         }
 224 
 225         if (flent == NULL) {
 226                 flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
 227                 bzero(flent, sizeof (*flent));
 228                 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
 229                 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
 230 
 231                 /* Initialize the receiver function to a safe routine */
 232                 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
 233                 flent->fe_index = -1;
 234         }
 235         (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 236 
 237         /* This is an initial flow, will be configured later */
 238         if (fd == NULL) {
 239                 *flentp = flent;
 240                 return (0);
 241         }
 242 
 243         flent->fe_client_cookie = client_cookie;
 244         flent->fe_type = type;
 245 
 246         /* Save flow desc */
 247         bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 248 
 249         if (mrp != NULL) {
 250                 /*
 251                  * We have already set fe_resource_props for a Link.
 252                  */
 253                 if (type & FLOW_USER) {
 254                         bcopy(mrp, &flent->fe_resource_props,
 255                             sizeof (mac_resource_props_t));
 256                 }
 257                 /*
 258                  * The effective resource list should reflect the priority
 259                  * that we set implicitly.
 260                  */
 261                 if (!(mrp->mrp_mask & MRP_PRIORITY))
 262                         mrp->mrp_mask |= MRP_PRIORITY;
 263                 if (type & FLOW_USER)
 264                         mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
 265                 else
 266                         mrp->mrp_priority = MPL_LINK_DEFAULT;
 267                 bzero(mrp->mrp_pool, MAXPATHLEN);
 268                 bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
 269                 bcopy(mrp, &flent->fe_effective_props,
 270                     sizeof (mac_resource_props_t));
 271         }
 272         flow_stat_create(flent);
 273 
 274         *flentp = flent;
 275         return (0);
 276 }
 277 
 278 /*
 279  * Validate flow entry and add it to a flow table.
 280  */
 281 int
 282 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
 283 {
 284         flow_entry_t    **headp, **p;
 285         flow_ops_t      *ops = &ft->ft_ops;
 286         flow_mask_t     mask;
 287         uint32_t        index;
 288         int             err;
 289 
 290         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 291 
 292         /*
 293          * Check for invalid bits in mask.
 294          */
 295         mask = flent->fe_flow_desc.fd_mask;
 296         if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
 297                 return (EOPNOTSUPP);
 298 
 299         /*
 300          * Validate flent.
 301          */
 302         if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
 303                 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
 304                     flow_entry_t *, flent, int, err);
 305                 return (err);
 306         }
 307 
 308         /*
 309          * Flent is valid. now calculate hash and insert it
 310          * into hash table.
 311          */
 312         index = ops->fo_hash_fe(ft, flent);
 313 
 314         /*
 315          * We do not need a lock up until now because we were
 316          * not accessing the flow table.
 317          */
 318         rw_enter(&ft->ft_lock, RW_WRITER);
 319         headp = &ft->ft_table[index];
 320 
 321         /*
 322          * Check for duplicate flow.
 323          */
 324         for (p = headp; *p != NULL; p = &(*p)->fe_next) {
 325                 if ((*p)->fe_flow_desc.fd_mask !=
 326                     flent->fe_flow_desc.fd_mask)
 327                         continue;
 328 
 329                 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
 330                         rw_exit(&ft->ft_lock);
 331                         DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
 332                             flow_entry_t *, flent, int, err);
 333                         return (EALREADY);
 334                 }
 335         }
 336 
 337         /*
 338          * Insert flow to hash list.
 339          */
 340         err = ops->fo_insert_fe(ft, headp, flent);
 341         if (err != 0) {
 342                 rw_exit(&ft->ft_lock);
 343                 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
 344                     flow_entry_t *, flent, int, err);
 345                 return (err);
 346         }
 347 
 348         /*
 349          * Save the hash index so it can be used by mac_flow_remove().
 350          */
 351         flent->fe_index = (int)index;
 352 
 353         /*
 354          * Save the flow tab back reference.
 355          */
 356         flent->fe_flow_tab = ft;
 357         FLOW_MARK(flent, FE_FLOW_TAB);
 358         ft->ft_flow_count++;
 359         rw_exit(&ft->ft_lock);
 360         return (0);
 361 }
 362 
 363 /*
 364  * Remove a flow from a mac client's subflow table
 365  */
 366 void
 367 mac_flow_rem_subflow(flow_entry_t *flent)
 368 {
 369         flow_tab_t              *ft = flent->fe_flow_tab;
 370         mac_client_impl_t       *mcip = ft->ft_mcip;
 371         mac_handle_t            mh = (mac_handle_t)ft->ft_mip;
 372 
 373         ASSERT(MAC_PERIM_HELD(mh));
 374 
 375         mac_flow_remove(ft, flent, B_FALSE);
 376         if (flent->fe_mcip == NULL) {
 377                 /*
 378                  * The interface is not yet plumbed and mac_client_flow_add
 379                  * was not done.
 380                  */
 381                 if (FLOW_TAB_EMPTY(ft)) {
 382                         mac_flow_tab_destroy(ft);
 383                         mcip->mci_subflow_tab = NULL;
 384                 }
 385         } else {
 386                 mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
 387                 mac_link_flow_clean((mac_client_handle_t)mcip, flent);
 388         }
 389         mac_fastpath_enable(mh);
 390 }
 391 
 392 /*
 393  * Add a flow to a mac client's subflow table and instantiate the flow
 394  * in the mac by creating the associated SRSs etc.
 395  */
 396 int
 397 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
 398     boolean_t instantiate_flow)
 399 {
 400         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
 401         mac_handle_t            mh = (mac_handle_t)mcip->mci_mip;
 402         flow_tab_info_t         *ftinfo;
 403         flow_mask_t             mask;
 404         flow_tab_t              *ft;
 405         int                     err;
 406         boolean_t               ft_created = B_FALSE;
 407 
 408         ASSERT(MAC_PERIM_HELD(mh));
 409 
 410         if ((err = mac_fastpath_disable(mh)) != 0)
 411                 return (err);
 412 
 413         /*
 414          * If the subflow table exists already just add the new subflow
 415          * to the existing table, else we create a new subflow table below.
 416          */
 417         ft = mcip->mci_subflow_tab;
 418         if (ft == NULL) {
 419                 mask = flent->fe_flow_desc.fd_mask;
 420                 /*
 421                  * Try to create a new table and then add the subflow to the
 422                  * newly created subflow table
 423                  */
 424                 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
 425                         mac_fastpath_enable(mh);
 426                         return (EOPNOTSUPP);
 427                 }
 428 
 429                 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
 430                     mcip->mci_mip, &ft);
 431                 ft_created = B_TRUE;
 432         }
 433 
 434         err = mac_flow_add(ft, flent);
 435         if (err != 0) {
 436                 if (ft_created)
 437                         mac_flow_tab_destroy(ft);
 438                 mac_fastpath_enable(mh);
 439                 return (err);
 440         }
 441 
 442         if (instantiate_flow) {
 443                 /* Now activate the flow by creating its SRSs */
 444                 ASSERT(MCIP_DATAPATH_SETUP(mcip));
 445                 err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
 446                 if (err != 0) {
 447                         mac_flow_remove(ft, flent, B_FALSE);
 448                         if (ft_created)
 449                                 mac_flow_tab_destroy(ft);
 450                         mac_fastpath_enable(mh);
 451                         return (err);
 452                 }
 453         } else {
 454                 FLOW_MARK(flent, FE_UF_NO_DATAPATH);
 455         }
 456         if (ft_created) {
 457                 ASSERT(mcip->mci_subflow_tab == NULL);
 458                 ft->ft_mcip = mcip;
 459                 mcip->mci_subflow_tab = ft;
 460                 if (instantiate_flow)
 461                         mac_client_update_classifier(mcip, B_TRUE);
 462         }
 463         return (0);
 464 }
 465 
 466 /*
 467  * Remove flow entry from flow table.
 468  */
 469 void
 470 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
 471 {
 472         flow_entry_t    **fp;
 473 
 474         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 475         if (!(flent->fe_flags & FE_FLOW_TAB))
 476                 return;
 477 
 478         rw_enter(&ft->ft_lock, RW_WRITER);
 479         /*
 480          * If this is a permanent removal from the flow table, mark it
 481          * CONDEMNED to prevent future references. If this is a temporary
 482          * removal from the table, say to update the flow descriptor then
 483          * we don't mark it CONDEMNED
 484          */
 485         if (!temp)
 486                 FLOW_MARK(flent, FE_CONDEMNED);
 487         /*
 488          * Locate the specified flent.
 489          */
 490         fp = &ft->ft_table[flent->fe_index];
 491         while (*fp != flent)
 492                 fp = &(*fp)->fe_next;
 493 
 494         /*
 495          * The flent must exist. Otherwise it's a bug.
 496          */
 497         ASSERT(fp != NULL);
 498         *fp = flent->fe_next;
 499         flent->fe_next = NULL;
 500 
 501         /*
 502          * Reset fe_index to -1 so any attempt to call mac_flow_remove()
 503          * on a flent that is supposed to be in the table (FE_FLOW_TAB)
 504          * will panic.
 505          */
 506         flent->fe_index = -1;
 507         FLOW_UNMARK(flent, FE_FLOW_TAB);
 508         ft->ft_flow_count--;
 509         rw_exit(&ft->ft_lock);
 510 }
 511 
 512 /*
 513  * This is the flow lookup routine used by the mac sw classifier engine.
 514  */
 515 int
 516 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
 517 {
 518         flow_state_t    s;
 519         flow_entry_t    *flent;
 520         flow_ops_t      *ops = &ft->ft_ops;
 521         boolean_t       retried = B_FALSE;
 522         int             i, err;
 523 
 524         s.fs_flags = flags;
 525 retry:
 526         s.fs_mp = mp;
 527 
 528         /*
 529          * Walk the list of predeclared accept functions.
 530          * Each of these would accumulate enough state to allow the next
 531          * accept routine to make progress.
 532          */
 533         for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
 534                 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
 535                         mblk_t  *last;
 536 
 537                         /*
 538                          * ENOBUFS indicates that the mp could be too short
 539                          * and may need a pullup.
 540                          */
 541                         if (err != ENOBUFS || retried)
 542                                 return (err);
 543 
 544                         /*
 545                          * The pullup is done on the last processed mblk, not
 546                          * the starting one. pullup is not done if the mblk
 547                          * has references or if b_cont is NULL.
 548                          */
 549                         last = s.fs_mp;
 550                         if (DB_REF(last) > 1 || last->b_cont == NULL ||
 551                             pullupmsg(last, -1) == 0)
 552                                 return (EINVAL);
 553 
 554                         retried = B_TRUE;
 555                         DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
 556                             flow_state_t *, &s);
 557                         goto retry;
 558                 }
 559         }
 560 
 561         /*
 562          * The packet is considered sane. We may now attempt to
 563          * find the corresponding flent.
 564          */
 565         rw_enter(&ft->ft_lock, RW_READER);
 566         flent = ft->ft_table[ops->fo_hash(ft, &s)];
 567         for (; flent != NULL; flent = flent->fe_next) {
 568                 if (flent->fe_match(ft, flent, &s)) {
 569                         FLOW_TRY_REFHOLD(flent, err);
 570                         if (err != 0)
 571                                 continue;
 572                         *flentp = flent;
 573                         rw_exit(&ft->ft_lock);
 574                         return (0);
 575                 }
 576         }
 577         rw_exit(&ft->ft_lock);
 578         return (ENOENT);
 579 }
 580 
 581 /*
 582  * Walk flow table.
 583  * The caller is assumed to have proper perimeter protection.
 584  */
 585 int
 586 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
 587     void *arg)
 588 {
 589         int             err, i, cnt = 0;
 590         flow_entry_t    *flent;
 591 
 592         if (ft == NULL)
 593                 return (0);
 594 
 595         for (i = 0; i < ft->ft_size; i++) {
 596                 for (flent = ft->ft_table[i]; flent != NULL;
 597                     flent = flent->fe_next) {
 598                         cnt++;
 599                         err = (*fn)(flent, arg);
 600                         if (err != 0)
 601                                 return (err);
 602                 }
 603         }
 604         VERIFY(cnt == ft->ft_flow_count);
 605         return (0);
 606 }
 607 
 608 /*
 609  * Same as the above except a mutex is used for protection here.
 610  */
 611 int
 612 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
 613     void *arg)
 614 {
 615         int             err;
 616 
 617         if (ft == NULL)
 618                 return (0);
 619 
 620         rw_enter(&ft->ft_lock, RW_WRITER);
 621         err = mac_flow_walk_nolock(ft, fn, arg);
 622         rw_exit(&ft->ft_lock);
 623         return (err);
 624 }
 625 
 626 static boolean_t        mac_flow_clean(flow_entry_t *);
 627 
 628 /*
 629  * Destroy a flow entry. Called when the last reference on a flow is released.
 630  */
 631 void
 632 mac_flow_destroy(flow_entry_t *flent)
 633 {
 634         ASSERT(flent->fe_refcnt == 0);
 635 
 636         if ((flent->fe_type & FLOW_USER) != 0) {
 637                 ASSERT(mac_flow_clean(flent));
 638         } else {
 639                 mac_flow_cleanup(flent);
 640         }
 641         mac_misc_stat_delete(flent);
 642         mutex_destroy(&flent->fe_lock);
 643         cv_destroy(&flent->fe_cv);
 644         flow_stat_destroy(flent);
 645         kmem_cache_free(flow_cache, flent);
 646 }
 647 
 648 /*
 649  * XXX eric
 650  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
 651  * mac_link_flow_modify() should really be moved/reworked into the
 652  * two functions below. This would consolidate all the mac property
 653  * checking in one place. I'm leaving this alone for now since it's
 654  * out of scope of the new flows work.
 655  */
 656 /* ARGSUSED */
 657 uint32_t
 658 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
 659 {
 660         uint32_t                changed_mask = 0;
 661         mac_resource_props_t    *fmrp = &flent->fe_effective_props;
 662         int                     i;
 663 
 664         if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
 665             (!(fmrp->mrp_mask & MRP_MAXBW) ||
 666             (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
 667                 changed_mask |= MRP_MAXBW;
 668                 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
 669                         fmrp->mrp_mask &= ~MRP_MAXBW;
 670                         fmrp->mrp_maxbw = 0;
 671                 } else {
 672                         fmrp->mrp_mask |= MRP_MAXBW;
 673                         fmrp->mrp_maxbw = mrp->mrp_maxbw;
 674                 }
 675         }
 676 
 677         if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
 678                 if (fmrp->mrp_priority != mrp->mrp_priority)
 679                         changed_mask |= MRP_PRIORITY;
 680                 if (mrp->mrp_priority == MPL_RESET) {
 681                         fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
 682                         fmrp->mrp_mask &= ~MRP_PRIORITY;
 683                 } else {
 684                         fmrp->mrp_priority = mrp->mrp_priority;
 685                         fmrp->mrp_mask |= MRP_PRIORITY;
 686                 }
 687         }
 688 
 689         /* modify fanout */
 690         if ((mrp->mrp_mask & MRP_CPUS) != 0) {
 691                 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
 692                     (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
 693                         for (i = 0; i < mrp->mrp_ncpus; i++) {
 694                                 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
 695                                         break;
 696                         }
 697                         if (i == mrp->mrp_ncpus) {
 698                                 /*
 699                                  * The new set of cpus passed is exactly
 700                                  * the same as the existing set.
 701                                  */
 702                                 return (changed_mask);
 703                         }
 704                 }
 705                 changed_mask |= MRP_CPUS;
 706                 MAC_COPY_CPUS(mrp, fmrp);
 707         }
 708 
 709         /*
 710          * Modify the rings property.
 711          */
 712         if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
 713                 mac_set_rings_effective(flent->fe_mcip);
 714 
 715         if ((mrp->mrp_mask & MRP_POOL) != 0) {
 716                 if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
 717                         changed_mask |= MRP_POOL;
 718                 if (strlen(mrp->mrp_pool) == 0)
 719                         fmrp->mrp_mask &= ~MRP_POOL;
 720                 else
 721                         fmrp->mrp_mask |= MRP_POOL;
 722                 (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
 723         }
 724         return (changed_mask);
 725 }
 726 
 727 void
 728 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
 729 {
 730         uint32_t changed_mask;
 731         mac_client_impl_t *mcip = flent->fe_mcip;
 732         mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
 733         mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
 734         cpupart_t *cpupart = NULL;
 735         boolean_t use_default = B_FALSE;
 736 
 737         ASSERT(flent != NULL);
 738         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 739 
 740         rw_enter(&ft->ft_lock, RW_WRITER);
 741 
 742         /* Update the cached values inside the subflow entry */
 743         changed_mask = mac_flow_modify_props(flent, mrp);
 744         rw_exit(&ft->ft_lock);
 745         /*
 746          * Push the changed parameters to the scheduling code in the
 747          * SRS's, to take effect right away.
 748          */
 749         if (changed_mask & MRP_MAXBW) {
 750                 mac_srs_update_bwlimit(flent, mrp);
 751                 /*
 752                  * If bandwidth is changed, we may have to change
 753                  * the number of soft ring to be used for fanout.
 754                  * Call mac_flow_update_fanout() if MAC_BIND_CPU
 755                  * is not set and there is no user supplied cpu
 756                  * info. This applies only to link at this time.
 757                  */
 758                 if (!(flent->fe_type & FLOW_USER) &&
 759                     !(changed_mask & MRP_CPUS) &&
 760                     !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
 761                         mac_fanout_setup(mcip, flent, mcip_mrp,
 762                             mac_rx_deliver, mcip, NULL, NULL);
 763                 }
 764         }
 765         if (mrp->mrp_mask & MRP_PRIORITY)
 766                 mac_flow_update_priority(mcip, flent);
 767 
 768         if (changed_mask & MRP_CPUS)
 769                 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
 770                     NULL);
 771 
 772         if (mrp->mrp_mask & MRP_POOL) {
 773                 pool_lock();
 774                 cpupart = mac_pset_find(mrp, &use_default);
 775                 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
 776                     cpupart);
 777                 mac_set_pool_effective(use_default, cpupart, mrp, emrp);
 778                 pool_unlock();
 779         }
 780 }
 781 
 782 /*
 783  * This function waits for a certain condition to be met and is generally
 784  * used before a destructive or quiescing operation.
 785  */
 786 void
 787 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
 788 {
 789         mutex_enter(&flent->fe_lock);
 790         flent->fe_flags |= FE_WAITER;
 791 
 792         switch (event) {
 793         case FLOW_DRIVER_UPCALL:
 794                 /*
 795                  * We want to make sure the driver upcalls have finished before
 796                  * we signal the Rx SRS worker to quit.
 797                  */
 798                 while (flent->fe_refcnt != 1)
 799                         cv_wait(&flent->fe_cv, &flent->fe_lock);
 800                 break;
 801 
 802         case FLOW_USER_REF:
 803                 /*
 804                  * Wait for the fe_user_refcnt to drop to 0. The flow has
 805                  * been removed from the global flow hash.
 806                  */
 807                 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
 808                 while (flent->fe_user_refcnt != 0)
 809                         cv_wait(&flent->fe_cv, &flent->fe_lock);
 810                 break;
 811 
 812         default:
 813                 ASSERT(0);
 814         }
 815 
 816         flent->fe_flags &= ~FE_WAITER;
 817         mutex_exit(&flent->fe_lock);
 818 }
 819 
 820 static boolean_t
 821 mac_flow_clean(flow_entry_t *flent)
 822 {
 823         ASSERT(flent->fe_next == NULL);
 824         ASSERT(flent->fe_tx_srs == NULL);
 825         ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
 826         ASSERT(flent->fe_mbg == NULL);
 827 
 828         return (B_TRUE);
 829 }
 830 
 831 void
 832 mac_flow_cleanup(flow_entry_t *flent)
 833 {
 834         if ((flent->fe_type & FLOW_USER) == 0) {
 835                 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
 836                     (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
 837                 ASSERT(flent->fe_refcnt == 0);
 838         } else {
 839                 ASSERT(flent->fe_refcnt == 1);
 840         }
 841 
 842         if (flent->fe_mbg != NULL) {
 843                 ASSERT(flent->fe_tx_srs == NULL);
 844                 /* This is a multicast or broadcast flow entry */
 845                 mac_bcast_grp_free(flent->fe_mbg);
 846                 flent->fe_mbg = NULL;
 847         }
 848 
 849         if (flent->fe_tx_srs != NULL) {
 850                 ASSERT(flent->fe_mbg == NULL);
 851                 mac_srs_free(flent->fe_tx_srs);
 852                 flent->fe_tx_srs = NULL;
 853         }
 854 
 855         /*
 856          * In the normal case fe_rx_srs_cnt is 1. However in the error case
 857          * when mac_unicast_add fails we may not have set up any SRS
 858          * in which case fe_rx_srs_cnt will be zero.
 859          */
 860         if (flent->fe_rx_srs_cnt != 0) {
 861                 ASSERT(flent->fe_rx_srs_cnt == 1);
 862                 mac_srs_free(flent->fe_rx_srs[0]);
 863                 flent->fe_rx_srs[0] = NULL;
 864                 flent->fe_rx_srs_cnt = 0;
 865         }
 866         ASSERT(flent->fe_rx_srs[0] == NULL);
 867 }
 868 
 869 void
 870 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
 871 {
 872         /*
 873          * Grab the fe_lock to see a self-consistent fe_flow_desc.
 874          * Updates to the fe_flow_desc happen under the fe_lock
 875          * after removing the flent from the flow table
 876          */
 877         mutex_enter(&flent->fe_lock);
 878         bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
 879         mutex_exit(&flent->fe_lock);
 880 }
 881 
 882 /*
 883  * Update a field of a flow entry. The mac perimeter ensures that
 884  * this is the only thread doing a modify operation on this mac end point.
 885  * So the flow table can't change or disappear. The ft_lock protects access
 886  * to the flow entry, and holding the lock ensures that there isn't any thread
 887  * accessing the flow entry or attempting a flow table lookup. However
 888  * data threads that are using the flow entry based on the old descriptor
 889  * will continue to use the flow entry. If strong coherence is required
 890  * then the flow will have to be quiesced before the descriptor can be
 891  * changed.
 892  */
 893 void
 894 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
 895 {
 896         flow_tab_t      *ft = flent->fe_flow_tab;
 897         flow_desc_t     old_desc;
 898         int             err;
 899 
 900         if (ft == NULL) {
 901                 /*
 902                  * The flow hasn't yet been inserted into the table,
 903                  * so only the caller knows about this flow, however for
 904                  * uniformity we grab the fe_lock here.
 905                  */
 906                 mutex_enter(&flent->fe_lock);
 907                 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 908                 mutex_exit(&flent->fe_lock);
 909         }
 910 
 911         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 912 
 913         /*
 914          * Need to remove the flow entry from the table and reinsert it,
 915          * into a potentially diference hash line. The hash depends on
 916          * the new descriptor fields. However access to fe_desc itself
 917          * is always under the fe_lock. This helps log and stat functions
 918          * see a self-consistent fe_flow_desc.
 919          */
 920         mac_flow_remove(ft, flent, B_TRUE);
 921         old_desc = flent->fe_flow_desc;
 922 
 923         mutex_enter(&flent->fe_lock);
 924         bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 925         mutex_exit(&flent->fe_lock);
 926 
 927         if (mac_flow_add(ft, flent) != 0) {
 928                 /*
 929                  * The add failed say due to an invalid flow descriptor.
 930                  * Undo the update
 931                  */
 932                 flent->fe_flow_desc = old_desc;
 933                 err = mac_flow_add(ft, flent);
 934                 ASSERT(err == 0);
 935         }
 936 }
 937 
 938 void
 939 mac_flow_set_name(flow_entry_t *flent, const char *name)
 940 {
 941         flow_tab_t      *ft = flent->fe_flow_tab;
 942 
 943         if (ft == NULL) {
 944                 /*
 945                  *  The flow hasn't yet been inserted into the table,
 946                  * so only the caller knows about this flow
 947                  */
 948                 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 949         } else {
 950                 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 951         }
 952 
 953         mutex_enter(&flent->fe_lock);
 954         (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 955         mutex_exit(&flent->fe_lock);
 956 }
 957 
 958 /*
 959  * Return the client-private cookie that was associated with
 960  * the flow when it was created.
 961  */
 962 void *
 963 mac_flow_get_client_cookie(flow_entry_t *flent)
 964 {
 965         return (flent->fe_client_cookie);
 966 }
 967 
 968 /*
 969  * Forward declarations.
 970  */
 971 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *);
 972 static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
 973 static int      flow_l2_accept(flow_tab_t *, flow_state_t *);
 974 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *);
 975 static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
 976 static int      flow_ether_accept(flow_tab_t *, flow_state_t *);
 977 
 978 /*
 979  * Create flow table.
 980  */
 981 void
 982 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
 983     mac_impl_t *mip, flow_tab_t **ftp)
 984 {
 985         flow_tab_t      *ft;
 986         flow_ops_t      *new_ops;
 987 
 988         ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
 989         bzero(ft, sizeof (*ft));
 990 
 991         ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
 992 
 993         /*
 994          * We make a copy of the ops vector instead of just pointing to it
 995          * because we might want to customize the ops vector on a per table
 996          * basis (e.g. for optimization).
 997          */
 998         new_ops = &ft->ft_ops;
 999         bcopy(ops, new_ops, sizeof (*ops));
1000         ft->ft_mask = mask;
1001         ft->ft_size = size;
1002         ft->ft_mip = mip;
1003 
1004         /*
1005          * Optimizations for DL_ETHER media.
1006          */
1007         if (mip->mi_info.mi_nativemedia == DL_ETHER) {
1008                 if (new_ops->fo_hash == flow_l2_hash)
1009                         new_ops->fo_hash = flow_ether_hash;
1010                 if (new_ops->fo_hash_fe == flow_l2_hash_fe)
1011                         new_ops->fo_hash_fe = flow_ether_hash_fe;
1012                 if (new_ops->fo_accept[0] == flow_l2_accept)
1013                         new_ops->fo_accept[0] = flow_ether_accept;
1014         }
1015         *ftp = ft;
1016 }
1017 
1018 void
1019 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
1020 {
1021         mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
1022             1024, mip, ftp);
1023 }
1024 
1025 /*
1026  * Destroy flow table.
1027  */
1028 void
1029 mac_flow_tab_destroy(flow_tab_t *ft)
1030 {
1031         if (ft == NULL)
1032                 return;
1033 
1034         ASSERT(ft->ft_flow_count == 0);
1035         kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
1036         bzero(ft, sizeof (*ft));
1037         kmem_cache_free(flow_tab_cache, ft);
1038 }
1039 
1040 /*
1041  * Add a new flow entry to the global flow hash table
1042  */
1043 int
1044 mac_flow_hash_add(flow_entry_t *flent)
1045 {
1046         int     err;
1047 
1048         rw_enter(&flow_tab_lock, RW_WRITER);
1049         err = mod_hash_insert(flow_hash,
1050             (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
1051         if (err != 0) {
1052                 rw_exit(&flow_tab_lock);
1053                 return (EEXIST);
1054         }
1055         /* Mark as inserted into the global flow hash table */
1056         FLOW_MARK(flent, FE_G_FLOW_HASH);
1057         rw_exit(&flow_tab_lock);
1058         return (err);
1059 }
1060 
1061 /*
1062  * Remove a flow entry from the global flow hash table
1063  */
1064 void
1065 mac_flow_hash_remove(flow_entry_t *flent)
1066 {
1067         mod_hash_val_t  val;
1068 
1069         rw_enter(&flow_tab_lock, RW_WRITER);
1070         VERIFY(mod_hash_remove(flow_hash,
1071             (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1072 
1073         /* Clear the mark that says inserted into the global flow hash table */
1074         FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1075         rw_exit(&flow_tab_lock);
1076 }
1077 
1078 /*
1079  * Retrieve a flow entry from the global flow hash table.
1080  */
1081 int
1082 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1083 {
1084         int             err;
1085         flow_entry_t    *flent;
1086 
1087         rw_enter(&flow_tab_lock, RW_READER);
1088         err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1089             (mod_hash_val_t *)&flent);
1090         if (err != 0) {
1091                 rw_exit(&flow_tab_lock);
1092                 return (ENOENT);
1093         }
1094         ASSERT(flent != NULL);
1095         FLOW_USER_REFHOLD(flent);
1096         rw_exit(&flow_tab_lock);
1097 
1098         *flentp = flent;
1099         return (0);
1100 }
1101 
1102 /*
1103  * Initialize or release mac client flows by walking the subflow table.
1104  * These are typically invoked during plumb/unplumb of links.
1105  */
1106 
1107 static int
1108 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1109 {
1110         mac_client_impl_t       *mcip = arg;
1111 
1112         if (mac_link_flow_init(arg, flent) != 0) {
1113                 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1114                     flent->fe_flow_name, mcip->mci_name);
1115         } else {
1116                 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1117         }
1118         return (0);
1119 }
1120 
1121 void
1122 mac_link_init_flows(mac_client_handle_t mch)
1123 {
1124         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1125 
1126         (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1127             mac_link_init_flows_cb, mcip);
1128         /*
1129          * If mac client had subflow(s) configured before plumb, change
1130          * function to mac_rx_srs_subflow_process and in case of hardware
1131          * classification, disable polling.
1132          */
1133         mac_client_update_classifier(mcip, B_TRUE);
1134 
1135 }
1136 
1137 boolean_t
1138 mac_link_has_flows(mac_client_handle_t mch)
1139 {
1140         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1141 
1142         if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1143                 return (B_TRUE);
1144 
1145         return (B_FALSE);
1146 }
1147 
1148 static int
1149 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1150 {
1151         FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1152         mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1153         mac_link_flow_clean(arg, flent);
1154         return (0);
1155 }
1156 
1157 void
1158 mac_link_release_flows(mac_client_handle_t mch)
1159 {
1160         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1161 
1162         /*
1163          * Change the mci_flent callback back to mac_rx_srs_process()
1164          * because flows are about to be deactivated.
1165          */
1166         mac_client_update_classifier(mcip, B_FALSE);
1167         (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1168             mac_link_release_flows_cb, mcip);
1169 }
1170 
1171 void
1172 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1173 {
1174         mac_flow_set_name(fep, new_name);
1175         if (fep->fe_ksp != NULL) {
1176                 flow_stat_destroy(fep);
1177                 flow_stat_create(fep);
1178         }
1179 }
1180 
1181 /*
1182  * mac_link_flow_init()
1183  * Internal flow interface used for allocating SRSs and related
1184  * data structures. Not meant to be used by mac clients.
1185  */
1186 int
1187 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1188 {
1189         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1190         mac_impl_t              *mip = mcip->mci_mip;
1191         int                     err;
1192 
1193         ASSERT(mch != NULL);
1194         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1195 
1196         if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1197                 return (err);
1198 
1199         sub_flow->fe_mcip = mcip;
1200 
1201         return (0);
1202 }
1203 
1204 /*
1205  * mac_link_flow_add()
1206  * Used by flowadm(1m) or kernel mac clients for creating flows.
1207  */
1208 int
1209 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1210     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1211 {
1212         flow_entry_t            *flent = NULL;
1213         int                     err;
1214         dls_dl_handle_t         dlh;
1215         dls_link_t              *dlp;
1216         boolean_t               link_held = B_FALSE;
1217         boolean_t               hash_added = B_FALSE;
1218         mac_perim_handle_t      mph;
1219 
1220         err = mac_flow_lookup_byname(flow_name, &flent);
1221         if (err == 0) {
1222                 FLOW_USER_REFRELE(flent);
1223                 return (EEXIST);
1224         }
1225 
1226         /*
1227          * First create a flow entry given the description provided
1228          * by the caller.
1229          */
1230         err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1231             FLOW_USER | FLOW_OTHER, &flent);
1232 
1233         if (err != 0)
1234                 return (err);
1235 
1236         /*
1237          * We've got a local variable referencing this flow now, so we need
1238          * to hold it. We'll release this flow before returning.
1239          * All failures until we return will undo any action that may internally
1240          * held the flow, so the last REFRELE will assure a clean freeing
1241          * of resources.
1242          */
1243         FLOW_REFHOLD(flent);
1244 
1245         flent->fe_link_id = linkid;
1246         FLOW_MARK(flent, FE_INCIPIENT);
1247 
1248         err = mac_perim_enter_by_linkid(linkid, &mph);
1249         if (err != 0) {
1250                 FLOW_FINAL_REFRELE(flent);
1251                 return (err);
1252         }
1253 
1254         /*
1255          * dls will eventually be merged with mac so it's ok
1256          * to call dls' internal functions.
1257          */
1258         err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1259         if (err != 0)
1260                 goto bail;
1261 
1262         link_held = B_TRUE;
1263 
1264         /*
1265          * Add the flow to the global flow table, this table will be per
1266          * exclusive zone so each zone can have its own flow namespace.
1267          * RFE 6625651 will fix this.
1268          *
1269          */
1270         if ((err = mac_flow_hash_add(flent)) != 0)
1271                 goto bail;
1272 
1273         hash_added = B_TRUE;
1274 
1275         /*
1276          * do not allow flows to be configured on an anchor VNIC
1277          */
1278         if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1279                 err = ENOTSUP;
1280                 goto bail;
1281         }
1282 
1283         /*
1284          * Add the subflow to the subflow table. Also instantiate the flow
1285          * in the mac if there is an active user (we check if the MAC client's
1286          * datapath has been setup).
1287          */
1288         err = mac_flow_add_subflow(dlp->dl_mch, flent,
1289             MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1290         if (err != 0)
1291                 goto bail;
1292 
1293         FLOW_UNMARK(flent, FE_INCIPIENT);
1294         dls_devnet_rele_link(dlh, dlp);
1295         mac_perim_exit(mph);
1296         return (0);
1297 
1298 bail:
1299         if (hash_added)
1300                 mac_flow_hash_remove(flent);
1301 
1302         if (link_held)
1303                 dls_devnet_rele_link(dlh, dlp);
1304 
1305         /*
1306          * Wait for any transient global flow hash refs to clear
1307          * and then release the creation reference on the flow
1308          */
1309         mac_flow_wait(flent, FLOW_USER_REF);
1310         FLOW_FINAL_REFRELE(flent);
1311         mac_perim_exit(mph);
1312         return (err);
1313 }
1314 
1315 /*
1316  * mac_link_flow_clean()
1317  * Internal flow interface used for freeing SRSs and related
1318  * data structures. Not meant to be used by mac clients.
1319  */
1320 void
1321 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1322 {
1323         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1324         mac_impl_t              *mip = mcip->mci_mip;
1325         boolean_t               last_subflow;
1326 
1327         ASSERT(mch != NULL);
1328         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1329 
1330         /*
1331          * This sub flow entry may fail to be fully initialized by
1332          * mac_link_flow_init(). If so, simply return.
1333          */
1334         if (sub_flow->fe_mcip == NULL)
1335                 return;
1336 
1337         last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1338         /*
1339          * Tear down the data path
1340          */
1341         mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1342         sub_flow->fe_mcip = NULL;
1343 
1344         /*
1345          * Delete the SRSs associated with this subflow. If this is being
1346          * driven by flowadm(1M) then the subflow will be deleted by
1347          * dls_rem_flow. However if this is a result of the interface being
1348          * unplumbed then the subflow itself won't be deleted.
1349          */
1350         mac_flow_cleanup(sub_flow);
1351 
1352         /*
1353          * If all the subflows are gone, renable some of the stuff
1354          * we disabled when adding a subflow, polling etc.
1355          */
1356         if (last_subflow) {
1357                 /*
1358                  * The subflow table itself is not protected by any locks or
1359                  * refcnts. Hence quiesce the client upfront before clearing
1360                  * mci_subflow_tab.
1361                  */
1362                 mac_client_quiesce(mcip);
1363                 mac_client_update_classifier(mcip, B_FALSE);
1364                 mac_flow_tab_destroy(mcip->mci_subflow_tab);
1365                 mcip->mci_subflow_tab = NULL;
1366                 mac_client_restart(mcip);
1367         }
1368 }
1369 
1370 /*
1371  * mac_link_flow_remove()
1372  * Used by flowadm(1m) or kernel mac clients for removing flows.
1373  */
1374 int
1375 mac_link_flow_remove(char *flow_name)
1376 {
1377         flow_entry_t            *flent;
1378         mac_perim_handle_t      mph;
1379         int                     err;
1380         datalink_id_t           linkid;
1381 
1382         err = mac_flow_lookup_byname(flow_name, &flent);
1383         if (err != 0)
1384                 return (err);
1385 
1386         linkid = flent->fe_link_id;
1387         FLOW_USER_REFRELE(flent);
1388 
1389         /*
1390          * The perim must be acquired before acquiring any other references
1391          * to maintain the lock and perimeter hierarchy. Please note the
1392          * FLOW_REFRELE above.
1393          */
1394         err = mac_perim_enter_by_linkid(linkid, &mph);
1395         if (err != 0)
1396                 return (err);
1397 
1398         /*
1399          * Note the second lookup of the flow, because a concurrent thread
1400          * may have removed it already while we were waiting to enter the
1401          * link's perimeter.
1402          */
1403         err = mac_flow_lookup_byname(flow_name, &flent);
1404         if (err != 0) {
1405                 mac_perim_exit(mph);
1406                 return (err);
1407         }
1408         FLOW_USER_REFRELE(flent);
1409 
1410         /*
1411          * Remove the flow from the subflow table and deactivate the flow
1412          * by quiescing and removings its SRSs
1413          */
1414         mac_flow_rem_subflow(flent);
1415 
1416         /*
1417          * Finally, remove the flow from the global table.
1418          */
1419         mac_flow_hash_remove(flent);
1420 
1421         /*
1422          * Wait for any transient global flow hash refs to clear
1423          * and then release the creation reference on the flow
1424          */
1425         mac_flow_wait(flent, FLOW_USER_REF);
1426         FLOW_FINAL_REFRELE(flent);
1427 
1428         mac_perim_exit(mph);
1429 
1430         return (0);
1431 }
1432 
1433 /*
1434  * mac_link_flow_modify()
1435  * Modifies the properties of a flow identified by its name.
1436  */
1437 int
1438 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1439 {
1440         flow_entry_t            *flent;
1441         mac_client_impl_t       *mcip;
1442         int                     err = 0;
1443         mac_perim_handle_t      mph;
1444         datalink_id_t           linkid;
1445         flow_tab_t              *flow_tab;
1446 
1447         err = mac_validate_props(NULL, mrp);
1448         if (err != 0)
1449                 return (err);
1450 
1451         err = mac_flow_lookup_byname(flow_name, &flent);
1452         if (err != 0)
1453                 return (err);
1454 
1455         linkid = flent->fe_link_id;
1456         FLOW_USER_REFRELE(flent);
1457 
1458         /*
1459          * The perim must be acquired before acquiring any other references
1460          * to maintain the lock and perimeter hierarchy. Please note the
1461          * FLOW_REFRELE above.
1462          */
1463         err = mac_perim_enter_by_linkid(linkid, &mph);
1464         if (err != 0)
1465                 return (err);
1466 
1467         /*
1468          * Note the second lookup of the flow, because a concurrent thread
1469          * may have removed it already while we were waiting to enter the
1470          * link's perimeter.
1471          */
1472         err = mac_flow_lookup_byname(flow_name, &flent);
1473         if (err != 0) {
1474                 mac_perim_exit(mph);
1475                 return (err);
1476         }
1477         FLOW_USER_REFRELE(flent);
1478 
1479         /*
1480          * If this flow is attached to a MAC client, then pass the request
1481          * along to the client.
1482          * Otherwise, just update the cached values.
1483          */
1484         mcip = flent->fe_mcip;
1485         mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1486         if (mcip != NULL) {
1487                 if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1488                         err = ENOENT;
1489                 } else {
1490                         mac_flow_modify(flow_tab, flent, mrp);
1491                 }
1492         } else {
1493                 (void) mac_flow_modify_props(flent, mrp);
1494         }
1495 
1496 done:
1497         mac_perim_exit(mph);
1498         return (err);
1499 }
1500 
1501 
1502 /*
1503  * State structure and misc functions used by mac_link_flow_walk().
1504  */
1505 typedef struct {
1506         int     (*ws_func)(mac_flowinfo_t *, void *);
1507         void    *ws_arg;
1508 } flow_walk_state_t;
1509 
1510 static void
1511 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1512 {
1513         (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1514             MAXFLOWNAMELEN);
1515         finfop->fi_link_id = flent->fe_link_id;
1516         finfop->fi_flow_desc = flent->fe_flow_desc;
1517         finfop->fi_resource_props = flent->fe_resource_props;
1518 }
1519 
1520 static int
1521 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1522 {
1523         flow_walk_state_t       *statep = arg;
1524         mac_flowinfo_t          *finfo;
1525         int                     err;
1526 
1527         finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
1528         mac_link_flowinfo_copy(finfo, flent);
1529         err = statep->ws_func(finfo, statep->ws_arg);
1530         kmem_free(finfo, sizeof (*finfo));
1531         return (err);
1532 }
1533 
1534 /*
1535  * mac_link_flow_walk()
1536  * Invokes callback 'func' for all flows belonging to the specified link.
1537  */
1538 int
1539 mac_link_flow_walk(datalink_id_t linkid,
1540     int (*func)(mac_flowinfo_t *, void *), void *arg)
1541 {
1542         mac_client_impl_t       *mcip;
1543         mac_perim_handle_t      mph;
1544         flow_walk_state_t       state;
1545         dls_dl_handle_t         dlh;
1546         dls_link_t              *dlp;
1547         int                     err;
1548 
1549         err = mac_perim_enter_by_linkid(linkid, &mph);
1550         if (err != 0)
1551                 return (err);
1552 
1553         err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1554         if (err != 0) {
1555                 mac_perim_exit(mph);
1556                 return (err);
1557         }
1558 
1559         mcip = (mac_client_impl_t *)dlp->dl_mch;
1560         state.ws_func = func;
1561         state.ws_arg = arg;
1562 
1563         err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1564             mac_link_flow_walk_cb, &state);
1565 
1566         dls_devnet_rele_link(dlh, dlp);
1567         mac_perim_exit(mph);
1568         return (err);
1569 }
1570 
1571 /*
1572  * mac_link_flow_info()
1573  * Retrieves information about a specific flow.
1574  */
1575 int
1576 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1577 {
1578         flow_entry_t    *flent;
1579         int             err;
1580 
1581         err = mac_flow_lookup_byname(flow_name, &flent);
1582         if (err != 0)
1583                 return (err);
1584 
1585         mac_link_flowinfo_copy(finfo, flent);
1586         FLOW_USER_REFRELE(flent);
1587         return (0);
1588 }
1589 
1590 /*
1591  * Hash function macro that takes an Ethernet address and VLAN id as input.
1592  */
1593 #define HASH_ETHER_VID(a, v, s) \
1594         ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1595 
1596 /*
1597  * Generic layer-2 address hashing function that takes an address and address
1598  * length as input.  This is the DJB hash function.
1599  */
1600 static uint32_t
1601 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1602 {
1603         uint32_t        hash = 5381;
1604         size_t          i;
1605 
1606         for (i = 0; i < addrlen; i++)
1607                 hash = ((hash << 5) + hash) + addr[i];
1608         return (hash % htsize);
1609 }
1610 
1611 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1612 
1613 #define CHECK_AND_ADJUST_START_PTR(s, start) {          \
1614         if ((s)->fs_mp->b_wptr == (start)) {              \
1615                 mblk_t  *next = (s)->fs_mp->b_cont;       \
1616                 if (next == NULL)                       \
1617                         return (EINVAL);                \
1618                                                         \
1619                 (s)->fs_mp = next;                   \
1620                 (start) = next->b_rptr;                      \
1621         }                                               \
1622 }
1623 
1624 /* ARGSUSED */
1625 static boolean_t
1626 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1627 {
1628         flow_l2info_t           *l2 = &s->fs_l2info;
1629         flow_desc_t             *fd = &flent->fe_flow_desc;
1630 
1631         return (l2->l2_vid == fd->fd_vid &&
1632             bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1633 }
1634 
1635 /*
1636  * Layer 2 hash function.
1637  * Must be paired with flow_l2_accept() within a set of flow_ops
1638  * because it assumes the dest address is already extracted.
1639  */
1640 static uint32_t
1641 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1642 {
1643         return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1644             ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1645 }
1646 
1647 /*
1648  * This is the generic layer 2 accept function.
1649  * It makes use of mac_header_info() to extract the header length,
1650  * sap, vlan ID and destination address.
1651  */
1652 static int
1653 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1654 {
1655         boolean_t               is_ether;
1656         flow_l2info_t           *l2 = &s->fs_l2info;
1657         mac_header_info_t       mhi;
1658         int                     err;
1659 
1660         is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1661         if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1662             s->fs_mp, &mhi)) != 0) {
1663                 if (err == EINVAL)
1664                         err = ENOBUFS;
1665 
1666                 return (err);
1667         }
1668 
1669         l2->l2_start = s->fs_mp->b_rptr;
1670         l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1671 
1672         if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1673             ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1674                 struct ether_vlan_header        *evhp =
1675                     (struct ether_vlan_header *)l2->l2_start;
1676 
1677                 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1678                         return (ENOBUFS);
1679 
1680                 l2->l2_sap = ntohs(evhp->ether_type);
1681                 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1682                 l2->l2_hdrsize = sizeof (*evhp);
1683         } else {
1684                 l2->l2_sap = mhi.mhi_bindsap;
1685                 l2->l2_vid = 0;
1686                 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1687         }
1688         return (0);
1689 }
1690 
1691 /*
1692  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1693  * accept(). The notable difference is that dest address is now extracted
1694  * by hash() rather than by accept(). This saves a few memory references
1695  * for flow tables that do not care about mac addresses.
1696  */
1697 static uint32_t
1698 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1699 {
1700         flow_l2info_t                   *l2 = &s->fs_l2info;
1701         struct ether_vlan_header        *evhp;
1702 
1703         evhp = (struct ether_vlan_header *)l2->l2_start;
1704         l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1705         return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1706 }
1707 
1708 static uint32_t
1709 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1710 {
1711         flow_desc_t     *fd = &flent->fe_flow_desc;
1712 
1713         ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1714         return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1715 }
1716 
1717 /* ARGSUSED */
1718 static int
1719 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1720 {
1721         flow_l2info_t                   *l2 = &s->fs_l2info;
1722         struct ether_vlan_header        *evhp;
1723         uint16_t                        sap;
1724 
1725         evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1726         l2->l2_start = (uchar_t *)evhp;
1727 
1728         if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1729                 return (ENOBUFS);
1730 
1731         if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1732             ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1733                 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1734                         return (ENOBUFS);
1735 
1736                 l2->l2_sap = ntohs(evhp->ether_type);
1737                 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1738                 l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1739         } else {
1740                 l2->l2_sap = sap;
1741                 l2->l2_vid = 0;
1742                 l2->l2_hdrsize = sizeof (struct ether_header);
1743         }
1744         return (0);
1745 }
1746 
1747 /*
1748  * Validates a layer 2 flow entry.
1749  */
1750 static int
1751 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1752 {
1753         flow_desc_t     *fd = &flent->fe_flow_desc;
1754 
1755         /*
1756          * Dest address is mandatory, and 0 length addresses are not yet
1757          * supported.
1758          */
1759         if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1760                 return (EINVAL);
1761 
1762         if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1763                 /*
1764                  * VLAN flows are only supported over ethernet macs.
1765                  */
1766                 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1767                         return (EINVAL);
1768 
1769                 if (fd->fd_vid == 0)
1770                         return (EINVAL);
1771 
1772         }
1773         flent->fe_match = flow_l2_match;
1774         return (0);
1775 }
1776 
1777 /*
1778  * Calculates hash index of flow entry.
1779  */
1780 static uint32_t
1781 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1782 {
1783         flow_desc_t     *fd = &flent->fe_flow_desc;
1784 
1785         ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1786         return (flow_l2_addrhash(fd->fd_dst_mac,
1787             ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1788 }
1789 
1790 /*
1791  * This is used for duplicate flow checking.
1792  */
1793 /* ARGSUSED */
1794 static boolean_t
1795 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1796 {
1797         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1798 
1799         ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1800         return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1801             fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1802 }
1803 
1804 /*
1805  * Generic flow entry insertion function.
1806  * Used by flow tables that do not have ordering requirements.
1807  */
1808 /* ARGSUSED */
1809 static int
1810 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1811     flow_entry_t *flent)
1812 {
1813         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1814 
1815         if (*headp != NULL) {
1816                 ASSERT(flent->fe_next == NULL);
1817                 flent->fe_next = *headp;
1818         }
1819         *headp = flent;
1820         return (0);
1821 }
1822 
1823 /*
1824  * IP version independent DSField matching function.
1825  */
1826 /* ARGSUSED */
1827 static boolean_t
1828 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1829 {
1830         flow_l3info_t   *l3info = &s->fs_l3info;
1831         flow_desc_t     *fd = &flent->fe_flow_desc;
1832 
1833         switch (l3info->l3_version) {
1834         case IPV4_VERSION: {
1835                 ipha_t          *ipha = (ipha_t *)l3info->l3_start;
1836 
1837                 return ((ipha->ipha_type_of_service &
1838                     fd->fd_dsfield_mask) == fd->fd_dsfield);
1839         }
1840         case IPV6_VERSION: {
1841                 ip6_t           *ip6h = (ip6_t *)l3info->l3_start;
1842 
1843                 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1844                     fd->fd_dsfield_mask) == fd->fd_dsfield);
1845         }
1846         default:
1847                 return (B_FALSE);
1848         }
1849 }
1850 
1851 /*
1852  * IP v4 and v6 address matching.
1853  * The netmask only needs to be applied on the packet but not on the
1854  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1855  */
1856 
1857 /* ARGSUSED */
1858 static boolean_t
1859 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1860 {
1861         flow_l3info_t   *l3info = &s->fs_l3info;
1862         flow_desc_t     *fd = &flent->fe_flow_desc;
1863         ipha_t          *ipha = (ipha_t *)l3info->l3_start;
1864         in_addr_t       addr;
1865 
1866         addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1867         if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1868                 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1869                     V4_PART_OF_V6(fd->fd_local_addr));
1870         }
1871         return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1872             V4_PART_OF_V6(fd->fd_remote_addr));
1873 }
1874 
1875 /* ARGSUSED */
1876 static boolean_t
1877 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1878 {
1879         flow_l3info_t   *l3info = &s->fs_l3info;
1880         flow_desc_t     *fd = &flent->fe_flow_desc;
1881         ip6_t           *ip6h = (ip6_t *)l3info->l3_start;
1882         in6_addr_t      *addrp;
1883 
1884         addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1885         if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1886                 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1887                     fd->fd_local_addr));
1888         }
1889         return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1890 }
1891 
1892 /* ARGSUSED */
1893 static boolean_t
1894 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1895 {
1896         flow_l3info_t   *l3info = &s->fs_l3info;
1897         flow_desc_t     *fd = &flent->fe_flow_desc;
1898 
1899         return (l3info->l3_protocol == fd->fd_protocol);
1900 }
1901 
1902 static uint32_t
1903 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1904 {
1905         flow_l3info_t   *l3info = &s->fs_l3info;
1906         flow_mask_t     mask = ft->ft_mask;
1907 
1908         if ((mask & FLOW_IP_LOCAL) != 0) {
1909                 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1910         } else if ((mask & FLOW_IP_REMOTE) != 0) {
1911                 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1912         } else if ((mask & FLOW_IP_DSFIELD) != 0) {
1913                 /*
1914                  * DSField flents are arranged as a single list.
1915                  */
1916                 return (0);
1917         }
1918         /*
1919          * IP addr flents are hashed into two lists, v4 or v6.
1920          */
1921         ASSERT(ft->ft_size >= 2);
1922         return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1923 }
1924 
1925 static uint32_t
1926 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1927 {
1928         flow_l3info_t   *l3info = &s->fs_l3info;
1929 
1930         return (l3info->l3_protocol % ft->ft_size);
1931 }
1932 
1933 /* ARGSUSED */
1934 static int
1935 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1936 {
1937         flow_l2info_t   *l2info = &s->fs_l2info;
1938         flow_l3info_t   *l3info = &s->fs_l3info;
1939         uint16_t        sap = l2info->l2_sap;
1940         uchar_t         *l3_start;
1941 
1942         l3_start = l2info->l2_start + l2info->l2_hdrsize;
1943 
1944         /*
1945          * Adjust start pointer if we're at the end of an mblk.
1946          */
1947         CHECK_AND_ADJUST_START_PTR(s, l3_start);
1948 
1949         l3info->l3_start = l3_start;
1950         if (!OK_32PTR(l3_start))
1951                 return (EINVAL);
1952 
1953         switch (sap) {
1954         case ETHERTYPE_IP: {
1955                 ipha_t  *ipha = (ipha_t *)l3_start;
1956 
1957                 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1958                         return (ENOBUFS);
1959 
1960                 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1961                 l3info->l3_protocol = ipha->ipha_protocol;
1962                 l3info->l3_version = IPV4_VERSION;
1963                 l3info->l3_fragmented =
1964                     IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1965                 break;
1966         }
1967         case ETHERTYPE_IPV6: {
1968                 ip6_t           *ip6h = (ip6_t *)l3_start;
1969                 ip6_frag_t      *frag = NULL;
1970                 uint16_t        ip6_hdrlen;
1971                 uint8_t         nexthdr;
1972 
1973                 if (!mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr, &ip6_hdrlen,
1974                     &nexthdr, &frag)) {
1975                         return (ENOBUFS);
1976                 }
1977                 l3info->l3_hdrsize = ip6_hdrlen;
1978                 l3info->l3_protocol = nexthdr;
1979                 l3info->l3_version = IPV6_VERSION;
1980                 l3info->l3_fragmented = (frag != NULL);
1981                 break;
1982         }
1983         default:
1984                 return (EINVAL);
1985         }
1986         return (0);
1987 }
1988 
1989 /* ARGSUSED */
1990 static int
1991 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1992 {
1993         flow_desc_t     *fd = &flent->fe_flow_desc;
1994 
1995         switch (fd->fd_protocol) {
1996         case IPPROTO_TCP:
1997         case IPPROTO_UDP:
1998         case IPPROTO_SCTP:
1999         case IPPROTO_ICMP:
2000         case IPPROTO_ICMPV6:
2001                 flent->fe_match = flow_ip_proto_match;
2002                 return (0);
2003         default:
2004                 return (EINVAL);
2005         }
2006 }
2007 
2008 /* ARGSUSED */
2009 static int
2010 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2011 {
2012         flow_desc_t     *fd = &flent->fe_flow_desc;
2013         flow_mask_t     mask;
2014         uint8_t         version;
2015         in6_addr_t      *addr, *netmask;
2016 
2017         /*
2018          * DSField does not require a IP version.
2019          */
2020         if (fd->fd_mask == FLOW_IP_DSFIELD) {
2021                 if (fd->fd_dsfield_mask == 0)
2022                         return (EINVAL);
2023 
2024                 flent->fe_match = flow_ip_dsfield_match;
2025                 return (0);
2026         }
2027 
2028         /*
2029          * IP addresses must come with a version to avoid ambiguity.
2030          */
2031         if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
2032                 return (EINVAL);
2033 
2034         version = fd->fd_ipversion;
2035         if (version != IPV4_VERSION && version != IPV6_VERSION)
2036                 return (EINVAL);
2037 
2038         mask = fd->fd_mask & ~FLOW_IP_VERSION;
2039         switch (mask) {
2040         case FLOW_IP_LOCAL:
2041                 addr = &fd->fd_local_addr;
2042                 netmask = &fd->fd_local_netmask;
2043                 break;
2044         case FLOW_IP_REMOTE:
2045                 addr = &fd->fd_remote_addr;
2046                 netmask = &fd->fd_remote_netmask;
2047                 break;
2048         default:
2049                 return (EINVAL);
2050         }
2051 
2052         /*
2053          * Apply netmask onto specified address.
2054          */
2055         V6_MASK_COPY(*addr, *netmask, *addr);
2056         if (version == IPV4_VERSION) {
2057                 ipaddr_t        v4addr = V4_PART_OF_V6((*addr));
2058                 ipaddr_t        v4mask = V4_PART_OF_V6((*netmask));
2059 
2060                 if (v4addr == 0 || v4mask == 0)
2061                         return (EINVAL);
2062                 flent->fe_match = flow_ip_v4_match;
2063         } else {
2064                 if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
2065                     IN6_IS_ADDR_UNSPECIFIED(netmask))
2066                         return (EINVAL);
2067                 flent->fe_match = flow_ip_v6_match;
2068         }
2069         return (0);
2070 }
2071 
2072 static uint32_t
2073 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2074 {
2075         flow_desc_t     *fd = &flent->fe_flow_desc;
2076 
2077         return (fd->fd_protocol % ft->ft_size);
2078 }
2079 
2080 static uint32_t
2081 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2082 {
2083         flow_desc_t     *fd = &flent->fe_flow_desc;
2084 
2085         /*
2086          * DSField flents are arranged as a single list.
2087          */
2088         if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2089                 return (0);
2090 
2091         /*
2092          * IP addr flents are hashed into two lists, v4 or v6.
2093          */
2094         ASSERT(ft->ft_size >= 2);
2095         return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2096 }
2097 
2098 /* ARGSUSED */
2099 static boolean_t
2100 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2101 {
2102         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2103 
2104         return (fd1->fd_protocol == fd2->fd_protocol);
2105 }
2106 
2107 /* ARGSUSED */
2108 static boolean_t
2109 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2110 {
2111         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2112         in6_addr_t      *a1, *m1, *a2, *m2;
2113 
2114         ASSERT(fd1->fd_mask == fd2->fd_mask);
2115         if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2116                 return (fd1->fd_dsfield == fd2->fd_dsfield &&
2117                     fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2118         }
2119 
2120         /*
2121          * flow_ip_accept_fe() already validated the version.
2122          */
2123         ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2124         if (fd1->fd_ipversion != fd2->fd_ipversion)
2125                 return (B_FALSE);
2126 
2127         switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2128         case FLOW_IP_LOCAL:
2129                 a1 = &fd1->fd_local_addr;
2130                 m1 = &fd1->fd_local_netmask;
2131                 a2 = &fd2->fd_local_addr;
2132                 m2 = &fd2->fd_local_netmask;
2133                 break;
2134         case FLOW_IP_REMOTE:
2135                 a1 = &fd1->fd_remote_addr;
2136                 m1 = &fd1->fd_remote_netmask;
2137                 a2 = &fd2->fd_remote_addr;
2138                 m2 = &fd2->fd_remote_netmask;
2139                 break;
2140         default:
2141                 /*
2142                  * This is unreachable given the checks in
2143                  * flow_ip_accept_fe().
2144                  */
2145                 return (B_FALSE);
2146         }
2147 
2148         if (fd1->fd_ipversion == IPV4_VERSION) {
2149                 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2150                     V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2151 
2152         } else {
2153                 return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2154                     IN6_ARE_ADDR_EQUAL(m1, m2));
2155         }
2156 }
2157 
2158 static int
2159 flow_ip_mask2plen(in6_addr_t *v6mask)
2160 {
2161         int             bits;
2162         int             plen = IPV6_ABITS;
2163         int             i;
2164 
2165         for (i = 3; i >= 0; i--) {
2166                 if (v6mask->s6_addr32[i] == 0) {
2167                         plen -= 32;
2168                         continue;
2169                 }
2170                 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2171                 if (bits == 0)
2172                         break;
2173                 plen -= bits;
2174         }
2175         return (plen);
2176 }
2177 
2178 /* ARGSUSED */
2179 static int
2180 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2181     flow_entry_t *flent)
2182 {
2183         flow_entry_t    **p = headp;
2184         flow_desc_t     *fd0, *fd;
2185         in6_addr_t      *m0, *m;
2186         int             plen0, plen;
2187 
2188         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2189 
2190         /*
2191          * No special ordering needed for dsfield.
2192          */
2193         fd0 = &flent->fe_flow_desc;
2194         if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2195                 if (*p != NULL) {
2196                         ASSERT(flent->fe_next == NULL);
2197                         flent->fe_next = *p;
2198                 }
2199                 *p = flent;
2200                 return (0);
2201         }
2202 
2203         /*
2204          * IP address flows are arranged in descending prefix length order.
2205          */
2206         m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2207             &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2208         plen0 = flow_ip_mask2plen(m0);
2209         ASSERT(plen0 != 0);
2210 
2211         for (; *p != NULL; p = &(*p)->fe_next) {
2212                 fd = &(*p)->fe_flow_desc;
2213 
2214                 /*
2215                  * Normally a dsfield flent shouldn't end up on the same
2216                  * list as an IP address because flow tables are (for now)
2217                  * disjoint. If we decide to support both IP and dsfield
2218                  * in the same table in the future, this check will allow
2219                  * for that.
2220                  */
2221                 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2222                         continue;
2223 
2224                 /*
2225                  * We also allow for the mixing of local and remote address
2226                  * flents within one list.
2227                  */
2228                 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2229                     &fd->fd_local_netmask : &fd->fd_remote_netmask;
2230                 plen = flow_ip_mask2plen(m);
2231 
2232                 if (plen <= plen0)
2233                         break;
2234         }
2235         if (*p != NULL) {
2236                 ASSERT(flent->fe_next == NULL);
2237                 flent->fe_next = *p;
2238         }
2239         *p = flent;
2240         return (0);
2241 }
2242 
2243 /*
2244  * Transport layer protocol and port matching functions.
2245  */
2246 
2247 /* ARGSUSED */
2248 static boolean_t
2249 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2250 {
2251         flow_l3info_t   *l3info = &s->fs_l3info;
2252         flow_l4info_t   *l4info = &s->fs_l4info;
2253         flow_desc_t     *fd = &flent->fe_flow_desc;
2254 
2255         return (fd->fd_protocol == l3info->l3_protocol &&
2256             fd->fd_local_port == l4info->l4_hash_port);
2257 }
2258 
2259 /* ARGSUSED */
2260 static boolean_t
2261 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2262 {
2263         flow_l3info_t   *l3info = &s->fs_l3info;
2264         flow_l4info_t   *l4info = &s->fs_l4info;
2265         flow_desc_t     *fd = &flent->fe_flow_desc;
2266 
2267         return (fd->fd_protocol == l3info->l3_protocol &&
2268             fd->fd_remote_port == l4info->l4_hash_port);
2269 }
2270 
2271 /*
2272  * Transport hash function.
2273  * Since we only support either local or remote port flows,
2274  * we only need to extract one of the ports to be used for
2275  * matching.
2276  */
2277 static uint32_t
2278 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2279 {
2280         flow_l3info_t   *l3info = &s->fs_l3info;
2281         flow_l4info_t   *l4info = &s->fs_l4info;
2282         uint8_t         proto = l3info->l3_protocol;
2283         boolean_t       dst_or_src;
2284 
2285         if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2286                 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2287         } else {
2288                 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2289         }
2290 
2291         l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2292             l4info->l4_src_port;
2293 
2294         return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2295 }
2296 
2297 /*
2298  * Unlike other accept() functions above, we do not need to get the header
2299  * size because this is our highest layer so far. If we want to do support
2300  * other higher layer protocols, we would need to save the l4_hdrsize
2301  * in the code below.
2302  */
2303 
2304 /* ARGSUSED */
2305 static int
2306 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2307 {
2308         flow_l3info_t   *l3info = &s->fs_l3info;
2309         flow_l4info_t   *l4info = &s->fs_l4info;
2310         uint8_t         proto = l3info->l3_protocol;
2311         uchar_t         *l4_start;
2312 
2313         l4_start = l3info->l3_start + l3info->l3_hdrsize;
2314 
2315         /*
2316          * Adjust start pointer if we're at the end of an mblk.
2317          */
2318         CHECK_AND_ADJUST_START_PTR(s, l4_start);
2319 
2320         l4info->l4_start = l4_start;
2321         if (!OK_32PTR(l4_start))
2322                 return (EINVAL);
2323 
2324         if (l3info->l3_fragmented == B_TRUE)
2325                 return (EINVAL);
2326 
2327         switch (proto) {
2328         case IPPROTO_TCP: {
2329                 struct tcphdr   *tcph = (struct tcphdr *)l4_start;
2330 
2331                 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2332                         return (ENOBUFS);
2333 
2334                 l4info->l4_src_port = tcph->th_sport;
2335                 l4info->l4_dst_port = tcph->th_dport;
2336                 break;
2337         }
2338         case IPPROTO_UDP: {
2339                 struct udphdr   *udph = (struct udphdr *)l4_start;
2340 
2341                 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2342                         return (ENOBUFS);
2343 
2344                 l4info->l4_src_port = udph->uh_sport;
2345                 l4info->l4_dst_port = udph->uh_dport;
2346                 break;
2347         }
2348         case IPPROTO_SCTP: {
2349                 sctp_hdr_t      *sctph = (sctp_hdr_t *)l4_start;
2350 
2351                 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2352                         return (ENOBUFS);
2353 
2354                 l4info->l4_src_port = sctph->sh_sport;
2355                 l4info->l4_dst_port = sctph->sh_dport;
2356                 break;
2357         }
2358         default:
2359                 return (EINVAL);
2360         }
2361 
2362         return (0);
2363 }
2364 
2365 /*
2366  * Validates transport flow entry.
2367  * The protocol field must be present.
2368  */
2369 
2370 /* ARGSUSED */
2371 static int
2372 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2373 {
2374         flow_desc_t     *fd = &flent->fe_flow_desc;
2375         flow_mask_t     mask = fd->fd_mask;
2376 
2377         if ((mask & FLOW_IP_PROTOCOL) == 0)
2378                 return (EINVAL);
2379 
2380         switch (fd->fd_protocol) {
2381         case IPPROTO_TCP:
2382         case IPPROTO_UDP:
2383         case IPPROTO_SCTP:
2384                 break;
2385         default:
2386                 return (EINVAL);
2387         }
2388 
2389         switch (mask & ~FLOW_IP_PROTOCOL) {
2390         case FLOW_ULP_PORT_LOCAL:
2391                 if (fd->fd_local_port == 0)
2392                         return (EINVAL);
2393 
2394                 flent->fe_match = flow_transport_lport_match;
2395                 break;
2396         case FLOW_ULP_PORT_REMOTE:
2397                 if (fd->fd_remote_port == 0)
2398                         return (EINVAL);
2399 
2400                 flent->fe_match = flow_transport_rport_match;
2401                 break;
2402         case 0:
2403                 /*
2404                  * transport-only flows conflicts with our table type.
2405                  */
2406                 return (EOPNOTSUPP);
2407         default:
2408                 return (EINVAL);
2409         }
2410 
2411         return (0);
2412 }
2413 
2414 static uint32_t
2415 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2416 {
2417         flow_desc_t     *fd = &flent->fe_flow_desc;
2418         uint16_t        port = 0;
2419 
2420         port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2421             fd->fd_local_port : fd->fd_remote_port;
2422 
2423         return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2424 }
2425 
2426 /* ARGSUSED */
2427 static boolean_t
2428 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2429 {
2430         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2431 
2432         if (fd1->fd_protocol != fd2->fd_protocol)
2433                 return (B_FALSE);
2434 
2435         if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2436                 return (fd1->fd_local_port == fd2->fd_local_port);
2437 
2438         if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2439                 return (fd1->fd_remote_port == fd2->fd_remote_port);
2440 
2441         return (B_TRUE);
2442 }
2443 
2444 static flow_ops_t flow_l2_ops = {
2445         flow_l2_accept_fe,
2446         flow_l2_hash_fe,
2447         flow_l2_match_fe,
2448         flow_generic_insert_fe,
2449         flow_l2_hash,
2450         {flow_l2_accept}
2451 };
2452 
2453 static flow_ops_t flow_ip_ops = {
2454         flow_ip_accept_fe,
2455         flow_ip_hash_fe,
2456         flow_ip_match_fe,
2457         flow_ip_insert_fe,
2458         flow_ip_hash,
2459         {flow_l2_accept, flow_ip_accept}
2460 };
2461 
2462 static flow_ops_t flow_ip_proto_ops = {
2463         flow_ip_proto_accept_fe,
2464         flow_ip_proto_hash_fe,
2465         flow_ip_proto_match_fe,
2466         flow_generic_insert_fe,
2467         flow_ip_proto_hash,
2468         {flow_l2_accept, flow_ip_accept}
2469 };
2470 
2471 static flow_ops_t flow_transport_ops = {
2472         flow_transport_accept_fe,
2473         flow_transport_hash_fe,
2474         flow_transport_match_fe,
2475         flow_generic_insert_fe,
2476         flow_transport_hash,
2477         {flow_l2_accept, flow_ip_accept, flow_transport_accept}
2478 };
2479 
2480 static flow_tab_info_t flow_tab_info_list[] = {
2481         {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2482         {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2483         {&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2484         {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2485         {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2486         {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2487 };
2488 
2489 #define FLOW_MAX_TAB_INFO \
2490         ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2491 
2492 static flow_tab_info_t *
2493 mac_flow_tab_info_get(flow_mask_t mask)
2494 {
2495         int     i;
2496 
2497         for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2498                 if (mask == flow_tab_info_list[i].fti_mask)
2499                         return (&flow_tab_info_list[i]);
2500         }
2501         return (NULL);
2502 }