9832-improved New usr/src/uts/common/io/mac/mac

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright 2019, Joyent, Inc.
  27  */
  28 
  29 #include <sys/strsun.h>
  30 #include <sys/sdt.h>
  31 #include <sys/mac.h>
  32 #include <sys/mac_impl.h>
  33 #include <sys/mac_client_impl.h>
  34 #include <sys/mac_stat.h>
  35 #include <sys/dls.h>
  36 #include <sys/dls_impl.h>
  37 #include <sys/mac_soft_ring.h>
  38 #include <sys/ethernet.h>
  39 #include <sys/cpupart.h>
  40 #include <sys/pool.h>
  41 #include <sys/pool_pset.h>
  42 #include <sys/vlan.h>
  43 #include <inet/ip.h>
  44 #include <inet/ip6.h>
  45 #include <netinet/tcp.h>
  46 #include <netinet/udp.h>
  47 #include <netinet/sctp.h>
  48 
  49 typedef struct flow_stats_s {
  50         uint64_t        fs_obytes;
  51         uint64_t        fs_opackets;
  52         uint64_t        fs_oerrors;
  53         uint64_t        fs_ibytes;
  54         uint64_t        fs_ipackets;
  55         uint64_t        fs_ierrors;
  56 } flow_stats_t;
  57 
  58 
  59 /* global flow table, will be a per exclusive-zone table later */
  60 static mod_hash_t       *flow_hash;
  61 static krwlock_t        flow_tab_lock;
  62 
  63 static kmem_cache_t     *flow_cache;
  64 static kmem_cache_t     *flow_tab_cache;
  65 static flow_ops_t       flow_l2_ops;
  66 
  67 typedef struct {
  68         const char      *fs_name;
  69         uint_t          fs_offset;
  70 } flow_stats_info_t;
  71 
  72 #define FS_OFF(f)       (offsetof(flow_stats_t, f))
  73 static flow_stats_info_t flow_stats_list[] = {
  74         {"rbytes",      FS_OFF(fs_ibytes)},
  75         {"ipackets",    FS_OFF(fs_ipackets)},
  76         {"ierrors",     FS_OFF(fs_ierrors)},
  77         {"obytes",      FS_OFF(fs_obytes)},
  78         {"opackets",    FS_OFF(fs_opackets)},
  79         {"oerrors",     FS_OFF(fs_oerrors)}
  80 };
  81 #define FS_SIZE         (sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
  82 
  83 /*
  84  * Checks whether a flow mask is legal.
  85  */
  86 static flow_tab_info_t  *mac_flow_tab_info_get(flow_mask_t);
  87 
  88 static void
  89 flow_stat_init(kstat_named_t *knp)
  90 {
  91         int     i;
  92 
  93         for (i = 0; i < FS_SIZE; i++, knp++) {
  94                 kstat_named_init(knp, flow_stats_list[i].fs_name,
  95                     KSTAT_DATA_UINT64);
  96         }
  97 }
  98 
  99 static int
 100 flow_stat_update(kstat_t *ksp, int rw)
 101 {
 102         flow_entry_t            *fep = ksp->ks_private;
 103         kstat_named_t           *knp = ksp->ks_data;
 104         uint64_t                *statp;
 105         int                     i;
 106         mac_rx_stats_t          *mac_rx_stat;
 107         mac_tx_stats_t          *mac_tx_stat;
 108         flow_stats_t            flow_stats;
 109         mac_soft_ring_set_t     *mac_srs;
 110 
 111         if (rw != KSTAT_READ)
 112                 return (EACCES);
 113 
 114         bzero(&flow_stats, sizeof (flow_stats_t));
 115 
 116         for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
 117                 mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
 118                 if (mac_srs == NULL)            /* Multicast flow */
 119                         break;
 120                 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
 121 
 122                 flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
 123                     mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
 124 
 125                 flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
 126                     mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
 127 
 128                 flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
 129         }
 130 
 131         mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
 132         if (mac_srs == NULL)            /* Multicast flow */
 133                 goto done;
 134         mac_tx_stat = &mac_srs->srs_tx.st_stat;
 135 
 136         flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
 137         flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
 138         flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;
 139 
 140 done:
 141         for (i = 0; i < FS_SIZE; i++, knp++) {
 142                 statp = (uint64_t *)
 143                     ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
 144                 knp->value.ui64 = *statp;
 145         }
 146         return (0);
 147 }
 148 
 149 static void
 150 flow_stat_create(flow_entry_t *fep)
 151 {
 152         kstat_t         *ksp;
 153         kstat_named_t   *knp;
 154         uint_t          nstats = FS_SIZE;
 155 
 156         /*
 157          * Fow now, flow entries are only manipulated and visible from the
 158          * global zone.
 159          */
 160         ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
 161             KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
 162         if (ksp == NULL)
 163                 return;
 164 
 165         ksp->ks_update = flow_stat_update;
 166         ksp->ks_private = fep;
 167         fep->fe_ksp = ksp;
 168 
 169         knp = (kstat_named_t *)ksp->ks_data;
 170         flow_stat_init(knp);
 171         kstat_install(ksp);
 172 }
 173 
 174 void
 175 flow_stat_destroy(flow_entry_t *fep)
 176 {
 177         if (fep->fe_ksp != NULL) {
 178                 kstat_delete(fep->fe_ksp);
 179                 fep->fe_ksp = NULL;
 180         }
 181 }
 182 
 183 /*
 184  * Initialize the flow table
 185  */
 186 void
 187 mac_flow_init()
 188 {
 189         flow_cache = kmem_cache_create("flow_entry_cache",
 190             sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 191         flow_tab_cache = kmem_cache_create("flow_tab_cache",
 192             sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 193         flow_hash = mod_hash_create_extended("flow_hash",
 194             100, mod_hash_null_keydtor, mod_hash_null_valdtor,
 195             mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
 196         rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
 197 }
 198 
 199 /*
 200  * Cleanup and release the flow table
 201  */
 202 void
 203 mac_flow_fini()
 204 {
 205         kmem_cache_destroy(flow_cache);
 206         kmem_cache_destroy(flow_tab_cache);
 207         mod_hash_destroy_hash(flow_hash);
 208         rw_destroy(&flow_tab_lock);
 209 }
 210 
 211 /*
 212  * mac_create_flow(): create a flow_entry_t.
 213  */
 214 int
 215 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
 216     void *client_cookie, uint_t type, flow_entry_t **flentp)
 217 {
 218         flow_entry_t            *flent = *flentp;
 219         int                     err = 0;
 220 
 221         if (mrp != NULL) {
 222                 err = mac_validate_props(NULL, mrp);
 223                 if (err != 0)
 224                         return (err);
 225         }
 226 
 227         if (flent == NULL) {
 228                 flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
 229                 bzero(flent, sizeof (*flent));
 230                 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
 231                 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
 232 
 233                 /* Initialize the receiver function to a safe routine */
 234                 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
 235                 flent->fe_index = -1;
 236         }
 237         (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 238 
 239         /* This is an initial flow, will be configured later */
 240         if (fd == NULL) {
 241                 *flentp = flent;
 242                 return (0);
 243         }
 244 
 245         flent->fe_client_cookie = client_cookie;
 246         flent->fe_type = type;
 247 
 248         /* Save flow desc */
 249         bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 250 
 251         if (mrp != NULL) {
 252                 /*
 253                  * We have already set fe_resource_props for a Link.
 254                  */
 255                 if (type & FLOW_USER) {
 256                         bcopy(mrp, &flent->fe_resource_props,
 257                             sizeof (mac_resource_props_t));
 258                 }
 259                 /*
 260                  * The effective resource list should reflect the priority
 261                  * that we set implicitly.
 262                  */
 263                 if (!(mrp->mrp_mask & MRP_PRIORITY))
 264                         mrp->mrp_mask |= MRP_PRIORITY;
 265                 if (type & FLOW_USER)
 266                         mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
 267                 else
 268                         mrp->mrp_priority = MPL_LINK_DEFAULT;
 269                 bzero(mrp->mrp_pool, MAXPATHLEN);
 270                 bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
 271                 bcopy(mrp, &flent->fe_effective_props,
 272                     sizeof (mac_resource_props_t));
 273         }
 274         flow_stat_create(flent);
 275 
 276         *flentp = flent;
 277         return (0);
 278 }
 279 
 280 /*
 281  * Validate flow entry and add it to a flow table.
 282  */
 283 int
 284 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
 285 {
 286         flow_entry_t    **headp, **p;
 287         flow_ops_t      *ops = &ft->ft_ops;
 288         flow_mask_t     mask;
 289         uint32_t        index;
 290         int             err;
 291 
 292         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 293 
 294         /*
 295          * Check for invalid bits in mask.
 296          */
 297         mask = flent->fe_flow_desc.fd_mask;
 298         if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
 299                 return (EOPNOTSUPP);
 300 
 301         /*
 302          * Validate flent.
 303          */
 304         if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
 305                 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
 306                     flow_entry_t *, flent, int, err);
 307                 return (err);
 308         }
 309 
 310         /*
 311          * Flent is valid. now calculate hash and insert it
 312          * into hash table.
 313          */
 314         index = ops->fo_hash_fe(ft, flent);
 315 
 316         /*
 317          * We do not need a lock up until now because we were
 318          * not accessing the flow table.
 319          */
 320         rw_enter(&ft->ft_lock, RW_WRITER);
 321         headp = &ft->ft_table[index];
 322 
 323         /*
 324          * Check for duplicate flow.
 325          */
 326         for (p = headp; *p != NULL; p = &(*p)->fe_next) {
 327                 if ((*p)->fe_flow_desc.fd_mask !=
 328                     flent->fe_flow_desc.fd_mask)
 329                         continue;
 330 
 331                 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
 332                         rw_exit(&ft->ft_lock);
 333                         DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
 334                             flow_entry_t *, flent, int, err);
 335                         return (EALREADY);
 336                 }
 337         }
 338 
 339         /*
 340          * Insert flow to hash list.
 341          */
 342         err = ops->fo_insert_fe(ft, headp, flent);
 343         if (err != 0) {
 344                 rw_exit(&ft->ft_lock);
 345                 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
 346                     flow_entry_t *, flent, int, err);
 347                 return (err);
 348         }
 349 
 350         /*
 351          * Save the hash index so it can be used by mac_flow_remove().
 352          */
 353         flent->fe_index = (int)index;
 354 
 355         /*
 356          * Save the flow tab back reference.
 357          */
 358         flent->fe_flow_tab = ft;
 359         FLOW_MARK(flent, FE_FLOW_TAB);
 360         ft->ft_flow_count++;
 361         rw_exit(&ft->ft_lock);
 362         return (0);
 363 }
 364 
 365 /*
 366  * Remove a flow from a mac client's subflow table
 367  */
 368 void
 369 mac_flow_rem_subflow(flow_entry_t *flent)
 370 {
 371         flow_tab_t              *ft = flent->fe_flow_tab;
 372         mac_client_impl_t       *mcip = ft->ft_mcip;
 373         mac_handle_t            mh = (mac_handle_t)ft->ft_mip;
 374 
 375         ASSERT(MAC_PERIM_HELD(mh));
 376 
 377         mac_flow_remove(ft, flent, B_FALSE);
 378         if (flent->fe_mcip == NULL) {
 379                 /*
 380                  * The interface is not yet plumbed and mac_client_flow_add
 381                  * was not done.
 382                  */
 383                 if (FLOW_TAB_EMPTY(ft)) {
 384                         mac_flow_tab_destroy(ft);
 385                         mcip->mci_subflow_tab = NULL;
 386                 }
 387         } else {
 388                 mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
 389                 mac_link_flow_clean((mac_client_handle_t)mcip, flent);
 390         }
 391         mac_fastpath_enable(mh);
 392 }
 393 
 394 /*
 395  * Add a flow to a mac client's subflow table and instantiate the flow
 396  * in the mac by creating the associated SRSs etc.
 397  */
 398 int
 399 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
 400     boolean_t instantiate_flow)
 401 {
 402         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
 403         mac_handle_t            mh = (mac_handle_t)mcip->mci_mip;
 404         flow_tab_info_t         *ftinfo;
 405         flow_mask_t             mask;
 406         flow_tab_t              *ft;
 407         int                     err;
 408         boolean_t               ft_created = B_FALSE;
 409 
 410         ASSERT(MAC_PERIM_HELD(mh));
 411 
 412         if ((err = mac_fastpath_disable(mh)) != 0)
 413                 return (err);
 414 
 415         /*
 416          * If the subflow table exists already just add the new subflow
 417          * to the existing table, else we create a new subflow table below.
 418          */
 419         ft = mcip->mci_subflow_tab;
 420         if (ft == NULL) {
 421                 mask = flent->fe_flow_desc.fd_mask;
 422                 /*
 423                  * Try to create a new table and then add the subflow to the
 424                  * newly created subflow table
 425                  */
 426                 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
 427                         mac_fastpath_enable(mh);
 428                         return (EOPNOTSUPP);
 429                 }
 430 
 431                 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
 432                     mcip->mci_mip, &ft);
 433                 ft_created = B_TRUE;
 434         }
 435 
 436         err = mac_flow_add(ft, flent);
 437         if (err != 0) {
 438                 if (ft_created)
 439                         mac_flow_tab_destroy(ft);
 440                 mac_fastpath_enable(mh);
 441                 return (err);
 442         }
 443 
 444         if (instantiate_flow) {
 445                 /* Now activate the flow by creating its SRSs */
 446                 ASSERT(MCIP_DATAPATH_SETUP(mcip));
 447                 err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
 448                 if (err != 0) {
 449                         mac_flow_remove(ft, flent, B_FALSE);
 450                         if (ft_created)
 451                                 mac_flow_tab_destroy(ft);
 452                         mac_fastpath_enable(mh);
 453                         return (err);
 454                 }
 455         } else {
 456                 FLOW_MARK(flent, FE_UF_NO_DATAPATH);
 457         }
 458         if (ft_created) {
 459                 ASSERT(mcip->mci_subflow_tab == NULL);
 460                 ft->ft_mcip = mcip;
 461                 mcip->mci_subflow_tab = ft;
 462                 if (instantiate_flow)
 463                         mac_client_update_classifier(mcip, B_TRUE);
 464         }
 465         return (0);
 466 }
 467 
 468 /*
 469  * Remove flow entry from flow table.
 470  */
 471 void
 472 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
 473 {
 474         flow_entry_t    **fp;
 475 
 476         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 477         if (!(flent->fe_flags & FE_FLOW_TAB))
 478                 return;
 479 
 480         rw_enter(&ft->ft_lock, RW_WRITER);
 481         /*
 482          * If this is a permanent removal from the flow table, mark it
 483          * CONDEMNED to prevent future references. If this is a temporary
 484          * removal from the table, say to update the flow descriptor then
 485          * we don't mark it CONDEMNED
 486          */
 487         if (!temp)
 488                 FLOW_MARK(flent, FE_CONDEMNED);
 489         /*
 490          * Locate the specified flent.
 491          */
 492         fp = &ft->ft_table[flent->fe_index];
 493         while (*fp != flent)
 494                 fp = &(*fp)->fe_next;
 495 
 496         /*
 497          * The flent must exist. Otherwise it's a bug.
 498          */
 499         ASSERT(fp != NULL);
 500         *fp = flent->fe_next;
 501         flent->fe_next = NULL;
 502 
 503         /*
 504          * Reset fe_index to -1 so any attempt to call mac_flow_remove()
 505          * on a flent that is supposed to be in the table (FE_FLOW_TAB)
 506          * will panic.
 507          */
 508         flent->fe_index = -1;
 509         FLOW_UNMARK(flent, FE_FLOW_TAB);
 510         ft->ft_flow_count--;
 511         rw_exit(&ft->ft_lock);
 512 }
 513 
 514 /*
 515  * This is the flow lookup routine used by the mac sw classifier engine.
 516  */
 517 int
 518 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
 519 {
 520         flow_state_t    s;
 521         flow_entry_t    *flent;
 522         flow_ops_t      *ops = &ft->ft_ops;
 523         boolean_t       retried = B_FALSE;
 524         int             i, err;
 525 
 526         s.fs_flags = flags;
 527 retry:
 528         s.fs_mp = mp;
 529 
 530         /*
 531          * Walk the list of predeclared accept functions.
 532          * Each of these would accumulate enough state to allow the next
 533          * accept routine to make progress.
 534          */
 535         for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
 536                 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
 537                         mblk_t  *last;
 538 
 539                         /*
 540                          * ENOBUFS indicates that the mp could be too short
 541                          * and may need a pullup.
 542                          */
 543                         if (err != ENOBUFS || retried)
 544                                 return (err);
 545 
 546                         /*
 547                          * The pullup is done on the last processed mblk, not
 548                          * the starting one. pullup is not done if the mblk
 549                          * has references or if b_cont is NULL.
 550                          */
 551                         last = s.fs_mp;
 552                         if (DB_REF(last) > 1 || last->b_cont == NULL ||
 553                             pullupmsg(last, -1) == 0)
 554                                 return (EINVAL);
 555 
 556                         retried = B_TRUE;
 557                         DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
 558                             flow_state_t *, &s);
 559                         goto retry;
 560                 }
 561         }
 562 
 563         /*
 564          * The packet is considered sane. We may now attempt to
 565          * find the corresponding flent.
 566          */
 567         rw_enter(&ft->ft_lock, RW_READER);
 568         flent = ft->ft_table[ops->fo_hash(ft, &s)];
 569         for (; flent != NULL; flent = flent->fe_next) {
 570                 if (flent->fe_match(ft, flent, &s)) {
 571                         FLOW_TRY_REFHOLD(flent, err);
 572                         if (err != 0)
 573                                 continue;
 574                         *flentp = flent;
 575                         rw_exit(&ft->ft_lock);
 576                         return (0);
 577                 }
 578         }
 579         rw_exit(&ft->ft_lock);
 580         return (ENOENT);
 581 }
 582 
 583 /*
 584  * Walk flow table.
 585  * The caller is assumed to have proper perimeter protection.
 586  */
 587 int
 588 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
 589     void *arg)
 590 {
 591         int             err, i, cnt = 0;
 592         flow_entry_t    *flent;
 593 
 594         if (ft == NULL)
 595                 return (0);
 596 
 597         for (i = 0; i < ft->ft_size; i++) {
 598                 for (flent = ft->ft_table[i]; flent != NULL;
 599                     flent = flent->fe_next) {
 600                         cnt++;
 601                         err = (*fn)(flent, arg);
 602                         if (err != 0)
 603                                 return (err);
 604                 }
 605         }
 606         VERIFY(cnt == ft->ft_flow_count);
 607         return (0);
 608 }
 609 
 610 /*
 611  * Same as the above except a mutex is used for protection here.
 612  */
 613 int
 614 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
 615     void *arg)
 616 {
 617         int             err;
 618 
 619         if (ft == NULL)
 620                 return (0);
 621 
 622         rw_enter(&ft->ft_lock, RW_WRITER);
 623         err = mac_flow_walk_nolock(ft, fn, arg);
 624         rw_exit(&ft->ft_lock);
 625         return (err);
 626 }
 627 
 628 static boolean_t        mac_flow_clean(flow_entry_t *);
 629 
 630 /*
 631  * Destroy a flow entry. Called when the last reference on a flow is released.
 632  */
 633 void
 634 mac_flow_destroy(flow_entry_t *flent)
 635 {
 636         ASSERT(flent->fe_refcnt == 0);
 637 
 638         if ((flent->fe_type & FLOW_USER) != 0) {
 639                 ASSERT(mac_flow_clean(flent));
 640         } else {
 641                 mac_flow_cleanup(flent);
 642         }
 643         mac_misc_stat_delete(flent);
 644         mutex_destroy(&flent->fe_lock);
 645         cv_destroy(&flent->fe_cv);
 646         flow_stat_destroy(flent);
 647         kmem_cache_free(flow_cache, flent);
 648 }
 649 
 650 /*
 651  * XXX eric
 652  * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
 653  * mac_link_flow_modify() should really be moved/reworked into the
 654  * two functions below. This would consolidate all the mac property
 655  * checking in one place. I'm leaving this alone for now since it's
 656  * out of scope of the new flows work.
 657  */
 658 /* ARGSUSED */
 659 uint32_t
 660 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
 661 {
 662         uint32_t                changed_mask = 0;
 663         mac_resource_props_t    *fmrp = &flent->fe_effective_props;
 664         int                     i;
 665 
 666         if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
 667             (!(fmrp->mrp_mask & MRP_MAXBW) ||
 668             (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
 669                 changed_mask |= MRP_MAXBW;
 670                 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
 671                         fmrp->mrp_mask &= ~MRP_MAXBW;
 672                         fmrp->mrp_maxbw = 0;
 673                 } else {
 674                         fmrp->mrp_mask |= MRP_MAXBW;
 675                         fmrp->mrp_maxbw = mrp->mrp_maxbw;
 676                 }
 677         }
 678 
 679         if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
 680                 if (fmrp->mrp_priority != mrp->mrp_priority)
 681                         changed_mask |= MRP_PRIORITY;
 682                 if (mrp->mrp_priority == MPL_RESET) {
 683                         fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
 684                         fmrp->mrp_mask &= ~MRP_PRIORITY;
 685                 } else {
 686                         fmrp->mrp_priority = mrp->mrp_priority;
 687                         fmrp->mrp_mask |= MRP_PRIORITY;
 688                 }
 689         }
 690 
 691         /* modify fanout */
 692         if ((mrp->mrp_mask & MRP_CPUS) != 0) {
 693                 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
 694                     (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
 695                         for (i = 0; i < mrp->mrp_ncpus; i++) {
 696                                 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
 697                                         break;
 698                         }
 699                         if (i == mrp->mrp_ncpus) {
 700                                 /*
 701                                  * The new set of cpus passed is exactly
 702                                  * the same as the existing set.
 703                                  */
 704                                 return (changed_mask);
 705                         }
 706                 }
 707                 changed_mask |= MRP_CPUS;
 708                 MAC_COPY_CPUS(mrp, fmrp);
 709         }
 710 
 711         /*
 712          * Modify the rings property.
 713          */
 714         if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
 715                 mac_set_rings_effective(flent->fe_mcip);
 716 
 717         if ((mrp->mrp_mask & MRP_POOL) != 0) {
 718                 if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
 719                         changed_mask |= MRP_POOL;
 720                 if (strlen(mrp->mrp_pool) == 0)
 721                         fmrp->mrp_mask &= ~MRP_POOL;
 722                 else
 723                         fmrp->mrp_mask |= MRP_POOL;
 724                 (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
 725         }
 726         return (changed_mask);
 727 }
 728 
 729 void
 730 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
 731 {
 732         uint32_t changed_mask;
 733         mac_client_impl_t *mcip = flent->fe_mcip;
 734         mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
 735         mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
 736         cpupart_t *cpupart = NULL;
 737         boolean_t use_default = B_FALSE;
 738 
 739         ASSERT(flent != NULL);
 740         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 741 
 742         rw_enter(&ft->ft_lock, RW_WRITER);
 743 
 744         /* Update the cached values inside the subflow entry */
 745         changed_mask = mac_flow_modify_props(flent, mrp);
 746         rw_exit(&ft->ft_lock);
 747         /*
 748          * Push the changed parameters to the scheduling code in the
 749          * SRS's, to take effect right away.
 750          */
 751         if (changed_mask & MRP_MAXBW) {
 752                 mac_srs_update_bwlimit(flent, mrp);
 753                 /*
 754                  * If bandwidth is changed, we may have to change
 755                  * the number of soft ring to be used for fanout.
 756                  * Call mac_flow_update_fanout() if MAC_BIND_CPU
 757                  * is not set and there is no user supplied cpu
 758                  * info. This applies only to link at this time.
 759                  */
 760                 if (!(flent->fe_type & FLOW_USER) &&
 761                     !(changed_mask & MRP_CPUS) &&
 762                     !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
 763                         mac_fanout_setup(mcip, flent, mcip_mrp,
 764                             mac_rx_deliver, mcip, NULL, NULL);
 765                 }
 766         }
 767         if (mrp->mrp_mask & MRP_PRIORITY)
 768                 mac_flow_update_priority(mcip, flent);
 769 
 770         if (changed_mask & MRP_CPUS)
 771                 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
 772                     NULL);
 773 
 774         if (mrp->mrp_mask & MRP_POOL) {
 775                 pool_lock();
 776                 cpupart = mac_pset_find(mrp, &use_default);
 777                 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
 778                     cpupart);
 779                 mac_set_pool_effective(use_default, cpupart, mrp, emrp);
 780                 pool_unlock();
 781         }
 782 }
 783 
 784 /*
 785  * This function waits for a certain condition to be met and is generally
 786  * used before a destructive or quiescing operation.
 787  */
 788 void
 789 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
 790 {
 791         mutex_enter(&flent->fe_lock);
 792         flent->fe_flags |= FE_WAITER;
 793 
 794         switch (event) {
 795         case FLOW_DRIVER_UPCALL:
 796                 /*
 797                  * We want to make sure the driver upcalls have finished before
 798                  * we signal the Rx SRS worker to quit.
 799                  */
 800                 while (flent->fe_refcnt != 1)
 801                         cv_wait(&flent->fe_cv, &flent->fe_lock);
 802                 break;
 803 
 804         case FLOW_USER_REF:
 805                 /*
 806                  * Wait for the fe_user_refcnt to drop to 0. The flow has
 807                  * been removed from the global flow hash.
 808                  */
 809                 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
 810                 while (flent->fe_user_refcnt != 0)
 811                         cv_wait(&flent->fe_cv, &flent->fe_lock);
 812                 break;
 813 
 814         default:
 815                 ASSERT(0);
 816         }
 817 
 818         flent->fe_flags &= ~FE_WAITER;
 819         mutex_exit(&flent->fe_lock);
 820 }
 821 
 822 static boolean_t
 823 mac_flow_clean(flow_entry_t *flent)
 824 {
 825         ASSERT(flent->fe_next == NULL);
 826         ASSERT(flent->fe_tx_srs == NULL);
 827         ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
 828         ASSERT(flent->fe_mbg == NULL);
 829 
 830         return (B_TRUE);
 831 }
 832 
 833 void
 834 mac_flow_cleanup(flow_entry_t *flent)
 835 {
 836         if ((flent->fe_type & FLOW_USER) == 0) {
 837                 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
 838                     (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
 839                 ASSERT(flent->fe_refcnt == 0);
 840         } else {
 841                 ASSERT(flent->fe_refcnt == 1);
 842         }
 843 
 844         if (flent->fe_mbg != NULL) {
 845                 ASSERT(flent->fe_tx_srs == NULL);
 846                 /* This is a multicast or broadcast flow entry */
 847                 mac_bcast_grp_free(flent->fe_mbg);
 848                 flent->fe_mbg = NULL;
 849         }
 850 
 851         if (flent->fe_tx_srs != NULL) {
 852                 ASSERT(flent->fe_mbg == NULL);
 853                 mac_srs_free(flent->fe_tx_srs);
 854                 flent->fe_tx_srs = NULL;
 855         }
 856 
 857         /*
 858          * In the normal case fe_rx_srs_cnt is 1. However in the error case
 859          * when mac_unicast_add fails we may not have set up any SRS
 860          * in which case fe_rx_srs_cnt will be zero.
 861          */
 862         if (flent->fe_rx_srs_cnt != 0) {
 863                 ASSERT(flent->fe_rx_srs_cnt == 1);
 864                 mac_srs_free(flent->fe_rx_srs[0]);
 865                 flent->fe_rx_srs[0] = NULL;
 866                 flent->fe_rx_srs_cnt = 0;
 867         }
 868         ASSERT(flent->fe_rx_srs[0] == NULL);
 869 }
 870 
 871 void
 872 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
 873 {
 874         /*
 875          * Grab the fe_lock to see a self-consistent fe_flow_desc.
 876          * Updates to the fe_flow_desc happen under the fe_lock
 877          * after removing the flent from the flow table
 878          */
 879         mutex_enter(&flent->fe_lock);
 880         bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
 881         mutex_exit(&flent->fe_lock);
 882 }
 883 
 884 /*
 885  * Update a field of a flow entry. The mac perimeter ensures that
 886  * this is the only thread doing a modify operation on this mac end point.
 887  * So the flow table can't change or disappear. The ft_lock protects access
 888  * to the flow entry, and holding the lock ensures that there isn't any thread
 889  * accessing the flow entry or attempting a flow table lookup. However
 890  * data threads that are using the flow entry based on the old descriptor
 891  * will continue to use the flow entry. If strong coherence is required
 892  * then the flow will have to be quiesced before the descriptor can be
 893  * changed.
 894  */
 895 void
 896 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
 897 {
 898         flow_tab_t      *ft = flent->fe_flow_tab;
 899         flow_desc_t     old_desc;
 900         int             err;
 901 
 902         if (ft == NULL) {
 903                 /*
 904                  * The flow hasn't yet been inserted into the table,
 905                  * so only the caller knows about this flow, however for
 906                  * uniformity we grab the fe_lock here.
 907                  */
 908                 mutex_enter(&flent->fe_lock);
 909                 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 910                 mutex_exit(&flent->fe_lock);
 911         }
 912 
 913         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 914 
 915         /*
 916          * Need to remove the flow entry from the table and reinsert it,
 917          * into a potentially diference hash line. The hash depends on
 918          * the new descriptor fields. However access to fe_desc itself
 919          * is always under the fe_lock. This helps log and stat functions
 920          * see a self-consistent fe_flow_desc.
 921          */
 922         mac_flow_remove(ft, flent, B_TRUE);
 923         old_desc = flent->fe_flow_desc;
 924 
 925         mutex_enter(&flent->fe_lock);
 926         bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
 927         mutex_exit(&flent->fe_lock);
 928 
 929         if (mac_flow_add(ft, flent) != 0) {
 930                 /*
 931                  * The add failed say due to an invalid flow descriptor.
 932                  * Undo the update
 933                  */
 934                 flent->fe_flow_desc = old_desc;
 935                 err = mac_flow_add(ft, flent);
 936                 ASSERT(err == 0);
 937         }
 938 }
 939 
 940 void
 941 mac_flow_set_name(flow_entry_t *flent, const char *name)
 942 {
 943         flow_tab_t      *ft = flent->fe_flow_tab;
 944 
 945         if (ft == NULL) {
 946                 /*
 947                  *  The flow hasn't yet been inserted into the table,
 948                  * so only the caller knows about this flow
 949                  */
 950                 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 951         } else {
 952                 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
 953         }
 954 
 955         mutex_enter(&flent->fe_lock);
 956         (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
 957         mutex_exit(&flent->fe_lock);
 958 }
 959 
 960 /*
 961  * Return the client-private cookie that was associated with
 962  * the flow when it was created.
 963  */
 964 void *
 965 mac_flow_get_client_cookie(flow_entry_t *flent)
 966 {
 967         return (flent->fe_client_cookie);
 968 }
 969 
 970 /*
 971  * Forward declarations.
 972  */
 973 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *);
 974 static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
 975 static int      flow_l2_accept(flow_tab_t *, flow_state_t *);
 976 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *);
 977 static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
 978 static int      flow_ether_accept(flow_tab_t *, flow_state_t *);
 979 
 980 /*
 981  * Create flow table.
 982  */
 983 void
 984 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
 985     mac_impl_t *mip, flow_tab_t **ftp)
 986 {
 987         flow_tab_t      *ft;
 988         flow_ops_t      *new_ops;
 989 
 990         ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
 991         bzero(ft, sizeof (*ft));
 992 
 993         ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
 994 
 995         /*
 996          * We make a copy of the ops vector instead of just pointing to it
 997          * because we might want to customize the ops vector on a per table
 998          * basis (e.g. for optimization).
 999          */
1000         new_ops = &ft->ft_ops;
1001         bcopy(ops, new_ops, sizeof (*ops));
1002         ft->ft_mask = mask;
1003         ft->ft_size = size;
1004         ft->ft_mip = mip;
1005 
1006         /*
1007          * Optimizations for DL_ETHER media.
1008          */
1009         if (mip->mi_info.mi_nativemedia == DL_ETHER) {
1010                 if (new_ops->fo_hash == flow_l2_hash)
1011                         new_ops->fo_hash = flow_ether_hash;
1012                 if (new_ops->fo_hash_fe == flow_l2_hash_fe)
1013                         new_ops->fo_hash_fe = flow_ether_hash_fe;
1014                 if (new_ops->fo_accept[0] == flow_l2_accept)
1015                         new_ops->fo_accept[0] = flow_ether_accept;
1016         }
1017         *ftp = ft;
1018 }
1019 
1020 void
1021 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
1022 {
1023         mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
1024             1024, mip, ftp);
1025 }
1026 
1027 /*
1028  * Destroy flow table.
1029  */
1030 void
1031 mac_flow_tab_destroy(flow_tab_t *ft)
1032 {
1033         if (ft == NULL)
1034                 return;
1035 
1036         ASSERT(ft->ft_flow_count == 0);
1037         kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
1038         bzero(ft, sizeof (*ft));
1039         kmem_cache_free(flow_tab_cache, ft);
1040 }
1041 
1042 /*
1043  * Add a new flow entry to the global flow hash table
1044  */
1045 int
1046 mac_flow_hash_add(flow_entry_t *flent)
1047 {
1048         int     err;
1049 
1050         rw_enter(&flow_tab_lock, RW_WRITER);
1051         err = mod_hash_insert(flow_hash,
1052             (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
1053         if (err != 0) {
1054                 rw_exit(&flow_tab_lock);
1055                 return (EEXIST);
1056         }
1057         /* Mark as inserted into the global flow hash table */
1058         FLOW_MARK(flent, FE_G_FLOW_HASH);
1059         rw_exit(&flow_tab_lock);
1060         return (err);
1061 }
1062 
1063 /*
1064  * Remove a flow entry from the global flow hash table
1065  */
1066 void
1067 mac_flow_hash_remove(flow_entry_t *flent)
1068 {
1069         mod_hash_val_t  val;
1070 
1071         rw_enter(&flow_tab_lock, RW_WRITER);
1072         VERIFY(mod_hash_remove(flow_hash,
1073             (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1074 
1075         /* Clear the mark that says inserted into the global flow hash table */
1076         FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1077         rw_exit(&flow_tab_lock);
1078 }
1079 
1080 /*
1081  * Retrieve a flow entry from the global flow hash table.
1082  */
1083 int
1084 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1085 {
1086         int             err;
1087         flow_entry_t    *flent;
1088 
1089         rw_enter(&flow_tab_lock, RW_READER);
1090         err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1091             (mod_hash_val_t *)&flent);
1092         if (err != 0) {
1093                 rw_exit(&flow_tab_lock);
1094                 return (ENOENT);
1095         }
1096         ASSERT(flent != NULL);
1097         FLOW_USER_REFHOLD(flent);
1098         rw_exit(&flow_tab_lock);
1099 
1100         *flentp = flent;
1101         return (0);
1102 }
1103 
1104 /*
1105  * Initialize or release mac client flows by walking the subflow table.
1106  * These are typically invoked during plumb/unplumb of links.
1107  */
1108 
1109 static int
1110 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1111 {
1112         mac_client_impl_t       *mcip = arg;
1113 
1114         if (mac_link_flow_init(arg, flent) != 0) {
1115                 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1116                     flent->fe_flow_name, mcip->mci_name);
1117         } else {
1118                 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1119         }
1120         return (0);
1121 }
1122 
1123 void
1124 mac_link_init_flows(mac_client_handle_t mch)
1125 {
1126         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1127 
1128         (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1129             mac_link_init_flows_cb, mcip);
1130         /*
1131          * If mac client had subflow(s) configured before plumb, change
1132          * function to mac_rx_srs_subflow_process and in case of hardware
1133          * classification, disable polling.
1134          */
1135         mac_client_update_classifier(mcip, B_TRUE);
1136 
1137 }
1138 
1139 boolean_t
1140 mac_link_has_flows(mac_client_handle_t mch)
1141 {
1142         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1143 
1144         if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1145                 return (B_TRUE);
1146 
1147         return (B_FALSE);
1148 }
1149 
1150 static int
1151 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1152 {
1153         FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1154         mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1155         mac_link_flow_clean(arg, flent);
1156         return (0);
1157 }
1158 
1159 void
1160 mac_link_release_flows(mac_client_handle_t mch)
1161 {
1162         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1163 
1164         /*
1165          * Change the mci_flent callback back to mac_rx_srs_process()
1166          * because flows are about to be deactivated.
1167          */
1168         mac_client_update_classifier(mcip, B_FALSE);
1169         (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1170             mac_link_release_flows_cb, mcip);
1171 }
1172 
1173 void
1174 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1175 {
1176         mac_flow_set_name(fep, new_name);
1177         if (fep->fe_ksp != NULL) {
1178                 flow_stat_destroy(fep);
1179                 flow_stat_create(fep);
1180         }
1181 }
1182 
1183 /*
1184  * mac_link_flow_init()
1185  * Internal flow interface used for allocating SRSs and related
1186  * data structures. Not meant to be used by mac clients.
1187  */
1188 int
1189 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1190 {
1191         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1192         mac_impl_t              *mip = mcip->mci_mip;
1193         int                     err;
1194 
1195         ASSERT(mch != NULL);
1196         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1197 
1198         if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1199                 return (err);
1200 
1201         sub_flow->fe_mcip = mcip;
1202 
1203         return (0);
1204 }
1205 
1206 /*
1207  * mac_link_flow_add()
1208  * Used by flowadm(1m) or kernel mac clients for creating flows.
1209  */
1210 int
1211 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1212     flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1213 {
1214         flow_entry_t            *flent = NULL;
1215         int                     err;
1216         dls_dl_handle_t         dlh;
1217         dls_link_t              *dlp;
1218         boolean_t               link_held = B_FALSE;
1219         boolean_t               hash_added = B_FALSE;
1220         mac_perim_handle_t      mph;
1221 
1222         err = mac_flow_lookup_byname(flow_name, &flent);
1223         if (err == 0) {
1224                 FLOW_USER_REFRELE(flent);
1225                 return (EEXIST);
1226         }
1227 
1228         /*
1229          * First create a flow entry given the description provided
1230          * by the caller.
1231          */
1232         err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1233             FLOW_USER | FLOW_OTHER, &flent);
1234 
1235         if (err != 0)
1236                 return (err);
1237 
1238         /*
1239          * We've got a local variable referencing this flow now, so we need
1240          * to hold it. We'll release this flow before returning.
1241          * All failures until we return will undo any action that may internally
1242          * held the flow, so the last REFRELE will assure a clean freeing
1243          * of resources.
1244          */
1245         FLOW_REFHOLD(flent);
1246 
1247         flent->fe_link_id = linkid;
1248         FLOW_MARK(flent, FE_INCIPIENT);
1249 
1250         err = mac_perim_enter_by_linkid(linkid, &mph);
1251         if (err != 0) {
1252                 FLOW_FINAL_REFRELE(flent);
1253                 return (err);
1254         }
1255 
1256         /*
1257          * dls will eventually be merged with mac so it's ok
1258          * to call dls' internal functions.
1259          */
1260         err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1261         if (err != 0)
1262                 goto bail;
1263 
1264         link_held = B_TRUE;
1265 
1266         /*
1267          * Add the flow to the global flow table, this table will be per
1268          * exclusive zone so each zone can have its own flow namespace.
1269          * RFE 6625651 will fix this.
1270          *
1271          */
1272         if ((err = mac_flow_hash_add(flent)) != 0)
1273                 goto bail;
1274 
1275         hash_added = B_TRUE;
1276 
1277         /*
1278          * do not allow flows to be configured on an anchor VNIC
1279          */
1280         if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1281                 err = ENOTSUP;
1282                 goto bail;
1283         }
1284 
1285         /*
1286          * Add the subflow to the subflow table. Also instantiate the flow
1287          * in the mac if there is an active user (we check if the MAC client's
1288          * datapath has been setup).
1289          */
1290         err = mac_flow_add_subflow(dlp->dl_mch, flent,
1291             MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1292         if (err != 0)
1293                 goto bail;
1294 
1295         FLOW_UNMARK(flent, FE_INCIPIENT);
1296         dls_devnet_rele_link(dlh, dlp);
1297         mac_perim_exit(mph);
1298         return (0);
1299 
1300 bail:
1301         if (hash_added)
1302                 mac_flow_hash_remove(flent);
1303 
1304         if (link_held)
1305                 dls_devnet_rele_link(dlh, dlp);
1306 
1307         /*
1308          * Wait for any transient global flow hash refs to clear
1309          * and then release the creation reference on the flow
1310          */
1311         mac_flow_wait(flent, FLOW_USER_REF);
1312         FLOW_FINAL_REFRELE(flent);
1313         mac_perim_exit(mph);
1314         return (err);
1315 }
1316 
1317 /*
1318  * mac_link_flow_clean()
1319  * Internal flow interface used for freeing SRSs and related
1320  * data structures. Not meant to be used by mac clients.
1321  */
1322 void
1323 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1324 {
1325         mac_client_impl_t       *mcip = (mac_client_impl_t *)mch;
1326         mac_impl_t              *mip = mcip->mci_mip;
1327         boolean_t               last_subflow;
1328 
1329         ASSERT(mch != NULL);
1330         ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1331 
1332         /*
1333          * This sub flow entry may fail to be fully initialized by
1334          * mac_link_flow_init(). If so, simply return.
1335          */
1336         if (sub_flow->fe_mcip == NULL)
1337                 return;
1338 
1339         last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1340         /*
1341          * Tear down the data path
1342          */
1343         mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1344         sub_flow->fe_mcip = NULL;
1345 
1346         /*
1347          * Delete the SRSs associated with this subflow. If this is being
1348          * driven by flowadm(1M) then the subflow will be deleted by
1349          * dls_rem_flow. However if this is a result of the interface being
1350          * unplumbed then the subflow itself won't be deleted.
1351          */
1352         mac_flow_cleanup(sub_flow);
1353 
1354         /*
1355          * If all the subflows are gone, renable some of the stuff
1356          * we disabled when adding a subflow, polling etc.
1357          */
1358         if (last_subflow) {
1359                 /*
1360                  * The subflow table itself is not protected by any locks or
1361                  * refcnts. Hence quiesce the client upfront before clearing
1362                  * mci_subflow_tab.
1363                  */
1364                 mac_client_quiesce(mcip);
1365                 mac_client_update_classifier(mcip, B_FALSE);
1366                 mac_flow_tab_destroy(mcip->mci_subflow_tab);
1367                 mcip->mci_subflow_tab = NULL;
1368                 mac_client_restart(mcip);
1369         }
1370 }
1371 
1372 /*
1373  * mac_link_flow_remove()
1374  * Used by flowadm(1m) or kernel mac clients for removing flows.
1375  */
1376 int
1377 mac_link_flow_remove(char *flow_name)
1378 {
1379         flow_entry_t            *flent;
1380         mac_perim_handle_t      mph;
1381         int                     err;
1382         datalink_id_t           linkid;
1383 
1384         err = mac_flow_lookup_byname(flow_name, &flent);
1385         if (err != 0)
1386                 return (err);
1387 
1388         linkid = flent->fe_link_id;
1389         FLOW_USER_REFRELE(flent);
1390 
1391         /*
1392          * The perim must be acquired before acquiring any other references
1393          * to maintain the lock and perimeter hierarchy. Please note the
1394          * FLOW_REFRELE above.
1395          */
1396         err = mac_perim_enter_by_linkid(linkid, &mph);
1397         if (err != 0)
1398                 return (err);
1399 
1400         /*
1401          * Note the second lookup of the flow, because a concurrent thread
1402          * may have removed it already while we were waiting to enter the
1403          * link's perimeter.
1404          */
1405         err = mac_flow_lookup_byname(flow_name, &flent);
1406         if (err != 0) {
1407                 mac_perim_exit(mph);
1408                 return (err);
1409         }
1410         FLOW_USER_REFRELE(flent);
1411 
1412         /*
1413          * Remove the flow from the subflow table and deactivate the flow
1414          * by quiescing and removings its SRSs
1415          */
1416         mac_flow_rem_subflow(flent);
1417 
1418         /*
1419          * Finally, remove the flow from the global table.
1420          */
1421         mac_flow_hash_remove(flent);
1422 
1423         /*
1424          * Wait for any transient global flow hash refs to clear
1425          * and then release the creation reference on the flow
1426          */
1427         mac_flow_wait(flent, FLOW_USER_REF);
1428         FLOW_FINAL_REFRELE(flent);
1429 
1430         mac_perim_exit(mph);
1431 
1432         return (0);
1433 }
1434 
1435 /*
1436  * mac_link_flow_modify()
1437  * Modifies the properties of a flow identified by its name.
1438  */
1439 int
1440 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1441 {
1442         flow_entry_t            *flent;
1443         mac_client_impl_t       *mcip;
1444         int                     err = 0;
1445         mac_perim_handle_t      mph;
1446         datalink_id_t           linkid;
1447         flow_tab_t              *flow_tab;
1448 
1449         err = mac_validate_props(NULL, mrp);
1450         if (err != 0)
1451                 return (err);
1452 
1453         err = mac_flow_lookup_byname(flow_name, &flent);
1454         if (err != 0)
1455                 return (err);
1456 
1457         linkid = flent->fe_link_id;
1458         FLOW_USER_REFRELE(flent);
1459 
1460         /*
1461          * The perim must be acquired before acquiring any other references
1462          * to maintain the lock and perimeter hierarchy. Please note the
1463          * FLOW_REFRELE above.
1464          */
1465         err = mac_perim_enter_by_linkid(linkid, &mph);
1466         if (err != 0)
1467                 return (err);
1468 
1469         /*
1470          * Note the second lookup of the flow, because a concurrent thread
1471          * may have removed it already while we were waiting to enter the
1472          * link's perimeter.
1473          */
1474         err = mac_flow_lookup_byname(flow_name, &flent);
1475         if (err != 0) {
1476                 mac_perim_exit(mph);
1477                 return (err);
1478         }
1479         FLOW_USER_REFRELE(flent);
1480 
1481         /*
1482          * If this flow is attached to a MAC client, then pass the request
1483          * along to the client.
1484          * Otherwise, just update the cached values.
1485          */
1486         mcip = flent->fe_mcip;
1487         mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1488         if (mcip != NULL) {
1489                 if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1490                         err = ENOENT;
1491                 } else {
1492                         mac_flow_modify(flow_tab, flent, mrp);
1493                 }
1494         } else {
1495                 (void) mac_flow_modify_props(flent, mrp);
1496         }
1497 
1498 done:
1499         mac_perim_exit(mph);
1500         return (err);
1501 }
1502 
1503 
1504 /*
1505  * State structure and misc functions used by mac_link_flow_walk().
1506  */
1507 typedef struct {
1508         int     (*ws_func)(mac_flowinfo_t *, void *);
1509         void    *ws_arg;
1510 } flow_walk_state_t;
1511 
1512 static void
1513 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1514 {
1515         (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1516             MAXFLOWNAMELEN);
1517         finfop->fi_link_id = flent->fe_link_id;
1518         finfop->fi_flow_desc = flent->fe_flow_desc;
1519         finfop->fi_resource_props = flent->fe_resource_props;
1520 }
1521 
1522 static int
1523 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1524 {
1525         flow_walk_state_t       *statep = arg;
1526         mac_flowinfo_t          *finfo;
1527         int                     err;
1528 
1529         finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
1530         mac_link_flowinfo_copy(finfo, flent);
1531         err = statep->ws_func(finfo, statep->ws_arg);
1532         kmem_free(finfo, sizeof (*finfo));
1533         return (err);
1534 }
1535 
1536 /*
1537  * mac_link_flow_walk()
1538  * Invokes callback 'func' for all flows belonging to the specified link.
1539  */
1540 int
1541 mac_link_flow_walk(datalink_id_t linkid,
1542     int (*func)(mac_flowinfo_t *, void *), void *arg)
1543 {
1544         mac_client_impl_t       *mcip;
1545         mac_perim_handle_t      mph;
1546         flow_walk_state_t       state;
1547         dls_dl_handle_t         dlh;
1548         dls_link_t              *dlp;
1549         int                     err;
1550 
1551         err = mac_perim_enter_by_linkid(linkid, &mph);
1552         if (err != 0)
1553                 return (err);
1554 
1555         err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1556         if (err != 0) {
1557                 mac_perim_exit(mph);
1558                 return (err);
1559         }
1560 
1561         mcip = (mac_client_impl_t *)dlp->dl_mch;
1562         state.ws_func = func;
1563         state.ws_arg = arg;
1564 
1565         err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1566             mac_link_flow_walk_cb, &state);
1567 
1568         dls_devnet_rele_link(dlh, dlp);
1569         mac_perim_exit(mph);
1570         return (err);
1571 }
1572 
1573 /*
1574  * mac_link_flow_info()
1575  * Retrieves information about a specific flow.
1576  */
1577 int
1578 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1579 {
1580         flow_entry_t    *flent;
1581         int             err;
1582 
1583         err = mac_flow_lookup_byname(flow_name, &flent);
1584         if (err != 0)
1585                 return (err);
1586 
1587         mac_link_flowinfo_copy(finfo, flent);
1588         FLOW_USER_REFRELE(flent);
1589         return (0);
1590 }
1591 
1592 /*
1593  * Hash function macro that takes an Ethernet address and VLAN id as input.
1594  */
1595 #define HASH_ETHER_VID(a, v, s) \
1596         ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1597 
1598 /*
1599  * Generic layer-2 address hashing function that takes an address and address
1600  * length as input.  This is the DJB hash function.
1601  */
1602 static uint32_t
1603 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1604 {
1605         uint32_t        hash = 5381;
1606         size_t          i;
1607 
1608         for (i = 0; i < addrlen; i++)
1609                 hash = ((hash << 5) + hash) + addr[i];
1610         return (hash % htsize);
1611 }
1612 
1613 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1614 
1615 #define CHECK_AND_ADJUST_START_PTR(s, start) {          \
1616         if ((s)->fs_mp->b_wptr == (start)) {              \
1617                 mblk_t  *next = (s)->fs_mp->b_cont;       \
1618                 if (next == NULL)                       \
1619                         return (EINVAL);                \
1620                                                         \
1621                 (s)->fs_mp = next;                   \
1622                 (start) = next->b_rptr;                      \
1623         }                                               \
1624 }
1625 
1626 /* ARGSUSED */
1627 static boolean_t
1628 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1629 {
1630         flow_l2info_t           *l2 = &s->fs_l2info;
1631         flow_desc_t             *fd = &flent->fe_flow_desc;
1632 
1633         return (l2->l2_vid == fd->fd_vid &&
1634             bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1635 }
1636 
1637 /*
1638  * Layer 2 hash function.
1639  * Must be paired with flow_l2_accept() within a set of flow_ops
1640  * because it assumes the dest address is already extracted.
1641  */
1642 static uint32_t
1643 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1644 {
1645         return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1646             ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1647 }
1648 
1649 /*
1650  * This is the generic layer 2 accept function.
1651  * It makes use of mac_header_info() to extract the header length,
1652  * sap, vlan ID and destination address.
1653  */
1654 static int
1655 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1656 {
1657         boolean_t               is_ether;
1658         flow_l2info_t           *l2 = &s->fs_l2info;
1659         mac_header_info_t       mhi;
1660         int                     err;
1661 
1662         is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1663         if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1664             s->fs_mp, &mhi)) != 0) {
1665                 if (err == EINVAL)
1666                         err = ENOBUFS;
1667 
1668                 return (err);
1669         }
1670 
1671         l2->l2_start = s->fs_mp->b_rptr;
1672         l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1673 
1674         if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1675             ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1676                 struct ether_vlan_header        *evhp =
1677                     (struct ether_vlan_header *)l2->l2_start;
1678 
1679                 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1680                         return (ENOBUFS);
1681 
1682                 l2->l2_sap = ntohs(evhp->ether_type);
1683                 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1684                 l2->l2_hdrsize = sizeof (*evhp);
1685         } else {
1686                 l2->l2_sap = mhi.mhi_bindsap;
1687                 l2->l2_vid = 0;
1688                 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1689         }
1690         return (0);
1691 }
1692 
1693 /*
1694  * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1695  * accept(). The notable difference is that dest address is now extracted
1696  * by hash() rather than by accept(). This saves a few memory references
1697  * for flow tables that do not care about mac addresses.
1698  */
1699 static uint32_t
1700 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1701 {
1702         flow_l2info_t                   *l2 = &s->fs_l2info;
1703         struct ether_vlan_header        *evhp;
1704 
1705         evhp = (struct ether_vlan_header *)l2->l2_start;
1706         l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1707         return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1708 }
1709 
1710 static uint32_t
1711 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1712 {
1713         flow_desc_t     *fd = &flent->fe_flow_desc;
1714 
1715         ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1716         return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1717 }
1718 
1719 /* ARGSUSED */
1720 static int
1721 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1722 {
1723         flow_l2info_t                   *l2 = &s->fs_l2info;
1724         struct ether_vlan_header        *evhp;
1725         uint16_t                        sap;
1726 
1727         evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1728         l2->l2_start = (uchar_t *)evhp;
1729 
1730         if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1731                 return (ENOBUFS);
1732 
1733         if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1734             ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1735                 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1736                         return (ENOBUFS);
1737 
1738                 l2->l2_sap = ntohs(evhp->ether_type);
1739                 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1740                 l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1741         } else {
1742                 l2->l2_sap = sap;
1743                 l2->l2_vid = 0;
1744                 l2->l2_hdrsize = sizeof (struct ether_header);
1745         }
1746         return (0);
1747 }
1748 
1749 /*
1750  * Validates a layer 2 flow entry.
1751  */
1752 static int
1753 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1754 {
1755         flow_desc_t     *fd = &flent->fe_flow_desc;
1756 
1757         /*
1758          * Dest address is mandatory, and 0 length addresses are not yet
1759          * supported.
1760          */
1761         if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1762                 return (EINVAL);
1763 
1764         if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1765                 /*
1766                  * VLAN flows are only supported over ethernet macs.
1767                  */
1768                 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1769                         return (EINVAL);
1770 
1771                 if (fd->fd_vid == 0)
1772                         return (EINVAL);
1773 
1774         }
1775         flent->fe_match = flow_l2_match;
1776         return (0);
1777 }
1778 
1779 /*
1780  * Calculates hash index of flow entry.
1781  */
1782 static uint32_t
1783 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1784 {
1785         flow_desc_t     *fd = &flent->fe_flow_desc;
1786 
1787         ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1788         return (flow_l2_addrhash(fd->fd_dst_mac,
1789             ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1790 }
1791 
1792 /*
1793  * This is used for duplicate flow checking.
1794  */
1795 /* ARGSUSED */
1796 static boolean_t
1797 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1798 {
1799         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1800 
1801         ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1802         return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1803             fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1804 }
1805 
1806 /*
1807  * Generic flow entry insertion function.
1808  * Used by flow tables that do not have ordering requirements.
1809  */
1810 /* ARGSUSED */
1811 static int
1812 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1813     flow_entry_t *flent)
1814 {
1815         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1816 
1817         if (*headp != NULL) {
1818                 ASSERT(flent->fe_next == NULL);
1819                 flent->fe_next = *headp;
1820         }
1821         *headp = flent;
1822         return (0);
1823 }
1824 
1825 /*
1826  * IP version independent DSField matching function.
1827  */
1828 /* ARGSUSED */
1829 static boolean_t
1830 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1831 {
1832         flow_l3info_t   *l3info = &s->fs_l3info;
1833         flow_desc_t     *fd = &flent->fe_flow_desc;
1834 
1835         switch (l3info->l3_version) {
1836         case IPV4_VERSION: {
1837                 ipha_t          *ipha = (ipha_t *)l3info->l3_start;
1838 
1839                 return ((ipha->ipha_type_of_service &
1840                     fd->fd_dsfield_mask) == fd->fd_dsfield);
1841         }
1842         case IPV6_VERSION: {
1843                 ip6_t           *ip6h = (ip6_t *)l3info->l3_start;
1844 
1845                 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1846                     fd->fd_dsfield_mask) == fd->fd_dsfield);
1847         }
1848         default:
1849                 return (B_FALSE);
1850         }
1851 }
1852 
1853 /*
1854  * IP v4 and v6 address matching.
1855  * The netmask only needs to be applied on the packet but not on the
1856  * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1857  */
1858 
1859 /* ARGSUSED */
1860 static boolean_t
1861 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1862 {
1863         flow_l3info_t   *l3info = &s->fs_l3info;
1864         flow_desc_t     *fd = &flent->fe_flow_desc;
1865         ipha_t          *ipha = (ipha_t *)l3info->l3_start;
1866         in_addr_t       addr;
1867 
1868         addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1869         if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1870                 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1871                     V4_PART_OF_V6(fd->fd_local_addr));
1872         }
1873         return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1874             V4_PART_OF_V6(fd->fd_remote_addr));
1875 }
1876 
1877 /* ARGSUSED */
1878 static boolean_t
1879 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1880 {
1881         flow_l3info_t   *l3info = &s->fs_l3info;
1882         flow_desc_t     *fd = &flent->fe_flow_desc;
1883         ip6_t           *ip6h = (ip6_t *)l3info->l3_start;
1884         in6_addr_t      *addrp;
1885 
1886         addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1887         if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1888                 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1889                     fd->fd_local_addr));
1890         }
1891         return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1892 }
1893 
1894 /* ARGSUSED */
1895 static boolean_t
1896 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1897 {
1898         flow_l3info_t   *l3info = &s->fs_l3info;
1899         flow_desc_t     *fd = &flent->fe_flow_desc;
1900 
1901         return (l3info->l3_protocol == fd->fd_protocol);
1902 }
1903 
1904 static uint32_t
1905 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1906 {
1907         flow_l3info_t   *l3info = &s->fs_l3info;
1908         flow_mask_t     mask = ft->ft_mask;
1909 
1910         if ((mask & FLOW_IP_LOCAL) != 0) {
1911                 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1912         } else if ((mask & FLOW_IP_REMOTE) != 0) {
1913                 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1914         } else if ((mask & FLOW_IP_DSFIELD) != 0) {
1915                 /*
1916                  * DSField flents are arranged as a single list.
1917                  */
1918                 return (0);
1919         }
1920         /*
1921          * IP addr flents are hashed into two lists, v4 or v6.
1922          */
1923         ASSERT(ft->ft_size >= 2);
1924         return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1925 }
1926 
1927 static uint32_t
1928 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1929 {
1930         flow_l3info_t   *l3info = &s->fs_l3info;
1931 
1932         return (l3info->l3_protocol % ft->ft_size);
1933 }
1934 
1935 /* ARGSUSED */
1936 static int
1937 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1938 {
1939         flow_l2info_t   *l2info = &s->fs_l2info;
1940         flow_l3info_t   *l3info = &s->fs_l3info;
1941         uint16_t        sap = l2info->l2_sap;
1942         uchar_t         *l3_start;
1943 
1944         l3_start = l2info->l2_start + l2info->l2_hdrsize;
1945 
1946         /*
1947          * Adjust start pointer if we're at the end of an mblk.
1948          */
1949         CHECK_AND_ADJUST_START_PTR(s, l3_start);
1950 
1951         l3info->l3_start = l3_start;
1952         if (!OK_32PTR(l3_start))
1953                 return (EINVAL);
1954 
1955         switch (sap) {
1956         case ETHERTYPE_IP: {
1957                 ipha_t  *ipha = (ipha_t *)l3_start;
1958 
1959                 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION)
1960                         return (EINVAL);
1961                 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1962                         return (ENOBUFS);
1963 
1964                 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1965                 l3info->l3_protocol = ipha->ipha_protocol;
1966                 l3info->l3_version = IPV4_VERSION;
1967                 l3info->l3_fragmented =
1968                     IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1969                 break;
1970         }
1971         case ETHERTYPE_IPV6: {
1972                 ip6_t           *ip6h = (ip6_t *)l3_start;
1973                 ip6_frag_t      *frag = NULL;
1974                 uint16_t        ip6_hdrlen;
1975                 uint8_t         nexthdr;
1976                 int             errno;
1977 
1978                 errno = mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr,
1979                     &ip6_hdrlen, &nexthdr, &frag);
1980                 /*
1981                  * ENOBUFS is not ENOSPC, but the semantics are the
1982                  * same for this caller.
1983                  */
1984                 if (errno != 0)
1985                         return (errno == ENOSPC ? ENOBUFS : errno);
1986                 l3info->l3_hdrsize = ip6_hdrlen;
1987                 l3info->l3_protocol = nexthdr;
1988                 l3info->l3_version = IPV6_VERSION;
1989                 l3info->l3_fragmented = (frag != NULL);
1990                 break;
1991         }
1992         default:
1993                 return (EINVAL);
1994         }
1995         return (0);
1996 }
1997 
1998 /* ARGSUSED */
1999 static int
2000 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2001 {
2002         flow_desc_t     *fd = &flent->fe_flow_desc;
2003 
2004         switch (fd->fd_protocol) {
2005         case IPPROTO_TCP:
2006         case IPPROTO_UDP:
2007         case IPPROTO_SCTP:
2008         case IPPROTO_ICMP:
2009         case IPPROTO_ICMPV6:
2010                 flent->fe_match = flow_ip_proto_match;
2011                 return (0);
2012         default:
2013                 return (EINVAL);
2014         }
2015 }
2016 
2017 /* ARGSUSED */
2018 static int
2019 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2020 {
2021         flow_desc_t     *fd = &flent->fe_flow_desc;
2022         flow_mask_t     mask;
2023         uint8_t         version;
2024         in6_addr_t      *addr, *netmask;
2025 
2026         /*
2027          * DSField does not require a IP version.
2028          */
2029         if (fd->fd_mask == FLOW_IP_DSFIELD) {
2030                 if (fd->fd_dsfield_mask == 0)
2031                         return (EINVAL);
2032 
2033                 flent->fe_match = flow_ip_dsfield_match;
2034                 return (0);
2035         }
2036 
2037         /*
2038          * IP addresses must come with a version to avoid ambiguity.
2039          */
2040         if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
2041                 return (EINVAL);
2042 
2043         version = fd->fd_ipversion;
2044         if (version != IPV4_VERSION && version != IPV6_VERSION)
2045                 return (EINVAL);
2046 
2047         mask = fd->fd_mask & ~FLOW_IP_VERSION;
2048         switch (mask) {
2049         case FLOW_IP_LOCAL:
2050                 addr = &fd->fd_local_addr;
2051                 netmask = &fd->fd_local_netmask;
2052                 break;
2053         case FLOW_IP_REMOTE:
2054                 addr = &fd->fd_remote_addr;
2055                 netmask = &fd->fd_remote_netmask;
2056                 break;
2057         default:
2058                 return (EINVAL);
2059         }
2060 
2061         /*
2062          * Apply netmask onto specified address.
2063          */
2064         V6_MASK_COPY(*addr, *netmask, *addr);
2065         if (version == IPV4_VERSION) {
2066                 ipaddr_t        v4addr = V4_PART_OF_V6((*addr));
2067                 ipaddr_t        v4mask = V4_PART_OF_V6((*netmask));
2068 
2069                 if (v4addr == 0 || v4mask == 0)
2070                         return (EINVAL);
2071                 flent->fe_match = flow_ip_v4_match;
2072         } else {
2073                 if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
2074                     IN6_IS_ADDR_UNSPECIFIED(netmask))
2075                         return (EINVAL);
2076                 flent->fe_match = flow_ip_v6_match;
2077         }
2078         return (0);
2079 }
2080 
2081 static uint32_t
2082 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2083 {
2084         flow_desc_t     *fd = &flent->fe_flow_desc;
2085 
2086         return (fd->fd_protocol % ft->ft_size);
2087 }
2088 
2089 static uint32_t
2090 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2091 {
2092         flow_desc_t     *fd = &flent->fe_flow_desc;
2093 
2094         /*
2095          * DSField flents are arranged as a single list.
2096          */
2097         if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2098                 return (0);
2099 
2100         /*
2101          * IP addr flents are hashed into two lists, v4 or v6.
2102          */
2103         ASSERT(ft->ft_size >= 2);
2104         return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2105 }
2106 
2107 /* ARGSUSED */
2108 static boolean_t
2109 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2110 {
2111         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2112 
2113         return (fd1->fd_protocol == fd2->fd_protocol);
2114 }
2115 
2116 /* ARGSUSED */
2117 static boolean_t
2118 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2119 {
2120         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2121         in6_addr_t      *a1, *m1, *a2, *m2;
2122 
2123         ASSERT(fd1->fd_mask == fd2->fd_mask);
2124         if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2125                 return (fd1->fd_dsfield == fd2->fd_dsfield &&
2126                     fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2127         }
2128 
2129         /*
2130          * flow_ip_accept_fe() already validated the version.
2131          */
2132         ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2133         if (fd1->fd_ipversion != fd2->fd_ipversion)
2134                 return (B_FALSE);
2135 
2136         switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2137         case FLOW_IP_LOCAL:
2138                 a1 = &fd1->fd_local_addr;
2139                 m1 = &fd1->fd_local_netmask;
2140                 a2 = &fd2->fd_local_addr;
2141                 m2 = &fd2->fd_local_netmask;
2142                 break;
2143         case FLOW_IP_REMOTE:
2144                 a1 = &fd1->fd_remote_addr;
2145                 m1 = &fd1->fd_remote_netmask;
2146                 a2 = &fd2->fd_remote_addr;
2147                 m2 = &fd2->fd_remote_netmask;
2148                 break;
2149         default:
2150                 /*
2151                  * This is unreachable given the checks in
2152                  * flow_ip_accept_fe().
2153                  */
2154                 return (B_FALSE);
2155         }
2156 
2157         if (fd1->fd_ipversion == IPV4_VERSION) {
2158                 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2159                     V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2160 
2161         } else {
2162                 return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2163                     IN6_ARE_ADDR_EQUAL(m1, m2));
2164         }
2165 }
2166 
2167 static int
2168 flow_ip_mask2plen(in6_addr_t *v6mask)
2169 {
2170         int             bits;
2171         int             plen = IPV6_ABITS;
2172         int             i;
2173 
2174         for (i = 3; i >= 0; i--) {
2175                 if (v6mask->s6_addr32[i] == 0) {
2176                         plen -= 32;
2177                         continue;
2178                 }
2179                 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2180                 if (bits == 0)
2181                         break;
2182                 plen -= bits;
2183         }
2184         return (plen);
2185 }
2186 
2187 /* ARGSUSED */
2188 static int
2189 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2190     flow_entry_t *flent)
2191 {
2192         flow_entry_t    **p = headp;
2193         flow_desc_t     *fd0, *fd;
2194         in6_addr_t      *m0, *m;
2195         int             plen0, plen;
2196 
2197         ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2198 
2199         /*
2200          * No special ordering needed for dsfield.
2201          */
2202         fd0 = &flent->fe_flow_desc;
2203         if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2204                 if (*p != NULL) {
2205                         ASSERT(flent->fe_next == NULL);
2206                         flent->fe_next = *p;
2207                 }
2208                 *p = flent;
2209                 return (0);
2210         }
2211 
2212         /*
2213          * IP address flows are arranged in descending prefix length order.
2214          */
2215         m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2216             &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2217         plen0 = flow_ip_mask2plen(m0);
2218         ASSERT(plen0 != 0);
2219 
2220         for (; *p != NULL; p = &(*p)->fe_next) {
2221                 fd = &(*p)->fe_flow_desc;
2222 
2223                 /*
2224                  * Normally a dsfield flent shouldn't end up on the same
2225                  * list as an IP address because flow tables are (for now)
2226                  * disjoint. If we decide to support both IP and dsfield
2227                  * in the same table in the future, this check will allow
2228                  * for that.
2229                  */
2230                 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2231                         continue;
2232 
2233                 /*
2234                  * We also allow for the mixing of local and remote address
2235                  * flents within one list.
2236                  */
2237                 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2238                     &fd->fd_local_netmask : &fd->fd_remote_netmask;
2239                 plen = flow_ip_mask2plen(m);
2240 
2241                 if (plen <= plen0)
2242                         break;
2243         }
2244         if (*p != NULL) {
2245                 ASSERT(flent->fe_next == NULL);
2246                 flent->fe_next = *p;
2247         }
2248         *p = flent;
2249         return (0);
2250 }
2251 
2252 /*
2253  * Transport layer protocol and port matching functions.
2254  */
2255 
2256 /* ARGSUSED */
2257 static boolean_t
2258 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2259 {
2260         flow_l3info_t   *l3info = &s->fs_l3info;
2261         flow_l4info_t   *l4info = &s->fs_l4info;
2262         flow_desc_t     *fd = &flent->fe_flow_desc;
2263 
2264         return (fd->fd_protocol == l3info->l3_protocol &&
2265             fd->fd_local_port == l4info->l4_hash_port);
2266 }
2267 
2268 /* ARGSUSED */
2269 static boolean_t
2270 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2271 {
2272         flow_l3info_t   *l3info = &s->fs_l3info;
2273         flow_l4info_t   *l4info = &s->fs_l4info;
2274         flow_desc_t     *fd = &flent->fe_flow_desc;
2275 
2276         return (fd->fd_protocol == l3info->l3_protocol &&
2277             fd->fd_remote_port == l4info->l4_hash_port);
2278 }
2279 
2280 /*
2281  * Transport hash function.
2282  * Since we only support either local or remote port flows,
2283  * we only need to extract one of the ports to be used for
2284  * matching.
2285  */
2286 static uint32_t
2287 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2288 {
2289         flow_l3info_t   *l3info = &s->fs_l3info;
2290         flow_l4info_t   *l4info = &s->fs_l4info;
2291         uint8_t         proto = l3info->l3_protocol;
2292         boolean_t       dst_or_src;
2293 
2294         if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2295                 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2296         } else {
2297                 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2298         }
2299 
2300         l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2301             l4info->l4_src_port;
2302 
2303         return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2304 }
2305 
2306 /*
2307  * Unlike other accept() functions above, we do not need to get the header
2308  * size because this is our highest layer so far. If we want to do support
2309  * other higher layer protocols, we would need to save the l4_hdrsize
2310  * in the code below.
2311  */
2312 
2313 /* ARGSUSED */
2314 static int
2315 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2316 {
2317         flow_l3info_t   *l3info = &s->fs_l3info;
2318         flow_l4info_t   *l4info = &s->fs_l4info;
2319         uint8_t         proto = l3info->l3_protocol;
2320         uchar_t         *l4_start;
2321 
2322         l4_start = l3info->l3_start + l3info->l3_hdrsize;
2323 
2324         /*
2325          * Adjust start pointer if we're at the end of an mblk.
2326          */
2327         CHECK_AND_ADJUST_START_PTR(s, l4_start);
2328 
2329         l4info->l4_start = l4_start;
2330         if (!OK_32PTR(l4_start))
2331                 return (EINVAL);
2332 
2333         if (l3info->l3_fragmented == B_TRUE)
2334                 return (EINVAL);
2335 
2336         switch (proto) {
2337         case IPPROTO_TCP: {
2338                 struct tcphdr   *tcph = (struct tcphdr *)l4_start;
2339 
2340                 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2341                         return (ENOBUFS);
2342 
2343                 l4info->l4_src_port = tcph->th_sport;
2344                 l4info->l4_dst_port = tcph->th_dport;
2345                 break;
2346         }
2347         case IPPROTO_UDP: {
2348                 struct udphdr   *udph = (struct udphdr *)l4_start;
2349 
2350                 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2351                         return (ENOBUFS);
2352 
2353                 l4info->l4_src_port = udph->uh_sport;
2354                 l4info->l4_dst_port = udph->uh_dport;
2355                 break;
2356         }
2357         case IPPROTO_SCTP: {
2358                 sctp_hdr_t      *sctph = (sctp_hdr_t *)l4_start;
2359 
2360                 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2361                         return (ENOBUFS);
2362 
2363                 l4info->l4_src_port = sctph->sh_sport;
2364                 l4info->l4_dst_port = sctph->sh_dport;
2365                 break;
2366         }
2367         default:
2368                 return (EINVAL);
2369         }
2370 
2371         return (0);
2372 }
2373 
2374 /*
2375  * Validates transport flow entry.
2376  * The protocol field must be present.
2377  */
2378 
2379 /* ARGSUSED */
2380 static int
2381 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2382 {
2383         flow_desc_t     *fd = &flent->fe_flow_desc;
2384         flow_mask_t     mask = fd->fd_mask;
2385 
2386         if ((mask & FLOW_IP_PROTOCOL) == 0)
2387                 return (EINVAL);
2388 
2389         switch (fd->fd_protocol) {
2390         case IPPROTO_TCP:
2391         case IPPROTO_UDP:
2392         case IPPROTO_SCTP:
2393                 break;
2394         default:
2395                 return (EINVAL);
2396         }
2397 
2398         switch (mask & ~FLOW_IP_PROTOCOL) {
2399         case FLOW_ULP_PORT_LOCAL:
2400                 if (fd->fd_local_port == 0)
2401                         return (EINVAL);
2402 
2403                 flent->fe_match = flow_transport_lport_match;
2404                 break;
2405         case FLOW_ULP_PORT_REMOTE:
2406                 if (fd->fd_remote_port == 0)
2407                         return (EINVAL);
2408 
2409                 flent->fe_match = flow_transport_rport_match;
2410                 break;
2411         case 0:
2412                 /*
2413                  * transport-only flows conflicts with our table type.
2414                  */
2415                 return (EOPNOTSUPP);
2416         default:
2417                 return (EINVAL);
2418         }
2419 
2420         return (0);
2421 }
2422 
2423 static uint32_t
2424 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2425 {
2426         flow_desc_t     *fd = &flent->fe_flow_desc;
2427         uint16_t        port = 0;
2428 
2429         port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2430             fd->fd_local_port : fd->fd_remote_port;
2431 
2432         return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2433 }
2434 
2435 /* ARGSUSED */
2436 static boolean_t
2437 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2438 {
2439         flow_desc_t     *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2440 
2441         if (fd1->fd_protocol != fd2->fd_protocol)
2442                 return (B_FALSE);
2443 
2444         if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2445                 return (fd1->fd_local_port == fd2->fd_local_port);
2446 
2447         if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2448                 return (fd1->fd_remote_port == fd2->fd_remote_port);
2449 
2450         return (B_TRUE);
2451 }
2452 
2453 static flow_ops_t flow_l2_ops = {
2454         flow_l2_accept_fe,
2455         flow_l2_hash_fe,
2456         flow_l2_match_fe,
2457         flow_generic_insert_fe,
2458         flow_l2_hash,
2459         {flow_l2_accept}
2460 };
2461 
2462 static flow_ops_t flow_ip_ops = {
2463         flow_ip_accept_fe,
2464         flow_ip_hash_fe,
2465         flow_ip_match_fe,
2466         flow_ip_insert_fe,
2467         flow_ip_hash,
2468         {flow_l2_accept, flow_ip_accept}
2469 };
2470 
2471 static flow_ops_t flow_ip_proto_ops = {
2472         flow_ip_proto_accept_fe,
2473         flow_ip_proto_hash_fe,
2474         flow_ip_proto_match_fe,
2475         flow_generic_insert_fe,
2476         flow_ip_proto_hash,
2477         {flow_l2_accept, flow_ip_accept}
2478 };
2479 
2480 static flow_ops_t flow_transport_ops = {
2481         flow_transport_accept_fe,
2482         flow_transport_hash_fe,
2483         flow_transport_match_fe,
2484         flow_generic_insert_fe,
2485         flow_transport_hash,
2486         {flow_l2_accept, flow_ip_accept, flow_transport_accept}
2487 };
2488 
2489 static flow_tab_info_t flow_tab_info_list[] = {
2490         {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2491         {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2492         {&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2493         {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2494         {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2495         {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2496 };
2497 
2498 #define FLOW_MAX_TAB_INFO \
2499         ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2500 
2501 static flow_tab_info_t *
2502 mac_flow_tab_info_get(flow_mask_t mask)
2503 {
2504         int     i;
2505 
2506         for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2507                 if (mask == flow_tab_info_list[i].fti_mask)
2508                         return (&flow_tab_info_list[i]);
2509         }
2510         return (NULL);
2511 }