1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright 2019, Joyent, Inc.
27 */
28
29 #include <sys/strsun.h>
30 #include <sys/sdt.h>
31 #include <sys/mac.h>
32 #include <sys/mac_impl.h>
33 #include <sys/mac_client_impl.h>
34 #include <sys/mac_stat.h>
35 #include <sys/dls.h>
36 #include <sys/dls_impl.h>
37 #include <sys/mac_soft_ring.h>
38 #include <sys/ethernet.h>
39 #include <sys/cpupart.h>
40 #include <sys/pool.h>
41 #include <sys/pool_pset.h>
42 #include <sys/vlan.h>
43 #include <inet/ip.h>
44 #include <inet/ip6.h>
45 #include <netinet/tcp.h>
46 #include <netinet/udp.h>
47 #include <netinet/sctp.h>
48
49 typedef struct flow_stats_s {
50 uint64_t fs_obytes;
51 uint64_t fs_opackets;
52 uint64_t fs_oerrors;
53 uint64_t fs_ibytes;
54 uint64_t fs_ipackets;
55 uint64_t fs_ierrors;
56 } flow_stats_t;
57
58
59 /* global flow table, will be a per exclusive-zone table later */
60 static mod_hash_t *flow_hash;
61 static krwlock_t flow_tab_lock;
62
63 static kmem_cache_t *flow_cache;
64 static kmem_cache_t *flow_tab_cache;
65 static flow_ops_t flow_l2_ops;
66
67 typedef struct {
68 const char *fs_name;
69 uint_t fs_offset;
70 } flow_stats_info_t;
71
72 #define FS_OFF(f) (offsetof(flow_stats_t, f))
73 static flow_stats_info_t flow_stats_list[] = {
74 {"rbytes", FS_OFF(fs_ibytes)},
75 {"ipackets", FS_OFF(fs_ipackets)},
76 {"ierrors", FS_OFF(fs_ierrors)},
77 {"obytes", FS_OFF(fs_obytes)},
78 {"opackets", FS_OFF(fs_opackets)},
79 {"oerrors", FS_OFF(fs_oerrors)}
80 };
81 #define FS_SIZE (sizeof (flow_stats_list) / sizeof (flow_stats_info_t))
82
83 /*
84 * Checks whether a flow mask is legal.
85 */
86 static flow_tab_info_t *mac_flow_tab_info_get(flow_mask_t);
87
88 static void
89 flow_stat_init(kstat_named_t *knp)
90 {
91 int i;
92
93 for (i = 0; i < FS_SIZE; i++, knp++) {
94 kstat_named_init(knp, flow_stats_list[i].fs_name,
95 KSTAT_DATA_UINT64);
96 }
97 }
98
99 static int
100 flow_stat_update(kstat_t *ksp, int rw)
101 {
102 flow_entry_t *fep = ksp->ks_private;
103 kstat_named_t *knp = ksp->ks_data;
104 uint64_t *statp;
105 int i;
106 mac_rx_stats_t *mac_rx_stat;
107 mac_tx_stats_t *mac_tx_stat;
108 flow_stats_t flow_stats;
109 mac_soft_ring_set_t *mac_srs;
110
111 if (rw != KSTAT_READ)
112 return (EACCES);
113
114 bzero(&flow_stats, sizeof (flow_stats_t));
115
116 for (i = 0; i < fep->fe_rx_srs_cnt; i++) {
117 mac_srs = (mac_soft_ring_set_t *)fep->fe_rx_srs[i];
118 if (mac_srs == NULL) /* Multicast flow */
119 break;
120 mac_rx_stat = &mac_srs->srs_rx.sr_stat;
121
122 flow_stats.fs_ibytes += mac_rx_stat->mrs_intrbytes +
123 mac_rx_stat->mrs_pollbytes + mac_rx_stat->mrs_lclbytes;
124
125 flow_stats.fs_ipackets += mac_rx_stat->mrs_intrcnt +
126 mac_rx_stat->mrs_pollcnt + mac_rx_stat->mrs_lclcnt;
127
128 flow_stats.fs_ierrors += mac_rx_stat->mrs_ierrors;
129 }
130
131 mac_srs = (mac_soft_ring_set_t *)fep->fe_tx_srs;
132 if (mac_srs == NULL) /* Multicast flow */
133 goto done;
134 mac_tx_stat = &mac_srs->srs_tx.st_stat;
135
136 flow_stats.fs_obytes = mac_tx_stat->mts_obytes;
137 flow_stats.fs_opackets = mac_tx_stat->mts_opackets;
138 flow_stats.fs_oerrors = mac_tx_stat->mts_oerrors;
139
140 done:
141 for (i = 0; i < FS_SIZE; i++, knp++) {
142 statp = (uint64_t *)
143 ((uchar_t *)&flow_stats + flow_stats_list[i].fs_offset);
144 knp->value.ui64 = *statp;
145 }
146 return (0);
147 }
148
149 static void
150 flow_stat_create(flow_entry_t *fep)
151 {
152 kstat_t *ksp;
153 kstat_named_t *knp;
154 uint_t nstats = FS_SIZE;
155
156 /*
157 * Fow now, flow entries are only manipulated and visible from the
158 * global zone.
159 */
160 ksp = kstat_create_zone("unix", 0, (char *)fep->fe_flow_name, "flow",
161 KSTAT_TYPE_NAMED, nstats, 0, GLOBAL_ZONEID);
162 if (ksp == NULL)
163 return;
164
165 ksp->ks_update = flow_stat_update;
166 ksp->ks_private = fep;
167 fep->fe_ksp = ksp;
168
169 knp = (kstat_named_t *)ksp->ks_data;
170 flow_stat_init(knp);
171 kstat_install(ksp);
172 }
173
174 void
175 flow_stat_destroy(flow_entry_t *fep)
176 {
177 if (fep->fe_ksp != NULL) {
178 kstat_delete(fep->fe_ksp);
179 fep->fe_ksp = NULL;
180 }
181 }
182
183 /*
184 * Initialize the flow table
185 */
186 void
187 mac_flow_init()
188 {
189 flow_cache = kmem_cache_create("flow_entry_cache",
190 sizeof (flow_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
191 flow_tab_cache = kmem_cache_create("flow_tab_cache",
192 sizeof (flow_tab_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
193 flow_hash = mod_hash_create_extended("flow_hash",
194 100, mod_hash_null_keydtor, mod_hash_null_valdtor,
195 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
196 rw_init(&flow_tab_lock, NULL, RW_DEFAULT, NULL);
197 }
198
199 /*
200 * Cleanup and release the flow table
201 */
202 void
203 mac_flow_fini()
204 {
205 kmem_cache_destroy(flow_cache);
206 kmem_cache_destroy(flow_tab_cache);
207 mod_hash_destroy_hash(flow_hash);
208 rw_destroy(&flow_tab_lock);
209 }
210
211 /*
212 * mac_create_flow(): create a flow_entry_t.
213 */
214 int
215 mac_flow_create(flow_desc_t *fd, mac_resource_props_t *mrp, char *name,
216 void *client_cookie, uint_t type, flow_entry_t **flentp)
217 {
218 flow_entry_t *flent = *flentp;
219 int err = 0;
220
221 if (mrp != NULL) {
222 err = mac_validate_props(NULL, mrp);
223 if (err != 0)
224 return (err);
225 }
226
227 if (flent == NULL) {
228 flent = kmem_cache_alloc(flow_cache, KM_SLEEP);
229 bzero(flent, sizeof (*flent));
230 mutex_init(&flent->fe_lock, NULL, MUTEX_DEFAULT, NULL);
231 cv_init(&flent->fe_cv, NULL, CV_DEFAULT, NULL);
232
233 /* Initialize the receiver function to a safe routine */
234 flent->fe_cb_fn = (flow_fn_t)mac_pkt_drop;
235 flent->fe_index = -1;
236 }
237 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
238
239 /* This is an initial flow, will be configured later */
240 if (fd == NULL) {
241 *flentp = flent;
242 return (0);
243 }
244
245 flent->fe_client_cookie = client_cookie;
246 flent->fe_type = type;
247
248 /* Save flow desc */
249 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
250
251 if (mrp != NULL) {
252 /*
253 * We have already set fe_resource_props for a Link.
254 */
255 if (type & FLOW_USER) {
256 bcopy(mrp, &flent->fe_resource_props,
257 sizeof (mac_resource_props_t));
258 }
259 /*
260 * The effective resource list should reflect the priority
261 * that we set implicitly.
262 */
263 if (!(mrp->mrp_mask & MRP_PRIORITY))
264 mrp->mrp_mask |= MRP_PRIORITY;
265 if (type & FLOW_USER)
266 mrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
267 else
268 mrp->mrp_priority = MPL_LINK_DEFAULT;
269 bzero(mrp->mrp_pool, MAXPATHLEN);
270 bzero(&mrp->mrp_cpus, sizeof (mac_cpus_t));
271 bcopy(mrp, &flent->fe_effective_props,
272 sizeof (mac_resource_props_t));
273 }
274 flow_stat_create(flent);
275
276 *flentp = flent;
277 return (0);
278 }
279
280 /*
281 * Validate flow entry and add it to a flow table.
282 */
283 int
284 mac_flow_add(flow_tab_t *ft, flow_entry_t *flent)
285 {
286 flow_entry_t **headp, **p;
287 flow_ops_t *ops = &ft->ft_ops;
288 flow_mask_t mask;
289 uint32_t index;
290 int err;
291
292 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
293
294 /*
295 * Check for invalid bits in mask.
296 */
297 mask = flent->fe_flow_desc.fd_mask;
298 if ((mask & ft->ft_mask) == 0 || (mask & ~ft->ft_mask) != 0)
299 return (EOPNOTSUPP);
300
301 /*
302 * Validate flent.
303 */
304 if ((err = ops->fo_accept_fe(ft, flent)) != 0) {
305 DTRACE_PROBE3(accept_failed, flow_tab_t *, ft,
306 flow_entry_t *, flent, int, err);
307 return (err);
308 }
309
310 /*
311 * Flent is valid. now calculate hash and insert it
312 * into hash table.
313 */
314 index = ops->fo_hash_fe(ft, flent);
315
316 /*
317 * We do not need a lock up until now because we were
318 * not accessing the flow table.
319 */
320 rw_enter(&ft->ft_lock, RW_WRITER);
321 headp = &ft->ft_table[index];
322
323 /*
324 * Check for duplicate flow.
325 */
326 for (p = headp; *p != NULL; p = &(*p)->fe_next) {
327 if ((*p)->fe_flow_desc.fd_mask !=
328 flent->fe_flow_desc.fd_mask)
329 continue;
330
331 if (ft->ft_ops.fo_match_fe(ft, *p, flent)) {
332 rw_exit(&ft->ft_lock);
333 DTRACE_PROBE3(dup_flow, flow_tab_t *, ft,
334 flow_entry_t *, flent, int, err);
335 return (EALREADY);
336 }
337 }
338
339 /*
340 * Insert flow to hash list.
341 */
342 err = ops->fo_insert_fe(ft, headp, flent);
343 if (err != 0) {
344 rw_exit(&ft->ft_lock);
345 DTRACE_PROBE3(insert_failed, flow_tab_t *, ft,
346 flow_entry_t *, flent, int, err);
347 return (err);
348 }
349
350 /*
351 * Save the hash index so it can be used by mac_flow_remove().
352 */
353 flent->fe_index = (int)index;
354
355 /*
356 * Save the flow tab back reference.
357 */
358 flent->fe_flow_tab = ft;
359 FLOW_MARK(flent, FE_FLOW_TAB);
360 ft->ft_flow_count++;
361 rw_exit(&ft->ft_lock);
362 return (0);
363 }
364
365 /*
366 * Remove a flow from a mac client's subflow table
367 */
368 void
369 mac_flow_rem_subflow(flow_entry_t *flent)
370 {
371 flow_tab_t *ft = flent->fe_flow_tab;
372 mac_client_impl_t *mcip = ft->ft_mcip;
373 mac_handle_t mh = (mac_handle_t)ft->ft_mip;
374
375 ASSERT(MAC_PERIM_HELD(mh));
376
377 mac_flow_remove(ft, flent, B_FALSE);
378 if (flent->fe_mcip == NULL) {
379 /*
380 * The interface is not yet plumbed and mac_client_flow_add
381 * was not done.
382 */
383 if (FLOW_TAB_EMPTY(ft)) {
384 mac_flow_tab_destroy(ft);
385 mcip->mci_subflow_tab = NULL;
386 }
387 } else {
388 mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
389 mac_link_flow_clean((mac_client_handle_t)mcip, flent);
390 }
391 mac_fastpath_enable(mh);
392 }
393
394 /*
395 * Add a flow to a mac client's subflow table and instantiate the flow
396 * in the mac by creating the associated SRSs etc.
397 */
398 int
399 mac_flow_add_subflow(mac_client_handle_t mch, flow_entry_t *flent,
400 boolean_t instantiate_flow)
401 {
402 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
403 mac_handle_t mh = (mac_handle_t)mcip->mci_mip;
404 flow_tab_info_t *ftinfo;
405 flow_mask_t mask;
406 flow_tab_t *ft;
407 int err;
408 boolean_t ft_created = B_FALSE;
409
410 ASSERT(MAC_PERIM_HELD(mh));
411
412 if ((err = mac_fastpath_disable(mh)) != 0)
413 return (err);
414
415 /*
416 * If the subflow table exists already just add the new subflow
417 * to the existing table, else we create a new subflow table below.
418 */
419 ft = mcip->mci_subflow_tab;
420 if (ft == NULL) {
421 mask = flent->fe_flow_desc.fd_mask;
422 /*
423 * Try to create a new table and then add the subflow to the
424 * newly created subflow table
425 */
426 if ((ftinfo = mac_flow_tab_info_get(mask)) == NULL) {
427 mac_fastpath_enable(mh);
428 return (EOPNOTSUPP);
429 }
430
431 mac_flow_tab_create(ftinfo->fti_ops, mask, ftinfo->fti_size,
432 mcip->mci_mip, &ft);
433 ft_created = B_TRUE;
434 }
435
436 err = mac_flow_add(ft, flent);
437 if (err != 0) {
438 if (ft_created)
439 mac_flow_tab_destroy(ft);
440 mac_fastpath_enable(mh);
441 return (err);
442 }
443
444 if (instantiate_flow) {
445 /* Now activate the flow by creating its SRSs */
446 ASSERT(MCIP_DATAPATH_SETUP(mcip));
447 err = mac_link_flow_init((mac_client_handle_t)mcip, flent);
448 if (err != 0) {
449 mac_flow_remove(ft, flent, B_FALSE);
450 if (ft_created)
451 mac_flow_tab_destroy(ft);
452 mac_fastpath_enable(mh);
453 return (err);
454 }
455 } else {
456 FLOW_MARK(flent, FE_UF_NO_DATAPATH);
457 }
458 if (ft_created) {
459 ASSERT(mcip->mci_subflow_tab == NULL);
460 ft->ft_mcip = mcip;
461 mcip->mci_subflow_tab = ft;
462 if (instantiate_flow)
463 mac_client_update_classifier(mcip, B_TRUE);
464 }
465 return (0);
466 }
467
468 /*
469 * Remove flow entry from flow table.
470 */
471 void
472 mac_flow_remove(flow_tab_t *ft, flow_entry_t *flent, boolean_t temp)
473 {
474 flow_entry_t **fp;
475
476 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
477 if (!(flent->fe_flags & FE_FLOW_TAB))
478 return;
479
480 rw_enter(&ft->ft_lock, RW_WRITER);
481 /*
482 * If this is a permanent removal from the flow table, mark it
483 * CONDEMNED to prevent future references. If this is a temporary
484 * removal from the table, say to update the flow descriptor then
485 * we don't mark it CONDEMNED
486 */
487 if (!temp)
488 FLOW_MARK(flent, FE_CONDEMNED);
489 /*
490 * Locate the specified flent.
491 */
492 fp = &ft->ft_table[flent->fe_index];
493 while (*fp != flent)
494 fp = &(*fp)->fe_next;
495
496 /*
497 * The flent must exist. Otherwise it's a bug.
498 */
499 ASSERT(fp != NULL);
500 *fp = flent->fe_next;
501 flent->fe_next = NULL;
502
503 /*
504 * Reset fe_index to -1 so any attempt to call mac_flow_remove()
505 * on a flent that is supposed to be in the table (FE_FLOW_TAB)
506 * will panic.
507 */
508 flent->fe_index = -1;
509 FLOW_UNMARK(flent, FE_FLOW_TAB);
510 ft->ft_flow_count--;
511 rw_exit(&ft->ft_lock);
512 }
513
514 /*
515 * This is the flow lookup routine used by the mac sw classifier engine.
516 */
517 int
518 mac_flow_lookup(flow_tab_t *ft, mblk_t *mp, uint_t flags, flow_entry_t **flentp)
519 {
520 flow_state_t s;
521 flow_entry_t *flent;
522 flow_ops_t *ops = &ft->ft_ops;
523 boolean_t retried = B_FALSE;
524 int i, err;
525
526 s.fs_flags = flags;
527 retry:
528 s.fs_mp = mp;
529
530 /*
531 * Walk the list of predeclared accept functions.
532 * Each of these would accumulate enough state to allow the next
533 * accept routine to make progress.
534 */
535 for (i = 0; i < FLOW_MAX_ACCEPT && ops->fo_accept[i] != NULL; i++) {
536 if ((err = (ops->fo_accept[i])(ft, &s)) != 0) {
537 mblk_t *last;
538
539 /*
540 * ENOBUFS indicates that the mp could be too short
541 * and may need a pullup.
542 */
543 if (err != ENOBUFS || retried)
544 return (err);
545
546 /*
547 * The pullup is done on the last processed mblk, not
548 * the starting one. pullup is not done if the mblk
549 * has references or if b_cont is NULL.
550 */
551 last = s.fs_mp;
552 if (DB_REF(last) > 1 || last->b_cont == NULL ||
553 pullupmsg(last, -1) == 0)
554 return (EINVAL);
555
556 retried = B_TRUE;
557 DTRACE_PROBE2(need_pullup, flow_tab_t *, ft,
558 flow_state_t *, &s);
559 goto retry;
560 }
561 }
562
563 /*
564 * The packet is considered sane. We may now attempt to
565 * find the corresponding flent.
566 */
567 rw_enter(&ft->ft_lock, RW_READER);
568 flent = ft->ft_table[ops->fo_hash(ft, &s)];
569 for (; flent != NULL; flent = flent->fe_next) {
570 if (flent->fe_match(ft, flent, &s)) {
571 FLOW_TRY_REFHOLD(flent, err);
572 if (err != 0)
573 continue;
574 *flentp = flent;
575 rw_exit(&ft->ft_lock);
576 return (0);
577 }
578 }
579 rw_exit(&ft->ft_lock);
580 return (ENOENT);
581 }
582
583 /*
584 * Walk flow table.
585 * The caller is assumed to have proper perimeter protection.
586 */
587 int
588 mac_flow_walk_nolock(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
589 void *arg)
590 {
591 int err, i, cnt = 0;
592 flow_entry_t *flent;
593
594 if (ft == NULL)
595 return (0);
596
597 for (i = 0; i < ft->ft_size; i++) {
598 for (flent = ft->ft_table[i]; flent != NULL;
599 flent = flent->fe_next) {
600 cnt++;
601 err = (*fn)(flent, arg);
602 if (err != 0)
603 return (err);
604 }
605 }
606 VERIFY(cnt == ft->ft_flow_count);
607 return (0);
608 }
609
610 /*
611 * Same as the above except a mutex is used for protection here.
612 */
613 int
614 mac_flow_walk(flow_tab_t *ft, int (*fn)(flow_entry_t *, void *),
615 void *arg)
616 {
617 int err;
618
619 if (ft == NULL)
620 return (0);
621
622 rw_enter(&ft->ft_lock, RW_WRITER);
623 err = mac_flow_walk_nolock(ft, fn, arg);
624 rw_exit(&ft->ft_lock);
625 return (err);
626 }
627
628 static boolean_t mac_flow_clean(flow_entry_t *);
629
630 /*
631 * Destroy a flow entry. Called when the last reference on a flow is released.
632 */
633 void
634 mac_flow_destroy(flow_entry_t *flent)
635 {
636 ASSERT(flent->fe_refcnt == 0);
637
638 if ((flent->fe_type & FLOW_USER) != 0) {
639 ASSERT(mac_flow_clean(flent));
640 } else {
641 mac_flow_cleanup(flent);
642 }
643 mac_misc_stat_delete(flent);
644 mutex_destroy(&flent->fe_lock);
645 cv_destroy(&flent->fe_cv);
646 flow_stat_destroy(flent);
647 kmem_cache_free(flow_cache, flent);
648 }
649
650 /*
651 * XXX eric
652 * The MAC_FLOW_PRIORITY checks in mac_resource_ctl_set() and
653 * mac_link_flow_modify() should really be moved/reworked into the
654 * two functions below. This would consolidate all the mac property
655 * checking in one place. I'm leaving this alone for now since it's
656 * out of scope of the new flows work.
657 */
658 /* ARGSUSED */
659 uint32_t
660 mac_flow_modify_props(flow_entry_t *flent, mac_resource_props_t *mrp)
661 {
662 uint32_t changed_mask = 0;
663 mac_resource_props_t *fmrp = &flent->fe_effective_props;
664 int i;
665
666 if ((mrp->mrp_mask & MRP_MAXBW) != 0 &&
667 (!(fmrp->mrp_mask & MRP_MAXBW) ||
668 (fmrp->mrp_maxbw != mrp->mrp_maxbw))) {
669 changed_mask |= MRP_MAXBW;
670 if (mrp->mrp_maxbw == MRP_MAXBW_RESETVAL) {
671 fmrp->mrp_mask &= ~MRP_MAXBW;
672 fmrp->mrp_maxbw = 0;
673 } else {
674 fmrp->mrp_mask |= MRP_MAXBW;
675 fmrp->mrp_maxbw = mrp->mrp_maxbw;
676 }
677 }
678
679 if ((mrp->mrp_mask & MRP_PRIORITY) != 0) {
680 if (fmrp->mrp_priority != mrp->mrp_priority)
681 changed_mask |= MRP_PRIORITY;
682 if (mrp->mrp_priority == MPL_RESET) {
683 fmrp->mrp_priority = MPL_SUBFLOW_DEFAULT;
684 fmrp->mrp_mask &= ~MRP_PRIORITY;
685 } else {
686 fmrp->mrp_priority = mrp->mrp_priority;
687 fmrp->mrp_mask |= MRP_PRIORITY;
688 }
689 }
690
691 /* modify fanout */
692 if ((mrp->mrp_mask & MRP_CPUS) != 0) {
693 if ((fmrp->mrp_ncpus == mrp->mrp_ncpus) &&
694 (fmrp->mrp_fanout_mode == mrp->mrp_fanout_mode)) {
695 for (i = 0; i < mrp->mrp_ncpus; i++) {
696 if (mrp->mrp_cpu[i] != fmrp->mrp_cpu[i])
697 break;
698 }
699 if (i == mrp->mrp_ncpus) {
700 /*
701 * The new set of cpus passed is exactly
702 * the same as the existing set.
703 */
704 return (changed_mask);
705 }
706 }
707 changed_mask |= MRP_CPUS;
708 MAC_COPY_CPUS(mrp, fmrp);
709 }
710
711 /*
712 * Modify the rings property.
713 */
714 if (mrp->mrp_mask & MRP_RX_RINGS || mrp->mrp_mask & MRP_TX_RINGS)
715 mac_set_rings_effective(flent->fe_mcip);
716
717 if ((mrp->mrp_mask & MRP_POOL) != 0) {
718 if (strcmp(fmrp->mrp_pool, mrp->mrp_pool) != 0)
719 changed_mask |= MRP_POOL;
720 if (strlen(mrp->mrp_pool) == 0)
721 fmrp->mrp_mask &= ~MRP_POOL;
722 else
723 fmrp->mrp_mask |= MRP_POOL;
724 (void) strncpy(fmrp->mrp_pool, mrp->mrp_pool, MAXPATHLEN);
725 }
726 return (changed_mask);
727 }
728
729 void
730 mac_flow_modify(flow_tab_t *ft, flow_entry_t *flent, mac_resource_props_t *mrp)
731 {
732 uint32_t changed_mask;
733 mac_client_impl_t *mcip = flent->fe_mcip;
734 mac_resource_props_t *mcip_mrp = MCIP_RESOURCE_PROPS(mcip);
735 mac_resource_props_t *emrp = MCIP_EFFECTIVE_PROPS(mcip);
736 cpupart_t *cpupart = NULL;
737 boolean_t use_default = B_FALSE;
738
739 ASSERT(flent != NULL);
740 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
741
742 rw_enter(&ft->ft_lock, RW_WRITER);
743
744 /* Update the cached values inside the subflow entry */
745 changed_mask = mac_flow_modify_props(flent, mrp);
746 rw_exit(&ft->ft_lock);
747 /*
748 * Push the changed parameters to the scheduling code in the
749 * SRS's, to take effect right away.
750 */
751 if (changed_mask & MRP_MAXBW) {
752 mac_srs_update_bwlimit(flent, mrp);
753 /*
754 * If bandwidth is changed, we may have to change
755 * the number of soft ring to be used for fanout.
756 * Call mac_flow_update_fanout() if MAC_BIND_CPU
757 * is not set and there is no user supplied cpu
758 * info. This applies only to link at this time.
759 */
760 if (!(flent->fe_type & FLOW_USER) &&
761 !(changed_mask & MRP_CPUS) &&
762 !(mcip_mrp->mrp_mask & MRP_CPUS_USERSPEC)) {
763 mac_fanout_setup(mcip, flent, mcip_mrp,
764 mac_rx_deliver, mcip, NULL, NULL);
765 }
766 }
767 if (mrp->mrp_mask & MRP_PRIORITY)
768 mac_flow_update_priority(mcip, flent);
769
770 if (changed_mask & MRP_CPUS)
771 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
772 NULL);
773
774 if (mrp->mrp_mask & MRP_POOL) {
775 pool_lock();
776 cpupart = mac_pset_find(mrp, &use_default);
777 mac_fanout_setup(mcip, flent, mrp, mac_rx_deliver, mcip, NULL,
778 cpupart);
779 mac_set_pool_effective(use_default, cpupart, mrp, emrp);
780 pool_unlock();
781 }
782 }
783
784 /*
785 * This function waits for a certain condition to be met and is generally
786 * used before a destructive or quiescing operation.
787 */
788 void
789 mac_flow_wait(flow_entry_t *flent, mac_flow_state_t event)
790 {
791 mutex_enter(&flent->fe_lock);
792 flent->fe_flags |= FE_WAITER;
793
794 switch (event) {
795 case FLOW_DRIVER_UPCALL:
796 /*
797 * We want to make sure the driver upcalls have finished before
798 * we signal the Rx SRS worker to quit.
799 */
800 while (flent->fe_refcnt != 1)
801 cv_wait(&flent->fe_cv, &flent->fe_lock);
802 break;
803
804 case FLOW_USER_REF:
805 /*
806 * Wait for the fe_user_refcnt to drop to 0. The flow has
807 * been removed from the global flow hash.
808 */
809 ASSERT(!(flent->fe_flags & FE_G_FLOW_HASH));
810 while (flent->fe_user_refcnt != 0)
811 cv_wait(&flent->fe_cv, &flent->fe_lock);
812 break;
813
814 default:
815 ASSERT(0);
816 }
817
818 flent->fe_flags &= ~FE_WAITER;
819 mutex_exit(&flent->fe_lock);
820 }
821
822 static boolean_t
823 mac_flow_clean(flow_entry_t *flent)
824 {
825 ASSERT(flent->fe_next == NULL);
826 ASSERT(flent->fe_tx_srs == NULL);
827 ASSERT(flent->fe_rx_srs_cnt == 0 && flent->fe_rx_srs[0] == NULL);
828 ASSERT(flent->fe_mbg == NULL);
829
830 return (B_TRUE);
831 }
832
833 void
834 mac_flow_cleanup(flow_entry_t *flent)
835 {
836 if ((flent->fe_type & FLOW_USER) == 0) {
837 ASSERT((flent->fe_mbg == NULL && flent->fe_mcip != NULL) ||
838 (flent->fe_mbg != NULL && flent->fe_mcip == NULL));
839 ASSERT(flent->fe_refcnt == 0);
840 } else {
841 ASSERT(flent->fe_refcnt == 1);
842 }
843
844 if (flent->fe_mbg != NULL) {
845 ASSERT(flent->fe_tx_srs == NULL);
846 /* This is a multicast or broadcast flow entry */
847 mac_bcast_grp_free(flent->fe_mbg);
848 flent->fe_mbg = NULL;
849 }
850
851 if (flent->fe_tx_srs != NULL) {
852 ASSERT(flent->fe_mbg == NULL);
853 mac_srs_free(flent->fe_tx_srs);
854 flent->fe_tx_srs = NULL;
855 }
856
857 /*
858 * In the normal case fe_rx_srs_cnt is 1. However in the error case
859 * when mac_unicast_add fails we may not have set up any SRS
860 * in which case fe_rx_srs_cnt will be zero.
861 */
862 if (flent->fe_rx_srs_cnt != 0) {
863 ASSERT(flent->fe_rx_srs_cnt == 1);
864 mac_srs_free(flent->fe_rx_srs[0]);
865 flent->fe_rx_srs[0] = NULL;
866 flent->fe_rx_srs_cnt = 0;
867 }
868 ASSERT(flent->fe_rx_srs[0] == NULL);
869 }
870
871 void
872 mac_flow_get_desc(flow_entry_t *flent, flow_desc_t *fd)
873 {
874 /*
875 * Grab the fe_lock to see a self-consistent fe_flow_desc.
876 * Updates to the fe_flow_desc happen under the fe_lock
877 * after removing the flent from the flow table
878 */
879 mutex_enter(&flent->fe_lock);
880 bcopy(&flent->fe_flow_desc, fd, sizeof (*fd));
881 mutex_exit(&flent->fe_lock);
882 }
883
884 /*
885 * Update a field of a flow entry. The mac perimeter ensures that
886 * this is the only thread doing a modify operation on this mac end point.
887 * So the flow table can't change or disappear. The ft_lock protects access
888 * to the flow entry, and holding the lock ensures that there isn't any thread
889 * accessing the flow entry or attempting a flow table lookup. However
890 * data threads that are using the flow entry based on the old descriptor
891 * will continue to use the flow entry. If strong coherence is required
892 * then the flow will have to be quiesced before the descriptor can be
893 * changed.
894 */
895 void
896 mac_flow_set_desc(flow_entry_t *flent, flow_desc_t *fd)
897 {
898 flow_tab_t *ft = flent->fe_flow_tab;
899 flow_desc_t old_desc;
900 int err;
901
902 if (ft == NULL) {
903 /*
904 * The flow hasn't yet been inserted into the table,
905 * so only the caller knows about this flow, however for
906 * uniformity we grab the fe_lock here.
907 */
908 mutex_enter(&flent->fe_lock);
909 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
910 mutex_exit(&flent->fe_lock);
911 }
912
913 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
914
915 /*
916 * Need to remove the flow entry from the table and reinsert it,
917 * into a potentially diference hash line. The hash depends on
918 * the new descriptor fields. However access to fe_desc itself
919 * is always under the fe_lock. This helps log and stat functions
920 * see a self-consistent fe_flow_desc.
921 */
922 mac_flow_remove(ft, flent, B_TRUE);
923 old_desc = flent->fe_flow_desc;
924
925 mutex_enter(&flent->fe_lock);
926 bcopy(fd, &flent->fe_flow_desc, sizeof (*fd));
927 mutex_exit(&flent->fe_lock);
928
929 if (mac_flow_add(ft, flent) != 0) {
930 /*
931 * The add failed say due to an invalid flow descriptor.
932 * Undo the update
933 */
934 flent->fe_flow_desc = old_desc;
935 err = mac_flow_add(ft, flent);
936 ASSERT(err == 0);
937 }
938 }
939
940 void
941 mac_flow_set_name(flow_entry_t *flent, const char *name)
942 {
943 flow_tab_t *ft = flent->fe_flow_tab;
944
945 if (ft == NULL) {
946 /*
947 * The flow hasn't yet been inserted into the table,
948 * so only the caller knows about this flow
949 */
950 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
951 } else {
952 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
953 }
954
955 mutex_enter(&flent->fe_lock);
956 (void) strlcpy(flent->fe_flow_name, name, MAXFLOWNAMELEN);
957 mutex_exit(&flent->fe_lock);
958 }
959
960 /*
961 * Return the client-private cookie that was associated with
962 * the flow when it was created.
963 */
964 void *
965 mac_flow_get_client_cookie(flow_entry_t *flent)
966 {
967 return (flent->fe_client_cookie);
968 }
969
970 /*
971 * Forward declarations.
972 */
973 static uint32_t flow_l2_hash(flow_tab_t *, flow_state_t *);
974 static uint32_t flow_l2_hash_fe(flow_tab_t *, flow_entry_t *);
975 static int flow_l2_accept(flow_tab_t *, flow_state_t *);
976 static uint32_t flow_ether_hash(flow_tab_t *, flow_state_t *);
977 static uint32_t flow_ether_hash_fe(flow_tab_t *, flow_entry_t *);
978 static int flow_ether_accept(flow_tab_t *, flow_state_t *);
979
980 /*
981 * Create flow table.
982 */
983 void
984 mac_flow_tab_create(flow_ops_t *ops, flow_mask_t mask, uint_t size,
985 mac_impl_t *mip, flow_tab_t **ftp)
986 {
987 flow_tab_t *ft;
988 flow_ops_t *new_ops;
989
990 ft = kmem_cache_alloc(flow_tab_cache, KM_SLEEP);
991 bzero(ft, sizeof (*ft));
992
993 ft->ft_table = kmem_zalloc(size * sizeof (flow_entry_t *), KM_SLEEP);
994
995 /*
996 * We make a copy of the ops vector instead of just pointing to it
997 * because we might want to customize the ops vector on a per table
998 * basis (e.g. for optimization).
999 */
1000 new_ops = &ft->ft_ops;
1001 bcopy(ops, new_ops, sizeof (*ops));
1002 ft->ft_mask = mask;
1003 ft->ft_size = size;
1004 ft->ft_mip = mip;
1005
1006 /*
1007 * Optimizations for DL_ETHER media.
1008 */
1009 if (mip->mi_info.mi_nativemedia == DL_ETHER) {
1010 if (new_ops->fo_hash == flow_l2_hash)
1011 new_ops->fo_hash = flow_ether_hash;
1012 if (new_ops->fo_hash_fe == flow_l2_hash_fe)
1013 new_ops->fo_hash_fe = flow_ether_hash_fe;
1014 if (new_ops->fo_accept[0] == flow_l2_accept)
1015 new_ops->fo_accept[0] = flow_ether_accept;
1016 }
1017 *ftp = ft;
1018 }
1019
1020 void
1021 mac_flow_l2tab_create(mac_impl_t *mip, flow_tab_t **ftp)
1022 {
1023 mac_flow_tab_create(&flow_l2_ops, FLOW_LINK_DST | FLOW_LINK_VID,
1024 1024, mip, ftp);
1025 }
1026
1027 /*
1028 * Destroy flow table.
1029 */
1030 void
1031 mac_flow_tab_destroy(flow_tab_t *ft)
1032 {
1033 if (ft == NULL)
1034 return;
1035
1036 ASSERT(ft->ft_flow_count == 0);
1037 kmem_free(ft->ft_table, ft->ft_size * sizeof (flow_entry_t *));
1038 bzero(ft, sizeof (*ft));
1039 kmem_cache_free(flow_tab_cache, ft);
1040 }
1041
1042 /*
1043 * Add a new flow entry to the global flow hash table
1044 */
1045 int
1046 mac_flow_hash_add(flow_entry_t *flent)
1047 {
1048 int err;
1049
1050 rw_enter(&flow_tab_lock, RW_WRITER);
1051 err = mod_hash_insert(flow_hash,
1052 (mod_hash_key_t)flent->fe_flow_name, (mod_hash_val_t)flent);
1053 if (err != 0) {
1054 rw_exit(&flow_tab_lock);
1055 return (EEXIST);
1056 }
1057 /* Mark as inserted into the global flow hash table */
1058 FLOW_MARK(flent, FE_G_FLOW_HASH);
1059 rw_exit(&flow_tab_lock);
1060 return (err);
1061 }
1062
1063 /*
1064 * Remove a flow entry from the global flow hash table
1065 */
1066 void
1067 mac_flow_hash_remove(flow_entry_t *flent)
1068 {
1069 mod_hash_val_t val;
1070
1071 rw_enter(&flow_tab_lock, RW_WRITER);
1072 VERIFY(mod_hash_remove(flow_hash,
1073 (mod_hash_key_t)flent->fe_flow_name, &val) == 0);
1074
1075 /* Clear the mark that says inserted into the global flow hash table */
1076 FLOW_UNMARK(flent, FE_G_FLOW_HASH);
1077 rw_exit(&flow_tab_lock);
1078 }
1079
1080 /*
1081 * Retrieve a flow entry from the global flow hash table.
1082 */
1083 int
1084 mac_flow_lookup_byname(char *name, flow_entry_t **flentp)
1085 {
1086 int err;
1087 flow_entry_t *flent;
1088
1089 rw_enter(&flow_tab_lock, RW_READER);
1090 err = mod_hash_find(flow_hash, (mod_hash_key_t)name,
1091 (mod_hash_val_t *)&flent);
1092 if (err != 0) {
1093 rw_exit(&flow_tab_lock);
1094 return (ENOENT);
1095 }
1096 ASSERT(flent != NULL);
1097 FLOW_USER_REFHOLD(flent);
1098 rw_exit(&flow_tab_lock);
1099
1100 *flentp = flent;
1101 return (0);
1102 }
1103
1104 /*
1105 * Initialize or release mac client flows by walking the subflow table.
1106 * These are typically invoked during plumb/unplumb of links.
1107 */
1108
1109 static int
1110 mac_link_init_flows_cb(flow_entry_t *flent, void *arg)
1111 {
1112 mac_client_impl_t *mcip = arg;
1113
1114 if (mac_link_flow_init(arg, flent) != 0) {
1115 cmn_err(CE_WARN, "Failed to initialize flow '%s' on link '%s'",
1116 flent->fe_flow_name, mcip->mci_name);
1117 } else {
1118 FLOW_UNMARK(flent, FE_UF_NO_DATAPATH);
1119 }
1120 return (0);
1121 }
1122
1123 void
1124 mac_link_init_flows(mac_client_handle_t mch)
1125 {
1126 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1127
1128 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1129 mac_link_init_flows_cb, mcip);
1130 /*
1131 * If mac client had subflow(s) configured before plumb, change
1132 * function to mac_rx_srs_subflow_process and in case of hardware
1133 * classification, disable polling.
1134 */
1135 mac_client_update_classifier(mcip, B_TRUE);
1136
1137 }
1138
1139 boolean_t
1140 mac_link_has_flows(mac_client_handle_t mch)
1141 {
1142 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1143
1144 if (!FLOW_TAB_EMPTY(mcip->mci_subflow_tab))
1145 return (B_TRUE);
1146
1147 return (B_FALSE);
1148 }
1149
1150 static int
1151 mac_link_release_flows_cb(flow_entry_t *flent, void *arg)
1152 {
1153 FLOW_MARK(flent, FE_UF_NO_DATAPATH);
1154 mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1155 mac_link_flow_clean(arg, flent);
1156 return (0);
1157 }
1158
1159 void
1160 mac_link_release_flows(mac_client_handle_t mch)
1161 {
1162 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1163
1164 /*
1165 * Change the mci_flent callback back to mac_rx_srs_process()
1166 * because flows are about to be deactivated.
1167 */
1168 mac_client_update_classifier(mcip, B_FALSE);
1169 (void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1170 mac_link_release_flows_cb, mcip);
1171 }
1172
1173 void
1174 mac_rename_flow(flow_entry_t *fep, const char *new_name)
1175 {
1176 mac_flow_set_name(fep, new_name);
1177 if (fep->fe_ksp != NULL) {
1178 flow_stat_destroy(fep);
1179 flow_stat_create(fep);
1180 }
1181 }
1182
1183 /*
1184 * mac_link_flow_init()
1185 * Internal flow interface used for allocating SRSs and related
1186 * data structures. Not meant to be used by mac clients.
1187 */
1188 int
1189 mac_link_flow_init(mac_client_handle_t mch, flow_entry_t *sub_flow)
1190 {
1191 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1192 mac_impl_t *mip = mcip->mci_mip;
1193 int err;
1194
1195 ASSERT(mch != NULL);
1196 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1197
1198 if ((err = mac_datapath_setup(mcip, sub_flow, SRST_FLOW)) != 0)
1199 return (err);
1200
1201 sub_flow->fe_mcip = mcip;
1202
1203 return (0);
1204 }
1205
1206 /*
1207 * mac_link_flow_add()
1208 * Used by flowadm(1m) or kernel mac clients for creating flows.
1209 */
1210 int
1211 mac_link_flow_add(datalink_id_t linkid, char *flow_name,
1212 flow_desc_t *flow_desc, mac_resource_props_t *mrp)
1213 {
1214 flow_entry_t *flent = NULL;
1215 int err;
1216 dls_dl_handle_t dlh;
1217 dls_link_t *dlp;
1218 boolean_t link_held = B_FALSE;
1219 boolean_t hash_added = B_FALSE;
1220 mac_perim_handle_t mph;
1221
1222 err = mac_flow_lookup_byname(flow_name, &flent);
1223 if (err == 0) {
1224 FLOW_USER_REFRELE(flent);
1225 return (EEXIST);
1226 }
1227
1228 /*
1229 * First create a flow entry given the description provided
1230 * by the caller.
1231 */
1232 err = mac_flow_create(flow_desc, mrp, flow_name, NULL,
1233 FLOW_USER | FLOW_OTHER, &flent);
1234
1235 if (err != 0)
1236 return (err);
1237
1238 /*
1239 * We've got a local variable referencing this flow now, so we need
1240 * to hold it. We'll release this flow before returning.
1241 * All failures until we return will undo any action that may internally
1242 * held the flow, so the last REFRELE will assure a clean freeing
1243 * of resources.
1244 */
1245 FLOW_REFHOLD(flent);
1246
1247 flent->fe_link_id = linkid;
1248 FLOW_MARK(flent, FE_INCIPIENT);
1249
1250 err = mac_perim_enter_by_linkid(linkid, &mph);
1251 if (err != 0) {
1252 FLOW_FINAL_REFRELE(flent);
1253 return (err);
1254 }
1255
1256 /*
1257 * dls will eventually be merged with mac so it's ok
1258 * to call dls' internal functions.
1259 */
1260 err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1261 if (err != 0)
1262 goto bail;
1263
1264 link_held = B_TRUE;
1265
1266 /*
1267 * Add the flow to the global flow table, this table will be per
1268 * exclusive zone so each zone can have its own flow namespace.
1269 * RFE 6625651 will fix this.
1270 *
1271 */
1272 if ((err = mac_flow_hash_add(flent)) != 0)
1273 goto bail;
1274
1275 hash_added = B_TRUE;
1276
1277 /*
1278 * do not allow flows to be configured on an anchor VNIC
1279 */
1280 if (mac_capab_get(dlp->dl_mh, MAC_CAPAB_ANCHOR_VNIC, NULL)) {
1281 err = ENOTSUP;
1282 goto bail;
1283 }
1284
1285 /*
1286 * Add the subflow to the subflow table. Also instantiate the flow
1287 * in the mac if there is an active user (we check if the MAC client's
1288 * datapath has been setup).
1289 */
1290 err = mac_flow_add_subflow(dlp->dl_mch, flent,
1291 MCIP_DATAPATH_SETUP((mac_client_impl_t *)dlp->dl_mch));
1292 if (err != 0)
1293 goto bail;
1294
1295 FLOW_UNMARK(flent, FE_INCIPIENT);
1296 dls_devnet_rele_link(dlh, dlp);
1297 mac_perim_exit(mph);
1298 return (0);
1299
1300 bail:
1301 if (hash_added)
1302 mac_flow_hash_remove(flent);
1303
1304 if (link_held)
1305 dls_devnet_rele_link(dlh, dlp);
1306
1307 /*
1308 * Wait for any transient global flow hash refs to clear
1309 * and then release the creation reference on the flow
1310 */
1311 mac_flow_wait(flent, FLOW_USER_REF);
1312 FLOW_FINAL_REFRELE(flent);
1313 mac_perim_exit(mph);
1314 return (err);
1315 }
1316
1317 /*
1318 * mac_link_flow_clean()
1319 * Internal flow interface used for freeing SRSs and related
1320 * data structures. Not meant to be used by mac clients.
1321 */
1322 void
1323 mac_link_flow_clean(mac_client_handle_t mch, flow_entry_t *sub_flow)
1324 {
1325 mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1326 mac_impl_t *mip = mcip->mci_mip;
1327 boolean_t last_subflow;
1328
1329 ASSERT(mch != NULL);
1330 ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1331
1332 /*
1333 * This sub flow entry may fail to be fully initialized by
1334 * mac_link_flow_init(). If so, simply return.
1335 */
1336 if (sub_flow->fe_mcip == NULL)
1337 return;
1338
1339 last_subflow = FLOW_TAB_EMPTY(mcip->mci_subflow_tab);
1340 /*
1341 * Tear down the data path
1342 */
1343 mac_datapath_teardown(mcip, sub_flow, SRST_FLOW);
1344 sub_flow->fe_mcip = NULL;
1345
1346 /*
1347 * Delete the SRSs associated with this subflow. If this is being
1348 * driven by flowadm(1M) then the subflow will be deleted by
1349 * dls_rem_flow. However if this is a result of the interface being
1350 * unplumbed then the subflow itself won't be deleted.
1351 */
1352 mac_flow_cleanup(sub_flow);
1353
1354 /*
1355 * If all the subflows are gone, renable some of the stuff
1356 * we disabled when adding a subflow, polling etc.
1357 */
1358 if (last_subflow) {
1359 /*
1360 * The subflow table itself is not protected by any locks or
1361 * refcnts. Hence quiesce the client upfront before clearing
1362 * mci_subflow_tab.
1363 */
1364 mac_client_quiesce(mcip);
1365 mac_client_update_classifier(mcip, B_FALSE);
1366 mac_flow_tab_destroy(mcip->mci_subflow_tab);
1367 mcip->mci_subflow_tab = NULL;
1368 mac_client_restart(mcip);
1369 }
1370 }
1371
1372 /*
1373 * mac_link_flow_remove()
1374 * Used by flowadm(1m) or kernel mac clients for removing flows.
1375 */
1376 int
1377 mac_link_flow_remove(char *flow_name)
1378 {
1379 flow_entry_t *flent;
1380 mac_perim_handle_t mph;
1381 int err;
1382 datalink_id_t linkid;
1383
1384 err = mac_flow_lookup_byname(flow_name, &flent);
1385 if (err != 0)
1386 return (err);
1387
1388 linkid = flent->fe_link_id;
1389 FLOW_USER_REFRELE(flent);
1390
1391 /*
1392 * The perim must be acquired before acquiring any other references
1393 * to maintain the lock and perimeter hierarchy. Please note the
1394 * FLOW_REFRELE above.
1395 */
1396 err = mac_perim_enter_by_linkid(linkid, &mph);
1397 if (err != 0)
1398 return (err);
1399
1400 /*
1401 * Note the second lookup of the flow, because a concurrent thread
1402 * may have removed it already while we were waiting to enter the
1403 * link's perimeter.
1404 */
1405 err = mac_flow_lookup_byname(flow_name, &flent);
1406 if (err != 0) {
1407 mac_perim_exit(mph);
1408 return (err);
1409 }
1410 FLOW_USER_REFRELE(flent);
1411
1412 /*
1413 * Remove the flow from the subflow table and deactivate the flow
1414 * by quiescing and removings its SRSs
1415 */
1416 mac_flow_rem_subflow(flent);
1417
1418 /*
1419 * Finally, remove the flow from the global table.
1420 */
1421 mac_flow_hash_remove(flent);
1422
1423 /*
1424 * Wait for any transient global flow hash refs to clear
1425 * and then release the creation reference on the flow
1426 */
1427 mac_flow_wait(flent, FLOW_USER_REF);
1428 FLOW_FINAL_REFRELE(flent);
1429
1430 mac_perim_exit(mph);
1431
1432 return (0);
1433 }
1434
1435 /*
1436 * mac_link_flow_modify()
1437 * Modifies the properties of a flow identified by its name.
1438 */
1439 int
1440 mac_link_flow_modify(char *flow_name, mac_resource_props_t *mrp)
1441 {
1442 flow_entry_t *flent;
1443 mac_client_impl_t *mcip;
1444 int err = 0;
1445 mac_perim_handle_t mph;
1446 datalink_id_t linkid;
1447 flow_tab_t *flow_tab;
1448
1449 err = mac_validate_props(NULL, mrp);
1450 if (err != 0)
1451 return (err);
1452
1453 err = mac_flow_lookup_byname(flow_name, &flent);
1454 if (err != 0)
1455 return (err);
1456
1457 linkid = flent->fe_link_id;
1458 FLOW_USER_REFRELE(flent);
1459
1460 /*
1461 * The perim must be acquired before acquiring any other references
1462 * to maintain the lock and perimeter hierarchy. Please note the
1463 * FLOW_REFRELE above.
1464 */
1465 err = mac_perim_enter_by_linkid(linkid, &mph);
1466 if (err != 0)
1467 return (err);
1468
1469 /*
1470 * Note the second lookup of the flow, because a concurrent thread
1471 * may have removed it already while we were waiting to enter the
1472 * link's perimeter.
1473 */
1474 err = mac_flow_lookup_byname(flow_name, &flent);
1475 if (err != 0) {
1476 mac_perim_exit(mph);
1477 return (err);
1478 }
1479 FLOW_USER_REFRELE(flent);
1480
1481 /*
1482 * If this flow is attached to a MAC client, then pass the request
1483 * along to the client.
1484 * Otherwise, just update the cached values.
1485 */
1486 mcip = flent->fe_mcip;
1487 mac_update_resources(mrp, &flent->fe_resource_props, B_TRUE);
1488 if (mcip != NULL) {
1489 if ((flow_tab = mcip->mci_subflow_tab) == NULL) {
1490 err = ENOENT;
1491 } else {
1492 mac_flow_modify(flow_tab, flent, mrp);
1493 }
1494 } else {
1495 (void) mac_flow_modify_props(flent, mrp);
1496 }
1497
1498 done:
1499 mac_perim_exit(mph);
1500 return (err);
1501 }
1502
1503
1504 /*
1505 * State structure and misc functions used by mac_link_flow_walk().
1506 */
1507 typedef struct {
1508 int (*ws_func)(mac_flowinfo_t *, void *);
1509 void *ws_arg;
1510 } flow_walk_state_t;
1511
1512 static void
1513 mac_link_flowinfo_copy(mac_flowinfo_t *finfop, flow_entry_t *flent)
1514 {
1515 (void) strlcpy(finfop->fi_flow_name, flent->fe_flow_name,
1516 MAXFLOWNAMELEN);
1517 finfop->fi_link_id = flent->fe_link_id;
1518 finfop->fi_flow_desc = flent->fe_flow_desc;
1519 finfop->fi_resource_props = flent->fe_resource_props;
1520 }
1521
1522 static int
1523 mac_link_flow_walk_cb(flow_entry_t *flent, void *arg)
1524 {
1525 flow_walk_state_t *statep = arg;
1526 mac_flowinfo_t *finfo;
1527 int err;
1528
1529 finfo = kmem_zalloc(sizeof (*finfo), KM_SLEEP);
1530 mac_link_flowinfo_copy(finfo, flent);
1531 err = statep->ws_func(finfo, statep->ws_arg);
1532 kmem_free(finfo, sizeof (*finfo));
1533 return (err);
1534 }
1535
1536 /*
1537 * mac_link_flow_walk()
1538 * Invokes callback 'func' for all flows belonging to the specified link.
1539 */
1540 int
1541 mac_link_flow_walk(datalink_id_t linkid,
1542 int (*func)(mac_flowinfo_t *, void *), void *arg)
1543 {
1544 mac_client_impl_t *mcip;
1545 mac_perim_handle_t mph;
1546 flow_walk_state_t state;
1547 dls_dl_handle_t dlh;
1548 dls_link_t *dlp;
1549 int err;
1550
1551 err = mac_perim_enter_by_linkid(linkid, &mph);
1552 if (err != 0)
1553 return (err);
1554
1555 err = dls_devnet_hold_link(linkid, &dlh, &dlp);
1556 if (err != 0) {
1557 mac_perim_exit(mph);
1558 return (err);
1559 }
1560
1561 mcip = (mac_client_impl_t *)dlp->dl_mch;
1562 state.ws_func = func;
1563 state.ws_arg = arg;
1564
1565 err = mac_flow_walk_nolock(mcip->mci_subflow_tab,
1566 mac_link_flow_walk_cb, &state);
1567
1568 dls_devnet_rele_link(dlh, dlp);
1569 mac_perim_exit(mph);
1570 return (err);
1571 }
1572
1573 /*
1574 * mac_link_flow_info()
1575 * Retrieves information about a specific flow.
1576 */
1577 int
1578 mac_link_flow_info(char *flow_name, mac_flowinfo_t *finfo)
1579 {
1580 flow_entry_t *flent;
1581 int err;
1582
1583 err = mac_flow_lookup_byname(flow_name, &flent);
1584 if (err != 0)
1585 return (err);
1586
1587 mac_link_flowinfo_copy(finfo, flent);
1588 FLOW_USER_REFRELE(flent);
1589 return (0);
1590 }
1591
1592 /*
1593 * Hash function macro that takes an Ethernet address and VLAN id as input.
1594 */
1595 #define HASH_ETHER_VID(a, v, s) \
1596 ((((uint32_t)(a)[3] + (a)[4] + (a)[5]) ^ (v)) % (s))
1597
1598 /*
1599 * Generic layer-2 address hashing function that takes an address and address
1600 * length as input. This is the DJB hash function.
1601 */
1602 static uint32_t
1603 flow_l2_addrhash(uint8_t *addr, size_t addrlen, size_t htsize)
1604 {
1605 uint32_t hash = 5381;
1606 size_t i;
1607
1608 for (i = 0; i < addrlen; i++)
1609 hash = ((hash << 5) + hash) + addr[i];
1610 return (hash % htsize);
1611 }
1612
1613 #define PKT_TOO_SMALL(s, end) ((s)->fs_mp->b_wptr < (end))
1614
1615 #define CHECK_AND_ADJUST_START_PTR(s, start) { \
1616 if ((s)->fs_mp->b_wptr == (start)) { \
1617 mblk_t *next = (s)->fs_mp->b_cont; \
1618 if (next == NULL) \
1619 return (EINVAL); \
1620 \
1621 (s)->fs_mp = next; \
1622 (start) = next->b_rptr; \
1623 } \
1624 }
1625
1626 /* ARGSUSED */
1627 static boolean_t
1628 flow_l2_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1629 {
1630 flow_l2info_t *l2 = &s->fs_l2info;
1631 flow_desc_t *fd = &flent->fe_flow_desc;
1632
1633 return (l2->l2_vid == fd->fd_vid &&
1634 bcmp(l2->l2_daddr, fd->fd_dst_mac, fd->fd_mac_len) == 0);
1635 }
1636
1637 /*
1638 * Layer 2 hash function.
1639 * Must be paired with flow_l2_accept() within a set of flow_ops
1640 * because it assumes the dest address is already extracted.
1641 */
1642 static uint32_t
1643 flow_l2_hash(flow_tab_t *ft, flow_state_t *s)
1644 {
1645 return (flow_l2_addrhash(s->fs_l2info.l2_daddr,
1646 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1647 }
1648
1649 /*
1650 * This is the generic layer 2 accept function.
1651 * It makes use of mac_header_info() to extract the header length,
1652 * sap, vlan ID and destination address.
1653 */
1654 static int
1655 flow_l2_accept(flow_tab_t *ft, flow_state_t *s)
1656 {
1657 boolean_t is_ether;
1658 flow_l2info_t *l2 = &s->fs_l2info;
1659 mac_header_info_t mhi;
1660 int err;
1661
1662 is_ether = (ft->ft_mip->mi_info.mi_nativemedia == DL_ETHER);
1663 if ((err = mac_header_info((mac_handle_t)ft->ft_mip,
1664 s->fs_mp, &mhi)) != 0) {
1665 if (err == EINVAL)
1666 err = ENOBUFS;
1667
1668 return (err);
1669 }
1670
1671 l2->l2_start = s->fs_mp->b_rptr;
1672 l2->l2_daddr = (uint8_t *)mhi.mhi_daddr;
1673
1674 if (is_ether && mhi.mhi_bindsap == ETHERTYPE_VLAN &&
1675 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1676 struct ether_vlan_header *evhp =
1677 (struct ether_vlan_header *)l2->l2_start;
1678
1679 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1680 return (ENOBUFS);
1681
1682 l2->l2_sap = ntohs(evhp->ether_type);
1683 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1684 l2->l2_hdrsize = sizeof (*evhp);
1685 } else {
1686 l2->l2_sap = mhi.mhi_bindsap;
1687 l2->l2_vid = 0;
1688 l2->l2_hdrsize = (uint32_t)mhi.mhi_hdrsize;
1689 }
1690 return (0);
1691 }
1692
1693 /*
1694 * flow_ether_hash()/accept() are optimized versions of flow_l2_hash()/
1695 * accept(). The notable difference is that dest address is now extracted
1696 * by hash() rather than by accept(). This saves a few memory references
1697 * for flow tables that do not care about mac addresses.
1698 */
1699 static uint32_t
1700 flow_ether_hash(flow_tab_t *ft, flow_state_t *s)
1701 {
1702 flow_l2info_t *l2 = &s->fs_l2info;
1703 struct ether_vlan_header *evhp;
1704
1705 evhp = (struct ether_vlan_header *)l2->l2_start;
1706 l2->l2_daddr = evhp->ether_dhost.ether_addr_octet;
1707 return (HASH_ETHER_VID(l2->l2_daddr, l2->l2_vid, ft->ft_size));
1708 }
1709
1710 static uint32_t
1711 flow_ether_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1712 {
1713 flow_desc_t *fd = &flent->fe_flow_desc;
1714
1715 ASSERT((fd->fd_mask & FLOW_LINK_VID) != 0 || fd->fd_vid == 0);
1716 return (HASH_ETHER_VID(fd->fd_dst_mac, fd->fd_vid, ft->ft_size));
1717 }
1718
1719 /* ARGSUSED */
1720 static int
1721 flow_ether_accept(flow_tab_t *ft, flow_state_t *s)
1722 {
1723 flow_l2info_t *l2 = &s->fs_l2info;
1724 struct ether_vlan_header *evhp;
1725 uint16_t sap;
1726
1727 evhp = (struct ether_vlan_header *)s->fs_mp->b_rptr;
1728 l2->l2_start = (uchar_t *)evhp;
1729
1730 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (struct ether_header)))
1731 return (ENOBUFS);
1732
1733 if ((sap = ntohs(evhp->ether_tpid)) == ETHERTYPE_VLAN &&
1734 ((s->fs_flags & FLOW_IGNORE_VLAN) == 0)) {
1735 if (PKT_TOO_SMALL(s, l2->l2_start + sizeof (*evhp)))
1736 return (ENOBUFS);
1737
1738 l2->l2_sap = ntohs(evhp->ether_type);
1739 l2->l2_vid = VLAN_ID(ntohs(evhp->ether_tci));
1740 l2->l2_hdrsize = sizeof (struct ether_vlan_header);
1741 } else {
1742 l2->l2_sap = sap;
1743 l2->l2_vid = 0;
1744 l2->l2_hdrsize = sizeof (struct ether_header);
1745 }
1746 return (0);
1747 }
1748
1749 /*
1750 * Validates a layer 2 flow entry.
1751 */
1752 static int
1753 flow_l2_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
1754 {
1755 flow_desc_t *fd = &flent->fe_flow_desc;
1756
1757 /*
1758 * Dest address is mandatory, and 0 length addresses are not yet
1759 * supported.
1760 */
1761 if ((fd->fd_mask & FLOW_LINK_DST) == 0 || fd->fd_mac_len == 0)
1762 return (EINVAL);
1763
1764 if ((fd->fd_mask & FLOW_LINK_VID) != 0) {
1765 /*
1766 * VLAN flows are only supported over ethernet macs.
1767 */
1768 if (ft->ft_mip->mi_info.mi_nativemedia != DL_ETHER)
1769 return (EINVAL);
1770
1771 if (fd->fd_vid == 0)
1772 return (EINVAL);
1773
1774 }
1775 flent->fe_match = flow_l2_match;
1776 return (0);
1777 }
1778
1779 /*
1780 * Calculates hash index of flow entry.
1781 */
1782 static uint32_t
1783 flow_l2_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
1784 {
1785 flow_desc_t *fd = &flent->fe_flow_desc;
1786
1787 ASSERT((fd->fd_mask & FLOW_LINK_VID) == 0 && fd->fd_vid == 0);
1788 return (flow_l2_addrhash(fd->fd_dst_mac,
1789 ft->ft_mip->mi_type->mt_addr_length, ft->ft_size));
1790 }
1791
1792 /*
1793 * This is used for duplicate flow checking.
1794 */
1795 /* ARGSUSED */
1796 static boolean_t
1797 flow_l2_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
1798 {
1799 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
1800
1801 ASSERT(fd1->fd_mac_len == fd2->fd_mac_len && fd1->fd_mac_len != 0);
1802 return (bcmp(&fd1->fd_dst_mac, &fd2->fd_dst_mac,
1803 fd1->fd_mac_len) == 0 && fd1->fd_vid == fd2->fd_vid);
1804 }
1805
1806 /*
1807 * Generic flow entry insertion function.
1808 * Used by flow tables that do not have ordering requirements.
1809 */
1810 /* ARGSUSED */
1811 static int
1812 flow_generic_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
1813 flow_entry_t *flent)
1814 {
1815 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
1816
1817 if (*headp != NULL) {
1818 ASSERT(flent->fe_next == NULL);
1819 flent->fe_next = *headp;
1820 }
1821 *headp = flent;
1822 return (0);
1823 }
1824
1825 /*
1826 * IP version independent DSField matching function.
1827 */
1828 /* ARGSUSED */
1829 static boolean_t
1830 flow_ip_dsfield_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1831 {
1832 flow_l3info_t *l3info = &s->fs_l3info;
1833 flow_desc_t *fd = &flent->fe_flow_desc;
1834
1835 switch (l3info->l3_version) {
1836 case IPV4_VERSION: {
1837 ipha_t *ipha = (ipha_t *)l3info->l3_start;
1838
1839 return ((ipha->ipha_type_of_service &
1840 fd->fd_dsfield_mask) == fd->fd_dsfield);
1841 }
1842 case IPV6_VERSION: {
1843 ip6_t *ip6h = (ip6_t *)l3info->l3_start;
1844
1845 return ((IPV6_FLOW_TCLASS(ip6h->ip6_vcf) &
1846 fd->fd_dsfield_mask) == fd->fd_dsfield);
1847 }
1848 default:
1849 return (B_FALSE);
1850 }
1851 }
1852
1853 /*
1854 * IP v4 and v6 address matching.
1855 * The netmask only needs to be applied on the packet but not on the
1856 * flow_desc since fd_local_addr/fd_remote_addr are premasked subnets.
1857 */
1858
1859 /* ARGSUSED */
1860 static boolean_t
1861 flow_ip_v4_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1862 {
1863 flow_l3info_t *l3info = &s->fs_l3info;
1864 flow_desc_t *fd = &flent->fe_flow_desc;
1865 ipha_t *ipha = (ipha_t *)l3info->l3_start;
1866 in_addr_t addr;
1867
1868 addr = (l3info->l3_dst_or_src ? ipha->ipha_dst : ipha->ipha_src);
1869 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1870 return ((addr & V4_PART_OF_V6(fd->fd_local_netmask)) ==
1871 V4_PART_OF_V6(fd->fd_local_addr));
1872 }
1873 return ((addr & V4_PART_OF_V6(fd->fd_remote_netmask)) ==
1874 V4_PART_OF_V6(fd->fd_remote_addr));
1875 }
1876
1877 /* ARGSUSED */
1878 static boolean_t
1879 flow_ip_v6_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1880 {
1881 flow_l3info_t *l3info = &s->fs_l3info;
1882 flow_desc_t *fd = &flent->fe_flow_desc;
1883 ip6_t *ip6h = (ip6_t *)l3info->l3_start;
1884 in6_addr_t *addrp;
1885
1886 addrp = (l3info->l3_dst_or_src ? &ip6h->ip6_dst : &ip6h->ip6_src);
1887 if ((fd->fd_mask & FLOW_IP_LOCAL) != 0) {
1888 return (V6_MASK_EQ(*addrp, fd->fd_local_netmask,
1889 fd->fd_local_addr));
1890 }
1891 return (V6_MASK_EQ(*addrp, fd->fd_remote_netmask, fd->fd_remote_addr));
1892 }
1893
1894 /* ARGSUSED */
1895 static boolean_t
1896 flow_ip_proto_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
1897 {
1898 flow_l3info_t *l3info = &s->fs_l3info;
1899 flow_desc_t *fd = &flent->fe_flow_desc;
1900
1901 return (l3info->l3_protocol == fd->fd_protocol);
1902 }
1903
1904 static uint32_t
1905 flow_ip_hash(flow_tab_t *ft, flow_state_t *s)
1906 {
1907 flow_l3info_t *l3info = &s->fs_l3info;
1908 flow_mask_t mask = ft->ft_mask;
1909
1910 if ((mask & FLOW_IP_LOCAL) != 0) {
1911 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
1912 } else if ((mask & FLOW_IP_REMOTE) != 0) {
1913 l3info->l3_dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
1914 } else if ((mask & FLOW_IP_DSFIELD) != 0) {
1915 /*
1916 * DSField flents are arranged as a single list.
1917 */
1918 return (0);
1919 }
1920 /*
1921 * IP addr flents are hashed into two lists, v4 or v6.
1922 */
1923 ASSERT(ft->ft_size >= 2);
1924 return ((l3info->l3_version == IPV4_VERSION) ? 0 : 1);
1925 }
1926
1927 static uint32_t
1928 flow_ip_proto_hash(flow_tab_t *ft, flow_state_t *s)
1929 {
1930 flow_l3info_t *l3info = &s->fs_l3info;
1931
1932 return (l3info->l3_protocol % ft->ft_size);
1933 }
1934
1935 /* ARGSUSED */
1936 static int
1937 flow_ip_accept(flow_tab_t *ft, flow_state_t *s)
1938 {
1939 flow_l2info_t *l2info = &s->fs_l2info;
1940 flow_l3info_t *l3info = &s->fs_l3info;
1941 uint16_t sap = l2info->l2_sap;
1942 uchar_t *l3_start;
1943
1944 l3_start = l2info->l2_start + l2info->l2_hdrsize;
1945
1946 /*
1947 * Adjust start pointer if we're at the end of an mblk.
1948 */
1949 CHECK_AND_ADJUST_START_PTR(s, l3_start);
1950
1951 l3info->l3_start = l3_start;
1952 if (!OK_32PTR(l3_start))
1953 return (EINVAL);
1954
1955 switch (sap) {
1956 case ETHERTYPE_IP: {
1957 ipha_t *ipha = (ipha_t *)l3_start;
1958
1959 if (IPH_HDR_VERSION(ipha) != IPV4_VERSION)
1960 return (EINVAL);
1961 if (PKT_TOO_SMALL(s, l3_start + IP_SIMPLE_HDR_LENGTH))
1962 return (ENOBUFS);
1963
1964 l3info->l3_hdrsize = IPH_HDR_LENGTH(ipha);
1965 l3info->l3_protocol = ipha->ipha_protocol;
1966 l3info->l3_version = IPV4_VERSION;
1967 l3info->l3_fragmented =
1968 IS_V4_FRAGMENT(ipha->ipha_fragment_offset_and_flags);
1969 break;
1970 }
1971 case ETHERTYPE_IPV6: {
1972 ip6_t *ip6h = (ip6_t *)l3_start;
1973 ip6_frag_t *frag = NULL;
1974 uint16_t ip6_hdrlen;
1975 uint8_t nexthdr;
1976 int errno;
1977
1978 errno = mac_ip_hdr_length_v6(ip6h, s->fs_mp->b_wptr,
1979 &ip6_hdrlen, &nexthdr, &frag);
1980 /*
1981 * ENOBUFS is not ENOSPC, but the semantics are the
1982 * same for this caller.
1983 */
1984 if (errno != 0)
1985 return (errno == ENOSPC ? ENOBUFS : errno);
1986 l3info->l3_hdrsize = ip6_hdrlen;
1987 l3info->l3_protocol = nexthdr;
1988 l3info->l3_version = IPV6_VERSION;
1989 l3info->l3_fragmented = (frag != NULL);
1990 break;
1991 }
1992 default:
1993 return (EINVAL);
1994 }
1995 return (0);
1996 }
1997
1998 /* ARGSUSED */
1999 static int
2000 flow_ip_proto_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2001 {
2002 flow_desc_t *fd = &flent->fe_flow_desc;
2003
2004 switch (fd->fd_protocol) {
2005 case IPPROTO_TCP:
2006 case IPPROTO_UDP:
2007 case IPPROTO_SCTP:
2008 case IPPROTO_ICMP:
2009 case IPPROTO_ICMPV6:
2010 flent->fe_match = flow_ip_proto_match;
2011 return (0);
2012 default:
2013 return (EINVAL);
2014 }
2015 }
2016
2017 /* ARGSUSED */
2018 static int
2019 flow_ip_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2020 {
2021 flow_desc_t *fd = &flent->fe_flow_desc;
2022 flow_mask_t mask;
2023 uint8_t version;
2024 in6_addr_t *addr, *netmask;
2025
2026 /*
2027 * DSField does not require a IP version.
2028 */
2029 if (fd->fd_mask == FLOW_IP_DSFIELD) {
2030 if (fd->fd_dsfield_mask == 0)
2031 return (EINVAL);
2032
2033 flent->fe_match = flow_ip_dsfield_match;
2034 return (0);
2035 }
2036
2037 /*
2038 * IP addresses must come with a version to avoid ambiguity.
2039 */
2040 if ((fd->fd_mask & FLOW_IP_VERSION) == 0)
2041 return (EINVAL);
2042
2043 version = fd->fd_ipversion;
2044 if (version != IPV4_VERSION && version != IPV6_VERSION)
2045 return (EINVAL);
2046
2047 mask = fd->fd_mask & ~FLOW_IP_VERSION;
2048 switch (mask) {
2049 case FLOW_IP_LOCAL:
2050 addr = &fd->fd_local_addr;
2051 netmask = &fd->fd_local_netmask;
2052 break;
2053 case FLOW_IP_REMOTE:
2054 addr = &fd->fd_remote_addr;
2055 netmask = &fd->fd_remote_netmask;
2056 break;
2057 default:
2058 return (EINVAL);
2059 }
2060
2061 /*
2062 * Apply netmask onto specified address.
2063 */
2064 V6_MASK_COPY(*addr, *netmask, *addr);
2065 if (version == IPV4_VERSION) {
2066 ipaddr_t v4addr = V4_PART_OF_V6((*addr));
2067 ipaddr_t v4mask = V4_PART_OF_V6((*netmask));
2068
2069 if (v4addr == 0 || v4mask == 0)
2070 return (EINVAL);
2071 flent->fe_match = flow_ip_v4_match;
2072 } else {
2073 if (IN6_IS_ADDR_UNSPECIFIED(addr) ||
2074 IN6_IS_ADDR_UNSPECIFIED(netmask))
2075 return (EINVAL);
2076 flent->fe_match = flow_ip_v6_match;
2077 }
2078 return (0);
2079 }
2080
2081 static uint32_t
2082 flow_ip_proto_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2083 {
2084 flow_desc_t *fd = &flent->fe_flow_desc;
2085
2086 return (fd->fd_protocol % ft->ft_size);
2087 }
2088
2089 static uint32_t
2090 flow_ip_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2091 {
2092 flow_desc_t *fd = &flent->fe_flow_desc;
2093
2094 /*
2095 * DSField flents are arranged as a single list.
2096 */
2097 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2098 return (0);
2099
2100 /*
2101 * IP addr flents are hashed into two lists, v4 or v6.
2102 */
2103 ASSERT(ft->ft_size >= 2);
2104 return ((fd->fd_ipversion == IPV4_VERSION) ? 0 : 1);
2105 }
2106
2107 /* ARGSUSED */
2108 static boolean_t
2109 flow_ip_proto_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2110 {
2111 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2112
2113 return (fd1->fd_protocol == fd2->fd_protocol);
2114 }
2115
2116 /* ARGSUSED */
2117 static boolean_t
2118 flow_ip_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2119 {
2120 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2121 in6_addr_t *a1, *m1, *a2, *m2;
2122
2123 ASSERT(fd1->fd_mask == fd2->fd_mask);
2124 if (fd1->fd_mask == FLOW_IP_DSFIELD) {
2125 return (fd1->fd_dsfield == fd2->fd_dsfield &&
2126 fd1->fd_dsfield_mask == fd2->fd_dsfield_mask);
2127 }
2128
2129 /*
2130 * flow_ip_accept_fe() already validated the version.
2131 */
2132 ASSERT((fd1->fd_mask & FLOW_IP_VERSION) != 0);
2133 if (fd1->fd_ipversion != fd2->fd_ipversion)
2134 return (B_FALSE);
2135
2136 switch (fd1->fd_mask & ~FLOW_IP_VERSION) {
2137 case FLOW_IP_LOCAL:
2138 a1 = &fd1->fd_local_addr;
2139 m1 = &fd1->fd_local_netmask;
2140 a2 = &fd2->fd_local_addr;
2141 m2 = &fd2->fd_local_netmask;
2142 break;
2143 case FLOW_IP_REMOTE:
2144 a1 = &fd1->fd_remote_addr;
2145 m1 = &fd1->fd_remote_netmask;
2146 a2 = &fd2->fd_remote_addr;
2147 m2 = &fd2->fd_remote_netmask;
2148 break;
2149 default:
2150 /*
2151 * This is unreachable given the checks in
2152 * flow_ip_accept_fe().
2153 */
2154 return (B_FALSE);
2155 }
2156
2157 if (fd1->fd_ipversion == IPV4_VERSION) {
2158 return (V4_PART_OF_V6((*a1)) == V4_PART_OF_V6((*a2)) &&
2159 V4_PART_OF_V6((*m1)) == V4_PART_OF_V6((*m2)));
2160
2161 } else {
2162 return (IN6_ARE_ADDR_EQUAL(a1, a2) &&
2163 IN6_ARE_ADDR_EQUAL(m1, m2));
2164 }
2165 }
2166
2167 static int
2168 flow_ip_mask2plen(in6_addr_t *v6mask)
2169 {
2170 int bits;
2171 int plen = IPV6_ABITS;
2172 int i;
2173
2174 for (i = 3; i >= 0; i--) {
2175 if (v6mask->s6_addr32[i] == 0) {
2176 plen -= 32;
2177 continue;
2178 }
2179 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1;
2180 if (bits == 0)
2181 break;
2182 plen -= bits;
2183 }
2184 return (plen);
2185 }
2186
2187 /* ARGSUSED */
2188 static int
2189 flow_ip_insert_fe(flow_tab_t *ft, flow_entry_t **headp,
2190 flow_entry_t *flent)
2191 {
2192 flow_entry_t **p = headp;
2193 flow_desc_t *fd0, *fd;
2194 in6_addr_t *m0, *m;
2195 int plen0, plen;
2196
2197 ASSERT(MAC_PERIM_HELD((mac_handle_t)ft->ft_mip));
2198
2199 /*
2200 * No special ordering needed for dsfield.
2201 */
2202 fd0 = &flent->fe_flow_desc;
2203 if ((fd0->fd_mask & FLOW_IP_DSFIELD) != 0) {
2204 if (*p != NULL) {
2205 ASSERT(flent->fe_next == NULL);
2206 flent->fe_next = *p;
2207 }
2208 *p = flent;
2209 return (0);
2210 }
2211
2212 /*
2213 * IP address flows are arranged in descending prefix length order.
2214 */
2215 m0 = ((fd0->fd_mask & FLOW_IP_LOCAL) != 0) ?
2216 &fd0->fd_local_netmask : &fd0->fd_remote_netmask;
2217 plen0 = flow_ip_mask2plen(m0);
2218 ASSERT(plen0 != 0);
2219
2220 for (; *p != NULL; p = &(*p)->fe_next) {
2221 fd = &(*p)->fe_flow_desc;
2222
2223 /*
2224 * Normally a dsfield flent shouldn't end up on the same
2225 * list as an IP address because flow tables are (for now)
2226 * disjoint. If we decide to support both IP and dsfield
2227 * in the same table in the future, this check will allow
2228 * for that.
2229 */
2230 if ((fd->fd_mask & FLOW_IP_DSFIELD) != 0)
2231 continue;
2232
2233 /*
2234 * We also allow for the mixing of local and remote address
2235 * flents within one list.
2236 */
2237 m = ((fd->fd_mask & FLOW_IP_LOCAL) != 0) ?
2238 &fd->fd_local_netmask : &fd->fd_remote_netmask;
2239 plen = flow_ip_mask2plen(m);
2240
2241 if (plen <= plen0)
2242 break;
2243 }
2244 if (*p != NULL) {
2245 ASSERT(flent->fe_next == NULL);
2246 flent->fe_next = *p;
2247 }
2248 *p = flent;
2249 return (0);
2250 }
2251
2252 /*
2253 * Transport layer protocol and port matching functions.
2254 */
2255
2256 /* ARGSUSED */
2257 static boolean_t
2258 flow_transport_lport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2259 {
2260 flow_l3info_t *l3info = &s->fs_l3info;
2261 flow_l4info_t *l4info = &s->fs_l4info;
2262 flow_desc_t *fd = &flent->fe_flow_desc;
2263
2264 return (fd->fd_protocol == l3info->l3_protocol &&
2265 fd->fd_local_port == l4info->l4_hash_port);
2266 }
2267
2268 /* ARGSUSED */
2269 static boolean_t
2270 flow_transport_rport_match(flow_tab_t *ft, flow_entry_t *flent, flow_state_t *s)
2271 {
2272 flow_l3info_t *l3info = &s->fs_l3info;
2273 flow_l4info_t *l4info = &s->fs_l4info;
2274 flow_desc_t *fd = &flent->fe_flow_desc;
2275
2276 return (fd->fd_protocol == l3info->l3_protocol &&
2277 fd->fd_remote_port == l4info->l4_hash_port);
2278 }
2279
2280 /*
2281 * Transport hash function.
2282 * Since we only support either local or remote port flows,
2283 * we only need to extract one of the ports to be used for
2284 * matching.
2285 */
2286 static uint32_t
2287 flow_transport_hash(flow_tab_t *ft, flow_state_t *s)
2288 {
2289 flow_l3info_t *l3info = &s->fs_l3info;
2290 flow_l4info_t *l4info = &s->fs_l4info;
2291 uint8_t proto = l3info->l3_protocol;
2292 boolean_t dst_or_src;
2293
2294 if ((ft->ft_mask & FLOW_ULP_PORT_LOCAL) != 0) {
2295 dst_or_src = ((s->fs_flags & FLOW_INBOUND) != 0);
2296 } else {
2297 dst_or_src = ((s->fs_flags & FLOW_OUTBOUND) != 0);
2298 }
2299
2300 l4info->l4_hash_port = dst_or_src ? l4info->l4_dst_port :
2301 l4info->l4_src_port;
2302
2303 return ((l4info->l4_hash_port ^ (proto << 4)) % ft->ft_size);
2304 }
2305
2306 /*
2307 * Unlike other accept() functions above, we do not need to get the header
2308 * size because this is our highest layer so far. If we want to do support
2309 * other higher layer protocols, we would need to save the l4_hdrsize
2310 * in the code below.
2311 */
2312
2313 /* ARGSUSED */
2314 static int
2315 flow_transport_accept(flow_tab_t *ft, flow_state_t *s)
2316 {
2317 flow_l3info_t *l3info = &s->fs_l3info;
2318 flow_l4info_t *l4info = &s->fs_l4info;
2319 uint8_t proto = l3info->l3_protocol;
2320 uchar_t *l4_start;
2321
2322 l4_start = l3info->l3_start + l3info->l3_hdrsize;
2323
2324 /*
2325 * Adjust start pointer if we're at the end of an mblk.
2326 */
2327 CHECK_AND_ADJUST_START_PTR(s, l4_start);
2328
2329 l4info->l4_start = l4_start;
2330 if (!OK_32PTR(l4_start))
2331 return (EINVAL);
2332
2333 if (l3info->l3_fragmented == B_TRUE)
2334 return (EINVAL);
2335
2336 switch (proto) {
2337 case IPPROTO_TCP: {
2338 struct tcphdr *tcph = (struct tcphdr *)l4_start;
2339
2340 if (PKT_TOO_SMALL(s, l4_start + sizeof (*tcph)))
2341 return (ENOBUFS);
2342
2343 l4info->l4_src_port = tcph->th_sport;
2344 l4info->l4_dst_port = tcph->th_dport;
2345 break;
2346 }
2347 case IPPROTO_UDP: {
2348 struct udphdr *udph = (struct udphdr *)l4_start;
2349
2350 if (PKT_TOO_SMALL(s, l4_start + sizeof (*udph)))
2351 return (ENOBUFS);
2352
2353 l4info->l4_src_port = udph->uh_sport;
2354 l4info->l4_dst_port = udph->uh_dport;
2355 break;
2356 }
2357 case IPPROTO_SCTP: {
2358 sctp_hdr_t *sctph = (sctp_hdr_t *)l4_start;
2359
2360 if (PKT_TOO_SMALL(s, l4_start + sizeof (*sctph)))
2361 return (ENOBUFS);
2362
2363 l4info->l4_src_port = sctph->sh_sport;
2364 l4info->l4_dst_port = sctph->sh_dport;
2365 break;
2366 }
2367 default:
2368 return (EINVAL);
2369 }
2370
2371 return (0);
2372 }
2373
2374 /*
2375 * Validates transport flow entry.
2376 * The protocol field must be present.
2377 */
2378
2379 /* ARGSUSED */
2380 static int
2381 flow_transport_accept_fe(flow_tab_t *ft, flow_entry_t *flent)
2382 {
2383 flow_desc_t *fd = &flent->fe_flow_desc;
2384 flow_mask_t mask = fd->fd_mask;
2385
2386 if ((mask & FLOW_IP_PROTOCOL) == 0)
2387 return (EINVAL);
2388
2389 switch (fd->fd_protocol) {
2390 case IPPROTO_TCP:
2391 case IPPROTO_UDP:
2392 case IPPROTO_SCTP:
2393 break;
2394 default:
2395 return (EINVAL);
2396 }
2397
2398 switch (mask & ~FLOW_IP_PROTOCOL) {
2399 case FLOW_ULP_PORT_LOCAL:
2400 if (fd->fd_local_port == 0)
2401 return (EINVAL);
2402
2403 flent->fe_match = flow_transport_lport_match;
2404 break;
2405 case FLOW_ULP_PORT_REMOTE:
2406 if (fd->fd_remote_port == 0)
2407 return (EINVAL);
2408
2409 flent->fe_match = flow_transport_rport_match;
2410 break;
2411 case 0:
2412 /*
2413 * transport-only flows conflicts with our table type.
2414 */
2415 return (EOPNOTSUPP);
2416 default:
2417 return (EINVAL);
2418 }
2419
2420 return (0);
2421 }
2422
2423 static uint32_t
2424 flow_transport_hash_fe(flow_tab_t *ft, flow_entry_t *flent)
2425 {
2426 flow_desc_t *fd = &flent->fe_flow_desc;
2427 uint16_t port = 0;
2428
2429 port = ((fd->fd_mask & FLOW_ULP_PORT_LOCAL) != 0) ?
2430 fd->fd_local_port : fd->fd_remote_port;
2431
2432 return ((port ^ (fd->fd_protocol << 4)) % ft->ft_size);
2433 }
2434
2435 /* ARGSUSED */
2436 static boolean_t
2437 flow_transport_match_fe(flow_tab_t *ft, flow_entry_t *f1, flow_entry_t *f2)
2438 {
2439 flow_desc_t *fd1 = &f1->fe_flow_desc, *fd2 = &f2->fe_flow_desc;
2440
2441 if (fd1->fd_protocol != fd2->fd_protocol)
2442 return (B_FALSE);
2443
2444 if ((fd1->fd_mask & FLOW_ULP_PORT_LOCAL) != 0)
2445 return (fd1->fd_local_port == fd2->fd_local_port);
2446
2447 if ((fd1->fd_mask & FLOW_ULP_PORT_REMOTE) != 0)
2448 return (fd1->fd_remote_port == fd2->fd_remote_port);
2449
2450 return (B_TRUE);
2451 }
2452
2453 static flow_ops_t flow_l2_ops = {
2454 flow_l2_accept_fe,
2455 flow_l2_hash_fe,
2456 flow_l2_match_fe,
2457 flow_generic_insert_fe,
2458 flow_l2_hash,
2459 {flow_l2_accept}
2460 };
2461
2462 static flow_ops_t flow_ip_ops = {
2463 flow_ip_accept_fe,
2464 flow_ip_hash_fe,
2465 flow_ip_match_fe,
2466 flow_ip_insert_fe,
2467 flow_ip_hash,
2468 {flow_l2_accept, flow_ip_accept}
2469 };
2470
2471 static flow_ops_t flow_ip_proto_ops = {
2472 flow_ip_proto_accept_fe,
2473 flow_ip_proto_hash_fe,
2474 flow_ip_proto_match_fe,
2475 flow_generic_insert_fe,
2476 flow_ip_proto_hash,
2477 {flow_l2_accept, flow_ip_accept}
2478 };
2479
2480 static flow_ops_t flow_transport_ops = {
2481 flow_transport_accept_fe,
2482 flow_transport_hash_fe,
2483 flow_transport_match_fe,
2484 flow_generic_insert_fe,
2485 flow_transport_hash,
2486 {flow_l2_accept, flow_ip_accept, flow_transport_accept}
2487 };
2488
2489 static flow_tab_info_t flow_tab_info_list[] = {
2490 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_LOCAL, 2},
2491 {&flow_ip_ops, FLOW_IP_VERSION | FLOW_IP_REMOTE, 2},
2492 {&flow_ip_ops, FLOW_IP_DSFIELD, 1},
2493 {&flow_ip_proto_ops, FLOW_IP_PROTOCOL, 256},
2494 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_LOCAL, 1024},
2495 {&flow_transport_ops, FLOW_IP_PROTOCOL | FLOW_ULP_PORT_REMOTE, 1024}
2496 };
2497
2498 #define FLOW_MAX_TAB_INFO \
2499 ((sizeof (flow_tab_info_list)) / sizeof (flow_tab_info_t))
2500
2501 static flow_tab_info_t *
2502 mac_flow_tab_info_get(flow_mask_t mask)
2503 {
2504 int i;
2505
2506 for (i = 0; i < FLOW_MAX_TAB_INFO; i++) {
2507 if (mask == flow_tab_info_list[i].fti_mask)
2508 return (&flow_tab_info_list[i]);
2509 }
2510 return (NULL);
2511 }