1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 1990 Mentat Inc.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright 2019 Joyent, Inc.
26 * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
27 */
28
29 /*
30 * This file contains the interface control functions for IP.
31 */
32
33 #include <sys/types.h>
34 #include <sys/stream.h>
35 #include <sys/dlpi.h>
36 #include <sys/stropts.h>
37 #include <sys/strsun.h>
38 #include <sys/sysmacros.h>
39 #include <sys/strsubr.h>
40 #include <sys/strlog.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/cmn_err.h>
44 #include <sys/kstat.h>
45 #include <sys/debug.h>
46 #include <sys/zone.h>
47 #include <sys/sunldi.h>
48 #include <sys/file.h>
49 #include <sys/bitmap.h>
50 #include <sys/cpuvar.h>
51 #include <sys/time.h>
52 #include <sys/ctype.h>
53 #include <sys/kmem.h>
54 #include <sys/systm.h>
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #include <sys/isa_defs.h>
58 #include <net/if.h>
59 #include <net/if_arp.h>
60 #include <net/if_types.h>
61 #include <net/if_dl.h>
62 #include <net/route.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <netinet/ip6.h>
66 #include <netinet/icmp6.h>
67 #include <netinet/igmp_var.h>
68 #include <sys/policy.h>
69 #include <sys/ethernet.h>
70 #include <sys/callb.h>
71 #include <sys/md5.h>
72
73 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
74 #include <inet/mi.h>
75 #include <inet/nd.h>
76 #include <inet/tunables.h>
77 #include <inet/arp.h>
78 #include <inet/ip_arp.h>
79 #include <inet/mib2.h>
80 #include <inet/ip.h>
81 #include <inet/ip6.h>
82 #include <inet/ip6_asp.h>
83 #include <inet/tcp.h>
84 #include <inet/ip_multi.h>
85 #include <inet/ip_ire.h>
86 #include <inet/ip_ftable.h>
87 #include <inet/ip_rts.h>
88 #include <inet/ip_ndp.h>
89 #include <inet/ip_if.h>
90 #include <inet/ip_impl.h>
91 #include <inet/sctp_ip.h>
92 #include <inet/ip_netinfo.h>
93 #include <inet/ilb_ip.h>
94
95 #include <netinet/igmp.h>
96 #include <inet/ip_listutils.h>
97 #include <inet/ipclassifier.h>
98 #include <sys/mac_client.h>
99 #include <sys/dld.h>
100 #include <sys/mac_flow.h>
101
102 #include <sys/systeminfo.h>
103 #include <sys/bootconf.h>
104
105 #include <sys/tsol/tndb.h>
106 #include <sys/tsol/tnet.h>
107
108 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
109 #include <inet/udp_impl.h> /* needed for udp_stack_t */
110
111 /* The character which tells where the ill_name ends */
112 #define IPIF_SEPARATOR_CHAR ':'
113
114 /* IP ioctl function table entry */
115 typedef struct ipft_s {
116 int ipft_cmd;
117 pfi_t ipft_pfi;
118 int ipft_min_size;
119 int ipft_flags;
120 } ipft_t;
121 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
122 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
123
124 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
125 static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
126 char *value, caddr_t cp, cred_t *ioc_cr);
127
128 static boolean_t ill_is_quiescent(ill_t *);
129 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
130 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type);
131 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
132 mblk_t *mp, boolean_t need_up);
133 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
134 mblk_t *mp, boolean_t need_up);
135 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
136 queue_t *q, mblk_t *mp, boolean_t need_up);
137 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
138 mblk_t *mp);
139 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
140 mblk_t *mp);
141 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
142 queue_t *q, mblk_t *mp, boolean_t need_up);
143 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
144 int ioccmd, struct linkblk *li);
145 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
146 static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
147 static void ipsq_flush(ill_t *ill);
148
149 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
150 queue_t *q, mblk_t *mp, boolean_t need_up);
151 static void ipsq_delete(ipsq_t *);
152
153 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
154 boolean_t initialize, boolean_t insert, int *errorp);
155 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
156 static void ipif_delete_bcast_ires(ipif_t *ipif);
157 static int ipif_add_ires_v4(ipif_t *, boolean_t);
158 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
159 boolean_t isv6);
160 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
161 static void ipif_free(ipif_t *ipif);
162 static void ipif_free_tail(ipif_t *ipif);
163 static void ipif_set_default(ipif_t *ipif);
164 static int ipif_set_values(queue_t *q, mblk_t *mp,
165 char *interf_name, uint_t *ppa);
166 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
167 queue_t *q);
168 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
169 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
170 ip_stack_t *);
171 static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
172 boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
173 int *error, ip_stack_t *);
174
175 static int ill_alloc_ppa(ill_if_t *, ill_t *);
176 static void ill_delete_interface_type(ill_if_t *);
177 static int ill_dl_up(ill_t *ill, ipif_t *ipif);
178 static void ill_dl_down(ill_t *ill);
179 static void ill_down(ill_t *ill);
180 static void ill_down_ipifs(ill_t *, boolean_t);
181 static void ill_free_mib(ill_t *ill);
182 static void ill_glist_delete(ill_t *);
183 static void ill_phyint_reinit(ill_t *ill);
184 static void ill_set_nce_router_flags(ill_t *, boolean_t);
185 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
186 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
187
188 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
189 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
190 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
191 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
192 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
193 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
194 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
195 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
196 static ip_v4mapinfo_func_t ip_mbcast_mapping;
197 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
198 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
199 static void phyint_free(phyint_t *);
200
201 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
202 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
203 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
204 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
205 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
206 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
207 dl_capability_sub_t *);
208 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
209 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
210 static void ill_capability_dld_ack(ill_t *, mblk_t *,
211 dl_capability_sub_t *);
212 static void ill_capability_dld_enable(ill_t *);
213 static void ill_capability_ack_thr(void *);
214 static void ill_capability_lso_enable(ill_t *);
215
216 static ill_t *ill_prev_usesrc(ill_t *);
217 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
218 static void ill_disband_usesrc_group(ill_t *);
219 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
220
221 #ifdef DEBUG
222 static void ill_trace_cleanup(const ill_t *);
223 static void ipif_trace_cleanup(const ipif_t *);
224 #endif
225
226 static void ill_dlpi_clear_deferred(ill_t *ill);
227
228 static void phyint_flags_init(phyint_t *, t_uscalar_t);
229
230 /*
231 * if we go over the memory footprint limit more than once in this msec
232 * interval, we'll start pruning aggressively.
233 */
234 int ip_min_frag_prune_time = 0;
235
236 static ipft_t ip_ioctl_ftbl[] = {
237 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
238 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
239 IPFT_F_NO_REPLY },
240 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
241 { 0 }
242 };
243
244 /* Simple ICMP IP Header Template */
245 static ipha_t icmp_ipha = {
246 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
247 };
248
249 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
250
251 static ip_m_t ip_m_tbl[] = {
252 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
253 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
254 ip_nodef_v6intfid },
255 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
256 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
257 ip_nodef_v6intfid },
258 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
259 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
260 ip_nodef_v6intfid },
261 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
262 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
263 ip_nodef_v6intfid },
264 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
265 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
266 ip_nodef_v6intfid },
267 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
268 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
269 ip_nodef_v6intfid },
270 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
271 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
272 ip_ipv4_v6destintfid },
273 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
274 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
275 ip_ipv6_v6destintfid },
276 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
277 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
278 ip_nodef_v6intfid },
279 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
280 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
281 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
282 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
283 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
284 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
285 ip_nodef_v6intfid }
286 };
287
288 char ipif_loopback_name[] = "lo0";
289
290 /* These are used by all IP network modules. */
291 sin6_t sin6_null; /* Zero address for quick clears */
292 sin_t sin_null; /* Zero address for quick clears */
293
294 /* When set search for unused ipif_seqid */
295 static ipif_t ipif_zero;
296
297 /*
298 * ppa arena is created after these many
299 * interfaces have been plumbed.
300 */
301 uint_t ill_no_arena = 12; /* Setable in /etc/system */
302
303 /*
304 * Allocate per-interface mibs.
305 * Returns true if ok. False otherwise.
306 * ipsq may not yet be allocated (loopback case ).
307 */
308 static boolean_t
309 ill_allocate_mibs(ill_t *ill)
310 {
311 /* Already allocated? */
312 if (ill->ill_ip_mib != NULL) {
313 if (ill->ill_isv6)
314 ASSERT(ill->ill_icmp6_mib != NULL);
315 return (B_TRUE);
316 }
317
318 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
319 KM_NOSLEEP);
320 if (ill->ill_ip_mib == NULL) {
321 return (B_FALSE);
322 }
323
324 /* Setup static information */
325 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
326 sizeof (mib2_ipIfStatsEntry_t));
327 if (ill->ill_isv6) {
328 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
329 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
330 sizeof (mib2_ipv6AddrEntry_t));
331 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
332 sizeof (mib2_ipv6RouteEntry_t));
333 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
334 sizeof (mib2_ipv6NetToMediaEntry_t));
335 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
336 sizeof (ipv6_member_t));
337 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
338 sizeof (ipv6_grpsrc_t));
339 } else {
340 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
341 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
342 sizeof (mib2_ipAddrEntry_t));
343 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
344 sizeof (mib2_ipRouteEntry_t));
345 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
346 sizeof (mib2_ipNetToMediaEntry_t));
347 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
348 sizeof (ip_member_t));
349 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
350 sizeof (ip_grpsrc_t));
351
352 /*
353 * For a v4 ill, we are done at this point, because per ill
354 * icmp mibs are only used for v6.
355 */
356 return (B_TRUE);
357 }
358
359 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
360 KM_NOSLEEP);
361 if (ill->ill_icmp6_mib == NULL) {
362 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
363 ill->ill_ip_mib = NULL;
364 return (B_FALSE);
365 }
366 /* static icmp info */
367 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
368 sizeof (mib2_ipv6IfIcmpEntry_t);
369 /*
370 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
371 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
372 * -> ill_phyint_reinit
373 */
374 return (B_TRUE);
375 }
376
377 /*
378 * Completely vaporize a lower level tap and all associated interfaces.
379 * ill_delete is called only out of ip_close when the device control
380 * stream is being closed.
381 */
382 void
383 ill_delete(ill_t *ill)
384 {
385 ipif_t *ipif;
386 ill_t *prev_ill;
387 ip_stack_t *ipst = ill->ill_ipst;
388
389 /*
390 * ill_delete may be forcibly entering the ipsq. The previous
391 * ioctl may not have completed and may need to be aborted.
392 * ipsq_flush takes care of it. If we don't need to enter the
393 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
394 * ill_delete_tail is sufficient.
395 */
396 ipsq_flush(ill);
397
398 /*
399 * Nuke all interfaces. ipif_free will take down the interface,
400 * remove it from the list, and free the data structure.
401 * Walk down the ipif list and remove the logical interfaces
402 * first before removing the main ipif. We can't unplumb
403 * zeroth interface first in the case of IPv6 as update_conn_ill
404 * -> ip_ll_multireq de-references ill_ipif for checking
405 * POINTOPOINT.
406 *
407 * If ill_ipif was not properly initialized (i.e low on memory),
408 * then no interfaces to clean up. In this case just clean up the
409 * ill.
410 */
411 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
412 ipif_free(ipif);
413
414 /*
415 * clean out all the nce_t entries that depend on this
416 * ill for the ill_phys_addr.
417 */
418 nce_flush(ill, B_TRUE);
419
420 /* Clean up msgs on pending upcalls for mrouted */
421 reset_mrt_ill(ill);
422
423 update_conn_ill(ill, ipst);
424
425 /*
426 * Remove multicast references added as a result of calls to
427 * ip_join_allmulti().
428 */
429 ip_purge_allmulti(ill);
430
431 /*
432 * If the ill being deleted is under IPMP, boot it out of the illgrp.
433 */
434 if (IS_UNDER_IPMP(ill))
435 ipmp_ill_leave_illgrp(ill);
436
437 /*
438 * ill_down will arrange to blow off any IRE's dependent on this
439 * ILL, and shut down fragmentation reassembly.
440 */
441 ill_down(ill);
442
443 /* Let SCTP know, so that it can remove this from its list. */
444 sctp_update_ill(ill, SCTP_ILL_REMOVE);
445
446 /*
447 * Walk all CONNs that can have a reference on an ire or nce for this
448 * ill (we actually walk all that now have stale references).
449 */
450 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
451
452 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
453 if (ill->ill_isv6)
454 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
455
456 /*
457 * If an address on this ILL is being used as a source address then
458 * clear out the pointers in other ILLs that point to this ILL.
459 */
460 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
461 if (ill->ill_usesrc_grp_next != NULL) {
462 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
463 ill_disband_usesrc_group(ill);
464 } else { /* consumer of the usesrc ILL */
465 prev_ill = ill_prev_usesrc(ill);
466 prev_ill->ill_usesrc_grp_next =
467 ill->ill_usesrc_grp_next;
468 }
469 }
470 rw_exit(&ipst->ips_ill_g_usesrc_lock);
471 }
472
473 static void
474 ipif_non_duplicate(ipif_t *ipif)
475 {
476 ill_t *ill = ipif->ipif_ill;
477 mutex_enter(&ill->ill_lock);
478 if (ipif->ipif_flags & IPIF_DUPLICATE) {
479 ipif->ipif_flags &= ~IPIF_DUPLICATE;
480 ASSERT(ill->ill_ipif_dup_count > 0);
481 ill->ill_ipif_dup_count--;
482 }
483 mutex_exit(&ill->ill_lock);
484 }
485
486 /*
487 * ill_delete_tail is called from ip_modclose after all references
488 * to the closing ill are gone. The wait is done in ip_modclose
489 */
490 void
491 ill_delete_tail(ill_t *ill)
492 {
493 mblk_t **mpp;
494 ipif_t *ipif;
495 ip_stack_t *ipst = ill->ill_ipst;
496
497 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
498 ipif_non_duplicate(ipif);
499 (void) ipif_down_tail(ipif);
500 }
501
502 ASSERT(ill->ill_ipif_dup_count == 0);
503
504 /*
505 * If polling capability is enabled (which signifies direct
506 * upcall into IP and driver has ill saved as a handle),
507 * we need to make sure that unbind has completed before we
508 * let the ill disappear and driver no longer has any reference
509 * to this ill.
510 */
511 mutex_enter(&ill->ill_lock);
512 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
513 cv_wait(&ill->ill_cv, &ill->ill_lock);
514 mutex_exit(&ill->ill_lock);
515 ASSERT(!(ill->ill_capabilities &
516 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
517
518 if (ill->ill_net_type != IRE_LOOPBACK)
519 qprocsoff(ill->ill_rq);
520
521 /*
522 * We do an ipsq_flush once again now. New messages could have
523 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
524 * could also have landed up if an ioctl thread had looked up
525 * the ill before we set the ILL_CONDEMNED flag, but not yet
526 * enqueued the ioctl when we did the ipsq_flush last time.
527 */
528 ipsq_flush(ill);
529
530 /*
531 * Free capabilities.
532 */
533 if (ill->ill_hcksum_capab != NULL) {
534 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
535 ill->ill_hcksum_capab = NULL;
536 }
537
538 if (ill->ill_zerocopy_capab != NULL) {
539 kmem_free(ill->ill_zerocopy_capab,
540 sizeof (ill_zerocopy_capab_t));
541 ill->ill_zerocopy_capab = NULL;
542 }
543
544 if (ill->ill_lso_capab != NULL) {
545 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
546 ill->ill_lso_capab = NULL;
547 }
548
549 if (ill->ill_dld_capab != NULL) {
550 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
551 ill->ill_dld_capab = NULL;
552 }
553
554 /* Clean up ill_allowed_ips* related state */
555 if (ill->ill_allowed_ips != NULL) {
556 ASSERT(ill->ill_allowed_ips_cnt > 0);
557 kmem_free(ill->ill_allowed_ips,
558 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
559 ill->ill_allowed_ips = NULL;
560 ill->ill_allowed_ips_cnt = 0;
561 }
562
563 while (ill->ill_ipif != NULL)
564 ipif_free_tail(ill->ill_ipif);
565
566 /*
567 * We have removed all references to ilm from conn and the ones joined
568 * within the kernel.
569 *
570 * We don't walk conns, mrts and ires because
571 *
572 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
573 * 2) ill_down ->ill_downi walks all the ires and cleans up
574 * ill references.
575 */
576
577 /*
578 * If this ill is an IPMP meta-interface, blow away the illgrp. This
579 * is safe to do because the illgrp has already been unlinked from the
580 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
581 */
582 if (IS_IPMP(ill)) {
583 ipmp_illgrp_destroy(ill->ill_grp);
584 ill->ill_grp = NULL;
585 }
586
587 if (ill->ill_mphysaddr_list != NULL) {
588 multiphysaddr_t *mpa, *tmpa;
589
590 mpa = ill->ill_mphysaddr_list;
591 ill->ill_mphysaddr_list = NULL;
592 while (mpa) {
593 tmpa = mpa->mpa_next;
594 kmem_free(mpa, sizeof (*mpa));
595 mpa = tmpa;
596 }
597 }
598 /*
599 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
600 * could free the phyint. No more reference to the phyint after this
601 * point.
602 */
603 (void) ill_glist_delete(ill);
604
605 if (ill->ill_frag_ptr != NULL) {
606 uint_t count;
607
608 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
609 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
610 }
611 mi_free(ill->ill_frag_ptr);
612 ill->ill_frag_ptr = NULL;
613 ill->ill_frag_hash_tbl = NULL;
614 }
615
616 freemsg(ill->ill_nd_lla_mp);
617 /* Free all retained control messages. */
618 mpp = &ill->ill_first_mp_to_free;
619 do {
620 while (mpp[0]) {
621 mblk_t *mp;
622 mblk_t *mp1;
623
624 mp = mpp[0];
625 mpp[0] = mp->b_next;
626 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
627 mp1->b_next = NULL;
628 mp1->b_prev = NULL;
629 }
630 freemsg(mp);
631 }
632 } while (mpp++ != &ill->ill_last_mp_to_free);
633
634 ill_free_mib(ill);
635
636 #ifdef DEBUG
637 ill_trace_cleanup(ill);
638 #endif
639
640 /* The default multicast interface might have changed */
641 ire_increment_multicast_generation(ipst, ill->ill_isv6);
642
643 /* Drop refcnt here */
644 netstack_rele(ill->ill_ipst->ips_netstack);
645 ill->ill_ipst = NULL;
646 }
647
648 static void
649 ill_free_mib(ill_t *ill)
650 {
651 ip_stack_t *ipst = ill->ill_ipst;
652
653 /*
654 * MIB statistics must not be lost, so when an interface
655 * goes away the counter values will be added to the global
656 * MIBs.
657 */
658 if (ill->ill_ip_mib != NULL) {
659 if (ill->ill_isv6) {
660 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
661 ill->ill_ip_mib);
662 } else {
663 ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
664 ill->ill_ip_mib);
665 }
666
667 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
668 ill->ill_ip_mib = NULL;
669 }
670 if (ill->ill_icmp6_mib != NULL) {
671 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
672 ill->ill_icmp6_mib);
673 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
674 ill->ill_icmp6_mib = NULL;
675 }
676 }
677
678 /*
679 * Concatenate together a physical address and a sap.
680 *
681 * Sap_lengths are interpreted as follows:
682 * sap_length == 0 ==> no sap
683 * sap_length > 0 ==> sap is at the head of the dlpi address
684 * sap_length < 0 ==> sap is at the tail of the dlpi address
685 */
686 static void
687 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
688 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
689 {
690 uint16_t sap_addr = (uint16_t)sap_src;
691
692 if (sap_length == 0) {
693 if (phys_src == NULL)
694 bzero(dst, phys_length);
695 else
696 bcopy(phys_src, dst, phys_length);
697 } else if (sap_length < 0) {
698 if (phys_src == NULL)
699 bzero(dst, phys_length);
700 else
701 bcopy(phys_src, dst, phys_length);
702 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
703 } else {
704 bcopy(&sap_addr, dst, sizeof (sap_addr));
705 if (phys_src == NULL)
706 bzero((char *)dst + sap_length, phys_length);
707 else
708 bcopy(phys_src, (char *)dst + sap_length, phys_length);
709 }
710 }
711
712 /*
713 * Generate a dl_unitdata_req mblk for the device and address given.
714 * addr_length is the length of the physical portion of the address.
715 * If addr is NULL include an all zero address of the specified length.
716 * TRUE? In any case, addr_length is taken to be the entire length of the
717 * dlpi address, including the absolute value of sap_length.
718 */
719 mblk_t *
720 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
721 t_scalar_t sap_length)
722 {
723 dl_unitdata_req_t *dlur;
724 mblk_t *mp;
725 t_scalar_t abs_sap_length; /* absolute value */
726
727 abs_sap_length = ABS(sap_length);
728 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
729 DL_UNITDATA_REQ);
730 if (mp == NULL)
731 return (NULL);
732 dlur = (dl_unitdata_req_t *)mp->b_rptr;
733 /* HACK: accomodate incompatible DLPI drivers */
734 if (addr_length == 8)
735 addr_length = 6;
736 dlur->dl_dest_addr_length = addr_length + abs_sap_length;
737 dlur->dl_dest_addr_offset = sizeof (*dlur);
738 dlur->dl_priority.dl_min = 0;
739 dlur->dl_priority.dl_max = 0;
740 ill_dlur_copy_address(addr, addr_length, sap, sap_length,
741 (uchar_t *)&dlur[1]);
742 return (mp);
743 }
744
745 /*
746 * Add the pending mp to the list. There can be only 1 pending mp
747 * in the list. Any exclusive ioctl that needs to wait for a response
748 * from another module or driver needs to use this function to set
749 * the ipx_pending_mp to the ioctl mblk and wait for the response from
750 * the other module/driver. This is also used while waiting for the
751 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
752 */
753 boolean_t
754 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
755 int waitfor)
756 {
757 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
758
759 ASSERT(IAM_WRITER_IPIF(ipif));
760 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
761 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
762 ASSERT(ipx->ipx_pending_mp == NULL);
763 /*
764 * The caller may be using a different ipif than the one passed into
765 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
766 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
767 * that `ipx_current_ipif == ipif'.
768 */
769 ASSERT(ipx->ipx_current_ipif != NULL);
770
771 /*
772 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
773 * driver.
774 */
775 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
776 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
777 (DB_TYPE(add_mp) == M_PCPROTO));
778
779 if (connp != NULL) {
780 ASSERT(MUTEX_HELD(&connp->conn_lock));
781 /*
782 * Return error if the conn has started closing. The conn
783 * could have finished cleaning up the pending mp list,
784 * If so we should not add another mp to the list negating
785 * the cleanup.
786 */
787 if (connp->conn_state_flags & CONN_CLOSING)
788 return (B_FALSE);
789 }
790 mutex_enter(&ipx->ipx_lock);
791 ipx->ipx_pending_ipif = ipif;
792 /*
793 * Note down the queue in b_queue. This will be returned by
794 * ipsq_pending_mp_get. Caller will then use these values to restart
795 * the processing
796 */
797 add_mp->b_next = NULL;
798 add_mp->b_queue = q;
799 ipx->ipx_pending_mp = add_mp;
800 ipx->ipx_waitfor = waitfor;
801 mutex_exit(&ipx->ipx_lock);
802
803 if (connp != NULL)
804 connp->conn_oper_pending_ill = ipif->ipif_ill;
805
806 return (B_TRUE);
807 }
808
809 /*
810 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
811 * queued in the list.
812 */
813 mblk_t *
814 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
815 {
816 mblk_t *curr = NULL;
817 ipxop_t *ipx = ipsq->ipsq_xop;
818
819 *connpp = NULL;
820 mutex_enter(&ipx->ipx_lock);
821 if (ipx->ipx_pending_mp == NULL) {
822 mutex_exit(&ipx->ipx_lock);
823 return (NULL);
824 }
825
826 /* There can be only 1 such excl message */
827 curr = ipx->ipx_pending_mp;
828 ASSERT(curr->b_next == NULL);
829 ipx->ipx_pending_ipif = NULL;
830 ipx->ipx_pending_mp = NULL;
831 ipx->ipx_waitfor = 0;
832 mutex_exit(&ipx->ipx_lock);
833
834 if (CONN_Q(curr->b_queue)) {
835 /*
836 * This mp did a refhold on the conn, at the start of the ioctl.
837 * So we can safely return a pointer to the conn to the caller.
838 */
839 *connpp = Q_TO_CONN(curr->b_queue);
840 } else {
841 *connpp = NULL;
842 }
843 curr->b_next = NULL;
844 curr->b_prev = NULL;
845 return (curr);
846 }
847
848 /*
849 * Cleanup the ioctl mp queued in ipx_pending_mp
850 * - Called in the ill_delete path
851 * - Called in the M_ERROR or M_HANGUP path on the ill.
852 * - Called in the conn close path.
853 *
854 * Returns success on finding the pending mblk associated with the ioctl or
855 * exclusive operation in progress, failure otherwise.
856 */
857 boolean_t
858 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
859 {
860 mblk_t *mp;
861 ipxop_t *ipx;
862 queue_t *q;
863 ipif_t *ipif;
864 int cmd;
865
866 ASSERT(IAM_WRITER_ILL(ill));
867 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
868
869 mutex_enter(&ipx->ipx_lock);
870 mp = ipx->ipx_pending_mp;
871 if (connp != NULL) {
872 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
873 /*
874 * Nothing to clean since the conn that is closing
875 * does not have a matching pending mblk in
876 * ipx_pending_mp.
877 */
878 mutex_exit(&ipx->ipx_lock);
879 return (B_FALSE);
880 }
881 } else {
882 /*
883 * A non-zero ill_error signifies we are called in the
884 * M_ERROR or M_HANGUP path and we need to unconditionally
885 * abort any current ioctl and do the corresponding cleanup.
886 * A zero ill_error means we are in the ill_delete path and
887 * we do the cleanup only if there is a pending mp.
888 */
889 if (mp == NULL && ill->ill_error == 0) {
890 mutex_exit(&ipx->ipx_lock);
891 return (B_FALSE);
892 }
893 }
894
895 /* Now remove from the ipx_pending_mp */
896 ipx->ipx_pending_mp = NULL;
897 ipif = ipx->ipx_pending_ipif;
898 ipx->ipx_pending_ipif = NULL;
899 ipx->ipx_waitfor = 0;
900 ipx->ipx_current_ipif = NULL;
901 cmd = ipx->ipx_current_ioctl;
902 ipx->ipx_current_ioctl = 0;
903 ipx->ipx_current_done = B_TRUE;
904 mutex_exit(&ipx->ipx_lock);
905
906 if (mp == NULL)
907 return (B_FALSE);
908
909 q = mp->b_queue;
910 mp->b_next = NULL;
911 mp->b_prev = NULL;
912 mp->b_queue = NULL;
913
914 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
915 DTRACE_PROBE4(ipif__ioctl,
916 char *, "ipsq_pending_mp_cleanup",
917 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
918 ipif_t *, ipif);
919 if (connp == NULL) {
920 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
921 } else {
922 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
923 mutex_enter(&ipif->ipif_ill->ill_lock);
924 ipif->ipif_state_flags &= ~IPIF_CHANGING;
925 mutex_exit(&ipif->ipif_ill->ill_lock);
926 }
927 } else {
928 inet_freemsg(mp);
929 }
930 return (B_TRUE);
931 }
932
933 /*
934 * Called in the conn close path and ill delete path
935 */
936 static void
937 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
938 {
939 ipsq_t *ipsq;
940 mblk_t *prev;
941 mblk_t *curr;
942 mblk_t *next;
943 queue_t *wq, *rq = NULL;
944 mblk_t *tmp_list = NULL;
945
946 ASSERT(IAM_WRITER_ILL(ill));
947 if (connp != NULL)
948 wq = CONNP_TO_WQ(connp);
949 else
950 wq = ill->ill_wq;
951
952 /*
953 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
954 * against this here.
955 */
956 if (wq != NULL)
957 rq = RD(wq);
958
959 ipsq = ill->ill_phyint->phyint_ipsq;
960 /*
961 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
962 * In the case of ioctl from a conn, there can be only 1 mp
963 * queued on the ipsq. If an ill is being unplumbed flush all
964 * the messages.
965 */
966 mutex_enter(&ipsq->ipsq_lock);
967 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
968 curr = next) {
969 next = curr->b_next;
970 if (connp == NULL ||
971 (curr->b_queue == wq || curr->b_queue == rq)) {
972 /* Unlink the mblk from the pending mp list */
973 if (prev != NULL) {
974 prev->b_next = curr->b_next;
975 } else {
976 ASSERT(ipsq->ipsq_xopq_mphead == curr);
977 ipsq->ipsq_xopq_mphead = curr->b_next;
978 }
979 if (ipsq->ipsq_xopq_mptail == curr)
980 ipsq->ipsq_xopq_mptail = prev;
981 /*
982 * Create a temporary list and release the ipsq lock
983 * New elements are added to the head of the tmp_list
984 */
985 curr->b_next = tmp_list;
986 tmp_list = curr;
987 } else {
988 prev = curr;
989 }
990 }
991 mutex_exit(&ipsq->ipsq_lock);
992
993 while (tmp_list != NULL) {
994 curr = tmp_list;
995 tmp_list = curr->b_next;
996 curr->b_next = NULL;
997 curr->b_prev = NULL;
998 wq = curr->b_queue;
999 curr->b_queue = NULL;
1000 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
1001 DTRACE_PROBE4(ipif__ioctl,
1002 char *, "ipsq_xopq_mp_cleanup",
1003 int, 0, ill_t *, NULL, ipif_t *, NULL);
1004 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1005 CONN_CLOSE : NO_COPYOUT, NULL);
1006 } else {
1007 /*
1008 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1009 * this can't be just inet_freemsg. we have to
1010 * restart it otherwise the thread will be stuck.
1011 */
1012 inet_freemsg(curr);
1013 }
1014 }
1015 }
1016
1017 /*
1018 * This conn has started closing. Cleanup any pending ioctl from this conn.
1019 * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1020 */
1021 void
1022 conn_ioctl_cleanup(conn_t *connp)
1023 {
1024 ipsq_t *ipsq;
1025 ill_t *ill;
1026 boolean_t refheld;
1027
1028 /*
1029 * Check for a queued ioctl. If the ioctl has not yet started, the mp
1030 * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1031 * started the mp could be present in ipx_pending_mp. Note that if
1032 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1033 * not yet queued anywhere. In this case, the conn close code will wait
1034 * until the conn_ref is dropped. If the stream was a tcp stream, then
1035 * tcp_close will wait first until all ioctls have completed for this
1036 * conn.
1037 */
1038 mutex_enter(&connp->conn_lock);
1039 ill = connp->conn_oper_pending_ill;
1040 if (ill == NULL) {
1041 mutex_exit(&connp->conn_lock);
1042 return;
1043 }
1044
1045 /*
1046 * We may not be able to refhold the ill if the ill/ipif
1047 * is changing. But we need to make sure that the ill will
1048 * not vanish. So we just bump up the ill_waiter count.
1049 */
1050 refheld = ill_waiter_inc(ill);
1051 mutex_exit(&connp->conn_lock);
1052 if (refheld) {
1053 if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1054 ill_waiter_dcr(ill);
1055 /*
1056 * Check whether this ioctl has started and is
1057 * pending. If it is not found there then check
1058 * whether this ioctl has not even started and is in
1059 * the ipsq_xopq list.
1060 */
1061 if (!ipsq_pending_mp_cleanup(ill, connp))
1062 ipsq_xopq_mp_cleanup(ill, connp);
1063 ipsq = ill->ill_phyint->phyint_ipsq;
1064 ipsq_exit(ipsq);
1065 return;
1066 }
1067 }
1068
1069 /*
1070 * The ill is also closing and we could not bump up the
1071 * ill_waiter_count or we could not enter the ipsq. Leave
1072 * the cleanup to ill_delete
1073 */
1074 mutex_enter(&connp->conn_lock);
1075 while (connp->conn_oper_pending_ill != NULL)
1076 cv_wait(&connp->conn_refcv, &connp->conn_lock);
1077 mutex_exit(&connp->conn_lock);
1078 if (refheld)
1079 ill_waiter_dcr(ill);
1080 }
1081
1082 /*
1083 * ipcl_walk function for cleaning up conn_*_ill fields.
1084 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1085 * conn_bound_if in place. We prefer dropping
1086 * packets instead of sending them out the wrong interface, or accepting
1087 * packets from the wrong ifindex.
1088 */
1089 static void
1090 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1091 {
1092 ill_t *ill = (ill_t *)arg;
1093
1094 mutex_enter(&connp->conn_lock);
1095 if (connp->conn_dhcpinit_ill == ill) {
1096 connp->conn_dhcpinit_ill = NULL;
1097 ASSERT(ill->ill_dhcpinit != 0);
1098 atomic_dec_32(&ill->ill_dhcpinit);
1099 ill_set_inputfn(ill);
1100 }
1101 mutex_exit(&connp->conn_lock);
1102 }
1103
1104 static int
1105 ill_down_ipifs_tail(ill_t *ill)
1106 {
1107 ipif_t *ipif;
1108 int err;
1109
1110 ASSERT(IAM_WRITER_ILL(ill));
1111 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1112 ipif_non_duplicate(ipif);
1113 /*
1114 * ipif_down_tail will call arp_ll_down on the last ipif
1115 * and typically return EINPROGRESS when the DL_UNBIND is sent.
1116 */
1117 if ((err = ipif_down_tail(ipif)) != 0)
1118 return (err);
1119 }
1120 return (0);
1121 }
1122
1123 /* ARGSUSED */
1124 void
1125 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1126 {
1127 ASSERT(IAM_WRITER_IPSQ(ipsq));
1128 (void) ill_down_ipifs_tail(q->q_ptr);
1129 freemsg(mp);
1130 ipsq_current_finish(ipsq);
1131 }
1132
1133 /*
1134 * ill_down_start is called when we want to down this ill and bring it up again
1135 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1136 * all interfaces, but don't tear down any plumbing.
1137 */
1138 boolean_t
1139 ill_down_start(queue_t *q, mblk_t *mp)
1140 {
1141 ill_t *ill = q->q_ptr;
1142 ipif_t *ipif;
1143
1144 ASSERT(IAM_WRITER_ILL(ill));
1145 /*
1146 * It is possible that some ioctl is already in progress while we
1147 * received the M_ERROR / M_HANGUP in which case, we need to abort
1148 * the ioctl. ill_down_start() is being processed as CUR_OP rather
1149 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1150 * the in progress ioctl from ever completing.
1151 *
1152 * The thread that started the ioctl (if any) must have returned,
1153 * since we are now executing as writer. After the 2 calls below,
1154 * the state of the ipsq and the ill would reflect no trace of any
1155 * pending operation. Subsequently if there is any response to the
1156 * original ioctl from the driver, it would be discarded as an
1157 * unsolicited message from the driver.
1158 */
1159 (void) ipsq_pending_mp_cleanup(ill, NULL);
1160 ill_dlpi_clear_deferred(ill);
1161
1162 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1163 (void) ipif_down(ipif, NULL, NULL);
1164
1165 ill_down(ill);
1166
1167 /*
1168 * Walk all CONNs that can have a reference on an ire or nce for this
1169 * ill (we actually walk all that now have stale references).
1170 */
1171 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1172
1173 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
1174 if (ill->ill_isv6)
1175 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1176
1177 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1178
1179 /*
1180 * Atomically test and add the pending mp if references are active.
1181 */
1182 mutex_enter(&ill->ill_lock);
1183 if (!ill_is_quiescent(ill)) {
1184 /* call cannot fail since `conn_t *' argument is NULL */
1185 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1186 mp, ILL_DOWN);
1187 mutex_exit(&ill->ill_lock);
1188 return (B_FALSE);
1189 }
1190 mutex_exit(&ill->ill_lock);
1191 return (B_TRUE);
1192 }
1193
1194 static void
1195 ill_down(ill_t *ill)
1196 {
1197 mblk_t *mp;
1198 ip_stack_t *ipst = ill->ill_ipst;
1199
1200 /*
1201 * Blow off any IREs dependent on this ILL.
1202 * The caller needs to handle conn_ixa_cleanup
1203 */
1204 ill_delete_ires(ill);
1205
1206 ire_walk_ill(0, 0, ill_downi, ill, ill);
1207
1208 /* Remove any conn_*_ill depending on this ill */
1209 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1210
1211 /*
1212 * Free state for additional IREs.
1213 */
1214 mutex_enter(&ill->ill_saved_ire_lock);
1215 mp = ill->ill_saved_ire_mp;
1216 ill->ill_saved_ire_mp = NULL;
1217 ill->ill_saved_ire_cnt = 0;
1218 mutex_exit(&ill->ill_saved_ire_lock);
1219 freemsg(mp);
1220 }
1221
1222 /*
1223 * ire_walk routine used to delete every IRE that depends on
1224 * 'ill'. (Always called as writer, and may only be called from ire_walk.)
1225 *
1226 * Note: since the routes added by the kernel are deleted separately,
1227 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1228 *
1229 * We also remove references on ire_nce_cache entries that refer to the ill.
1230 */
1231 void
1232 ill_downi(ire_t *ire, char *ill_arg)
1233 {
1234 ill_t *ill = (ill_t *)ill_arg;
1235 nce_t *nce;
1236
1237 mutex_enter(&ire->ire_lock);
1238 nce = ire->ire_nce_cache;
1239 if (nce != NULL && nce->nce_ill == ill)
1240 ire->ire_nce_cache = NULL;
1241 else
1242 nce = NULL;
1243 mutex_exit(&ire->ire_lock);
1244 if (nce != NULL)
1245 nce_refrele(nce);
1246 if (ire->ire_ill == ill) {
1247 /*
1248 * The existing interface binding for ire must be
1249 * deleted before trying to bind the route to another
1250 * interface. However, since we are using the contents of the
1251 * ire after ire_delete, the caller has to ensure that
1252 * CONDEMNED (deleted) ire's are not removed from the list
1253 * when ire_delete() returns. Currently ill_downi() is
1254 * only called as part of ire_walk*() routines, so that
1255 * the irb_refhold() done by ire_walk*() will ensure that
1256 * ire_delete() does not lead to ire_inactive().
1257 */
1258 ASSERT(ire->ire_bucket->irb_refcnt > 0);
1259 ire_delete(ire);
1260 if (ire->ire_unbound)
1261 ire_rebind(ire);
1262 }
1263 }
1264
1265 /* Remove IRE_IF_CLONE on this ill */
1266 void
1267 ill_downi_if_clone(ire_t *ire, char *ill_arg)
1268 {
1269 ill_t *ill = (ill_t *)ill_arg;
1270
1271 ASSERT(ire->ire_type & IRE_IF_CLONE);
1272 if (ire->ire_ill == ill)
1273 ire_delete(ire);
1274 }
1275
1276 /* Consume an M_IOCACK of the fastpath probe. */
1277 void
1278 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1279 {
1280 mblk_t *mp1 = mp;
1281
1282 /*
1283 * If this was the first attempt turn on the fastpath probing.
1284 */
1285 mutex_enter(&ill->ill_lock);
1286 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1287 ill->ill_dlpi_fastpath_state = IDS_OK;
1288 mutex_exit(&ill->ill_lock);
1289
1290 /* Free the M_IOCACK mblk, hold on to the data */
1291 mp = mp->b_cont;
1292 freeb(mp1);
1293 if (mp == NULL)
1294 return;
1295 if (mp->b_cont != NULL)
1296 nce_fastpath_update(ill, mp);
1297 else
1298 ip0dbg(("ill_fastpath_ack: no b_cont\n"));
1299 freemsg(mp);
1300 }
1301
1302 /*
1303 * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1304 * The data portion of the request is a dl_unitdata_req_t template for
1305 * what we would send downstream in the absence of a fastpath confirmation.
1306 */
1307 int
1308 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1309 {
1310 struct iocblk *ioc;
1311 mblk_t *mp;
1312
1313 if (dlur_mp == NULL)
1314 return (EINVAL);
1315
1316 mutex_enter(&ill->ill_lock);
1317 switch (ill->ill_dlpi_fastpath_state) {
1318 case IDS_FAILED:
1319 /*
1320 * Driver NAKed the first fastpath ioctl - assume it doesn't
1321 * support it.
1322 */
1323 mutex_exit(&ill->ill_lock);
1324 return (ENOTSUP);
1325 case IDS_UNKNOWN:
1326 /* This is the first probe */
1327 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1328 break;
1329 default:
1330 break;
1331 }
1332 mutex_exit(&ill->ill_lock);
1333
1334 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1335 return (EAGAIN);
1336
1337 mp->b_cont = copyb(dlur_mp);
1338 if (mp->b_cont == NULL) {
1339 freeb(mp);
1340 return (EAGAIN);
1341 }
1342
1343 ioc = (struct iocblk *)mp->b_rptr;
1344 ioc->ioc_count = msgdsize(mp->b_cont);
1345
1346 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1347 char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1348 putnext(ill->ill_wq, mp);
1349 return (0);
1350 }
1351
1352 void
1353 ill_capability_probe(ill_t *ill)
1354 {
1355 mblk_t *mp;
1356
1357 ASSERT(IAM_WRITER_ILL(ill));
1358
1359 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1360 ill->ill_dlpi_capab_state != IDCS_FAILED)
1361 return;
1362
1363 /*
1364 * We are starting a new cycle of capability negotiation.
1365 * Free up the capab reset messages of any previous incarnation.
1366 * We will do a fresh allocation when we get the response to our probe
1367 */
1368 if (ill->ill_capab_reset_mp != NULL) {
1369 freemsg(ill->ill_capab_reset_mp);
1370 ill->ill_capab_reset_mp = NULL;
1371 }
1372
1373 ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1374
1375 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1376 if (mp == NULL)
1377 return;
1378
1379 ill_capability_send(ill, mp);
1380 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1381 }
1382
1383 static boolean_t
1384 ill_capability_wait(ill_t *ill)
1385 {
1386 /*
1387 * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can
1388 * only be set by someone who is the writer. Since we
1389 * drop-and-reacquire the squeue in this loop, we need to check for
1390 * ILL_CONDEMNED, which if set means nothing can signal our capability
1391 * condition variable.
1392 */
1393 ASSERT(IAM_WRITER_ILL(ill));
1394
1395 while (ill->ill_capab_pending_cnt != 0 &&
1396 (ill->ill_state_flags & ILL_CONDEMNED) == 0) {
1397 /* This may enable blocked callers of ill_capability_done(). */
1398 ipsq_exit(ill->ill_phyint->phyint_ipsq);
1399 /* Pause a bit (1msec) before we re-enter the squeue. */
1400 delay(drv_usectohz(1000000));
1401
1402 /*
1403 * If ipsq_enter() fails, someone set ILL_CONDEMNED
1404 * while we dropped the squeue. Indicate such to the caller.
1405 */
1406 if (!ipsq_enter(ill, B_FALSE, CUR_OP))
1407 return (B_FALSE);
1408 }
1409
1410 return ((ill->ill_state_flags & ILL_CONDEMNED) == 0);
1411 }
1412
1413 void
1414 ill_capability_reset(ill_t *ill, boolean_t reneg)
1415 {
1416 ASSERT(IAM_WRITER_ILL(ill));
1417
1418 if (ill->ill_dlpi_capab_state != IDCS_OK)
1419 return;
1420
1421 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1422
1423 ASSERT(ill->ill_capab_reset_mp != NULL);
1424
1425 ill_capability_send(ill, ill->ill_capab_reset_mp);
1426 ill->ill_capab_reset_mp = NULL;
1427 /*
1428 * We turn off all capabilities except those pertaining to
1429 * direct function call capabilities viz. ILL_CAPAB_DLD*
1430 * which will be turned off by the corresponding reset functions.
1431 */
1432 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY);
1433 }
1434
1435 static void
1436 ill_capability_reset_alloc(ill_t *ill)
1437 {
1438 mblk_t *mp;
1439 size_t size = 0;
1440 int err;
1441 dl_capability_req_t *capb;
1442
1443 ASSERT(IAM_WRITER_ILL(ill));
1444 ASSERT(ill->ill_capab_reset_mp == NULL);
1445
1446 if (ILL_HCKSUM_CAPABLE(ill)) {
1447 size += sizeof (dl_capability_sub_t) +
1448 sizeof (dl_capab_hcksum_t);
1449 }
1450
1451 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1452 size += sizeof (dl_capability_sub_t) +
1453 sizeof (dl_capab_zerocopy_t);
1454 }
1455
1456 if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1457 size += sizeof (dl_capability_sub_t) +
1458 sizeof (dl_capab_dld_t);
1459 }
1460
1461 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1462 STR_NOSIG, &err);
1463
1464 mp->b_datap->db_type = M_PROTO;
1465 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1466
1467 capb = (dl_capability_req_t *)mp->b_rptr;
1468 capb->dl_primitive = DL_CAPABILITY_REQ;
1469 capb->dl_sub_offset = sizeof (dl_capability_req_t);
1470 capb->dl_sub_length = size;
1471
1472 mp->b_wptr += sizeof (dl_capability_req_t);
1473
1474 /*
1475 * Each handler fills in the corresponding dl_capability_sub_t
1476 * inside the mblk,
1477 */
1478 ill_capability_hcksum_reset_fill(ill, mp);
1479 ill_capability_zerocopy_reset_fill(ill, mp);
1480 ill_capability_dld_reset_fill(ill, mp);
1481
1482 ill->ill_capab_reset_mp = mp;
1483 }
1484
1485 static void
1486 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1487 {
1488 dl_capab_id_t *id_ic;
1489 uint_t sub_dl_cap = outers->dl_cap;
1490 dl_capability_sub_t *inners;
1491 uint8_t *capend;
1492
1493 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1494
1495 /*
1496 * Note: range checks here are not absolutely sufficient to
1497 * make us robust against malformed messages sent by drivers;
1498 * this is in keeping with the rest of IP's dlpi handling.
1499 * (Remember, it's coming from something else in the kernel
1500 * address space)
1501 */
1502
1503 capend = (uint8_t *)(outers + 1) + outers->dl_length;
1504 if (capend > mp->b_wptr) {
1505 cmn_err(CE_WARN, "ill_capability_id_ack: "
1506 "malformed sub-capability too long for mblk");
1507 return;
1508 }
1509
1510 id_ic = (dl_capab_id_t *)(outers + 1);
1511
1512 if (outers->dl_length < sizeof (*id_ic) ||
1513 (inners = &id_ic->id_subcap,
1514 inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1515 cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1516 "encapsulated capab type %d too long for mblk",
1517 inners->dl_cap);
1518 return;
1519 }
1520
1521 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1522 ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1523 "isn't as expected; pass-thru module(s) detected, "
1524 "discarding capability\n", inners->dl_cap));
1525 return;
1526 }
1527
1528 /* Process the encapsulated sub-capability */
1529 ill_capability_dispatch(ill, mp, inners);
1530 }
1531
1532 static void
1533 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1534 {
1535 dl_capability_sub_t *dl_subcap;
1536
1537 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1538 return;
1539
1540 /*
1541 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1542 * initialized below since it is not used by DLD.
1543 */
1544 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1545 dl_subcap->dl_cap = DL_CAPAB_DLD;
1546 dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1547
1548 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1549 }
1550
1551 static void
1552 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1553 {
1554 /*
1555 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1556 * is only to get the VRRP capability.
1557 *
1558 * Note that we cannot check ill_ipif_up_count here since
1559 * ill_ipif_up_count is only incremented when the resolver is setup.
1560 * That is done asynchronously, and can race with this function.
1561 */
1562 if (!ill->ill_dl_up) {
1563 if (subp->dl_cap == DL_CAPAB_VRRP)
1564 ill_capability_vrrp_ack(ill, mp, subp);
1565 return;
1566 }
1567
1568 switch (subp->dl_cap) {
1569 case DL_CAPAB_HCKSUM:
1570 ill_capability_hcksum_ack(ill, mp, subp);
1571 break;
1572 case DL_CAPAB_ZEROCOPY:
1573 ill_capability_zerocopy_ack(ill, mp, subp);
1574 break;
1575 case DL_CAPAB_DLD:
1576 ill_capability_dld_ack(ill, mp, subp);
1577 break;
1578 case DL_CAPAB_VRRP:
1579 break;
1580 default:
1581 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1582 subp->dl_cap));
1583 }
1584 }
1585
1586 /*
1587 * Process the vrrp capability received from a DLS Provider. isub must point
1588 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1589 */
1590 static void
1591 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1592 {
1593 dl_capab_vrrp_t *vrrp;
1594 uint_t sub_dl_cap = isub->dl_cap;
1595 uint8_t *capend;
1596
1597 ASSERT(IAM_WRITER_ILL(ill));
1598 ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1599
1600 /*
1601 * Note: range checks here are not absolutely sufficient to
1602 * make us robust against malformed messages sent by drivers;
1603 * this is in keeping with the rest of IP's dlpi handling.
1604 * (Remember, it's coming from something else in the kernel
1605 * address space)
1606 */
1607 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1608 if (capend > mp->b_wptr) {
1609 cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1610 "malformed sub-capability too long for mblk");
1611 return;
1612 }
1613 vrrp = (dl_capab_vrrp_t *)(isub + 1);
1614
1615 /*
1616 * Compare the IP address family and set ILLF_VRRP for the right ill.
1617 */
1618 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1619 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1620 ill->ill_flags |= ILLF_VRRP;
1621 }
1622 }
1623
1624 /*
1625 * Process a hardware checksum offload capability negotiation ack received
1626 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1627 * of a DL_CAPABILITY_ACK message.
1628 */
1629 static void
1630 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1631 {
1632 dl_capability_req_t *ocap;
1633 dl_capab_hcksum_t *ihck, *ohck;
1634 ill_hcksum_capab_t **ill_hcksum;
1635 mblk_t *nmp = NULL;
1636 uint_t sub_dl_cap = isub->dl_cap;
1637 uint8_t *capend;
1638
1639 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1640
1641 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1642
1643 /*
1644 * Note: range checks here are not absolutely sufficient to
1645 * make us robust against malformed messages sent by drivers;
1646 * this is in keeping with the rest of IP's dlpi handling.
1647 * (Remember, it's coming from something else in the kernel
1648 * address space)
1649 */
1650 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1651 if (capend > mp->b_wptr) {
1652 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1653 "malformed sub-capability too long for mblk");
1654 return;
1655 }
1656
1657 /*
1658 * There are two types of acks we process here:
1659 * 1. acks in reply to a (first form) generic capability req
1660 * (no ENABLE flag set)
1661 * 2. acks in reply to a ENABLE capability req.
1662 * (ENABLE flag set)
1663 */
1664 ihck = (dl_capab_hcksum_t *)(isub + 1);
1665
1666 if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1667 cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1668 "unsupported hardware checksum "
1669 "sub-capability (version %d, expected %d)",
1670 ihck->hcksum_version, HCKSUM_VERSION_1);
1671 return;
1672 }
1673
1674 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1675 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1676 "checksum capability isn't as expected; pass-thru "
1677 "module(s) detected, discarding capability\n"));
1678 return;
1679 }
1680
1681 #define CURR_HCKSUM_CAPAB \
1682 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \
1683 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1684
1685 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1686 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1687 /* do ENABLE processing */
1688 if (*ill_hcksum == NULL) {
1689 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1690 KM_NOSLEEP);
1691
1692 if (*ill_hcksum == NULL) {
1693 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1694 "could not enable hcksum version %d "
1695 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1696 ill->ill_name);
1697 return;
1698 }
1699 }
1700
1701 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1702 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1703 ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1704 ip1dbg(("ill_capability_hcksum_ack: interface %s "
1705 "has enabled hardware checksumming\n ",
1706 ill->ill_name));
1707 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1708 /*
1709 * Enabling hardware checksum offload
1710 * Currently IP supports {TCP,UDP}/IPv4
1711 * partial and full cksum offload and
1712 * IPv4 header checksum offload.
1713 * Allocate new mblk which will
1714 * contain a new capability request
1715 * to enable hardware checksum offload.
1716 */
1717 uint_t size;
1718 uchar_t *rptr;
1719
1720 size = sizeof (dl_capability_req_t) +
1721 sizeof (dl_capability_sub_t) + isub->dl_length;
1722
1723 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1724 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1725 "could not enable hardware cksum for %s (ENOMEM)\n",
1726 ill->ill_name);
1727 return;
1728 }
1729
1730 rptr = nmp->b_rptr;
1731 /* initialize dl_capability_req_t */
1732 ocap = (dl_capability_req_t *)nmp->b_rptr;
1733 ocap->dl_sub_offset =
1734 sizeof (dl_capability_req_t);
1735 ocap->dl_sub_length =
1736 sizeof (dl_capability_sub_t) +
1737 isub->dl_length;
1738 nmp->b_rptr += sizeof (dl_capability_req_t);
1739
1740 /* initialize dl_capability_sub_t */
1741 bcopy(isub, nmp->b_rptr, sizeof (*isub));
1742 nmp->b_rptr += sizeof (*isub);
1743
1744 /* initialize dl_capab_hcksum_t */
1745 ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1746 bcopy(ihck, ohck, sizeof (*ihck));
1747
1748 nmp->b_rptr = rptr;
1749 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1750
1751 /* Set ENABLE flag */
1752 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1753 ohck->hcksum_txflags |= HCKSUM_ENABLE;
1754
1755 /*
1756 * nmp points to a DL_CAPABILITY_REQ message to enable
1757 * hardware checksum acceleration.
1758 */
1759 ill_capability_send(ill, nmp);
1760 } else {
1761 ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1762 "advertised %x hardware checksum capability flags\n",
1763 ill->ill_name, ihck->hcksum_txflags));
1764 }
1765 }
1766
1767 static void
1768 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1769 {
1770 dl_capab_hcksum_t *hck_subcap;
1771 dl_capability_sub_t *dl_subcap;
1772
1773 if (!ILL_HCKSUM_CAPABLE(ill))
1774 return;
1775
1776 ASSERT(ill->ill_hcksum_capab != NULL);
1777
1778 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1779 dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1780 dl_subcap->dl_length = sizeof (*hck_subcap);
1781
1782 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1783 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1784 hck_subcap->hcksum_txflags = 0;
1785
1786 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1787 }
1788
1789 static void
1790 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1791 {
1792 mblk_t *nmp = NULL;
1793 dl_capability_req_t *oc;
1794 dl_capab_zerocopy_t *zc_ic, *zc_oc;
1795 ill_zerocopy_capab_t **ill_zerocopy_capab;
1796 uint_t sub_dl_cap = isub->dl_cap;
1797 uint8_t *capend;
1798
1799 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1800
1801 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1802
1803 /*
1804 * Note: range checks here are not absolutely sufficient to
1805 * make us robust against malformed messages sent by drivers;
1806 * this is in keeping with the rest of IP's dlpi handling.
1807 * (Remember, it's coming from something else in the kernel
1808 * address space)
1809 */
1810 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1811 if (capend > mp->b_wptr) {
1812 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1813 "malformed sub-capability too long for mblk");
1814 return;
1815 }
1816
1817 zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1818 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1819 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1820 "unsupported ZEROCOPY sub-capability (version %d, "
1821 "expected %d)", zc_ic->zerocopy_version,
1822 ZEROCOPY_VERSION_1);
1823 return;
1824 }
1825
1826 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1827 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1828 "capability isn't as expected; pass-thru module(s) "
1829 "detected, discarding capability\n"));
1830 return;
1831 }
1832
1833 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1834 if (*ill_zerocopy_capab == NULL) {
1835 *ill_zerocopy_capab =
1836 kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1837 KM_NOSLEEP);
1838
1839 if (*ill_zerocopy_capab == NULL) {
1840 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1841 "could not enable Zero-copy version %d "
1842 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1843 ill->ill_name);
1844 return;
1845 }
1846 }
1847
1848 ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1849 "supports Zero-copy version %d\n", ill->ill_name,
1850 ZEROCOPY_VERSION_1));
1851
1852 (*ill_zerocopy_capab)->ill_zerocopy_version =
1853 zc_ic->zerocopy_version;
1854 (*ill_zerocopy_capab)->ill_zerocopy_flags =
1855 zc_ic->zerocopy_flags;
1856
1857 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1858 } else {
1859 uint_t size;
1860 uchar_t *rptr;
1861
1862 size = sizeof (dl_capability_req_t) +
1863 sizeof (dl_capability_sub_t) +
1864 sizeof (dl_capab_zerocopy_t);
1865
1866 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1867 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1868 "could not enable zerocopy for %s (ENOMEM)\n",
1869 ill->ill_name);
1870 return;
1871 }
1872
1873 rptr = nmp->b_rptr;
1874 /* initialize dl_capability_req_t */
1875 oc = (dl_capability_req_t *)rptr;
1876 oc->dl_sub_offset = sizeof (dl_capability_req_t);
1877 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1878 sizeof (dl_capab_zerocopy_t);
1879 rptr += sizeof (dl_capability_req_t);
1880
1881 /* initialize dl_capability_sub_t */
1882 bcopy(isub, rptr, sizeof (*isub));
1883 rptr += sizeof (*isub);
1884
1885 /* initialize dl_capab_zerocopy_t */
1886 zc_oc = (dl_capab_zerocopy_t *)rptr;
1887 *zc_oc = *zc_ic;
1888
1889 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1890 "to enable zero-copy version %d\n", ill->ill_name,
1891 ZEROCOPY_VERSION_1));
1892
1893 /* set VMSAFE_MEM flag */
1894 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1895
1896 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1897 ill_capability_send(ill, nmp);
1898 }
1899 }
1900
1901 static void
1902 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1903 {
1904 dl_capab_zerocopy_t *zerocopy_subcap;
1905 dl_capability_sub_t *dl_subcap;
1906
1907 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1908 return;
1909
1910 ASSERT(ill->ill_zerocopy_capab != NULL);
1911
1912 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1913 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1914 dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1915
1916 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1917 zerocopy_subcap->zerocopy_version =
1918 ill->ill_zerocopy_capab->ill_zerocopy_version;
1919 zerocopy_subcap->zerocopy_flags = 0;
1920
1921 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1922 }
1923
1924 /*
1925 * DLD capability
1926 * Refer to dld.h for more information regarding the purpose and usage
1927 * of this capability.
1928 */
1929 static void
1930 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1931 {
1932 dl_capab_dld_t *dld_ic, dld;
1933 uint_t sub_dl_cap = isub->dl_cap;
1934 uint8_t *capend;
1935 ill_dld_capab_t *idc;
1936
1937 ASSERT(IAM_WRITER_ILL(ill));
1938 ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1939
1940 /*
1941 * Note: range checks here are not absolutely sufficient to
1942 * make us robust against malformed messages sent by drivers;
1943 * this is in keeping with the rest of IP's dlpi handling.
1944 * (Remember, it's coming from something else in the kernel
1945 * address space)
1946 */
1947 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1948 if (capend > mp->b_wptr) {
1949 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1950 "malformed sub-capability too long for mblk");
1951 return;
1952 }
1953 dld_ic = (dl_capab_dld_t *)(isub + 1);
1954 if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1955 cmn_err(CE_CONT, "ill_capability_dld_ack: "
1956 "unsupported DLD sub-capability (version %d, "
1957 "expected %d)", dld_ic->dld_version,
1958 DLD_CURRENT_VERSION);
1959 return;
1960 }
1961 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1962 ip1dbg(("ill_capability_dld_ack: mid token for dld "
1963 "capability isn't as expected; pass-thru module(s) "
1964 "detected, discarding capability\n"));
1965 return;
1966 }
1967
1968 /*
1969 * Copy locally to ensure alignment.
1970 */
1971 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1972
1973 if ((idc = ill->ill_dld_capab) == NULL) {
1974 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1975 if (idc == NULL) {
1976 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1977 "could not enable DLD version %d "
1978 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1979 ill->ill_name);
1980 return;
1981 }
1982 ill->ill_dld_capab = idc;
1983 }
1984 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1985 idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1986 ip1dbg(("ill_capability_dld_ack: interface %s "
1987 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1988
1989 ill_capability_dld_enable(ill);
1990 }
1991
1992 /*
1993 * Typically capability negotiation between IP and the driver happens via
1994 * DLPI message exchange. However GLD also offers a direct function call
1995 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1996 * But arbitrary function calls into IP or GLD are not permitted, since both
1997 * of them are protected by their own perimeter mechanism. The perimeter can
1998 * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1999 * these perimeters is IP -> MAC. Thus for example to enable the squeue
2000 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
2001 * to enter the mac perimeter and then do the direct function calls into
2002 * GLD to enable squeue polling. The ring related callbacks from the mac into
2003 * the stack to add, bind, quiesce, restart or cleanup a ring are all
2004 * protected by the mac perimeter.
2005 */
2006 static void
2007 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
2008 {
2009 ill_dld_capab_t *idc = ill->ill_dld_capab;
2010 int err;
2011
2012 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
2013 DLD_ENABLE);
2014 ASSERT(err == 0);
2015 }
2016
2017 static void
2018 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
2019 {
2020 ill_dld_capab_t *idc = ill->ill_dld_capab;
2021 int err;
2022
2023 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
2024 DLD_DISABLE);
2025 ASSERT(err == 0);
2026 }
2027
2028 boolean_t
2029 ill_mac_perim_held(ill_t *ill)
2030 {
2031 ill_dld_capab_t *idc = ill->ill_dld_capab;
2032
2033 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
2034 DLD_QUERY));
2035 }
2036
2037 static void
2038 ill_capability_direct_enable(ill_t *ill)
2039 {
2040 ill_dld_capab_t *idc = ill->ill_dld_capab;
2041 ill_dld_direct_t *idd = &idc->idc_direct;
2042 dld_capab_direct_t direct;
2043 int rc;
2044
2045 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2046
2047 bzero(&direct, sizeof (direct));
2048 direct.di_rx_cf = (uintptr_t)ip_input;
2049 direct.di_rx_ch = ill;
2050
2051 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2052 DLD_ENABLE);
2053 if (rc == 0) {
2054 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2055 idd->idd_tx_dh = direct.di_tx_dh;
2056 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2057 idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2058 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2059 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2060 ASSERT(idd->idd_tx_cb_df != NULL);
2061 ASSERT(idd->idd_tx_fctl_df != NULL);
2062 ASSERT(idd->idd_tx_df != NULL);
2063 /*
2064 * One time registration of flow enable callback function
2065 */
2066 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2067 ill_flow_enable, ill);
2068 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2069 DTRACE_PROBE1(direct_on, (ill_t *), ill);
2070 } else {
2071 cmn_err(CE_WARN, "warning: could not enable DIRECT "
2072 "capability, rc = %d\n", rc);
2073 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2074 }
2075 }
2076
2077 static void
2078 ill_capability_poll_enable(ill_t *ill)
2079 {
2080 ill_dld_capab_t *idc = ill->ill_dld_capab;
2081 dld_capab_poll_t poll;
2082 int rc;
2083
2084 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2085
2086 bzero(&poll, sizeof (poll));
2087 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2088 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2089 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2090 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2091 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2092 poll.poll_ring_ch = ill;
2093 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2094 DLD_ENABLE);
2095 if (rc == 0) {
2096 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2097 DTRACE_PROBE1(poll_on, (ill_t *), ill);
2098 } else {
2099 ip1dbg(("warning: could not enable POLL "
2100 "capability, rc = %d\n", rc));
2101 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2102 }
2103 }
2104
2105 /*
2106 * Enable the LSO capability.
2107 */
2108 static void
2109 ill_capability_lso_enable(ill_t *ill)
2110 {
2111 ill_dld_capab_t *idc = ill->ill_dld_capab;
2112 dld_capab_lso_t lso;
2113 int rc;
2114
2115 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2116
2117 if (ill->ill_lso_capab == NULL) {
2118 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2119 KM_NOSLEEP);
2120 if (ill->ill_lso_capab == NULL) {
2121 cmn_err(CE_WARN, "ill_capability_lso_enable: "
2122 "could not enable LSO for %s (ENOMEM)\n",
2123 ill->ill_name);
2124 return;
2125 }
2126 }
2127
2128 bzero(&lso, sizeof (lso));
2129 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2130 DLD_ENABLE)) == 0) {
2131 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2132 ill->ill_lso_capab->ill_lso_max = lso.lso_max;
2133 ill->ill_capabilities |= ILL_CAPAB_LSO;
2134 ip1dbg(("ill_capability_lso_enable: interface %s "
2135 "has enabled LSO\n ", ill->ill_name));
2136 } else {
2137 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2138 ill->ill_lso_capab = NULL;
2139 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2140 }
2141 }
2142
2143 /*
2144 * Check whether or not mac will prevent us from sending with a given IP
2145 * address. This requires having the IPCHECK capability, which we should
2146 * always be able to successfully negotiate, but if it's somehow missing
2147 * then we just permit the caller to use the address, since mac does the
2148 * actual enforcement and ip is just performing a courtesy check to help
2149 * prevent users from unwittingly setting and attempting to use blocked
2150 * addresses.
2151 */
2152 static boolean_t
2153 ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr)
2154 {
2155 if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0)
2156 return (B_TRUE);
2157
2158 ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck;
2159 ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df;
2160 return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr));
2161 }
2162
2163 static void
2164 ill_capability_ipcheck_enable(ill_t *ill)
2165 {
2166 ill_dld_capab_t *idc = ill->ill_dld_capab;
2167 ill_dld_ipcheck_t *idi = &idc->idc_ipcheck;
2168 dld_capab_ipcheck_t spoof;
2169 int rc;
2170
2171 ASSERT(IAM_WRITER_ILL(ill));
2172
2173 bzero(&spoof, sizeof (spoof));
2174 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
2175 &spoof, DLD_ENABLE)) == 0) {
2176 idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df;
2177 idi->idi_allowed_dh = spoof.ipc_allowed_dh;
2178 ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK;
2179 } else {
2180 cmn_err(CE_WARN, "warning: could not enable IPCHECK "
2181 "capability, rc = %d\n", rc);
2182 DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc);
2183 }
2184 }
2185
2186 static void
2187 ill_capability_dld_enable(ill_t *ill)
2188 {
2189 mac_perim_handle_t mph;
2190
2191 ASSERT(IAM_WRITER_ILL(ill));
2192
2193 ill_mac_perim_enter(ill, &mph);
2194 if (!ill->ill_isv6) {
2195 ill_capability_direct_enable(ill);
2196 ill_capability_poll_enable(ill);
2197 ill_capability_lso_enable(ill);
2198 }
2199
2200 ill_capability_ipcheck_enable(ill);
2201
2202 ill->ill_capabilities |= ILL_CAPAB_DLD;
2203 ill_mac_perim_exit(ill, mph);
2204 }
2205
2206 static void
2207 ill_capability_dld_disable(ill_t *ill)
2208 {
2209 ill_dld_capab_t *idc;
2210 ill_dld_direct_t *idd;
2211 mac_perim_handle_t mph;
2212
2213 ASSERT(IAM_WRITER_ILL(ill));
2214
2215 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2216 return;
2217
2218 ill_mac_perim_enter(ill, &mph);
2219
2220 idc = ill->ill_dld_capab;
2221 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2222 /*
2223 * For performance we avoid locks in the transmit data path
2224 * and don't maintain a count of the number of threads using
2225 * direct calls. Thus some threads could be using direct
2226 * transmit calls to GLD, even after the capability mechanism
2227 * turns it off. This is still safe since the handles used in
2228 * the direct calls continue to be valid until the unplumb is
2229 * completed. Remove the callback that was added (1-time) at
2230 * capab enable time.
2231 */
2232 mutex_enter(&ill->ill_lock);
2233 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2234 mutex_exit(&ill->ill_lock);
2235 if (ill->ill_flownotify_mh != NULL) {
2236 idd = &idc->idc_direct;
2237 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2238 ill->ill_flownotify_mh);
2239 ill->ill_flownotify_mh = NULL;
2240 }
2241 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2242 NULL, DLD_DISABLE);
2243 }
2244
2245 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2246 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2247 ip_squeue_clean_all(ill);
2248 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2249 NULL, DLD_DISABLE);
2250 }
2251
2252 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2253 ASSERT(ill->ill_lso_capab != NULL);
2254 /*
2255 * Clear the capability flag for LSO but retain the
2256 * ill_lso_capab structure since it's possible that another
2257 * thread is still referring to it. The structure only gets
2258 * deallocated when we destroy the ill.
2259 */
2260
2261 ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2262 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2263 NULL, DLD_DISABLE);
2264 }
2265
2266 if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) {
2267 ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL);
2268 ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL);
2269
2270 ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK;
2271 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
2272 NULL, DLD_DISABLE);
2273 }
2274
2275 ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2276 ill_mac_perim_exit(ill, mph);
2277 }
2278
2279 /*
2280 * Capability Negotiation protocol
2281 *
2282 * We don't wait for DLPI capability operations to finish during interface
2283 * bringup or teardown. Doing so would introduce more asynchrony and the
2284 * interface up/down operations will need multiple return and restarts.
2285 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2286 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2287 * exclusive operation won't start until the DLPI operations of the previous
2288 * exclusive operation complete.
2289 *
2290 * The capability state machine is shown below.
2291 *
2292 * state next state event, action
2293 *
2294 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe
2295 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack
2296 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack)
2297 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG
2298 * IDCS_OK IDCS_RESET_SENT ill_capability_reset
2299 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr
2300 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr ->
2301 * ill_capability_probe.
2302 */
2303
2304 /*
2305 * Dedicated thread started from ip_stack_init that handles capability
2306 * disable. This thread ensures the taskq dispatch does not fail by waiting
2307 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2308 * that direct calls to DLD are done in a cv_waitable context.
2309 */
2310 void
2311 ill_taskq_dispatch(ip_stack_t *ipst)
2312 {
2313 callb_cpr_t cprinfo;
2314 char name[64];
2315 mblk_t *mp;
2316
2317 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2318 ipst->ips_netstack->netstack_stackid);
2319 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2320 name);
2321 mutex_enter(&ipst->ips_capab_taskq_lock);
2322
2323 for (;;) {
2324 mp = ipst->ips_capab_taskq_head;
2325 while (mp != NULL) {
2326 ipst->ips_capab_taskq_head = mp->b_next;
2327 if (ipst->ips_capab_taskq_head == NULL)
2328 ipst->ips_capab_taskq_tail = NULL;
2329 mutex_exit(&ipst->ips_capab_taskq_lock);
2330 mp->b_next = NULL;
2331
2332 VERIFY(taskq_dispatch(system_taskq,
2333 ill_capability_ack_thr, mp, TQ_SLEEP) !=
2334 TASKQID_INVALID);
2335 mutex_enter(&ipst->ips_capab_taskq_lock);
2336 mp = ipst->ips_capab_taskq_head;
2337 }
2338
2339 if (ipst->ips_capab_taskq_quit)
2340 break;
2341 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2342 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2343 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2344 }
2345 VERIFY(ipst->ips_capab_taskq_head == NULL);
2346 VERIFY(ipst->ips_capab_taskq_tail == NULL);
2347 CALLB_CPR_EXIT(&cprinfo);
2348 thread_exit();
2349 }
2350
2351 /*
2352 * Consume a new-style hardware capabilities negotiation ack.
2353 * Called via taskq on receipt of DL_CAPABILITY_ACK.
2354 */
2355 static void
2356 ill_capability_ack_thr(void *arg)
2357 {
2358 mblk_t *mp = arg;
2359 dl_capability_ack_t *capp;
2360 dl_capability_sub_t *subp, *endp;
2361 ill_t *ill;
2362 boolean_t reneg;
2363
2364 ill = (ill_t *)mp->b_prev;
2365 mp->b_prev = NULL;
2366
2367 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2368
2369 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2370 ill->ill_dlpi_capab_state == IDCS_RENEG) {
2371 /*
2372 * We have received the ack for our DL_CAPAB reset request.
2373 * There isnt' anything in the message that needs processing.
2374 * All message based capabilities have been disabled, now
2375 * do the function call based capability disable.
2376 */
2377 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2378 ill_capability_dld_disable(ill);
2379 ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2380 if (reneg)
2381 ill_capability_probe(ill);
2382 goto done;
2383 }
2384
2385 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2386 ill->ill_dlpi_capab_state = IDCS_OK;
2387
2388 capp = (dl_capability_ack_t *)mp->b_rptr;
2389
2390 if (capp->dl_sub_length == 0) {
2391 /* no new-style capabilities */
2392 goto done;
2393 }
2394
2395 /* make sure the driver supplied correct dl_sub_length */
2396 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2397 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2398 "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2399 goto done;
2400 }
2401
2402 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2403 /*
2404 * There are sub-capabilities. Process the ones we know about.
2405 * Loop until we don't have room for another sub-cap header..
2406 */
2407 for (subp = SC(capp, capp->dl_sub_offset),
2408 endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2409 subp <= endp;
2410 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2411
2412 switch (subp->dl_cap) {
2413 case DL_CAPAB_ID_WRAPPER:
2414 ill_capability_id_ack(ill, mp, subp);
2415 break;
2416 default:
2417 ill_capability_dispatch(ill, mp, subp);
2418 break;
2419 }
2420 }
2421 #undef SC
2422 done:
2423 inet_freemsg(mp);
2424 ill_capability_done(ill);
2425 ipsq_exit(ill->ill_phyint->phyint_ipsq);
2426 }
2427
2428 /*
2429 * This needs to be started in a taskq thread to provide a cv_waitable
2430 * context.
2431 */
2432 void
2433 ill_capability_ack(ill_t *ill, mblk_t *mp)
2434 {
2435 ip_stack_t *ipst = ill->ill_ipst;
2436
2437 mp->b_prev = (mblk_t *)ill;
2438 ASSERT(mp->b_next == NULL);
2439
2440 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2441 TQ_NOSLEEP) != TASKQID_INVALID)
2442 return;
2443
2444 /*
2445 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2446 * which will do the dispatch using TQ_SLEEP to guarantee success.
2447 */
2448 mutex_enter(&ipst->ips_capab_taskq_lock);
2449 if (ipst->ips_capab_taskq_head == NULL) {
2450 ASSERT(ipst->ips_capab_taskq_tail == NULL);
2451 ipst->ips_capab_taskq_head = mp;
2452 } else {
2453 ipst->ips_capab_taskq_tail->b_next = mp;
2454 }
2455 ipst->ips_capab_taskq_tail = mp;
2456
2457 cv_signal(&ipst->ips_capab_taskq_cv);
2458 mutex_exit(&ipst->ips_capab_taskq_lock);
2459 }
2460
2461 /*
2462 * This routine is called to scan the fragmentation reassembly table for
2463 * the specified ILL for any packets that are starting to smell.
2464 * dead_interval is the maximum time in seconds that will be tolerated. It
2465 * will either be the value specified in ip_g_frag_timeout, or zero if the
2466 * ILL is shutting down and it is time to blow everything off.
2467 *
2468 * It returns the number of seconds (as a time_t) that the next frag timer
2469 * should be scheduled for, 0 meaning that the timer doesn't need to be
2470 * re-started. Note that the method of calculating next_timeout isn't
2471 * entirely accurate since time will flow between the time we grab
2472 * current_time and the time we schedule the next timeout. This isn't a
2473 * big problem since this is the timer for sending an ICMP reassembly time
2474 * exceeded messages, and it doesn't have to be exactly accurate.
2475 *
2476 * This function is
2477 * sometimes called as writer, although this is not required.
2478 */
2479 time_t
2480 ill_frag_timeout(ill_t *ill, time_t dead_interval)
2481 {
2482 ipfb_t *ipfb;
2483 ipfb_t *endp;
2484 ipf_t *ipf;
2485 ipf_t *ipfnext;
2486 mblk_t *mp;
2487 time_t current_time = gethrestime_sec();
2488 time_t next_timeout = 0;
2489 uint32_t hdr_length;
2490 mblk_t *send_icmp_head;
2491 mblk_t *send_icmp_head_v6;
2492 ip_stack_t *ipst = ill->ill_ipst;
2493 ip_recv_attr_t iras;
2494
2495 bzero(&iras, sizeof (iras));
2496 iras.ira_flags = 0;
2497 iras.ira_ill = iras.ira_rill = ill;
2498 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2499 iras.ira_rifindex = iras.ira_ruifindex;
2500
2501 ipfb = ill->ill_frag_hash_tbl;
2502 if (ipfb == NULL)
2503 return (B_FALSE);
2504 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2505 /* Walk the frag hash table. */
2506 for (; ipfb < endp; ipfb++) {
2507 send_icmp_head = NULL;
2508 send_icmp_head_v6 = NULL;
2509 mutex_enter(&ipfb->ipfb_lock);
2510 while ((ipf = ipfb->ipfb_ipf) != 0) {
2511 time_t frag_time = current_time - ipf->ipf_timestamp;
2512 time_t frag_timeout;
2513
2514 if (frag_time < dead_interval) {
2515 /*
2516 * There are some outstanding fragments
2517 * that will timeout later. Make note of
2518 * the time so that we can reschedule the
2519 * next timeout appropriately.
2520 */
2521 frag_timeout = dead_interval - frag_time;
2522 if (next_timeout == 0 ||
2523 frag_timeout < next_timeout) {
2524 next_timeout = frag_timeout;
2525 }
2526 break;
2527 }
2528 /* Time's up. Get it out of here. */
2529 hdr_length = ipf->ipf_nf_hdr_len;
2530 ipfnext = ipf->ipf_hash_next;
2531 if (ipfnext)
2532 ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2533 *ipf->ipf_ptphn = ipfnext;
2534 mp = ipf->ipf_mp->b_cont;
2535 for (; mp; mp = mp->b_cont) {
2536 /* Extra points for neatness. */
2537 IP_REASS_SET_START(mp, 0);
2538 IP_REASS_SET_END(mp, 0);
2539 }
2540 mp = ipf->ipf_mp->b_cont;
2541 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2542 ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2543 ipfb->ipfb_count -= ipf->ipf_count;
2544 ASSERT(ipfb->ipfb_frag_pkts > 0);
2545 ipfb->ipfb_frag_pkts--;
2546 /*
2547 * We do not send any icmp message from here because
2548 * we currently are holding the ipfb_lock for this
2549 * hash chain. If we try and send any icmp messages
2550 * from here we may end up via a put back into ip
2551 * trying to get the same lock, causing a recursive
2552 * mutex panic. Instead we build a list and send all
2553 * the icmp messages after we have dropped the lock.
2554 */
2555 if (ill->ill_isv6) {
2556 if (hdr_length != 0) {
2557 mp->b_next = send_icmp_head_v6;
2558 send_icmp_head_v6 = mp;
2559 } else {
2560 freemsg(mp);
2561 }
2562 } else {
2563 if (hdr_length != 0) {
2564 mp->b_next = send_icmp_head;
2565 send_icmp_head = mp;
2566 } else {
2567 freemsg(mp);
2568 }
2569 }
2570 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2571 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2572 freeb(ipf->ipf_mp);
2573 }
2574 mutex_exit(&ipfb->ipfb_lock);
2575 /*
2576 * Now need to send any icmp messages that we delayed from
2577 * above.
2578 */
2579 while (send_icmp_head_v6 != NULL) {
2580 ip6_t *ip6h;
2581
2582 mp = send_icmp_head_v6;
2583 send_icmp_head_v6 = send_icmp_head_v6->b_next;
2584 mp->b_next = NULL;
2585 ip6h = (ip6_t *)mp->b_rptr;
2586 iras.ira_flags = 0;
2587 /*
2588 * This will result in an incorrect ALL_ZONES zoneid
2589 * for multicast packets, but we
2590 * don't send ICMP errors for those in any case.
2591 */
2592 iras.ira_zoneid =
2593 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2594 ill, ipst);
2595 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2596 icmp_time_exceeded_v6(mp,
2597 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2598 &iras);
2599 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2600 }
2601 while (send_icmp_head != NULL) {
2602 ipaddr_t dst;
2603
2604 mp = send_icmp_head;
2605 send_icmp_head = send_icmp_head->b_next;
2606 mp->b_next = NULL;
2607
2608 dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2609
2610 iras.ira_flags = IRAF_IS_IPV4;
2611 /*
2612 * This will result in an incorrect ALL_ZONES zoneid
2613 * for broadcast and multicast packets, but we
2614 * don't send ICMP errors for those in any case.
2615 */
2616 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2617 ill, ipst);
2618 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2619 icmp_time_exceeded(mp,
2620 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2621 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2622 }
2623 }
2624 /*
2625 * A non-dying ILL will use the return value to decide whether to
2626 * restart the frag timer, and for how long.
2627 */
2628 return (next_timeout);
2629 }
2630
2631 /*
2632 * This routine is called when the approximate count of mblk memory used
2633 * for the specified ILL has exceeded max_count.
2634 */
2635 void
2636 ill_frag_prune(ill_t *ill, uint_t max_count)
2637 {
2638 ipfb_t *ipfb;
2639 ipf_t *ipf;
2640 size_t count;
2641 clock_t now;
2642
2643 /*
2644 * If we are here within ip_min_frag_prune_time msecs remove
2645 * ill_frag_free_num_pkts oldest packets from each bucket and increment
2646 * ill_frag_free_num_pkts.
2647 */
2648 mutex_enter(&ill->ill_lock);
2649 now = ddi_get_lbolt();
2650 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2651 (ip_min_frag_prune_time != 0 ?
2652 ip_min_frag_prune_time : msec_per_tick)) {
2653
2654 ill->ill_frag_free_num_pkts++;
2655
2656 } else {
2657 ill->ill_frag_free_num_pkts = 0;
2658 }
2659 ill->ill_last_frag_clean_time = now;
2660 mutex_exit(&ill->ill_lock);
2661
2662 /*
2663 * free ill_frag_free_num_pkts oldest packets from each bucket.
2664 */
2665 if (ill->ill_frag_free_num_pkts != 0) {
2666 int ix;
2667
2668 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2669 ipfb = &ill->ill_frag_hash_tbl[ix];
2670 mutex_enter(&ipfb->ipfb_lock);
2671 if (ipfb->ipfb_ipf != NULL) {
2672 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2673 ill->ill_frag_free_num_pkts);
2674 }
2675 mutex_exit(&ipfb->ipfb_lock);
2676 }
2677 }
2678 /*
2679 * While the reassembly list for this ILL is too big, prune a fragment
2680 * queue by age, oldest first.
2681 */
2682 while (ill->ill_frag_count > max_count) {
2683 int ix;
2684 ipfb_t *oipfb = NULL;
2685 uint_t oldest = UINT_MAX;
2686
2687 count = 0;
2688 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2689 ipfb = &ill->ill_frag_hash_tbl[ix];
2690 mutex_enter(&ipfb->ipfb_lock);
2691 ipf = ipfb->ipfb_ipf;
2692 if (ipf != NULL && ipf->ipf_gen < oldest) {
2693 oldest = ipf->ipf_gen;
2694 oipfb = ipfb;
2695 }
2696 count += ipfb->ipfb_count;
2697 mutex_exit(&ipfb->ipfb_lock);
2698 }
2699 if (oipfb == NULL)
2700 break;
2701
2702 if (count <= max_count)
2703 return; /* Somebody beat us to it, nothing to do */
2704 mutex_enter(&oipfb->ipfb_lock);
2705 ipf = oipfb->ipfb_ipf;
2706 if (ipf != NULL) {
2707 ill_frag_free_pkts(ill, oipfb, ipf, 1);
2708 }
2709 mutex_exit(&oipfb->ipfb_lock);
2710 }
2711 }
2712
2713 /*
2714 * free 'free_cnt' fragmented packets starting at ipf.
2715 */
2716 void
2717 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2718 {
2719 size_t count;
2720 mblk_t *mp;
2721 mblk_t *tmp;
2722 ipf_t **ipfp = ipf->ipf_ptphn;
2723
2724 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2725 ASSERT(ipfp != NULL);
2726 ASSERT(ipf != NULL);
2727
2728 while (ipf != NULL && free_cnt-- > 0) {
2729 count = ipf->ipf_count;
2730 mp = ipf->ipf_mp;
2731 ipf = ipf->ipf_hash_next;
2732 for (tmp = mp; tmp; tmp = tmp->b_cont) {
2733 IP_REASS_SET_START(tmp, 0);
2734 IP_REASS_SET_END(tmp, 0);
2735 }
2736 atomic_add_32(&ill->ill_frag_count, -count);
2737 ASSERT(ipfb->ipfb_count >= count);
2738 ipfb->ipfb_count -= count;
2739 ASSERT(ipfb->ipfb_frag_pkts > 0);
2740 ipfb->ipfb_frag_pkts--;
2741 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2742 ip_drop_input("ipIfStatsReasmFails", mp, ill);
2743 freemsg(mp);
2744 }
2745
2746 if (ipf)
2747 ipf->ipf_ptphn = ipfp;
2748 ipfp[0] = ipf;
2749 }
2750
2751 /*
2752 * Helper function for ill_forward_set().
2753 */
2754 static void
2755 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2756 {
2757 ip_stack_t *ipst = ill->ill_ipst;
2758
2759 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2760
2761 ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2762 (enable ? "Enabling" : "Disabling"),
2763 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2764 mutex_enter(&ill->ill_lock);
2765 if (enable)
2766 ill->ill_flags |= ILLF_ROUTER;
2767 else
2768 ill->ill_flags &= ~ILLF_ROUTER;
2769 mutex_exit(&ill->ill_lock);
2770 if (ill->ill_isv6)
2771 ill_set_nce_router_flags(ill, enable);
2772 /* Notify routing socket listeners of this change. */
2773 if (ill->ill_ipif != NULL)
2774 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2775 }
2776
2777 /*
2778 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing
2779 * socket messages for each interface whose flags we change.
2780 */
2781 int
2782 ill_forward_set(ill_t *ill, boolean_t enable)
2783 {
2784 ipmp_illgrp_t *illg;
2785 ip_stack_t *ipst = ill->ill_ipst;
2786
2787 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2788
2789 if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2790 (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2791 return (0);
2792
2793 if (IS_LOOPBACK(ill))
2794 return (EINVAL);
2795
2796 if (enable && ill->ill_allowed_ips_cnt > 0)
2797 return (EPERM);
2798
2799 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2800 /*
2801 * Update all of the interfaces in the group.
2802 */
2803 illg = ill->ill_grp;
2804 ill = list_head(&illg->ig_if);
2805 for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2806 ill_forward_set_on_ill(ill, enable);
2807
2808 /*
2809 * Update the IPMP meta-interface.
2810 */
2811 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2812 return (0);
2813 }
2814
2815 ill_forward_set_on_ill(ill, enable);
2816 return (0);
2817 }
2818
2819 /*
2820 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2821 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2822 * set or clear.
2823 */
2824 static void
2825 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2826 {
2827 ipif_t *ipif;
2828 ncec_t *ncec;
2829 nce_t *nce;
2830
2831 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2832 /*
2833 * NOTE: we match across the illgrp because nce's for
2834 * addresses on IPMP interfaces have an nce_ill that points to
2835 * the bound underlying ill.
2836 */
2837 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2838 if (nce != NULL) {
2839 ncec = nce->nce_common;
2840 mutex_enter(&ncec->ncec_lock);
2841 if (enable)
2842 ncec->ncec_flags |= NCE_F_ISROUTER;
2843 else
2844 ncec->ncec_flags &= ~NCE_F_ISROUTER;
2845 mutex_exit(&ncec->ncec_lock);
2846 nce_refrele(nce);
2847 }
2848 }
2849 }
2850
2851 /*
2852 * Intializes the context structure and returns the first ill in the list
2853 * cuurently start_list and end_list can have values:
2854 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists.
2855 * IP_V4_G_HEAD Traverse IPV4 list only.
2856 * IP_V6_G_HEAD Traverse IPV6 list only.
2857 */
2858
2859 /*
2860 * We don't check for CONDEMNED ills here. Caller must do that if
2861 * necessary under the ill lock.
2862 */
2863 ill_t *
2864 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2865 ip_stack_t *ipst)
2866 {
2867 ill_if_t *ifp;
2868 ill_t *ill;
2869 avl_tree_t *avl_tree;
2870
2871 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2872 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2873
2874 /*
2875 * setup the lists to search
2876 */
2877 if (end_list != MAX_G_HEADS) {
2878 ctx->ctx_current_list = start_list;
2879 ctx->ctx_last_list = end_list;
2880 } else {
2881 ctx->ctx_last_list = MAX_G_HEADS - 1;
2882 ctx->ctx_current_list = 0;
2883 }
2884
2885 while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2886 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2887 if (ifp != (ill_if_t *)
2888 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2889 avl_tree = &ifp->illif_avl_by_ppa;
2890 ill = avl_first(avl_tree);
2891 /*
2892 * ill is guaranteed to be non NULL or ifp should have
2893 * not existed.
2894 */
2895 ASSERT(ill != NULL);
2896 return (ill);
2897 }
2898 ctx->ctx_current_list++;
2899 }
2900
2901 return (NULL);
2902 }
2903
2904 /*
2905 * returns the next ill in the list. ill_first() must have been called
2906 * before calling ill_next() or bad things will happen.
2907 */
2908
2909 /*
2910 * We don't check for CONDEMNED ills here. Caller must do that if
2911 * necessary under the ill lock.
2912 */
2913 ill_t *
2914 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2915 {
2916 ill_if_t *ifp;
2917 ill_t *ill;
2918 ip_stack_t *ipst = lastill->ill_ipst;
2919
2920 ASSERT(lastill->ill_ifptr != (ill_if_t *)
2921 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2922 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2923 AVL_AFTER)) != NULL) {
2924 return (ill);
2925 }
2926
2927 /* goto next ill_ifp in the list. */
2928 ifp = lastill->ill_ifptr->illif_next;
2929
2930 /* make sure not at end of circular list */
2931 while (ifp ==
2932 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2933 if (++ctx->ctx_current_list > ctx->ctx_last_list)
2934 return (NULL);
2935 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2936 }
2937
2938 return (avl_first(&ifp->illif_avl_by_ppa));
2939 }
2940
2941 /*
2942 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2943 * The final number (PPA) must not have any leading zeros. Upon success, a
2944 * pointer to the start of the PPA is returned; otherwise NULL is returned.
2945 */
2946 static char *
2947 ill_get_ppa_ptr(char *name)
2948 {
2949 int namelen = strlen(name);
2950 int end_ndx = namelen - 1;
2951 int ppa_ndx, i;
2952
2953 /*
2954 * Check that the first character is [a-zA-Z], and that the last
2955 * character is [0-9].
2956 */
2957 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2958 return (NULL);
2959
2960 /*
2961 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2962 */
2963 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2964 if (!isdigit(name[ppa_ndx - 1]))
2965 break;
2966
2967 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2968 return (NULL);
2969
2970 /*
2971 * Check that the intermediate characters are [a-z0-9.]
2972 */
2973 for (i = 1; i < ppa_ndx; i++) {
2974 if (!isalpha(name[i]) && !isdigit(name[i]) &&
2975 name[i] != '.' && name[i] != '_') {
2976 return (NULL);
2977 }
2978 }
2979
2980 return (name + ppa_ndx);
2981 }
2982
2983 /*
2984 * use avl tree to locate the ill.
2985 */
2986 static ill_t *
2987 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2988 {
2989 char *ppa_ptr = NULL;
2990 int len;
2991 uint_t ppa;
2992 ill_t *ill = NULL;
2993 ill_if_t *ifp;
2994 int list;
2995
2996 /*
2997 * get ppa ptr
2998 */
2999 if (isv6)
3000 list = IP_V6_G_HEAD;
3001 else
3002 list = IP_V4_G_HEAD;
3003
3004 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
3005 return (NULL);
3006 }
3007
3008 len = ppa_ptr - name + 1;
3009
3010 ppa = stoi(&ppa_ptr);
3011
3012 ifp = IP_VX_ILL_G_LIST(list, ipst);
3013
3014 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
3015 /*
3016 * match is done on len - 1 as the name is not null
3017 * terminated it contains ppa in addition to the interface
3018 * name.
3019 */
3020 if ((ifp->illif_name_len == len) &&
3021 bcmp(ifp->illif_name, name, len - 1) == 0) {
3022 break;
3023 } else {
3024 ifp = ifp->illif_next;
3025 }
3026 }
3027
3028 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
3029 /*
3030 * Even the interface type does not exist.
3031 */
3032 return (NULL);
3033 }
3034
3035 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
3036 if (ill != NULL) {
3037 mutex_enter(&ill->ill_lock);
3038 if (ILL_CAN_LOOKUP(ill)) {
3039 ill_refhold_locked(ill);
3040 mutex_exit(&ill->ill_lock);
3041 return (ill);
3042 }
3043 mutex_exit(&ill->ill_lock);
3044 }
3045 return (NULL);
3046 }
3047
3048 /*
3049 * comparison function for use with avl.
3050 */
3051 static int
3052 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
3053 {
3054 uint_t ppa;
3055 uint_t ill_ppa;
3056
3057 ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
3058
3059 ppa = *((uint_t *)ppa_ptr);
3060 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
3061 /*
3062 * We want the ill with the lowest ppa to be on the
3063 * top.
3064 */
3065 if (ill_ppa < ppa)
3066 return (1);
3067 if (ill_ppa > ppa)
3068 return (-1);
3069 return (0);
3070 }
3071
3072 /*
3073 * remove an interface type from the global list.
3074 */
3075 static void
3076 ill_delete_interface_type(ill_if_t *interface)
3077 {
3078 ASSERT(interface != NULL);
3079 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
3080
3081 avl_destroy(&interface->illif_avl_by_ppa);
3082 if (interface->illif_ppa_arena != NULL)
3083 vmem_destroy(interface->illif_ppa_arena);
3084
3085 remque(interface);
3086
3087 mi_free(interface);
3088 }
3089
3090 /*
3091 * remove ill from the global list.
3092 */
3093 static void
3094 ill_glist_delete(ill_t *ill)
3095 {
3096 ip_stack_t *ipst;
3097 phyint_t *phyi;
3098
3099 if (ill == NULL)
3100 return;
3101 ipst = ill->ill_ipst;
3102 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3103
3104 /*
3105 * If the ill was never inserted into the AVL tree
3106 * we skip the if branch.
3107 */
3108 if (ill->ill_ifptr != NULL) {
3109 /*
3110 * remove from AVL tree and free ppa number
3111 */
3112 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3113
3114 if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3115 vmem_free(ill->ill_ifptr->illif_ppa_arena,
3116 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3117 }
3118 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3119 ill_delete_interface_type(ill->ill_ifptr);
3120 }
3121
3122 /*
3123 * Indicate ill is no longer in the list.
3124 */
3125 ill->ill_ifptr = NULL;
3126 ill->ill_name_length = 0;
3127 ill->ill_name[0] = '\0';
3128 ill->ill_ppa = UINT_MAX;
3129 }
3130
3131 /* Generate one last event for this ill. */
3132 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3133 ill->ill_name_length);
3134
3135 ASSERT(ill->ill_phyint != NULL);
3136 phyi = ill->ill_phyint;
3137 ill->ill_phyint = NULL;
3138
3139 /*
3140 * ill_init allocates a phyint always to store the copy
3141 * of flags relevant to phyint. At that point in time, we could
3142 * not assign the name and hence phyint_illv4/v6 could not be
3143 * initialized. Later in ipif_set_values, we assign the name to
3144 * the ill, at which point in time we assign phyint_illv4/v6.
3145 * Thus we don't rely on phyint_illv6 to be initialized always.
3146 */
3147 if (ill->ill_flags & ILLF_IPV6)
3148 phyi->phyint_illv6 = NULL;
3149 else
3150 phyi->phyint_illv4 = NULL;
3151
3152 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3153 rw_exit(&ipst->ips_ill_g_lock);
3154 return;
3155 }
3156
3157 /*
3158 * There are no ills left on this phyint; pull it out of the phyint
3159 * avl trees, and free it.
3160 */
3161 if (phyi->phyint_ifindex > 0) {
3162 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3163 phyi);
3164 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3165 phyi);
3166 }
3167 rw_exit(&ipst->ips_ill_g_lock);
3168
3169 phyint_free(phyi);
3170 }
3171
3172 /*
3173 * allocate a ppa, if the number of plumbed interfaces of this type are
3174 * less than ill_no_arena do a linear search to find a unused ppa.
3175 * When the number goes beyond ill_no_arena switch to using an arena.
3176 * Note: ppa value of zero cannot be allocated from vmem_arena as it
3177 * is the return value for an error condition, so allocation starts at one
3178 * and is decremented by one.
3179 */
3180 static int
3181 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3182 {
3183 ill_t *tmp_ill;
3184 uint_t start, end;
3185 int ppa;
3186
3187 if (ifp->illif_ppa_arena == NULL &&
3188 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3189 /*
3190 * Create an arena.
3191 */
3192 ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3193 (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3194 NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3195 /* allocate what has already been assigned */
3196 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3197 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3198 tmp_ill, AVL_AFTER)) {
3199 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3200 1, /* size */
3201 1, /* align/quantum */
3202 0, /* phase */
3203 0, /* nocross */
3204 /* minaddr */
3205 (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3206 /* maxaddr */
3207 (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3208 VM_NOSLEEP|VM_FIRSTFIT);
3209 if (ppa == 0) {
3210 ip1dbg(("ill_alloc_ppa: ppa allocation"
3211 " failed while switching"));
3212 vmem_destroy(ifp->illif_ppa_arena);
3213 ifp->illif_ppa_arena = NULL;
3214 break;
3215 }
3216 }
3217 }
3218
3219 if (ifp->illif_ppa_arena != NULL) {
3220 if (ill->ill_ppa == UINT_MAX) {
3221 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3222 1, VM_NOSLEEP|VM_FIRSTFIT);
3223 if (ppa == 0)
3224 return (EAGAIN);
3225 ill->ill_ppa = --ppa;
3226 } else {
3227 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3228 1, /* size */
3229 1, /* align/quantum */
3230 0, /* phase */
3231 0, /* nocross */
3232 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3233 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3234 VM_NOSLEEP|VM_FIRSTFIT);
3235 /*
3236 * Most likely the allocation failed because
3237 * the requested ppa was in use.
3238 */
3239 if (ppa == 0)
3240 return (EEXIST);
3241 }
3242 return (0);
3243 }
3244
3245 /*
3246 * No arena is in use and not enough (>ill_no_arena) interfaces have
3247 * been plumbed to create one. Do a linear search to get a unused ppa.
3248 */
3249 if (ill->ill_ppa == UINT_MAX) {
3250 end = UINT_MAX - 1;
3251 start = 0;
3252 } else {
3253 end = start = ill->ill_ppa;
3254 }
3255
3256 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3257 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3258 if (start++ >= end) {
3259 if (ill->ill_ppa == UINT_MAX)
3260 return (EAGAIN);
3261 else
3262 return (EEXIST);
3263 }
3264 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3265 }
3266 ill->ill_ppa = start;
3267 return (0);
3268 }
3269
3270 /*
3271 * Insert ill into the list of configured ill's. Once this function completes,
3272 * the ill is globally visible and is available through lookups. More precisely
3273 * this happens after the caller drops the ill_g_lock.
3274 */
3275 static int
3276 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3277 {
3278 ill_if_t *ill_interface;
3279 avl_index_t where = 0;
3280 int error;
3281 int name_length;
3282 int index;
3283 boolean_t check_length = B_FALSE;
3284 ip_stack_t *ipst = ill->ill_ipst;
3285
3286 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3287
3288 name_length = mi_strlen(name) + 1;
3289
3290 if (isv6)
3291 index = IP_V6_G_HEAD;
3292 else
3293 index = IP_V4_G_HEAD;
3294
3295 ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3296 /*
3297 * Search for interface type based on name
3298 */
3299 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3300 if ((ill_interface->illif_name_len == name_length) &&
3301 (strcmp(ill_interface->illif_name, name) == 0)) {
3302 break;
3303 }
3304 ill_interface = ill_interface->illif_next;
3305 }
3306
3307 /*
3308 * Interface type not found, create one.
3309 */
3310 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3311 ill_g_head_t ghead;
3312
3313 /*
3314 * allocate ill_if_t structure
3315 */
3316 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3317 if (ill_interface == NULL) {
3318 return (ENOMEM);
3319 }
3320
3321 (void) strcpy(ill_interface->illif_name, name);
3322 ill_interface->illif_name_len = name_length;
3323
3324 avl_create(&ill_interface->illif_avl_by_ppa,
3325 ill_compare_ppa, sizeof (ill_t),
3326 offsetof(struct ill_s, ill_avl_byppa));
3327
3328 /*
3329 * link the structure in the back to maintain order
3330 * of configuration for ifconfig output.
3331 */
3332 ghead = ipst->ips_ill_g_heads[index];
3333 insque(ill_interface, ghead.ill_g_list_tail);
3334 }
3335
3336 if (ill->ill_ppa == UINT_MAX)
3337 check_length = B_TRUE;
3338
3339 error = ill_alloc_ppa(ill_interface, ill);
3340 if (error != 0) {
3341 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3342 ill_delete_interface_type(ill->ill_ifptr);
3343 return (error);
3344 }
3345
3346 /*
3347 * When the ppa is choosen by the system, check that there is
3348 * enough space to insert ppa. if a specific ppa was passed in this
3349 * check is not required as the interface name passed in will have
3350 * the right ppa in it.
3351 */
3352 if (check_length) {
3353 /*
3354 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3355 */
3356 char buf[sizeof (uint_t) * 3];
3357
3358 /*
3359 * convert ppa to string to calculate the amount of space
3360 * required for it in the name.
3361 */
3362 numtos(ill->ill_ppa, buf);
3363
3364 /* Do we have enough space to insert ppa ? */
3365
3366 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3367 /* Free ppa and interface type struct */
3368 if (ill_interface->illif_ppa_arena != NULL) {
3369 vmem_free(ill_interface->illif_ppa_arena,
3370 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3371 }
3372 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3373 ill_delete_interface_type(ill->ill_ifptr);
3374
3375 return (EINVAL);
3376 }
3377 }
3378
3379 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3380 ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3381
3382 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3383 &where);
3384 ill->ill_ifptr = ill_interface;
3385 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
3386
3387 ill_phyint_reinit(ill);
3388 return (0);
3389 }
3390
3391 /* Initialize the per phyint ipsq used for serialization */
3392 static boolean_t
3393 ipsq_init(ill_t *ill, boolean_t enter)
3394 {
3395 ipsq_t *ipsq;
3396 ipxop_t *ipx;
3397
3398 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
3399 return (B_FALSE);
3400
3401 ill->ill_phyint->phyint_ipsq = ipsq;
3402 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
3403 ipx->ipx_ipsq = ipsq;
3404 ipsq->ipsq_next = ipsq;
3405 ipsq->ipsq_phyint = ill->ill_phyint;
3406 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
3407 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
3408 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */
3409 if (enter) {
3410 ipx->ipx_writer = curthread;
3411 ipx->ipx_forced = B_FALSE;
3412 ipx->ipx_reentry_cnt = 1;
3413 #ifdef DEBUG
3414 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
3415 #endif
3416 }
3417 return (B_TRUE);
3418 }
3419
3420 /*
3421 * Here we perform initialisation of the ill_t common to both regular
3422 * interface ILLs and the special loopback ILL created by ill_lookup_on_name.
3423 */
3424 static int
3425 ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
3426 boolean_t ipsq_enter)
3427 {
3428 int count;
3429 uchar_t *frag_ptr;
3430
3431 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
3432 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
3433 ill->ill_saved_ire_cnt = 0;
3434
3435 if (is_loopback) {
3436 ill->ill_max_frag = isv6 ? ip_loopback_mtu_v6plus :
3437 ip_loopback_mtuplus;
3438 /*
3439 * No resolver here.
3440 */
3441 ill->ill_net_type = IRE_LOOPBACK;
3442 } else {
3443 ill->ill_rq = q;
3444 ill->ill_wq = WR(q);
3445 ill->ill_ppa = UINT_MAX;
3446 }
3447
3448 ill->ill_isv6 = isv6;
3449
3450 /*
3451 * Allocate sufficient space to contain our fragment hash table and
3452 * the device name.
3453 */
3454 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
3455 if (frag_ptr == NULL)
3456 return (ENOMEM);
3457 ill->ill_frag_ptr = frag_ptr;
3458 ill->ill_frag_free_num_pkts = 0;
3459 ill->ill_last_frag_clean_time = 0;
3460 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
3461 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
3462 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
3463 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
3464 NULL, MUTEX_DEFAULT, NULL);
3465 }
3466
3467 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3468 if (ill->ill_phyint == NULL) {
3469 mi_free(frag_ptr);
3470 return (ENOMEM);
3471 }
3472
3473 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3474 if (isv6) {
3475 ill->ill_phyint->phyint_illv6 = ill;
3476 } else {
3477 ill->ill_phyint->phyint_illv4 = ill;
3478 }
3479 if (is_loopback) {
3480 phyint_flags_init(ill->ill_phyint, DL_LOOP);
3481 }
3482
3483 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3484
3485 ill_set_inputfn(ill);
3486
3487 if (!ipsq_init(ill, ipsq_enter)) {
3488 mi_free(frag_ptr);
3489 mi_free(ill->ill_phyint);
3490 return (ENOMEM);
3491 }
3492
3493 /* Frag queue limit stuff */
3494 ill->ill_frag_count = 0;
3495 ill->ill_ipf_gen = 0;
3496
3497 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3498 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3499 ill->ill_global_timer = INFINITY;
3500 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3501 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3502 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3503 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3504
3505 /*
3506 * Initialize IPv6 configuration variables. The IP module is always
3507 * opened as an IPv4 module. Instead tracking down the cases where
3508 * it switches to do ipv6, we'll just initialize the IPv6 configuration
3509 * here for convenience, this has no effect until the ill is set to do
3510 * IPv6.
3511 */
3512 ill->ill_reachable_time = ND_REACHABLE_TIME;
3513 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
3514 ill->ill_max_buf = ND_MAX_Q;
3515 ill->ill_refcnt = 0;
3516
3517 return (0);
3518 }
3519
3520 /*
3521 * ill_init is called by ip_open when a device control stream is opened.
3522 * It does a few initializations, and shoots a DL_INFO_REQ message down
3523 * to the driver. The response is later picked up in ip_rput_dlpi and
3524 * used to set up default mechanisms for talking to the driver. (Always
3525 * called as writer.)
3526 *
3527 * If this function returns error, ip_open will call ip_close which in
3528 * turn will call ill_delete to clean up any memory allocated here that
3529 * is not yet freed.
3530 *
3531 * Note: ill_ipst and ill_zoneid must be set before calling ill_init.
3532 */
3533 int
3534 ill_init(queue_t *q, ill_t *ill)
3535 {
3536 int ret;
3537 dl_info_req_t *dlir;
3538 mblk_t *info_mp;
3539
3540 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
3541 BPRI_HI);
3542 if (info_mp == NULL)
3543 return (ENOMEM);
3544
3545 /*
3546 * For now pretend this is a v4 ill. We need to set phyint_ill*
3547 * at this point because of the following reason. If we can't
3548 * enter the ipsq at some point and cv_wait, the writer that
3549 * wakes us up tries to locate us using the list of all phyints
3550 * in an ipsq and the ills from the phyint thru the phyint_ill*.
3551 * If we don't set it now, we risk a missed wakeup.
3552 */
3553 if ((ret = ill_init_common(ill, q, B_FALSE, B_FALSE, B_TRUE)) != 0) {
3554 freemsg(info_mp);
3555 return (ret);
3556 }
3557
3558 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
3559
3560 /* Send down the Info Request to the driver. */
3561 info_mp->b_datap->db_type = M_PCPROTO;
3562 dlir = (dl_info_req_t *)info_mp->b_rptr;
3563 info_mp->b_wptr = (uchar_t *)&dlir[1];
3564 dlir->dl_primitive = DL_INFO_REQ;
3565
3566 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3567
3568 qprocson(q);
3569 ill_dlpi_send(ill, info_mp);
3570
3571 return (0);
3572 }
3573
3574 /*
3575 * ill_dls_info
3576 * creates datalink socket info from the device.
3577 */
3578 int
3579 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
3580 {
3581 size_t len;
3582
3583 sdl->sdl_family = AF_LINK;
3584 sdl->sdl_index = ill_get_upper_ifindex(ill);
3585 sdl->sdl_type = ill->ill_type;
3586 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3587 len = strlen(sdl->sdl_data);
3588 ASSERT(len < 256);
3589 sdl->sdl_nlen = (uchar_t)len;
3590 sdl->sdl_alen = ill->ill_phys_addr_length;
3591 sdl->sdl_slen = 0;
3592 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
3593 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
3594
3595 return (sizeof (struct sockaddr_dl));
3596 }
3597
3598 /*
3599 * ill_xarp_info
3600 * creates xarp info from the device.
3601 */
3602 static int
3603 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
3604 {
3605 sdl->sdl_family = AF_LINK;
3606 sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
3607 sdl->sdl_type = ill->ill_type;
3608 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3609 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
3610 sdl->sdl_alen = ill->ill_phys_addr_length;
3611 sdl->sdl_slen = 0;
3612 return (sdl->sdl_nlen);
3613 }
3614
3615 static int
3616 loopback_kstat_update(kstat_t *ksp, int rw)
3617 {
3618 kstat_named_t *kn;
3619 netstackid_t stackid;
3620 netstack_t *ns;
3621 ip_stack_t *ipst;
3622
3623 if (ksp == NULL || ksp->ks_data == NULL)
3624 return (EIO);
3625
3626 if (rw == KSTAT_WRITE)
3627 return (EACCES);
3628
3629 kn = KSTAT_NAMED_PTR(ksp);
3630 stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
3631
3632 ns = netstack_find_by_stackid(stackid);
3633 if (ns == NULL)
3634 return (-1);
3635
3636 ipst = ns->netstack_ip;
3637 if (ipst == NULL) {
3638 netstack_rele(ns);
3639 return (-1);
3640 }
3641 kn[0].value.ui32 = ipst->ips_loopback_packets;
3642 kn[1].value.ui32 = ipst->ips_loopback_packets;
3643 netstack_rele(ns);
3644 return (0);
3645 }
3646
3647 /*
3648 * Has ifindex been plumbed already?
3649 */
3650 static boolean_t
3651 phyint_exists(uint_t index, ip_stack_t *ipst)
3652 {
3653 ASSERT(index != 0);
3654 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
3655
3656 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3657 &index, NULL) != NULL);
3658 }
3659
3660 /*
3661 * Pick a unique ifindex.
3662 * When the index counter passes IF_INDEX_MAX for the first time, the wrap
3663 * flag is set so that next time time ip_assign_ifindex() is called, it
3664 * falls through and resets the index counter back to 1, the minimum value
3665 * for the interface index. The logic below assumes that ips_ill_index
3666 * can hold a value of IF_INDEX_MAX+1 without there being any loss
3667 * (i.e. reset back to 0.)
3668 */
3669 boolean_t
3670 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
3671 {
3672 uint_t loops;
3673
3674 if (!ipst->ips_ill_index_wrap) {
3675 *indexp = ipst->ips_ill_index++;
3676 if (ipst->ips_ill_index > IF_INDEX_MAX) {
3677 /*
3678 * Reached the maximum ifindex value, set the wrap
3679 * flag to indicate that it is no longer possible
3680 * to assume that a given index is unallocated.
3681 */
3682 ipst->ips_ill_index_wrap = B_TRUE;
3683 }
3684 return (B_TRUE);
3685 }
3686
3687 if (ipst->ips_ill_index > IF_INDEX_MAX)
3688 ipst->ips_ill_index = 1;
3689
3690 /*
3691 * Start reusing unused indexes. Note that we hold the ill_g_lock
3692 * at this point and don't want to call any function that attempts
3693 * to get the lock again.
3694 */
3695 for (loops = IF_INDEX_MAX; loops > 0; loops--) {
3696 if (!phyint_exists(ipst->ips_ill_index, ipst)) {
3697 /* found unused index - use it */
3698 *indexp = ipst->ips_ill_index;
3699 return (B_TRUE);
3700 }
3701
3702 ipst->ips_ill_index++;
3703 if (ipst->ips_ill_index > IF_INDEX_MAX)
3704 ipst->ips_ill_index = 1;
3705 }
3706
3707 /*
3708 * all interface indicies are inuse.
3709 */
3710 return (B_FALSE);
3711 }
3712
3713 /*
3714 * Assign a unique interface index for the phyint.
3715 */
3716 static boolean_t
3717 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
3718 {
3719 ASSERT(phyi->phyint_ifindex == 0);
3720 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
3721 }
3722
3723 /*
3724 * Initialize the flags on `phyi' as per the provided mactype.
3725 */
3726 static void
3727 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
3728 {
3729 uint64_t flags = 0;
3730
3731 /*
3732 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces,
3733 * we always presume the underlying hardware is working and set
3734 * PHYI_RUNNING (if it's not, the driver will subsequently send a
3735 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization
3736 * there are no active interfaces in the group so we set PHYI_FAILED.
3737 */
3738 if (mactype == SUNW_DL_IPMP)
3739 flags |= PHYI_FAILED;
3740 else
3741 flags |= PHYI_RUNNING;
3742
3743 switch (mactype) {
3744 case SUNW_DL_VNI:
3745 flags |= PHYI_VIRTUAL;
3746 break;
3747 case SUNW_DL_IPMP:
3748 flags |= PHYI_IPMP;
3749 break;
3750 case DL_LOOP:
3751 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
3752 break;
3753 }
3754
3755 mutex_enter(&phyi->phyint_lock);
3756 phyi->phyint_flags |= flags;
3757 mutex_exit(&phyi->phyint_lock);
3758 }
3759
3760 /*
3761 * Return a pointer to the ill which matches the supplied name. Note that
3762 * the ill name length includes the null termination character. (May be
3763 * called as writer.)
3764 * If do_alloc and the interface is "lo0" it will be automatically created.
3765 * Cannot bump up reference on condemned ills. So dup detect can't be done
3766 * using this func.
3767 */
3768 ill_t *
3769 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
3770 boolean_t *did_alloc, ip_stack_t *ipst)
3771 {
3772 ill_t *ill;
3773 ipif_t *ipif;
3774 ipsq_t *ipsq;
3775 kstat_named_t *kn;
3776 boolean_t isloopback;
3777 in6_addr_t ov6addr;
3778
3779 isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
3780
3781 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3782 ill = ill_find_by_name(name, isv6, ipst);
3783 rw_exit(&ipst->ips_ill_g_lock);
3784 if (ill != NULL)
3785 return (ill);
3786
3787 /*
3788 * Couldn't find it. Does this happen to be a lookup for the
3789 * loopback device and are we allowed to allocate it?
3790 */
3791 if (!isloopback || !do_alloc)
3792 return (NULL);
3793
3794 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3795 ill = ill_find_by_name(name, isv6, ipst);
3796 if (ill != NULL) {
3797 rw_exit(&ipst->ips_ill_g_lock);
3798 return (ill);
3799 }
3800
3801 /* Create the loopback device on demand */
3802 ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
3803 sizeof (ipif_loopback_name), BPRI_MED));
3804 if (ill == NULL)
3805 goto done;
3806
3807 bzero(ill, sizeof (*ill));
3808 ill->ill_ipst = ipst;
3809 netstack_hold(ipst->ips_netstack);
3810 /*
3811 * For exclusive stacks we set the zoneid to zero
3812 * to make IP operate as if in the global zone.
3813 */
3814 ill->ill_zoneid = GLOBAL_ZONEID;
3815
3816 if (ill_init_common(ill, NULL, isv6, B_TRUE, B_FALSE) != 0)
3817 goto done;
3818
3819 if (!ill_allocate_mibs(ill))
3820 goto done;
3821
3822 ill->ill_current_frag = ill->ill_max_frag;
3823 ill->ill_mtu = ill->ill_max_frag; /* Initial value */
3824 ill->ill_mc_mtu = ill->ill_mtu;
3825 /*
3826 * ipif_loopback_name can't be pointed at directly because its used
3827 * by both the ipv4 and ipv6 interfaces. When the ill is removed
3828 * from the glist, ill_glist_delete() sets the first character of
3829 * ill_name to '\0'.
3830 */
3831 ill->ill_name = (char *)ill + sizeof (*ill);
3832 (void) strcpy(ill->ill_name, ipif_loopback_name);
3833 ill->ill_name_length = sizeof (ipif_loopback_name);
3834 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
3835 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3836
3837 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
3838 if (ipif == NULL)
3839 goto done;
3840
3841 ill->ill_flags = ILLF_MULTICAST;
3842
3843 ov6addr = ipif->ipif_v6lcl_addr;
3844 /* Set up default loopback address and mask. */
3845 if (!isv6) {
3846 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
3847
3848 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
3849 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
3850 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3851 ipif->ipif_v6subnet);
3852 ill->ill_flags |= ILLF_IPV4;
3853 } else {
3854 ipif->ipif_v6lcl_addr = ipv6_loopback;
3855 ipif->ipif_v6net_mask = ipv6_all_ones;
3856 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3857 ipif->ipif_v6subnet);
3858 ill->ill_flags |= ILLF_IPV6;
3859 }
3860
3861 /*
3862 * Chain us in at the end of the ill list. hold the ill
3863 * before we make it globally visible. 1 for the lookup.
3864 */
3865 ill_refhold(ill);
3866
3867 ipsq = ill->ill_phyint->phyint_ipsq;
3868
3869 if (ill_glist_insert(ill, "lo", isv6) != 0)
3870 cmn_err(CE_PANIC, "cannot insert loopback interface");
3871
3872 /* Let SCTP know so that it can add this to its list */
3873 sctp_update_ill(ill, SCTP_ILL_INSERT);
3874
3875 /*
3876 * We have already assigned ipif_v6lcl_addr above, but we need to
3877 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
3878 * requires to be after ill_glist_insert() since we need the
3879 * ill_index set. Pass on ipv6_loopback as the old address.
3880 */
3881 sctp_update_ipif_addr(ipif, ov6addr);
3882
3883 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
3884
3885 /*
3886 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
3887 * If so, free our original one.
3888 */
3889 if (ipsq != ill->ill_phyint->phyint_ipsq)
3890 ipsq_delete(ipsq);
3891
3892 if (ipst->ips_loopback_ksp == NULL) {
3893 /* Export loopback interface statistics */
3894 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
3895 ipif_loopback_name, "net",
3896 KSTAT_TYPE_NAMED, 2, 0,
3897 ipst->ips_netstack->netstack_stackid);
3898 if (ipst->ips_loopback_ksp != NULL) {
3899 ipst->ips_loopback_ksp->ks_update =
3900 loopback_kstat_update;
3901 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
3902 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
3903 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
3904 ipst->ips_loopback_ksp->ks_private =
3905 (void *)(uintptr_t)ipst->ips_netstack->
3906 netstack_stackid;
3907 kstat_install(ipst->ips_loopback_ksp);
3908 }
3909 }
3910
3911 *did_alloc = B_TRUE;
3912 rw_exit(&ipst->ips_ill_g_lock);
3913 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
3914 NE_PLUMB, ill->ill_name, ill->ill_name_length);
3915 return (ill);
3916 done:
3917 if (ill != NULL) {
3918 if (ill->ill_phyint != NULL) {
3919 ipsq = ill->ill_phyint->phyint_ipsq;
3920 if (ipsq != NULL) {
3921 ipsq->ipsq_phyint = NULL;
3922 ipsq_delete(ipsq);
3923 }
3924 mi_free(ill->ill_phyint);
3925 }
3926 ill_free_mib(ill);
3927 if (ill->ill_ipst != NULL)
3928 netstack_rele(ill->ill_ipst->ips_netstack);
3929 mi_free(ill);
3930 }
3931 rw_exit(&ipst->ips_ill_g_lock);
3932 return (NULL);
3933 }
3934
3935 /*
3936 * For IPP calls - use the ip_stack_t for global stack.
3937 */
3938 ill_t *
3939 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
3940 {
3941 ip_stack_t *ipst;
3942 ill_t *ill;
3943 netstack_t *ns;
3944
3945 ns = netstack_find_by_stackid(GLOBAL_NETSTACKID);
3946
3947 if ((ipst = ns->netstack_ip) == NULL) {
3948 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
3949 netstack_rele(ns);
3950 return (NULL);
3951 }
3952
3953 ill = ill_lookup_on_ifindex(index, isv6, ipst);
3954 netstack_rele(ns);
3955 return (ill);
3956 }
3957
3958 /*
3959 * Return a pointer to the ill which matches the index and IP version type.
3960 */
3961 ill_t *
3962 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3963 {
3964 ill_t *ill;
3965 phyint_t *phyi;
3966
3967 /*
3968 * Indexes are stored in the phyint - a common structure
3969 * to both IPv4 and IPv6.
3970 */
3971 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3972 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3973 (void *) &index, NULL);
3974 if (phyi != NULL) {
3975 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
3976 if (ill != NULL) {
3977 mutex_enter(&ill->ill_lock);
3978 if (!ILL_IS_CONDEMNED(ill)) {
3979 ill_refhold_locked(ill);
3980 mutex_exit(&ill->ill_lock);
3981 rw_exit(&ipst->ips_ill_g_lock);
3982 return (ill);
3983 }
3984 mutex_exit(&ill->ill_lock);
3985 }
3986 }
3987 rw_exit(&ipst->ips_ill_g_lock);
3988 return (NULL);
3989 }
3990
3991 /*
3992 * Verify whether or not an interface index is valid for the specified zoneid
3993 * to transmit packets.
3994 * It can be zero (meaning "reset") or an interface index assigned
3995 * to a non-VNI interface. (We don't use VNI interface to send packets.)
3996 */
3997 boolean_t
3998 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
3999 ip_stack_t *ipst)
4000 {
4001 ill_t *ill;
4002
4003 if (ifindex == 0)
4004 return (B_TRUE);
4005
4006 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
4007 if (ill == NULL)
4008 return (B_FALSE);
4009 if (IS_VNI(ill)) {
4010 ill_refrele(ill);
4011 return (B_FALSE);
4012 }
4013 ill_refrele(ill);
4014 return (B_TRUE);
4015 }
4016
4017 /*
4018 * Return the ifindex next in sequence after the passed in ifindex.
4019 * If there is no next ifindex for the given protocol, return 0.
4020 */
4021 uint_t
4022 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
4023 {
4024 phyint_t *phyi;
4025 phyint_t *phyi_initial;
4026 uint_t ifindex;
4027
4028 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4029
4030 if (index == 0) {
4031 phyi = avl_first(
4032 &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
4033 } else {
4034 phyi = phyi_initial = avl_find(
4035 &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
4036 (void *) &index, NULL);
4037 }
4038
4039 for (; phyi != NULL;
4040 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
4041 phyi, AVL_AFTER)) {
4042 /*
4043 * If we're not returning the first interface in the tree
4044 * and we still haven't moved past the phyint_t that
4045 * corresponds to index, avl_walk needs to be called again
4046 */
4047 if (!((index != 0) && (phyi == phyi_initial))) {
4048 if (isv6) {
4049 if ((phyi->phyint_illv6) &&
4050 ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
4051 (phyi->phyint_illv6->ill_isv6 == 1))
4052 break;
4053 } else {
4054 if ((phyi->phyint_illv4) &&
4055 ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
4056 (phyi->phyint_illv4->ill_isv6 == 0))
4057 break;
4058 }
4059 }
4060 }
4061
4062 rw_exit(&ipst->ips_ill_g_lock);
4063
4064 if (phyi != NULL)
4065 ifindex = phyi->phyint_ifindex;
4066 else
4067 ifindex = 0;
4068
4069 return (ifindex);
4070 }
4071
4072 /*
4073 * Return the ifindex for the named interface.
4074 * If there is no next ifindex for the interface, return 0.
4075 */
4076 uint_t
4077 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
4078 {
4079 phyint_t *phyi;
4080 avl_index_t where = 0;
4081 uint_t ifindex;
4082
4083 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4084
4085 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
4086 name, &where)) == NULL) {
4087 rw_exit(&ipst->ips_ill_g_lock);
4088 return (0);
4089 }
4090
4091 ifindex = phyi->phyint_ifindex;
4092
4093 rw_exit(&ipst->ips_ill_g_lock);
4094
4095 return (ifindex);
4096 }
4097
4098 /*
4099 * Return the ifindex to be used by upper layer protocols for instance
4100 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
4101 */
4102 uint_t
4103 ill_get_upper_ifindex(const ill_t *ill)
4104 {
4105 if (IS_UNDER_IPMP(ill))
4106 return (ipmp_ill_get_ipmp_ifindex(ill));
4107 else
4108 return (ill->ill_phyint->phyint_ifindex);
4109 }
4110
4111
4112 /*
4113 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4114 * that gives a running thread a reference to the ill. This reference must be
4115 * released by the thread when it is done accessing the ill and related
4116 * objects. ill_refcnt can not be used to account for static references
4117 * such as other structures pointing to an ill. Callers must generally
4118 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4119 * or be sure that the ill is not being deleted or changing state before
4120 * calling the refhold functions. A non-zero ill_refcnt ensures that the
4121 * ill won't change any of its critical state such as address, netmask etc.
4122 */
4123 void
4124 ill_refhold(ill_t *ill)
4125 {
4126 mutex_enter(&ill->ill_lock);
4127 ill->ill_refcnt++;
4128 ILL_TRACE_REF(ill);
4129 mutex_exit(&ill->ill_lock);
4130 }
4131
4132 void
4133 ill_refhold_locked(ill_t *ill)
4134 {
4135 ASSERT(MUTEX_HELD(&ill->ill_lock));
4136 ill->ill_refcnt++;
4137 ILL_TRACE_REF(ill);
4138 }
4139
4140 /* Returns true if we managed to get a refhold */
4141 boolean_t
4142 ill_check_and_refhold(ill_t *ill)
4143 {
4144 mutex_enter(&ill->ill_lock);
4145 if (!ILL_IS_CONDEMNED(ill)) {
4146 ill_refhold_locked(ill);
4147 mutex_exit(&ill->ill_lock);
4148 return (B_TRUE);
4149 }
4150 mutex_exit(&ill->ill_lock);
4151 return (B_FALSE);
4152 }
4153
4154 /*
4155 * Must not be called while holding any locks. Otherwise if this is
4156 * the last reference to be released, there is a chance of recursive mutex
4157 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4158 * to restart an ioctl.
4159 */
4160 void
4161 ill_refrele(ill_t *ill)
4162 {
4163 mutex_enter(&ill->ill_lock);
4164 ASSERT(ill->ill_refcnt != 0);
4165 ill->ill_refcnt--;
4166 ILL_UNTRACE_REF(ill);
4167 if (ill->ill_refcnt != 0) {
4168 /* Every ire pointing to the ill adds 1 to ill_refcnt */
4169 mutex_exit(&ill->ill_lock);
4170 return;
4171 }
4172
4173 /* Drops the ill_lock */
4174 ipif_ill_refrele_tail(ill);
4175 }
4176
4177 /*
4178 * Obtain a weak reference count on the ill. This reference ensures the
4179 * ill won't be freed, but the ill may change any of its critical state
4180 * such as netmask, address etc. Returns an error if the ill has started
4181 * closing.
4182 */
4183 boolean_t
4184 ill_waiter_inc(ill_t *ill)
4185 {
4186 mutex_enter(&ill->ill_lock);
4187 if (ill->ill_state_flags & ILL_CONDEMNED) {
4188 mutex_exit(&ill->ill_lock);
4189 return (B_FALSE);
4190 }
4191 ill->ill_waiters++;
4192 mutex_exit(&ill->ill_lock);
4193 return (B_TRUE);
4194 }
4195
4196 void
4197 ill_waiter_dcr(ill_t *ill)
4198 {
4199 mutex_enter(&ill->ill_lock);
4200 ill->ill_waiters--;
4201 if (ill->ill_waiters == 0)
4202 cv_broadcast(&ill->ill_cv);
4203 mutex_exit(&ill->ill_lock);
4204 }
4205
4206 /*
4207 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
4208 * driver. We construct best guess defaults for lower level information that
4209 * we need. If an interface is brought up without injection of any overriding
4210 * information from outside, we have to be ready to go with these defaults.
4211 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
4212 * we primarely want the dl_provider_style.
4213 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
4214 * at which point we assume the other part of the information is valid.
4215 */
4216 void
4217 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
4218 {
4219 uchar_t *brdcst_addr;
4220 uint_t brdcst_addr_length, phys_addr_length;
4221 t_scalar_t sap_length;
4222 dl_info_ack_t *dlia;
4223 ip_m_t *ipm;
4224 dl_qos_cl_sel1_t *sel1;
4225 int min_mtu;
4226
4227 ASSERT(IAM_WRITER_ILL(ill));
4228
4229 /*
4230 * Till the ill is fully up the ill is not globally visible.
4231 * So no need for a lock.
4232 */
4233 dlia = (dl_info_ack_t *)mp->b_rptr;
4234 ill->ill_mactype = dlia->dl_mac_type;
4235
4236 ipm = ip_m_lookup(dlia->dl_mac_type);
4237 if (ipm == NULL) {
4238 ipm = ip_m_lookup(DL_OTHER);
4239 ASSERT(ipm != NULL);
4240 }
4241 ill->ill_media = ipm;
4242
4243 /*
4244 * When the new DLPI stuff is ready we'll pull lengths
4245 * from dlia.
4246 */
4247 if (dlia->dl_version == DL_VERSION_2) {
4248 brdcst_addr_length = dlia->dl_brdcst_addr_length;
4249 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
4250 brdcst_addr_length);
4251 if (brdcst_addr == NULL) {
4252 brdcst_addr_length = 0;
4253 }
4254 sap_length = dlia->dl_sap_length;
4255 phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
4256 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
4257 brdcst_addr_length, sap_length, phys_addr_length));
4258 } else {
4259 brdcst_addr_length = 6;
4260 brdcst_addr = ip_six_byte_all_ones;
4261 sap_length = -2;
4262 phys_addr_length = brdcst_addr_length;
4263 }
4264
4265 ill->ill_bcast_addr_length = brdcst_addr_length;
4266 ill->ill_phys_addr_length = phys_addr_length;
4267 ill->ill_sap_length = sap_length;
4268
4269 /*
4270 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
4271 * but we must ensure a minimum IP MTU is used since other bits of
4272 * IP will fly apart otherwise.
4273 */
4274 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
4275 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
4276 ill->ill_current_frag = ill->ill_max_frag;
4277 ill->ill_mtu = ill->ill_max_frag;
4278 ill->ill_mc_mtu = ill->ill_mtu; /* Overridden by DL_NOTE_SDU_SIZE2 */
4279
4280 ill->ill_type = ipm->ip_m_type;
4281
4282 if (!ill->ill_dlpi_style_set) {
4283 if (dlia->dl_provider_style == DL_STYLE2)
4284 ill->ill_needs_attach = 1;
4285
4286 phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
4287
4288 /*
4289 * Allocate the first ipif on this ill. We don't delay it
4290 * further as ioctl handling assumes at least one ipif exists.
4291 *
4292 * At this point we don't know whether the ill is v4 or v6.
4293 * We will know this whan the SIOCSLIFNAME happens and
4294 * the correct value for ill_isv6 will be assigned in
4295 * ipif_set_values(). We need to hold the ill lock and
4296 * clear the ILL_LL_SUBNET_PENDING flag and atomically do
4297 * the wakeup.
4298 */
4299 (void) ipif_allocate(ill, 0, IRE_LOCAL,
4300 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
4301 mutex_enter(&ill->ill_lock);
4302 ASSERT(ill->ill_dlpi_style_set == 0);
4303 ill->ill_dlpi_style_set = 1;
4304 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
4305 cv_broadcast(&ill->ill_cv);
4306 mutex_exit(&ill->ill_lock);
4307 freemsg(mp);
4308 return;
4309 }
4310 ASSERT(ill->ill_ipif != NULL);
4311 /*
4312 * We know whether it is IPv4 or IPv6 now, as this is the
4313 * second DL_INFO_ACK we are recieving in response to the
4314 * DL_INFO_REQ sent in ipif_set_values.
4315 */
4316 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
4317 /*
4318 * Clear all the flags that were set based on ill_bcast_addr_length
4319 * and ill_phys_addr_length (in ipif_set_values) as these could have
4320 * changed now and we need to re-evaluate.
4321 */
4322 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
4323 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
4324
4325 /*
4326 * Free ill_bcast_mp as things could have changed now.
4327 *
4328 * NOTE: The IPMP meta-interface is special-cased because it starts
4329 * with no underlying interfaces (and thus an unknown broadcast
4330 * address length), but we enforce that an interface is broadcast-
4331 * capable as part of allowing it to join a group.
4332 */
4333 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
4334 if (ill->ill_bcast_mp != NULL)
4335 freemsg(ill->ill_bcast_mp);
4336 ill->ill_net_type = IRE_IF_NORESOLVER;
4337
4338 ill->ill_bcast_mp = ill_dlur_gen(NULL,
4339 ill->ill_phys_addr_length,
4340 ill->ill_sap,
4341 ill->ill_sap_length);
4342
4343 if (ill->ill_isv6)
4344 /*
4345 * Note: xresolv interfaces will eventually need NOARP
4346 * set here as well, but that will require those
4347 * external resolvers to have some knowledge of
4348 * that flag and act appropriately. Not to be changed
4349 * at present.
4350 */
4351 ill->ill_flags |= ILLF_NONUD;
4352 else
4353 ill->ill_flags |= ILLF_NOARP;
4354
4355 if (ill->ill_mactype == SUNW_DL_VNI) {
4356 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
4357 } else if (ill->ill_phys_addr_length == 0 ||
4358 ill->ill_mactype == DL_IPV4 ||
4359 ill->ill_mactype == DL_IPV6) {
4360 /*
4361 * The underying link is point-to-point, so mark the
4362 * interface as such. We can do IP multicast over
4363 * such a link since it transmits all network-layer
4364 * packets to the remote side the same way.
4365 */
4366 ill->ill_flags |= ILLF_MULTICAST;
4367 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
4368 }
4369 } else {
4370 ill->ill_net_type = IRE_IF_RESOLVER;
4371 if (ill->ill_bcast_mp != NULL)
4372 freemsg(ill->ill_bcast_mp);
4373 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
4374 ill->ill_bcast_addr_length, ill->ill_sap,
4375 ill->ill_sap_length);
4376 /*
4377 * Later detect lack of DLPI driver multicast
4378 * capability by catching DL_ENABMULTI errors in
4379 * ip_rput_dlpi.
4380 */
4381 ill->ill_flags |= ILLF_MULTICAST;
4382 if (!ill->ill_isv6)
4383 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
4384 }
4385
4386 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
4387 if (ill->ill_mactype == SUNW_DL_IPMP)
4388 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
4389
4390 /* By default an interface does not support any CoS marking */
4391 ill->ill_flags &= ~ILLF_COS_ENABLED;
4392
4393 /*
4394 * If we get QoS information in DL_INFO_ACK, the device supports
4395 * some form of CoS marking, set ILLF_COS_ENABLED.
4396 */
4397 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
4398 dlia->dl_qos_length);
4399 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
4400 ill->ill_flags |= ILLF_COS_ENABLED;
4401 }
4402
4403 /* Clear any previous error indication. */
4404 ill->ill_error = 0;
4405 freemsg(mp);
4406 }
4407
4408 /*
4409 * Perform various checks to verify that an address would make sense as a
4410 * local, remote, or subnet interface address.
4411 */
4412 static boolean_t
4413 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
4414 {
4415 ipaddr_t net_mask;
4416
4417 /*
4418 * Don't allow all zeroes, or all ones, but allow
4419 * all ones netmask.
4420 */
4421 if ((net_mask = ip_net_mask(addr)) == 0)
4422 return (B_FALSE);
4423 /* A given netmask overrides the "guess" netmask */
4424 if (subnet_mask != 0)
4425 net_mask = subnet_mask;
4426 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
4427 (addr == (addr | ~net_mask)))) {
4428 return (B_FALSE);
4429 }
4430
4431 /*
4432 * Even if the netmask is all ones, we do not allow address to be
4433 * 255.255.255.255
4434 */
4435 if (addr == INADDR_BROADCAST)
4436 return (B_FALSE);
4437
4438 if (CLASSD(addr))
4439 return (B_FALSE);
4440
4441 return (B_TRUE);
4442 }
4443
4444 #define V6_IPIF_LINKLOCAL(p) \
4445 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
4446
4447 /*
4448 * Compare two given ipifs and check if the second one is better than
4449 * the first one using the order of preference (not taking deprecated
4450 * into acount) specified in ipif_lookup_multicast().
4451 */
4452 static boolean_t
4453 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
4454 {
4455 /* Check the least preferred first. */
4456 if (IS_LOOPBACK(old_ipif->ipif_ill)) {
4457 /* If both ipifs are the same, use the first one. */
4458 if (IS_LOOPBACK(new_ipif->ipif_ill))
4459 return (B_FALSE);
4460 else
4461 return (B_TRUE);
4462 }
4463
4464 /* For IPv6, check for link local address. */
4465 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
4466 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4467 V6_IPIF_LINKLOCAL(new_ipif)) {
4468 /* The second one is equal or less preferred. */
4469 return (B_FALSE);
4470 } else {
4471 return (B_TRUE);
4472 }
4473 }
4474
4475 /* Then check for point to point interface. */
4476 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
4477 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4478 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
4479 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
4480 return (B_FALSE);
4481 } else {
4482 return (B_TRUE);
4483 }
4484 }
4485
4486 /* old_ipif is a normal interface, so no need to use the new one. */
4487 return (B_FALSE);
4488 }
4489
4490 /*
4491 * Find a mulitcast-capable ipif given an IP instance and zoneid.
4492 * The ipif must be up, and its ill must multicast-capable, not
4493 * condemned, not an underlying interface in an IPMP group, and
4494 * not a VNI interface. Order of preference:
4495 *
4496 * 1a. normal
4497 * 1b. normal, but deprecated
4498 * 2a. point to point
4499 * 2b. point to point, but deprecated
4500 * 3a. link local
4501 * 3b. link local, but deprecated
4502 * 4. loopback.
4503 */
4504 static ipif_t *
4505 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4506 {
4507 ill_t *ill;
4508 ill_walk_context_t ctx;
4509 ipif_t *ipif;
4510 ipif_t *saved_ipif = NULL;
4511 ipif_t *dep_ipif = NULL;
4512
4513 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4514 if (isv6)
4515 ill = ILL_START_WALK_V6(&ctx, ipst);
4516 else
4517 ill = ILL_START_WALK_V4(&ctx, ipst);
4518
4519 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4520 mutex_enter(&ill->ill_lock);
4521 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
4522 ILL_IS_CONDEMNED(ill) ||
4523 !(ill->ill_flags & ILLF_MULTICAST)) {
4524 mutex_exit(&ill->ill_lock);
4525 continue;
4526 }
4527 for (ipif = ill->ill_ipif; ipif != NULL;
4528 ipif = ipif->ipif_next) {
4529 if (zoneid != ipif->ipif_zoneid &&
4530 zoneid != ALL_ZONES &&
4531 ipif->ipif_zoneid != ALL_ZONES) {
4532 continue;
4533 }
4534 if (!(ipif->ipif_flags & IPIF_UP) ||
4535 IPIF_IS_CONDEMNED(ipif)) {
4536 continue;
4537 }
4538
4539 /*
4540 * Found one candidate. If it is deprecated,
4541 * remember it in dep_ipif. If it is not deprecated,
4542 * remember it in saved_ipif.
4543 */
4544 if (ipif->ipif_flags & IPIF_DEPRECATED) {
4545 if (dep_ipif == NULL) {
4546 dep_ipif = ipif;
4547 } else if (ipif_comp_multi(dep_ipif, ipif,
4548 isv6)) {
4549 /*
4550 * If the previous dep_ipif does not
4551 * belong to the same ill, we've done
4552 * a ipif_refhold() on it. So we need
4553 * to release it.
4554 */
4555 if (dep_ipif->ipif_ill != ill)
4556 ipif_refrele(dep_ipif);
4557 dep_ipif = ipif;
4558 }
4559 continue;
4560 }
4561 if (saved_ipif == NULL) {
4562 saved_ipif = ipif;
4563 } else {
4564 if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
4565 if (saved_ipif->ipif_ill != ill)
4566 ipif_refrele(saved_ipif);
4567 saved_ipif = ipif;
4568 }
4569 }
4570 }
4571 /*
4572 * Before going to the next ill, do a ipif_refhold() on the
4573 * saved ones.
4574 */
4575 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
4576 ipif_refhold_locked(saved_ipif);
4577 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
4578 ipif_refhold_locked(dep_ipif);
4579 mutex_exit(&ill->ill_lock);
4580 }
4581 rw_exit(&ipst->ips_ill_g_lock);
4582
4583 /*
4584 * If we have only the saved_ipif, return it. But if we have both
4585 * saved_ipif and dep_ipif, check to see which one is better.
4586 */
4587 if (saved_ipif != NULL) {
4588 if (dep_ipif != NULL) {
4589 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
4590 ipif_refrele(saved_ipif);
4591 return (dep_ipif);
4592 } else {
4593 ipif_refrele(dep_ipif);
4594 return (saved_ipif);
4595 }
4596 }
4597 return (saved_ipif);
4598 } else {
4599 return (dep_ipif);
4600 }
4601 }
4602
4603 ill_t *
4604 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4605 {
4606 ipif_t *ipif;
4607 ill_t *ill;
4608
4609 ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
4610 if (ipif == NULL)
4611 return (NULL);
4612
4613 ill = ipif->ipif_ill;
4614 ill_refhold(ill);
4615 ipif_refrele(ipif);
4616 return (ill);
4617 }
4618
4619 /*
4620 * This function is called when an application does not specify an interface
4621 * to be used for multicast traffic (joining a group/sending data). It
4622 * calls ire_lookup_multi() to look for an interface route for the
4623 * specified multicast group. Doing this allows the administrator to add
4624 * prefix routes for multicast to indicate which interface to be used for
4625 * multicast traffic in the above scenario. The route could be for all
4626 * multicast (224.0/4), for a single multicast group (a /32 route) or
4627 * anything in between. If there is no such multicast route, we just find
4628 * any multicast capable interface and return it. The returned ipif
4629 * is refhold'ed.
4630 *
4631 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
4632 * unicast table. This is used by CGTP.
4633 */
4634 ill_t *
4635 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
4636 boolean_t *multirtp, ipaddr_t *setsrcp)
4637 {
4638 ill_t *ill;
4639
4640 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
4641 if (ill != NULL)
4642 return (ill);
4643
4644 return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
4645 }
4646
4647 /*
4648 * Look for an ipif with the specified interface address and destination.
4649 * The destination address is used only for matching point-to-point interfaces.
4650 */
4651 ipif_t *
4652 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
4653 {
4654 ipif_t *ipif;
4655 ill_t *ill;
4656 ill_walk_context_t ctx;
4657
4658 /*
4659 * First match all the point-to-point interfaces
4660 * before looking at non-point-to-point interfaces.
4661 * This is done to avoid returning non-point-to-point
4662 * ipif instead of unnumbered point-to-point ipif.
4663 */
4664 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4665 ill = ILL_START_WALK_V4(&ctx, ipst);
4666 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4667 mutex_enter(&ill->ill_lock);
4668 for (ipif = ill->ill_ipif; ipif != NULL;
4669 ipif = ipif->ipif_next) {
4670 /* Allow the ipif to be down */
4671 if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
4672 (ipif->ipif_lcl_addr == if_addr) &&
4673 (ipif->ipif_pp_dst_addr == dst)) {
4674 if (!IPIF_IS_CONDEMNED(ipif)) {
4675 ipif_refhold_locked(ipif);
4676 mutex_exit(&ill->ill_lock);
4677 rw_exit(&ipst->ips_ill_g_lock);
4678 return (ipif);
4679 }
4680 }
4681 }
4682 mutex_exit(&ill->ill_lock);
4683 }
4684 rw_exit(&ipst->ips_ill_g_lock);
4685
4686 /* lookup the ipif based on interface address */
4687 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
4688 ASSERT(ipif == NULL || !ipif->ipif_isv6);
4689 return (ipif);
4690 }
4691
4692 /*
4693 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
4694 */
4695 static ipif_t *
4696 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
4697 zoneid_t zoneid, ip_stack_t *ipst)
4698 {
4699 ipif_t *ipif;
4700 ill_t *ill;
4701 boolean_t ptp = B_FALSE;
4702 ill_walk_context_t ctx;
4703 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
4704 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
4705
4706 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4707 /*
4708 * Repeat twice, first based on local addresses and
4709 * next time for pointopoint.
4710 */
4711 repeat:
4712 ill = ILL_START_WALK_V4(&ctx, ipst);
4713 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4714 if (match_ill != NULL && ill != match_ill &&
4715 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
4716 continue;
4717 }
4718 mutex_enter(&ill->ill_lock);
4719 for (ipif = ill->ill_ipif; ipif != NULL;
4720 ipif = ipif->ipif_next) {
4721 if (zoneid != ALL_ZONES &&
4722 zoneid != ipif->ipif_zoneid &&
4723 ipif->ipif_zoneid != ALL_ZONES)
4724 continue;
4725
4726 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
4727 continue;
4728
4729 /* Allow the ipif to be down */
4730 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4731 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4732 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4733 (ipif->ipif_pp_dst_addr == addr))) {
4734 if (!IPIF_IS_CONDEMNED(ipif)) {
4735 ipif_refhold_locked(ipif);
4736 mutex_exit(&ill->ill_lock);
4737 rw_exit(&ipst->ips_ill_g_lock);
4738 return (ipif);
4739 }
4740 }
4741 }
4742 mutex_exit(&ill->ill_lock);
4743 }
4744
4745 /* If we already did the ptp case, then we are done */
4746 if (ptp) {
4747 rw_exit(&ipst->ips_ill_g_lock);
4748 return (NULL);
4749 }
4750 ptp = B_TRUE;
4751 goto repeat;
4752 }
4753
4754 /*
4755 * Lookup an ipif with the specified address. For point-to-point links we
4756 * look for matches on either the destination address or the local address,
4757 * but we skip the local address check if IPIF_UNNUMBERED is set. If the
4758 * `match_ill' argument is non-NULL, the lookup is restricted to that ill
4759 * (or illgrp if `match_ill' is in an IPMP group).
4760 */
4761 ipif_t *
4762 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4763 ip_stack_t *ipst)
4764 {
4765 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
4766 zoneid, ipst));
4767 }
4768
4769 /*
4770 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
4771 * except that we will only return an address if it is not marked as
4772 * IPIF_DUPLICATE
4773 */
4774 ipif_t *
4775 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4776 ip_stack_t *ipst)
4777 {
4778 return (ipif_lookup_addr_common(addr, match_ill,
4779 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
4780 zoneid, ipst));
4781 }
4782
4783 /*
4784 * Special abbreviated version of ipif_lookup_addr() that doesn't match
4785 * `match_ill' across the IPMP group. This function is only needed in some
4786 * corner-cases; almost everything should use ipif_lookup_addr().
4787 */
4788 ipif_t *
4789 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4790 {
4791 ASSERT(match_ill != NULL);
4792 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
4793 ipst));
4794 }
4795
4796 /*
4797 * Look for an ipif with the specified address. For point-point links
4798 * we look for matches on either the destination address and the local
4799 * address, but we ignore the check on the local address if IPIF_UNNUMBERED
4800 * is set.
4801 * If the `match_ill' argument is non-NULL, the lookup is restricted to that
4802 * ill (or illgrp if `match_ill' is in an IPMP group).
4803 * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
4804 */
4805 zoneid_t
4806 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4807 {
4808 zoneid_t zoneid;
4809 ipif_t *ipif;
4810 ill_t *ill;
4811 boolean_t ptp = B_FALSE;
4812 ill_walk_context_t ctx;
4813
4814 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4815 /*
4816 * Repeat twice, first based on local addresses and
4817 * next time for pointopoint.
4818 */
4819 repeat:
4820 ill = ILL_START_WALK_V4(&ctx, ipst);
4821 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4822 if (match_ill != NULL && ill != match_ill &&
4823 !IS_IN_SAME_ILLGRP(ill, match_ill)) {
4824 continue;
4825 }
4826 mutex_enter(&ill->ill_lock);
4827 for (ipif = ill->ill_ipif; ipif != NULL;
4828 ipif = ipif->ipif_next) {
4829 /* Allow the ipif to be down */
4830 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4831 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4832 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4833 (ipif->ipif_pp_dst_addr == addr)) &&
4834 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
4835 zoneid = ipif->ipif_zoneid;
4836 mutex_exit(&ill->ill_lock);
4837 rw_exit(&ipst->ips_ill_g_lock);
4838 /*
4839 * If ipif_zoneid was ALL_ZONES then we have
4840 * a trusted extensions shared IP address.
4841 * In that case GLOBAL_ZONEID works to send.
4842 */
4843 if (zoneid == ALL_ZONES)
4844 zoneid = GLOBAL_ZONEID;
4845 return (zoneid);
4846 }
4847 }
4848 mutex_exit(&ill->ill_lock);
4849 }
4850
4851 /* If we already did the ptp case, then we are done */
4852 if (ptp) {
4853 rw_exit(&ipst->ips_ill_g_lock);
4854 return (ALL_ZONES);
4855 }
4856 ptp = B_TRUE;
4857 goto repeat;
4858 }
4859
4860 /*
4861 * Look for an ipif that matches the specified remote address i.e. the
4862 * ipif that would receive the specified packet.
4863 * First look for directly connected interfaces and then do a recursive
4864 * IRE lookup and pick the first ipif corresponding to the source address in the
4865 * ire.
4866 * Returns: held ipif
4867 *
4868 * This is only used for ICMP_ADDRESS_MASK_REQUESTs
4869 */
4870 ipif_t *
4871 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
4872 {
4873 ipif_t *ipif;
4874
4875 ASSERT(!ill->ill_isv6);
4876
4877 /*
4878 * Someone could be changing this ipif currently or change it
4879 * after we return this. Thus a few packets could use the old
4880 * old values. However structure updates/creates (ire, ilg, ilm etc)
4881 * will atomically be updated or cleaned up with the new value
4882 * Thus we don't need a lock to check the flags or other attrs below.
4883 */
4884 mutex_enter(&ill->ill_lock);
4885 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4886 if (IPIF_IS_CONDEMNED(ipif))
4887 continue;
4888 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
4889 ipif->ipif_zoneid != ALL_ZONES)
4890 continue;
4891 /* Allow the ipif to be down */
4892 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4893 if ((ipif->ipif_pp_dst_addr == addr) ||
4894 (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
4895 ipif->ipif_lcl_addr == addr)) {
4896 ipif_refhold_locked(ipif);
4897 mutex_exit(&ill->ill_lock);
4898 return (ipif);
4899 }
4900 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
4901 ipif_refhold_locked(ipif);
4902 mutex_exit(&ill->ill_lock);
4903 return (ipif);
4904 }
4905 }
4906 mutex_exit(&ill->ill_lock);
4907 /*
4908 * For a remote destination it isn't possible to nail down a particular
4909 * ipif.
4910 */
4911
4912 /* Pick the first interface */
4913 ipif = ipif_get_next_ipif(NULL, ill);
4914 return (ipif);
4915 }
4916
4917 /*
4918 * This func does not prevent refcnt from increasing. But if
4919 * the caller has taken steps to that effect, then this func
4920 * can be used to determine whether the ill has become quiescent
4921 */
4922 static boolean_t
4923 ill_is_quiescent(ill_t *ill)
4924 {
4925 ipif_t *ipif;
4926
4927 ASSERT(MUTEX_HELD(&ill->ill_lock));
4928
4929 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4930 if (ipif->ipif_refcnt != 0)
4931 return (B_FALSE);
4932 }
4933 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
4934 return (B_FALSE);
4935 }
4936 return (B_TRUE);
4937 }
4938
4939 boolean_t
4940 ill_is_freeable(ill_t *ill)
4941 {
4942 ipif_t *ipif;
4943
4944 ASSERT(MUTEX_HELD(&ill->ill_lock));
4945
4946 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4947 if (ipif->ipif_refcnt != 0) {
4948 return (B_FALSE);
4949 }
4950 }
4951 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
4952 return (B_FALSE);
4953 }
4954 return (B_TRUE);
4955 }
4956
4957 /*
4958 * This func does not prevent refcnt from increasing. But if
4959 * the caller has taken steps to that effect, then this func
4960 * can be used to determine whether the ipif has become quiescent
4961 */
4962 static boolean_t
4963 ipif_is_quiescent(ipif_t *ipif)
4964 {
4965 ill_t *ill;
4966
4967 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4968
4969 if (ipif->ipif_refcnt != 0)
4970 return (B_FALSE);
4971
4972 ill = ipif->ipif_ill;
4973 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
4974 ill->ill_logical_down) {
4975 return (B_TRUE);
4976 }
4977
4978 /* This is the last ipif going down or being deleted on this ill */
4979 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
4980 return (B_FALSE);
4981 }
4982
4983 return (B_TRUE);
4984 }
4985
4986 /*
4987 * return true if the ipif can be destroyed: the ipif has to be quiescent
4988 * with zero references from ire/ilm to it.
4989 */
4990 static boolean_t
4991 ipif_is_freeable(ipif_t *ipif)
4992 {
4993 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4994 ASSERT(ipif->ipif_id != 0);
4995 return (ipif->ipif_refcnt == 0);
4996 }
4997
4998 /*
4999 * The ipif/ill/ire has been refreled. Do the tail processing.
5000 * Determine if the ipif or ill in question has become quiescent and if so
5001 * wakeup close and/or restart any queued pending ioctl that is waiting
5002 * for the ipif_down (or ill_down)
5003 */
5004 void
5005 ipif_ill_refrele_tail(ill_t *ill)
5006 {
5007 mblk_t *mp;
5008 conn_t *connp;
5009 ipsq_t *ipsq;
5010 ipxop_t *ipx;
5011 ipif_t *ipif;
5012 dl_notify_ind_t *dlindp;
5013
5014 ASSERT(MUTEX_HELD(&ill->ill_lock));
5015
5016 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
5017 /* ip_modclose() may be waiting */
5018 cv_broadcast(&ill->ill_cv);
5019 }
5020
5021 ipsq = ill->ill_phyint->phyint_ipsq;
5022 mutex_enter(&ipsq->ipsq_lock);
5023 ipx = ipsq->ipsq_xop;
5024 mutex_enter(&ipx->ipx_lock);
5025 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */
5026 goto unlock;
5027
5028 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
5029
5030 ipif = ipx->ipx_pending_ipif;
5031 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */
5032 goto unlock;
5033
5034 switch (ipx->ipx_waitfor) {
5035 case IPIF_DOWN:
5036 if (!ipif_is_quiescent(ipif))
5037 goto unlock;
5038 break;
5039 case IPIF_FREE:
5040 if (!ipif_is_freeable(ipif))
5041 goto unlock;
5042 break;
5043 case ILL_DOWN:
5044 if (!ill_is_quiescent(ill))
5045 goto unlock;
5046 break;
5047 case ILL_FREE:
5048 /*
5049 * ILL_FREE is only for loopback; normal ill teardown waits
5050 * synchronously in ip_modclose() without using ipx_waitfor,
5051 * handled by the cv_broadcast() at the top of this function.
5052 */
5053 if (!ill_is_freeable(ill))
5054 goto unlock;
5055 break;
5056 default:
5057 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
5058 (void *)ipsq, ipx->ipx_waitfor);
5059 }
5060
5061 ill_refhold_locked(ill); /* for qwriter_ip() call below */
5062 mutex_exit(&ipx->ipx_lock);
5063 mp = ipsq_pending_mp_get(ipsq, &connp);
5064 mutex_exit(&ipsq->ipsq_lock);
5065 mutex_exit(&ill->ill_lock);
5066
5067 ASSERT(mp != NULL);
5068 /*
5069 * NOTE: all of the qwriter_ip() calls below use CUR_OP since
5070 * we can only get here when the current operation decides it
5071 * it needs to quiesce via ipsq_pending_mp_add().
5072 */
5073 switch (mp->b_datap->db_type) {
5074 case M_PCPROTO:
5075 case M_PROTO:
5076 /*
5077 * For now, only DL_NOTIFY_IND messages can use this facility.
5078 */
5079 dlindp = (dl_notify_ind_t *)mp->b_rptr;
5080 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
5081
5082 switch (dlindp->dl_notification) {
5083 case DL_NOTE_PHYS_ADDR:
5084 qwriter_ip(ill, ill->ill_rq, mp,
5085 ill_set_phys_addr_tail, CUR_OP, B_TRUE);
5086 return;
5087 case DL_NOTE_REPLUMB:
5088 qwriter_ip(ill, ill->ill_rq, mp,
5089 ill_replumb_tail, CUR_OP, B_TRUE);
5090 return;
5091 default:
5092 ASSERT(0);
5093 ill_refrele(ill);
5094 }
5095 break;
5096
5097 case M_ERROR:
5098 case M_HANGUP:
5099 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
5100 B_TRUE);
5101 return;
5102
5103 case M_IOCTL:
5104 case M_IOCDATA:
5105 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
5106 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
5107 return;
5108
5109 default:
5110 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5111 "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5112 }
5113 return;
5114 unlock:
5115 mutex_exit(&ipsq->ipsq_lock);
5116 mutex_exit(&ipx->ipx_lock);
5117 mutex_exit(&ill->ill_lock);
5118 }
5119
5120 #ifdef DEBUG
5121 /* Reuse trace buffer from beginning (if reached the end) and record trace */
5122 static void
5123 th_trace_rrecord(th_trace_t *th_trace)
5124 {
5125 tr_buf_t *tr_buf;
5126 uint_t lastref;
5127
5128 lastref = th_trace->th_trace_lastref;
5129 lastref++;
5130 if (lastref == TR_BUF_MAX)
5131 lastref = 0;
5132 th_trace->th_trace_lastref = lastref;
5133 tr_buf = &th_trace->th_trbuf[lastref];
5134 tr_buf->tr_time = ddi_get_lbolt();
5135 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
5136 }
5137
5138 static void
5139 th_trace_free(void *value)
5140 {
5141 th_trace_t *th_trace = value;
5142
5143 ASSERT(th_trace->th_refcnt == 0);
5144 kmem_free(th_trace, sizeof (*th_trace));
5145 }
5146
5147 /*
5148 * Find or create the per-thread hash table used to track object references.
5149 * The ipst argument is NULL if we shouldn't allocate.
5150 *
5151 * Accesses per-thread data, so there's no need to lock here.
5152 */
5153 static mod_hash_t *
5154 th_trace_gethash(ip_stack_t *ipst)
5155 {
5156 th_hash_t *thh;
5157
5158 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
5159 mod_hash_t *mh;
5160 char name[256];
5161 size_t objsize, rshift;
5162 int retv;
5163
5164 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
5165 return (NULL);
5166 (void) snprintf(name, sizeof (name), "th_trace_%p",
5167 (void *)curthread);
5168
5169 /*
5170 * We use mod_hash_create_extended here rather than the more
5171 * obvious mod_hash_create_ptrhash because the latter has a
5172 * hard-coded KM_SLEEP, and we'd prefer to fail rather than
5173 * block.
5174 */
5175 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
5176 MAX(sizeof (ire_t), sizeof (ncec_t)));
5177 rshift = highbit(objsize);
5178 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
5179 th_trace_free, mod_hash_byptr, (void *)rshift,
5180 mod_hash_ptrkey_cmp, KM_NOSLEEP);
5181 if (mh == NULL) {
5182 kmem_free(thh, sizeof (*thh));
5183 return (NULL);
5184 }
5185 thh->thh_hash = mh;
5186 thh->thh_ipst = ipst;
5187 /*
5188 * We trace ills, ipifs, ires, and nces. All of these are
5189 * per-IP-stack, so the lock on the thread list is as well.
5190 */
5191 rw_enter(&ip_thread_rwlock, RW_WRITER);
5192 list_insert_tail(&ip_thread_list, thh);
5193 rw_exit(&ip_thread_rwlock);
5194 retv = tsd_set(ip_thread_data, thh);
5195 ASSERT(retv == 0);
5196 }
5197 return (thh != NULL ? thh->thh_hash : NULL);
5198 }
5199
5200 boolean_t
5201 th_trace_ref(const void *obj, ip_stack_t *ipst)
5202 {
5203 th_trace_t *th_trace;
5204 mod_hash_t *mh;
5205 mod_hash_val_t val;
5206
5207 if ((mh = th_trace_gethash(ipst)) == NULL)
5208 return (B_FALSE);
5209
5210 /*
5211 * Attempt to locate the trace buffer for this obj and thread.
5212 * If it does not exist, then allocate a new trace buffer and
5213 * insert into the hash.
5214 */
5215 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
5216 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
5217 if (th_trace == NULL)
5218 return (B_FALSE);
5219
5220 th_trace->th_id = curthread;
5221 if (mod_hash_insert(mh, (mod_hash_key_t)obj,
5222 (mod_hash_val_t)th_trace) != 0) {
5223 kmem_free(th_trace, sizeof (th_trace_t));
5224 return (B_FALSE);
5225 }
5226 } else {
5227 th_trace = (th_trace_t *)val;
5228 }
5229
5230 ASSERT(th_trace->th_refcnt >= 0 &&
5231 th_trace->th_refcnt < TR_BUF_MAX - 1);
5232
5233 th_trace->th_refcnt++;
5234 th_trace_rrecord(th_trace);
5235 return (B_TRUE);
5236 }
5237
5238 /*
5239 * For the purpose of tracing a reference release, we assume that global
5240 * tracing is always on and that the same thread initiated the reference hold
5241 * is releasing.
5242 */
5243 void
5244 th_trace_unref(const void *obj)
5245 {
5246 int retv;
5247 mod_hash_t *mh;
5248 th_trace_t *th_trace;
5249 mod_hash_val_t val;
5250
5251 mh = th_trace_gethash(NULL);
5252 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
5253 ASSERT(retv == 0);
5254 th_trace = (th_trace_t *)val;
5255
5256 ASSERT(th_trace->th_refcnt > 0);
5257 th_trace->th_refcnt--;
5258 th_trace_rrecord(th_trace);
5259 }
5260
5261 /*
5262 * If tracing has been disabled, then we assume that the reference counts are
5263 * now useless, and we clear them out before destroying the entries.
5264 */
5265 void
5266 th_trace_cleanup(const void *obj, boolean_t trace_disable)
5267 {
5268 th_hash_t *thh;
5269 mod_hash_t *mh;
5270 mod_hash_val_t val;
5271 th_trace_t *th_trace;
5272 int retv;
5273
5274 rw_enter(&ip_thread_rwlock, RW_READER);
5275 for (thh = list_head(&ip_thread_list); thh != NULL;
5276 thh = list_next(&ip_thread_list, thh)) {
5277 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
5278 &val) == 0) {
5279 th_trace = (th_trace_t *)val;
5280 if (trace_disable)
5281 th_trace->th_refcnt = 0;
5282 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
5283 ASSERT(retv == 0);
5284 }
5285 }
5286 rw_exit(&ip_thread_rwlock);
5287 }
5288
5289 void
5290 ipif_trace_ref(ipif_t *ipif)
5291 {
5292 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5293
5294 if (ipif->ipif_trace_disable)
5295 return;
5296
5297 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
5298 ipif->ipif_trace_disable = B_TRUE;
5299 ipif_trace_cleanup(ipif);
5300 }
5301 }
5302
5303 void
5304 ipif_untrace_ref(ipif_t *ipif)
5305 {
5306 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5307
5308 if (!ipif->ipif_trace_disable)
5309 th_trace_unref(ipif);
5310 }
5311
5312 void
5313 ill_trace_ref(ill_t *ill)
5314 {
5315 ASSERT(MUTEX_HELD(&ill->ill_lock));
5316
5317 if (ill->ill_trace_disable)
5318 return;
5319
5320 if (!th_trace_ref(ill, ill->ill_ipst)) {
5321 ill->ill_trace_disable = B_TRUE;
5322 ill_trace_cleanup(ill);
5323 }
5324 }
5325
5326 void
5327 ill_untrace_ref(ill_t *ill)
5328 {
5329 ASSERT(MUTEX_HELD(&ill->ill_lock));
5330
5331 if (!ill->ill_trace_disable)
5332 th_trace_unref(ill);
5333 }
5334
5335 /*
5336 * Called when ipif is unplumbed or when memory alloc fails. Note that on
5337 * failure, ipif_trace_disable is set.
5338 */
5339 static void
5340 ipif_trace_cleanup(const ipif_t *ipif)
5341 {
5342 th_trace_cleanup(ipif, ipif->ipif_trace_disable);
5343 }
5344
5345 /*
5346 * Called when ill is unplumbed or when memory alloc fails. Note that on
5347 * failure, ill_trace_disable is set.
5348 */
5349 static void
5350 ill_trace_cleanup(const ill_t *ill)
5351 {
5352 th_trace_cleanup(ill, ill->ill_trace_disable);
5353 }
5354 #endif /* DEBUG */
5355
5356 void
5357 ipif_refhold_locked(ipif_t *ipif)
5358 {
5359 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5360 ipif->ipif_refcnt++;
5361 IPIF_TRACE_REF(ipif);
5362 }
5363
5364 void
5365 ipif_refhold(ipif_t *ipif)
5366 {
5367 ill_t *ill;
5368
5369 ill = ipif->ipif_ill;
5370 mutex_enter(&ill->ill_lock);
5371 ipif->ipif_refcnt++;
5372 IPIF_TRACE_REF(ipif);
5373 mutex_exit(&ill->ill_lock);
5374 }
5375
5376 /*
5377 * Must not be called while holding any locks. Otherwise if this is
5378 * the last reference to be released there is a chance of recursive mutex
5379 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5380 * to restart an ioctl.
5381 */
5382 void
5383 ipif_refrele(ipif_t *ipif)
5384 {
5385 ill_t *ill;
5386
5387 ill = ipif->ipif_ill;
5388
5389 mutex_enter(&ill->ill_lock);
5390 ASSERT(ipif->ipif_refcnt != 0);
5391 ipif->ipif_refcnt--;
5392 IPIF_UNTRACE_REF(ipif);
5393 if (ipif->ipif_refcnt != 0) {
5394 mutex_exit(&ill->ill_lock);
5395 return;
5396 }
5397
5398 /* Drops the ill_lock */
5399 ipif_ill_refrele_tail(ill);
5400 }
5401
5402 ipif_t *
5403 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
5404 {
5405 ipif_t *ipif;
5406
5407 mutex_enter(&ill->ill_lock);
5408 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
5409 ipif != NULL; ipif = ipif->ipif_next) {
5410 if (IPIF_IS_CONDEMNED(ipif))
5411 continue;
5412 ipif_refhold_locked(ipif);
5413 mutex_exit(&ill->ill_lock);
5414 return (ipif);
5415 }
5416 mutex_exit(&ill->ill_lock);
5417 return (NULL);
5418 }
5419
5420 /*
5421 * TODO: make this table extendible at run time
5422 * Return a pointer to the mac type info for 'mac_type'
5423 */
5424 static ip_m_t *
5425 ip_m_lookup(t_uscalar_t mac_type)
5426 {
5427 ip_m_t *ipm;
5428
5429 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
5430 if (ipm->ip_m_mac_type == mac_type)
5431 return (ipm);
5432 return (NULL);
5433 }
5434
5435 /*
5436 * Make a link layer address from the multicast IP address *addr.
5437 * To form the link layer address, invoke the ip_m_v*mapping function
5438 * associated with the link-layer type.
5439 */
5440 void
5441 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
5442 {
5443 ip_m_t *ipm;
5444
5445 if (ill->ill_net_type == IRE_IF_NORESOLVER)
5446 return;
5447
5448 ASSERT(addr != NULL);
5449
5450 ipm = ip_m_lookup(ill->ill_mactype);
5451 if (ipm == NULL ||
5452 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
5453 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
5454 ip0dbg(("no mapping for ill %s mactype 0x%x\n",
5455 ill->ill_name, ill->ill_mactype));
5456 return;
5457 }
5458 if (ill->ill_isv6)
5459 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
5460 else
5461 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
5462 }
5463
5464 /*
5465 * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous.
5466 * Otherwise returns B_TRUE.
5467 *
5468 * The netmask can be verified to be contiguous with 32 shifts and or
5469 * operations. Take the contiguous mask (in host byte order) and compute
5470 * mask | mask << 1 | mask << 2 | ... | mask << 31
5471 * the result will be the same as the 'mask' for contiguous mask.
5472 */
5473 static boolean_t
5474 ip_contiguous_mask(uint32_t mask)
5475 {
5476 uint32_t m = mask;
5477 int i;
5478
5479 for (i = 1; i < 32; i++)
5480 m |= (mask << i);
5481
5482 return (m == mask);
5483 }
5484
5485 /*
5486 * ip_rt_add is called to add an IPv4 route to the forwarding table.
5487 * ill is passed in to associate it with the correct interface.
5488 * If ire_arg is set, then we return the held IRE in that location.
5489 */
5490 int
5491 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5492 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
5493 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
5494 {
5495 ire_t *ire, *nire;
5496 ire_t *gw_ire = NULL;
5497 ipif_t *ipif = NULL;
5498 uint_t type;
5499 int match_flags = MATCH_IRE_TYPE;
5500 tsol_gc_t *gc = NULL;
5501 tsol_gcgrp_t *gcgrp = NULL;
5502 boolean_t gcgrp_xtraref = B_FALSE;
5503 boolean_t cgtp_broadcast;
5504 boolean_t unbound = B_FALSE;
5505
5506 ip1dbg(("ip_rt_add:"));
5507
5508 if (ire_arg != NULL)
5509 *ire_arg = NULL;
5510
5511 /* disallow non-contiguous netmasks */
5512 if (!ip_contiguous_mask(ntohl(mask)))
5513 return (ENOTSUP);
5514
5515 /*
5516 * If this is the case of RTF_HOST being set, then we set the netmask
5517 * to all ones (regardless if one was supplied).
5518 */
5519 if (flags & RTF_HOST)
5520 mask = IP_HOST_MASK;
5521
5522 /*
5523 * Prevent routes with a zero gateway from being created (since
5524 * interfaces can currently be plumbed and brought up no assigned
5525 * address).
5526 */
5527 if (gw_addr == 0)
5528 return (ENETUNREACH);
5529 /*
5530 * Get the ipif, if any, corresponding to the gw_addr
5531 * If -ifp was specified we restrict ourselves to the ill, otherwise
5532 * we match on the gatway and destination to handle unnumbered pt-pt
5533 * interfaces.
5534 */
5535 if (ill != NULL)
5536 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
5537 else
5538 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5539 if (ipif != NULL) {
5540 if (IS_VNI(ipif->ipif_ill)) {
5541 ipif_refrele(ipif);
5542 return (EINVAL);
5543 }
5544 }
5545
5546 /*
5547 * GateD will attempt to create routes with a loopback interface
5548 * address as the gateway and with RTF_GATEWAY set. We allow
5549 * these routes to be added, but create them as interface routes
5550 * since the gateway is an interface address.
5551 */
5552 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
5553 flags &= ~RTF_GATEWAY;
5554 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
5555 mask == IP_HOST_MASK) {
5556 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
5557 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
5558 NULL);
5559 if (ire != NULL) {
5560 ire_refrele(ire);
5561 ipif_refrele(ipif);
5562 return (EEXIST);
5563 }
5564 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
5565 "for 0x%x\n", (void *)ipif,
5566 ipif->ipif_ire_type,
5567 ntohl(ipif->ipif_lcl_addr)));
5568 ire = ire_create(
5569 (uchar_t *)&dst_addr, /* dest address */
5570 (uchar_t *)&mask, /* mask */
5571 NULL, /* no gateway */
5572 ipif->ipif_ire_type, /* LOOPBACK */
5573 ipif->ipif_ill,
5574 zoneid,
5575 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
5576 NULL,
5577 ipst);
5578
5579 if (ire == NULL) {
5580 ipif_refrele(ipif);
5581 return (ENOMEM);
5582 }
5583 /* src address assigned by the caller? */
5584 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5585 ire->ire_setsrc_addr = src_addr;
5586
5587 nire = ire_add(ire);
5588 if (nire == NULL) {
5589 /*
5590 * In the result of failure, ire_add() will have
5591 * already deleted the ire in question, so there
5592 * is no need to do that here.
5593 */
5594 ipif_refrele(ipif);
5595 return (ENOMEM);
5596 }
5597 /*
5598 * Check if it was a duplicate entry. This handles
5599 * the case of two racing route adds for the same route
5600 */
5601 if (nire != ire) {
5602 ASSERT(nire->ire_identical_ref > 1);
5603 ire_delete(nire);
5604 ire_refrele(nire);
5605 ipif_refrele(ipif);
5606 return (EEXIST);
5607 }
5608 ire = nire;
5609 goto save_ire;
5610 }
5611 }
5612
5613 /*
5614 * The routes for multicast with CGTP are quite special in that
5615 * the gateway is the local interface address, yet RTF_GATEWAY
5616 * is set. We turn off RTF_GATEWAY to provide compatibility with
5617 * this undocumented and unusual use of multicast routes.
5618 */
5619 if ((flags & RTF_MULTIRT) && ipif != NULL)
5620 flags &= ~RTF_GATEWAY;
5621
5622 /*
5623 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
5624 * and the gateway address provided is one of the system's interface
5625 * addresses. By using the routing socket interface and supplying an
5626 * RTA_IFP sockaddr with an interface index, an alternate method of
5627 * specifying an interface route to be created is available which uses
5628 * the interface index that specifies the outgoing interface rather than
5629 * the address of an outgoing interface (which may not be able to
5630 * uniquely identify an interface). When coupled with the RTF_GATEWAY
5631 * flag, routes can be specified which not only specify the next-hop to
5632 * be used when routing to a certain prefix, but also which outgoing
5633 * interface should be used.
5634 *
5635 * Previously, interfaces would have unique addresses assigned to them
5636 * and so the address assigned to a particular interface could be used
5637 * to identify a particular interface. One exception to this was the
5638 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
5639 *
5640 * With the advent of IPv6 and its link-local addresses, this
5641 * restriction was relaxed and interfaces could share addresses between
5642 * themselves. In fact, typically all of the link-local interfaces on
5643 * an IPv6 node or router will have the same link-local address. In
5644 * order to differentiate between these interfaces, the use of an
5645 * interface index is necessary and this index can be carried inside a
5646 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction
5647 * of using the interface index, however, is that all of the ipif's that
5648 * are part of an ill have the same index and so the RTA_IFP sockaddr
5649 * cannot be used to differentiate between ipif's (or logical
5650 * interfaces) that belong to the same ill (physical interface).
5651 *
5652 * For example, in the following case involving IPv4 interfaces and
5653 * logical interfaces
5654 *
5655 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0
5656 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0
5657 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0
5658 *
5659 * the ipif's corresponding to each of these interface routes can be
5660 * uniquely identified by the "gateway" (actually interface address).
5661 *
5662 * In this case involving multiple IPv6 default routes to a particular
5663 * link-local gateway, the use of RTA_IFP is necessary to specify which
5664 * default route is of interest:
5665 *
5666 * default fe80::123:4567:89ab:cdef U if0
5667 * default fe80::123:4567:89ab:cdef U if1
5668 */
5669
5670 /* RTF_GATEWAY not set */
5671 if (!(flags & RTF_GATEWAY)) {
5672 if (sp != NULL) {
5673 ip2dbg(("ip_rt_add: gateway security attributes "
5674 "cannot be set with interface route\n"));
5675 if (ipif != NULL)
5676 ipif_refrele(ipif);
5677 return (EINVAL);
5678 }
5679
5680 /*
5681 * Whether or not ill (RTA_IFP) is set, we require that
5682 * the gateway is one of our local addresses.
5683 */
5684 if (ipif == NULL)
5685 return (ENETUNREACH);
5686
5687 /*
5688 * We use MATCH_IRE_ILL here. If the caller specified an
5689 * interface (from the RTA_IFP sockaddr) we use it, otherwise
5690 * we use the ill derived from the gateway address.
5691 * We can always match the gateway address since we record it
5692 * in ire_gateway_addr.
5693 * We don't allow RTA_IFP to specify a different ill than the
5694 * one matching the ipif to make sure we can delete the route.
5695 */
5696 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
5697 if (ill == NULL) {
5698 ill = ipif->ipif_ill;
5699 } else if (ill != ipif->ipif_ill) {
5700 ipif_refrele(ipif);
5701 return (EINVAL);
5702 }
5703
5704 /*
5705 * We check for an existing entry at this point.
5706 *
5707 * Since a netmask isn't passed in via the ioctl interface
5708 * (SIOCADDRT), we don't check for a matching netmask in that
5709 * case.
5710 */
5711 if (!ioctl_msg)
5712 match_flags |= MATCH_IRE_MASK;
5713 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5714 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
5715 NULL);
5716 if (ire != NULL) {
5717 ire_refrele(ire);
5718 ipif_refrele(ipif);
5719 return (EEXIST);
5720 }
5721
5722 /*
5723 * Some software (for example, GateD and Sun Cluster) attempts
5724 * to create (what amount to) IRE_PREFIX routes with the
5725 * loopback address as the gateway. This is primarily done to
5726 * set up prefixes with the RTF_REJECT flag set (for example,
5727 * when generating aggregate routes.)
5728 *
5729 * If the IRE type (as defined by ill->ill_net_type) would be
5730 * IRE_LOOPBACK, then we map the request into a
5731 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
5732 * these interface routes, by definition, can only be that.
5733 *
5734 * Needless to say, the real IRE_LOOPBACK is NOT created by this
5735 * routine, but rather using ire_create() directly.
5736 *
5737 */
5738 type = ill->ill_net_type;
5739 if (type == IRE_LOOPBACK) {
5740 type = IRE_IF_NORESOLVER;
5741 flags |= RTF_BLACKHOLE;
5742 }
5743
5744 /*
5745 * Create a copy of the IRE_IF_NORESOLVER or
5746 * IRE_IF_RESOLVER with the modified address, netmask, and
5747 * gateway.
5748 */
5749 ire = ire_create(
5750 (uchar_t *)&dst_addr,
5751 (uint8_t *)&mask,
5752 (uint8_t *)&gw_addr,
5753 type,
5754 ill,
5755 zoneid,
5756 flags,
5757 NULL,
5758 ipst);
5759 if (ire == NULL) {
5760 ipif_refrele(ipif);
5761 return (ENOMEM);
5762 }
5763
5764 /* src address assigned by the caller? */
5765 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5766 ire->ire_setsrc_addr = src_addr;
5767
5768 nire = ire_add(ire);
5769 if (nire == NULL) {
5770 /*
5771 * In the result of failure, ire_add() will have
5772 * already deleted the ire in question, so there
5773 * is no need to do that here.
5774 */
5775 ipif_refrele(ipif);
5776 return (ENOMEM);
5777 }
5778 /*
5779 * Check if it was a duplicate entry. This handles
5780 * the case of two racing route adds for the same route
5781 */
5782 if (nire != ire) {
5783 ire_delete(nire);
5784 ire_refrele(nire);
5785 ipif_refrele(ipif);
5786 return (EEXIST);
5787 }
5788 ire = nire;
5789 goto save_ire;
5790 }
5791
5792 /*
5793 * Get an interface IRE for the specified gateway.
5794 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
5795 * gateway, it is currently unreachable and we fail the request
5796 * accordingly. We reject any RTF_GATEWAY routes where the gateway
5797 * is an IRE_LOCAL or IRE_LOOPBACK.
5798 * If RTA_IFP was specified we look on that particular ill.
5799 */
5800 if (ill != NULL)
5801 match_flags |= MATCH_IRE_ILL;
5802
5803 /* Check whether the gateway is reachable. */
5804 again:
5805 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
5806 if (flags & RTF_INDIRECT)
5807 type |= IRE_OFFLINK;
5808
5809 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
5810 ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
5811 if (gw_ire == NULL) {
5812 /*
5813 * With IPMP, we allow host routes to influence in.mpathd's
5814 * target selection. However, if the test addresses are on
5815 * their own network, the above lookup will fail since the
5816 * underlying IRE_INTERFACEs are marked hidden. So allow
5817 * hidden test IREs to be found and try again.
5818 */
5819 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) {
5820 match_flags |= MATCH_IRE_TESTHIDDEN;
5821 goto again;
5822 }
5823 if (ipif != NULL)
5824 ipif_refrele(ipif);
5825 return (ENETUNREACH);
5826 }
5827 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
5828 ire_refrele(gw_ire);
5829 if (ipif != NULL)
5830 ipif_refrele(ipif);
5831 return (ENETUNREACH);
5832 }
5833
5834 if (ill == NULL && !(flags & RTF_INDIRECT)) {
5835 unbound = B_TRUE;
5836 if (ipst->ips_ip_strict_src_multihoming > 0)
5837 ill = gw_ire->ire_ill;
5838 }
5839
5840 /*
5841 * We create one of three types of IREs as a result of this request
5842 * based on the netmask. A netmask of all ones (which is automatically
5843 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
5844 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
5845 * created. Otherwise, an IRE_PREFIX route is created for the
5846 * destination prefix.
5847 */
5848 if (mask == IP_HOST_MASK)
5849 type = IRE_HOST;
5850 else if (mask == 0)
5851 type = IRE_DEFAULT;
5852 else
5853 type = IRE_PREFIX;
5854
5855 /* check for a duplicate entry */
5856 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5857 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
5858 0, ipst, NULL);
5859 if (ire != NULL) {
5860 if (ipif != NULL)
5861 ipif_refrele(ipif);
5862 ire_refrele(gw_ire);
5863 ire_refrele(ire);
5864 return (EEXIST);
5865 }
5866
5867 /* Security attribute exists */
5868 if (sp != NULL) {
5869 tsol_gcgrp_addr_t ga;
5870
5871 /* find or create the gateway credentials group */
5872 ga.ga_af = AF_INET;
5873 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
5874
5875 /* we hold reference to it upon success */
5876 gcgrp = gcgrp_lookup(&ga, B_TRUE);
5877 if (gcgrp == NULL) {
5878 if (ipif != NULL)
5879 ipif_refrele(ipif);
5880 ire_refrele(gw_ire);
5881 return (ENOMEM);
5882 }
5883
5884 /*
5885 * Create and add the security attribute to the group; a
5886 * reference to the group is made upon allocating a new
5887 * entry successfully. If it finds an already-existing
5888 * entry for the security attribute in the group, it simply
5889 * returns it and no new reference is made to the group.
5890 */
5891 gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
5892 if (gc == NULL) {
5893 if (ipif != NULL)
5894 ipif_refrele(ipif);
5895 /* release reference held by gcgrp_lookup */
5896 GCGRP_REFRELE(gcgrp);
5897 ire_refrele(gw_ire);
5898 return (ENOMEM);
5899 }
5900 }
5901
5902 /* Create the IRE. */
5903 ire = ire_create(
5904 (uchar_t *)&dst_addr, /* dest address */
5905 (uchar_t *)&mask, /* mask */
5906 (uchar_t *)&gw_addr, /* gateway address */
5907 (ushort_t)type, /* IRE type */
5908 ill,
5909 zoneid,
5910 flags,
5911 gc, /* security attribute */
5912 ipst);
5913
5914 /*
5915 * The ire holds a reference to the 'gc' and the 'gc' holds a
5916 * reference to the 'gcgrp'. We can now release the extra reference
5917 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
5918 */
5919 if (gcgrp_xtraref)
5920 GCGRP_REFRELE(gcgrp);
5921 if (ire == NULL) {
5922 if (gc != NULL)
5923 GC_REFRELE(gc);
5924 if (ipif != NULL)
5925 ipif_refrele(ipif);
5926 ire_refrele(gw_ire);
5927 return (ENOMEM);
5928 }
5929
5930 /* Before we add, check if an extra CGTP broadcast is needed */
5931 cgtp_broadcast = ((flags & RTF_MULTIRT) &&
5932 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
5933
5934 /* src address assigned by the caller? */
5935 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5936 ire->ire_setsrc_addr = src_addr;
5937
5938 ire->ire_unbound = unbound;
5939
5940 /*
5941 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
5942 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
5943 */
5944
5945 /* Add the new IRE. */
5946 nire = ire_add(ire);
5947 if (nire == NULL) {
5948 /*
5949 * In the result of failure, ire_add() will have
5950 * already deleted the ire in question, so there
5951 * is no need to do that here.
5952 */
5953 if (ipif != NULL)
5954 ipif_refrele(ipif);
5955 ire_refrele(gw_ire);
5956 return (ENOMEM);
5957 }
5958 /*
5959 * Check if it was a duplicate entry. This handles
5960 * the case of two racing route adds for the same route
5961 */
5962 if (nire != ire) {
5963 ire_delete(nire);
5964 ire_refrele(nire);
5965 if (ipif != NULL)
5966 ipif_refrele(ipif);
5967 ire_refrele(gw_ire);
5968 return (EEXIST);
5969 }
5970 ire = nire;
5971
5972 if (flags & RTF_MULTIRT) {
5973 /*
5974 * Invoke the CGTP (multirouting) filtering module
5975 * to add the dst address in the filtering database.
5976 * Replicated inbound packets coming from that address
5977 * will be filtered to discard the duplicates.
5978 * It is not necessary to call the CGTP filter hook
5979 * when the dst address is a broadcast or multicast,
5980 * because an IP source address cannot be a broadcast
5981 * or a multicast.
5982 */
5983 if (cgtp_broadcast) {
5984 ip_cgtp_bcast_add(ire, ipst);
5985 goto save_ire;
5986 }
5987 if (ipst->ips_ip_cgtp_filter_ops != NULL &&
5988 !CLASSD(ire->ire_addr)) {
5989 int res;
5990 ipif_t *src_ipif;
5991
5992 /* Find the source address corresponding to gw_ire */
5993 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
5994 NULL, zoneid, ipst);
5995 if (src_ipif != NULL) {
5996 res = ipst->ips_ip_cgtp_filter_ops->
5997 cfo_add_dest_v4(
5998 ipst->ips_netstack->netstack_stackid,
5999 ire->ire_addr,
6000 ire->ire_gateway_addr,
6001 ire->ire_setsrc_addr,
6002 src_ipif->ipif_lcl_addr);
6003 ipif_refrele(src_ipif);
6004 } else {
6005 res = EADDRNOTAVAIL;
6006 }
6007 if (res != 0) {
6008 if (ipif != NULL)
6009 ipif_refrele(ipif);
6010 ire_refrele(gw_ire);
6011 ire_delete(ire);
6012 ire_refrele(ire); /* Held in ire_add */
6013 return (res);
6014 }
6015 }
6016 }
6017
6018 save_ire:
6019 if (gw_ire != NULL) {
6020 ire_refrele(gw_ire);
6021 gw_ire = NULL;
6022 }
6023 if (ill != NULL) {
6024 /*
6025 * Save enough information so that we can recreate the IRE if
6026 * the interface goes down and then up. The metrics associated
6027 * with the route will be saved as well when rts_setmetrics() is
6028 * called after the IRE has been created. In the case where
6029 * memory cannot be allocated, none of this information will be
6030 * saved.
6031 */
6032 ill_save_ire(ill, ire);
6033 }
6034 if (ioctl_msg)
6035 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
6036 if (ire_arg != NULL) {
6037 /*
6038 * Store the ire that was successfully added into where ire_arg
6039 * points to so that callers don't have to look it up
6040 * themselves (but they are responsible for ire_refrele()ing
6041 * the ire when they are finished with it).
6042 */
6043 *ire_arg = ire;
6044 } else {
6045 ire_refrele(ire); /* Held in ire_add */
6046 }
6047 if (ipif != NULL)
6048 ipif_refrele(ipif);
6049 return (0);
6050 }
6051
6052 /*
6053 * ip_rt_delete is called to delete an IPv4 route.
6054 * ill is passed in to associate it with the correct interface.
6055 */
6056 /* ARGSUSED4 */
6057 int
6058 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
6059 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
6060 ip_stack_t *ipst, zoneid_t zoneid)
6061 {
6062 ire_t *ire = NULL;
6063 ipif_t *ipif;
6064 uint_t type;
6065 uint_t match_flags = MATCH_IRE_TYPE;
6066 int err = 0;
6067
6068 ip1dbg(("ip_rt_delete:"));
6069 /*
6070 * If this is the case of RTF_HOST being set, then we set the netmask
6071 * to all ones. Otherwise, we use the netmask if one was supplied.
6072 */
6073 if (flags & RTF_HOST) {
6074 mask = IP_HOST_MASK;
6075 match_flags |= MATCH_IRE_MASK;
6076 } else if (rtm_addrs & RTA_NETMASK) {
6077 match_flags |= MATCH_IRE_MASK;
6078 }
6079
6080 /*
6081 * Note that RTF_GATEWAY is never set on a delete, therefore
6082 * we check if the gateway address is one of our interfaces first,
6083 * and fall back on RTF_GATEWAY routes.
6084 *
6085 * This makes it possible to delete an original
6086 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
6087 * However, we have RTF_KERNEL set on the ones created by ipif_up
6088 * and those can not be deleted here.
6089 *
6090 * We use MATCH_IRE_ILL if we know the interface. If the caller
6091 * specified an interface (from the RTA_IFP sockaddr) we use it,
6092 * otherwise we use the ill derived from the gateway address.
6093 * We can always match the gateway address since we record it
6094 * in ire_gateway_addr.
6095 *
6096 * For more detail on specifying routes by gateway address and by
6097 * interface index, see the comments in ip_rt_add().
6098 */
6099 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
6100 if (ipif != NULL) {
6101 ill_t *ill_match;
6102
6103 if (ill != NULL)
6104 ill_match = ill;
6105 else
6106 ill_match = ipif->ipif_ill;
6107
6108 match_flags |= MATCH_IRE_ILL;
6109 if (ipif->ipif_ire_type == IRE_LOOPBACK) {
6110 ire = ire_ftable_lookup_v4(dst_addr, mask, 0,
6111 IRE_LOOPBACK, ill_match, ALL_ZONES, NULL,
6112 match_flags, 0, ipst, NULL);
6113 }
6114 if (ire == NULL) {
6115 match_flags |= MATCH_IRE_GW;
6116 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
6117 IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
6118 match_flags, 0, ipst, NULL);
6119 }
6120 /* Avoid deleting routes created by kernel from an ipif */
6121 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
6122 ire_refrele(ire);
6123 ire = NULL;
6124 }
6125
6126 /* Restore in case we didn't find a match */
6127 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
6128 }
6129
6130 if (ire == NULL) {
6131 /*
6132 * At this point, the gateway address is not one of our own
6133 * addresses or a matching interface route was not found. We
6134 * set the IRE type to lookup based on whether
6135 * this is a host route, a default route or just a prefix.
6136 *
6137 * If an ill was passed in, then the lookup is based on an
6138 * interface index so MATCH_IRE_ILL is added to match_flags.
6139 */
6140 match_flags |= MATCH_IRE_GW;
6141 if (ill != NULL)
6142 match_flags |= MATCH_IRE_ILL;
6143 if (mask == IP_HOST_MASK)
6144 type = IRE_HOST;
6145 else if (mask == 0)
6146 type = IRE_DEFAULT;
6147 else
6148 type = IRE_PREFIX;
6149 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
6150 ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
6151 }
6152
6153 if (ipif != NULL) {
6154 ipif_refrele(ipif);
6155 ipif = NULL;
6156 }
6157
6158 if (ire == NULL)
6159 return (ESRCH);
6160
6161 if (ire->ire_flags & RTF_MULTIRT) {
6162 /*
6163 * Invoke the CGTP (multirouting) filtering module
6164 * to remove the dst address from the filtering database.
6165 * Packets coming from that address will no longer be
6166 * filtered to remove duplicates.
6167 */
6168 if (ipst->ips_ip_cgtp_filter_ops != NULL) {
6169 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
6170 ipst->ips_netstack->netstack_stackid,
6171 ire->ire_addr, ire->ire_gateway_addr);
6172 }
6173 ip_cgtp_bcast_delete(ire, ipst);
6174 }
6175
6176 ill = ire->ire_ill;
6177 if (ill != NULL)
6178 ill_remove_saved_ire(ill, ire);
6179 if (ioctl_msg)
6180 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
6181 ire_delete(ire);
6182 ire_refrele(ire);
6183 return (err);
6184 }
6185
6186 /*
6187 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
6188 */
6189 /* ARGSUSED */
6190 int
6191 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6192 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6193 {
6194 ipaddr_t dst_addr;
6195 ipaddr_t gw_addr;
6196 ipaddr_t mask;
6197 int error = 0;
6198 mblk_t *mp1;
6199 struct rtentry *rt;
6200 ipif_t *ipif = NULL;
6201 ip_stack_t *ipst;
6202
6203 ASSERT(q->q_next == NULL);
6204 ipst = CONNQ_TO_IPST(q);
6205
6206 ip1dbg(("ip_siocaddrt:"));
6207 /* Existence of mp1 verified in ip_wput_nondata */
6208 mp1 = mp->b_cont->b_cont;
6209 rt = (struct rtentry *)mp1->b_rptr;
6210
6211 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6212 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6213
6214 /*
6215 * If the RTF_HOST flag is on, this is a request to assign a gateway
6216 * to a particular host address. In this case, we set the netmask to
6217 * all ones for the particular destination address. Otherwise,
6218 * determine the netmask to be used based on dst_addr and the interfaces
6219 * in use.
6220 */
6221 if (rt->rt_flags & RTF_HOST) {
6222 mask = IP_HOST_MASK;
6223 } else {
6224 /*
6225 * Note that ip_subnet_mask returns a zero mask in the case of
6226 * default (an all-zeroes address).
6227 */
6228 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6229 }
6230
6231 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
6232 B_TRUE, NULL, ipst, ALL_ZONES);
6233 if (ipif != NULL)
6234 ipif_refrele(ipif);
6235 return (error);
6236 }
6237
6238 /*
6239 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6240 */
6241 /* ARGSUSED */
6242 int
6243 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6244 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6245 {
6246 ipaddr_t dst_addr;
6247 ipaddr_t gw_addr;
6248 ipaddr_t mask;
6249 int error;
6250 mblk_t *mp1;
6251 struct rtentry *rt;
6252 ipif_t *ipif = NULL;
6253 ip_stack_t *ipst;
6254
6255 ASSERT(q->q_next == NULL);
6256 ipst = CONNQ_TO_IPST(q);
6257
6258 ip1dbg(("ip_siocdelrt:"));
6259 /* Existence of mp1 verified in ip_wput_nondata */
6260 mp1 = mp->b_cont->b_cont;
6261 rt = (struct rtentry *)mp1->b_rptr;
6262
6263 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6264 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6265
6266 /*
6267 * If the RTF_HOST flag is on, this is a request to delete a gateway
6268 * to a particular host address. In this case, we set the netmask to
6269 * all ones for the particular destination address. Otherwise,
6270 * determine the netmask to be used based on dst_addr and the interfaces
6271 * in use.
6272 */
6273 if (rt->rt_flags & RTF_HOST) {
6274 mask = IP_HOST_MASK;
6275 } else {
6276 /*
6277 * Note that ip_subnet_mask returns a zero mask in the case of
6278 * default (an all-zeroes address).
6279 */
6280 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6281 }
6282
6283 error = ip_rt_delete(dst_addr, mask, gw_addr,
6284 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
6285 ipst, ALL_ZONES);
6286 if (ipif != NULL)
6287 ipif_refrele(ipif);
6288 return (error);
6289 }
6290
6291 /*
6292 * Enqueue the mp onto the ipsq, chained by b_next.
6293 * b_prev stores the function to be executed later, and b_queue the queue
6294 * where this mp originated.
6295 */
6296 void
6297 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6298 ill_t *pending_ill)
6299 {
6300 conn_t *connp;
6301 ipxop_t *ipx = ipsq->ipsq_xop;
6302
6303 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6304 ASSERT(MUTEX_HELD(&ipx->ipx_lock));
6305 ASSERT(func != NULL);
6306
6307 mp->b_queue = q;
6308 mp->b_prev = (void *)func;
6309 mp->b_next = NULL;
6310
6311 switch (type) {
6312 case CUR_OP:
6313 if (ipx->ipx_mptail != NULL) {
6314 ASSERT(ipx->ipx_mphead != NULL);
6315 ipx->ipx_mptail->b_next = mp;
6316 } else {
6317 ASSERT(ipx->ipx_mphead == NULL);
6318 ipx->ipx_mphead = mp;
6319 }
6320 ipx->ipx_mptail = mp;
6321 break;
6322
6323 case NEW_OP:
6324 if (ipsq->ipsq_xopq_mptail != NULL) {
6325 ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6326 ipsq->ipsq_xopq_mptail->b_next = mp;
6327 } else {
6328 ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6329 ipsq->ipsq_xopq_mphead = mp;
6330 }
6331 ipsq->ipsq_xopq_mptail = mp;
6332 ipx->ipx_ipsq_queued = B_TRUE;
6333 break;
6334
6335 case SWITCH_OP:
6336 ASSERT(ipsq->ipsq_swxop != NULL);
6337 /* only one switch operation is currently allowed */
6338 ASSERT(ipsq->ipsq_switch_mp == NULL);
6339 ipsq->ipsq_switch_mp = mp;
6340 ipx->ipx_ipsq_queued = B_TRUE;
6341 break;
6342 default:
6343 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6344 }
6345
6346 if (CONN_Q(q) && pending_ill != NULL) {
6347 connp = Q_TO_CONN(q);
6348 ASSERT(MUTEX_HELD(&connp->conn_lock));
6349 connp->conn_oper_pending_ill = pending_ill;
6350 }
6351 }
6352
6353 /*
6354 * Dequeue the next message that requested exclusive access to this IPSQ's
6355 * xop. Specifically:
6356 *
6357 * 1. If we're still processing the current operation on `ipsq', then
6358 * dequeue the next message for the operation (from ipx_mphead), or
6359 * return NULL if there are no queued messages for the operation.
6360 * These messages are queued via CUR_OP to qwriter_ip() and friends.
6361 *
6362 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is
6363 * not set) see if the ipsq has requested an xop switch. If so, switch
6364 * `ipsq' to a different xop. Xop switches only happen when joining or
6365 * leaving IPMP groups and require a careful dance -- see the comments
6366 * in-line below for details. If we're leaving a group xop or if we're
6367 * joining a group xop and become writer on it, then we proceed to (3).
6368 * Otherwise, we return NULL and exit the xop.
6369 *
6370 * 3. For each IPSQ in the xop, return any switch operation stored on
6371 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before
6372 * any other messages queued on the IPSQ. Otherwise, dequeue the next
6373 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
6374 * Note that if the phyint tied to `ipsq' is not using IPMP there will
6375 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for
6376 * each phyint in the group, including the IPMP meta-interface phyint.
6377 */
6378 static mblk_t *
6379 ipsq_dq(ipsq_t *ipsq)
6380 {
6381 ill_t *illv4, *illv6;
6382 mblk_t *mp;
6383 ipsq_t *xopipsq;
6384 ipsq_t *leftipsq = NULL;
6385 ipxop_t *ipx;
6386 phyint_t *phyi = ipsq->ipsq_phyint;
6387 ip_stack_t *ipst = ipsq->ipsq_ipst;
6388 boolean_t emptied = B_FALSE;
6389
6390 /*
6391 * Grab all the locks we need in the defined order (ill_g_lock ->
6392 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
6393 */
6394 rw_enter(&ipst->ips_ill_g_lock,
6395 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
6396 mutex_enter(&ipsq->ipsq_lock);
6397 ipx = ipsq->ipsq_xop;
6398 mutex_enter(&ipx->ipx_lock);
6399
6400 /*
6401 * Dequeue the next message associated with the current exclusive
6402 * operation, if any.
6403 */
6404 if ((mp = ipx->ipx_mphead) != NULL) {
6405 ipx->ipx_mphead = mp->b_next;
6406 if (ipx->ipx_mphead == NULL)
6407 ipx->ipx_mptail = NULL;
6408 mp->b_next = (void *)ipsq;
6409 goto out;
6410 }
6411
6412 if (ipx->ipx_current_ipif != NULL)
6413 goto empty;
6414
6415 if (ipsq->ipsq_swxop != NULL) {
6416 /*
6417 * The exclusive operation that is now being completed has
6418 * requested a switch to a different xop. This happens
6419 * when an interface joins or leaves an IPMP group. Joins
6420 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
6421 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
6422 * (phyint_free()), or interface plumb for an ill type
6423 * not in the IPMP group (ip_rput_dlpi_writer()).
6424 *
6425 * Xop switches are not allowed on the IPMP meta-interface.
6426 */
6427 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
6428 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
6429 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
6430
6431 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
6432 /*
6433 * We're switching back to our own xop, so we have two
6434 * xop's to drain/exit: our own, and the group xop
6435 * that we are leaving.
6436 *
6437 * First, pull ourselves out of the group ipsq list.
6438 * This is safe since we're writer on ill_g_lock.
6439 */
6440 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
6441
6442 xopipsq = ipx->ipx_ipsq;
6443 while (xopipsq->ipsq_next != ipsq)
6444 xopipsq = xopipsq->ipsq_next;
6445
6446 xopipsq->ipsq_next = ipsq->ipsq_next;
6447 ipsq->ipsq_next = ipsq;
6448 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6449 ipsq->ipsq_swxop = NULL;
6450
6451 /*
6452 * Second, prepare to exit the group xop. The actual
6453 * ipsq_exit() is done at the end of this function
6454 * since we cannot hold any locks across ipsq_exit().
6455 * Note that although we drop the group's ipx_lock, no
6456 * threads can proceed since we're still ipx_writer.
6457 */
6458 leftipsq = xopipsq;
6459 mutex_exit(&ipx->ipx_lock);
6460
6461 /*
6462 * Third, set ipx to point to our own xop (which was
6463 * inactive and therefore can be entered).
6464 */
6465 ipx = ipsq->ipsq_xop;
6466 mutex_enter(&ipx->ipx_lock);
6467 ASSERT(ipx->ipx_writer == NULL);
6468 ASSERT(ipx->ipx_current_ipif == NULL);
6469 } else {
6470 /*
6471 * We're switching from our own xop to a group xop.
6472 * The requestor of the switch must ensure that the
6473 * group xop cannot go away (e.g. by ensuring the
6474 * phyint associated with the xop cannot go away).
6475 *
6476 * If we can become writer on our new xop, then we'll
6477 * do the drain. Otherwise, the current writer of our
6478 * new xop will do the drain when it exits.
6479 *
6480 * First, splice ourselves into the group IPSQ list.
6481 * This is safe since we're writer on ill_g_lock.
6482 */
6483 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6484
6485 xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
6486 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
6487 xopipsq = xopipsq->ipsq_next;
6488
6489 xopipsq->ipsq_next = ipsq;
6490 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
6491 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6492 ipsq->ipsq_swxop = NULL;
6493
6494 /*
6495 * Second, exit our own xop, since it's now unused.
6496 * This is safe since we've got the only reference.
6497 */
6498 ASSERT(ipx->ipx_writer == curthread);
6499 ipx->ipx_writer = NULL;
6500 VERIFY(--ipx->ipx_reentry_cnt == 0);
6501 ipx->ipx_ipsq_queued = B_FALSE;
6502 mutex_exit(&ipx->ipx_lock);
6503
6504 /*
6505 * Third, set ipx to point to our new xop, and check
6506 * if we can become writer on it. If we cannot, then
6507 * the current writer will drain the IPSQ group when
6508 * it exits. Our ipsq_xop is guaranteed to be stable
6509 * because we're still holding ipsq_lock.
6510 */
6511 ipx = ipsq->ipsq_xop;
6512 mutex_enter(&ipx->ipx_lock);
6513 if (ipx->ipx_writer != NULL ||
6514 ipx->ipx_current_ipif != NULL) {
6515 goto out;
6516 }
6517 }
6518
6519 /*
6520 * Fourth, become writer on our new ipx before we continue
6521 * with the drain. Note that we never dropped ipsq_lock
6522 * above, so no other thread could've raced with us to
6523 * become writer first. Also, we're holding ipx_lock, so
6524 * no other thread can examine the ipx right now.
6525 */
6526 ASSERT(ipx->ipx_current_ipif == NULL);
6527 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6528 VERIFY(ipx->ipx_reentry_cnt++ == 0);
6529 ipx->ipx_writer = curthread;
6530 ipx->ipx_forced = B_FALSE;
6531 #ifdef DEBUG
6532 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6533 #endif
6534 }
6535
6536 xopipsq = ipsq;
6537 do {
6538 /*
6539 * So that other operations operate on a consistent and
6540 * complete phyint, a switch message on an IPSQ must be
6541 * handled prior to any other operations on that IPSQ.
6542 */
6543 if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
6544 xopipsq->ipsq_switch_mp = NULL;
6545 ASSERT(mp->b_next == NULL);
6546 mp->b_next = (void *)xopipsq;
6547 goto out;
6548 }
6549
6550 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
6551 xopipsq->ipsq_xopq_mphead = mp->b_next;
6552 if (xopipsq->ipsq_xopq_mphead == NULL)
6553 xopipsq->ipsq_xopq_mptail = NULL;
6554 mp->b_next = (void *)xopipsq;
6555 goto out;
6556 }
6557 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6558 empty:
6559 /*
6560 * There are no messages. Further, we are holding ipx_lock, hence no
6561 * new messages can end up on any IPSQ in the xop.
6562 */
6563 ipx->ipx_writer = NULL;
6564 ipx->ipx_forced = B_FALSE;
6565 VERIFY(--ipx->ipx_reentry_cnt == 0);
6566 ipx->ipx_ipsq_queued = B_FALSE;
6567 emptied = B_TRUE;
6568 #ifdef DEBUG
6569 ipx->ipx_depth = 0;
6570 #endif
6571 out:
6572 mutex_exit(&ipx->ipx_lock);
6573 mutex_exit(&ipsq->ipsq_lock);
6574
6575 /*
6576 * If we completely emptied the xop, then wake up any threads waiting
6577 * to enter any of the IPSQ's associated with it.
6578 */
6579 if (emptied) {
6580 xopipsq = ipsq;
6581 do {
6582 if ((phyi = xopipsq->ipsq_phyint) == NULL)
6583 continue;
6584
6585 illv4 = phyi->phyint_illv4;
6586 illv6 = phyi->phyint_illv6;
6587
6588 GRAB_ILL_LOCKS(illv4, illv6);
6589 if (illv4 != NULL)
6590 cv_broadcast(&illv4->ill_cv);
6591 if (illv6 != NULL)
6592 cv_broadcast(&illv6->ill_cv);
6593 RELEASE_ILL_LOCKS(illv4, illv6);
6594 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6595 }
6596 rw_exit(&ipst->ips_ill_g_lock);
6597
6598 /*
6599 * Now that all locks are dropped, exit the IPSQ we left.
6600 */
6601 if (leftipsq != NULL)
6602 ipsq_exit(leftipsq);
6603
6604 return (mp);
6605 }
6606
6607 /*
6608 * Return completion status of previously initiated DLPI operations on
6609 * ills in the purview of an ipsq.
6610 */
6611 static boolean_t
6612 ipsq_dlpi_done(ipsq_t *ipsq)
6613 {
6614 ipsq_t *ipsq_start;
6615 phyint_t *phyi;
6616 ill_t *ill;
6617
6618 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
6619 ipsq_start = ipsq;
6620
6621 do {
6622 /*
6623 * The only current users of this function are ipsq_try_enter
6624 * and ipsq_enter which have made sure that ipsq_writer is
6625 * NULL before we reach here. ill_dlpi_pending is modified
6626 * only by an ipsq writer
6627 */
6628 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
6629 phyi = ipsq->ipsq_phyint;
6630 /*
6631 * phyi could be NULL if a phyint that is part of an
6632 * IPMP group is being unplumbed. A more detailed
6633 * comment is in ipmp_grp_update_kstats()
6634 */
6635 if (phyi != NULL) {
6636 ill = phyi->phyint_illv4;
6637 if (ill != NULL &&
6638 (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
6639 ill->ill_arl_dlpi_pending))
6640 return (B_FALSE);
6641
6642 ill = phyi->phyint_illv6;
6643 if (ill != NULL &&
6644 ill->ill_dlpi_pending != DL_PRIM_INVAL)
6645 return (B_FALSE);
6646 }
6647
6648 } while ((ipsq = ipsq->ipsq_next) != ipsq_start);
6649
6650 return (B_TRUE);
6651 }
6652
6653 /*
6654 * Enter the ipsq corresponding to ill, by waiting synchronously till
6655 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6656 * will have to drain completely before ipsq_enter returns success.
6657 * ipx_current_ipif will be set if some exclusive op is in progress,
6658 * and the ipsq_exit logic will start the next enqueued op after
6659 * completion of the current op. If 'force' is used, we don't wait
6660 * for the enqueued ops. This is needed when a conn_close wants to
6661 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6662 * of an ill can also use this option. But we dont' use it currently.
6663 */
6664 #define ENTER_SQ_WAIT_TICKS 100
6665 boolean_t
6666 ipsq_enter(ill_t *ill, boolean_t force, int type)
6667 {
6668 ipsq_t *ipsq;
6669 ipxop_t *ipx;
6670 boolean_t waited_enough = B_FALSE;
6671 ip_stack_t *ipst = ill->ill_ipst;
6672
6673 /*
6674 * Note that the relationship between ill and ipsq is fixed as long as
6675 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the
6676 * relationship between the IPSQ and xop cannot change. However,
6677 * since we cannot hold ipsq_lock across the cv_wait(), it may change
6678 * while we're waiting. We wait on ill_cv and rely on ipsq_exit()
6679 * waking up all ills in the xop when it becomes available.
6680 */
6681 for (;;) {
6682 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6683 mutex_enter(&ill->ill_lock);
6684 if (ill->ill_state_flags & ILL_CONDEMNED) {
6685 mutex_exit(&ill->ill_lock);
6686 rw_exit(&ipst->ips_ill_g_lock);
6687 return (B_FALSE);
6688 }
6689
6690 ipsq = ill->ill_phyint->phyint_ipsq;
6691 mutex_enter(&ipsq->ipsq_lock);
6692 ipx = ipsq->ipsq_xop;
6693 mutex_enter(&ipx->ipx_lock);
6694
6695 if (ipx->ipx_writer == NULL && (type == CUR_OP ||
6696 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
6697 waited_enough))
6698 break;
6699
6700 rw_exit(&ipst->ips_ill_g_lock);
6701
6702 if (!force || ipx->ipx_writer != NULL) {
6703 mutex_exit(&ipx->ipx_lock);
6704 mutex_exit(&ipsq->ipsq_lock);
6705 cv_wait(&ill->ill_cv, &ill->ill_lock);
6706 } else {
6707 mutex_exit(&ipx->ipx_lock);
6708 mutex_exit(&ipsq->ipsq_lock);
6709 (void) cv_reltimedwait(&ill->ill_cv,
6710 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
6711 waited_enough = B_TRUE;
6712 }
6713 mutex_exit(&ill->ill_lock);
6714 }
6715
6716 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6717 ASSERT(ipx->ipx_reentry_cnt == 0);
6718 ipx->ipx_writer = curthread;
6719 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
6720 ipx->ipx_reentry_cnt++;
6721 #ifdef DEBUG
6722 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6723 #endif
6724 mutex_exit(&ipx->ipx_lock);
6725 mutex_exit(&ipsq->ipsq_lock);
6726 mutex_exit(&ill->ill_lock);
6727 rw_exit(&ipst->ips_ill_g_lock);
6728
6729 return (B_TRUE);
6730 }
6731
6732 /*
6733 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
6734 * across the call to the core interface ipsq_try_enter() and hence calls this
6735 * function directly. This is explained more fully in ipif_set_values().
6736 * In order to support the above constraint, ipsq_try_enter is implemented as
6737 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
6738 */
6739 static ipsq_t *
6740 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
6741 int type, boolean_t reentry_ok)
6742 {
6743 ipsq_t *ipsq;
6744 ipxop_t *ipx;
6745 ip_stack_t *ipst = ill->ill_ipst;
6746
6747 /*
6748 * lock ordering:
6749 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
6750 *
6751 * ipx of an ipsq can't change when ipsq_lock is held.
6752 */
6753 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
6754 GRAB_CONN_LOCK(q);
6755 mutex_enter(&ill->ill_lock);
6756 ipsq = ill->ill_phyint->phyint_ipsq;
6757 mutex_enter(&ipsq->ipsq_lock);
6758 ipx = ipsq->ipsq_xop;
6759 mutex_enter(&ipx->ipx_lock);
6760
6761 /*
6762 * 1. Enter the ipsq if we are already writer and reentry is ok.
6763 * (Note: If the caller does not specify reentry_ok then neither
6764 * 'func' nor any of its callees must ever attempt to enter the ipsq
6765 * again. Otherwise it can lead to an infinite loop
6766 * 2. Enter the ipsq if there is no current writer and this attempted
6767 * entry is part of the current operation
6768 * 3. Enter the ipsq if there is no current writer and this is a new
6769 * operation and the operation queue is empty and there is no
6770 * operation currently in progress and if all previously initiated
6771 * DLPI operations have completed.
6772 */
6773 if ((ipx->ipx_writer == curthread && reentry_ok) ||
6774 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
6775 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
6776 ipsq_dlpi_done(ipsq))))) {
6777 /* Success. */
6778 ipx->ipx_reentry_cnt++;
6779 ipx->ipx_writer = curthread;
6780 ipx->ipx_forced = B_FALSE;
6781 mutex_exit(&ipx->ipx_lock);
6782 mutex_exit(&ipsq->ipsq_lock);
6783 mutex_exit(&ill->ill_lock);
6784 RELEASE_CONN_LOCK(q);
6785 #ifdef DEBUG
6786 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6787 #endif
6788 return (ipsq);
6789 }
6790
6791 if (func != NULL)
6792 ipsq_enq(ipsq, q, mp, func, type, ill);
6793
6794 mutex_exit(&ipx->ipx_lock);
6795 mutex_exit(&ipsq->ipsq_lock);
6796 mutex_exit(&ill->ill_lock);
6797 RELEASE_CONN_LOCK(q);
6798 return (NULL);
6799 }
6800
6801 /*
6802 * The ipsq_t (ipsq) is the synchronization data structure used to serialize
6803 * certain critical operations like plumbing (i.e. most set ioctls), etc.
6804 * There is one ipsq per phyint. The ipsq
6805 * serializes exclusive ioctls issued by applications on a per ipsq basis in
6806 * ipsq_xopq_mphead. It also protects against multiple threads executing in
6807 * the ipsq. Responses from the driver pertain to the current ioctl (say a
6808 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
6809 * up the interface) and are enqueued in ipx_mphead.
6810 *
6811 * If a thread does not want to reenter the ipsq when it is already writer,
6812 * it must make sure that the specified reentry point to be called later
6813 * when the ipsq is empty, nor any code path starting from the specified reentry
6814 * point must never ever try to enter the ipsq again. Otherwise it can lead
6815 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
6816 * When the thread that is currently exclusive finishes, it (ipsq_exit)
6817 * dequeues the requests waiting to become exclusive in ipx_mphead and calls
6818 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
6819 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
6820 * ioctl if the current ioctl has completed. If the current ioctl is still
6821 * in progress it simply returns. The current ioctl could be waiting for
6822 * a response from another module (the driver or could be waiting for
6823 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
6824 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
6825 * execution of the ioctl and ipsq_exit does not start the next ioctl unless
6826 * ipx_current_ipif is NULL which happens only once the ioctl is complete and
6827 * all associated DLPI operations have completed.
6828 */
6829
6830 /*
6831 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
6832 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ
6833 * on success, or NULL on failure. The caller ensures ipif/ill is valid by
6834 * refholding it as necessary. If the IPSQ cannot be entered and `func' is
6835 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
6836 * can be entered. If `func' is NULL, then `q' and `mp' are ignored.
6837 */
6838 ipsq_t *
6839 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
6840 ipsq_func_t func, int type, boolean_t reentry_ok)
6841 {
6842 ip_stack_t *ipst;
6843 ipsq_t *ipsq;
6844
6845 /* Only 1 of ipif or ill can be specified */
6846 ASSERT((ipif != NULL) ^ (ill != NULL));
6847
6848 if (ipif != NULL)
6849 ill = ipif->ipif_ill;
6850 ipst = ill->ill_ipst;
6851
6852 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6853 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
6854 rw_exit(&ipst->ips_ill_g_lock);
6855
6856 return (ipsq);
6857 }
6858
6859 /*
6860 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures
6861 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ
6862 * cannot be entered, the mp is queued for completion.
6863 */
6864 void
6865 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6866 boolean_t reentry_ok)
6867 {
6868 ipsq_t *ipsq;
6869
6870 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
6871
6872 /*
6873 * Drop the caller's refhold on the ill. This is safe since we either
6874 * entered the IPSQ (and thus are exclusive), or failed to enter the
6875 * IPSQ, in which case we return without accessing ill anymore. This
6876 * is needed because func needs to see the correct refcount.
6877 * e.g. removeif can work only then.
6878 */
6879 ill_refrele(ill);
6880 if (ipsq != NULL) {
6881 (*func)(ipsq, q, mp, NULL);
6882 ipsq_exit(ipsq);
6883 }
6884 }
6885
6886 /*
6887 * Exit the specified IPSQ. If this is the final exit on it then drain it
6888 * prior to exiting. Caller must be writer on the specified IPSQ.
6889 */
6890 void
6891 ipsq_exit(ipsq_t *ipsq)
6892 {
6893 mblk_t *mp;
6894 ipsq_t *mp_ipsq;
6895 queue_t *q;
6896 phyint_t *phyi;
6897 ipsq_func_t func;
6898
6899 ASSERT(IAM_WRITER_IPSQ(ipsq));
6900
6901 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
6902 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
6903 ipsq->ipsq_xop->ipx_reentry_cnt--;
6904 return;
6905 }
6906
6907 for (;;) {
6908 phyi = ipsq->ipsq_phyint;
6909 mp = ipsq_dq(ipsq);
6910 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
6911
6912 /*
6913 * If we've changed to a new IPSQ, and the phyint associated
6914 * with the old one has gone away, free the old IPSQ. Note
6915 * that this cannot happen while the IPSQ is in a group.
6916 */
6917 if (mp_ipsq != ipsq && phyi == NULL) {
6918 ASSERT(ipsq->ipsq_next == ipsq);
6919 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6920 ipsq_delete(ipsq);
6921 }
6922
6923 if (mp == NULL)
6924 break;
6925
6926 q = mp->b_queue;
6927 func = (ipsq_func_t)mp->b_prev;
6928 ipsq = mp_ipsq;
6929 mp->b_next = mp->b_prev = NULL;
6930 mp->b_queue = NULL;
6931
6932 /*
6933 * If 'q' is an conn queue, it is valid, since we did a
6934 * a refhold on the conn at the start of the ioctl.
6935 * If 'q' is an ill queue, it is valid, since close of an
6936 * ill will clean up its IPSQ.
6937 */
6938 (*func)(ipsq, q, mp, NULL);
6939 }
6940 }
6941
6942 /*
6943 * Used to start any igmp or mld timers that could not be started
6944 * while holding ill_mcast_lock. The timers can't be started while holding
6945 * the lock, since mld/igmp_start_timers may need to call untimeout()
6946 * which can't be done while holding the lock which the timeout handler
6947 * acquires. Otherwise
6948 * there could be a deadlock since the timeout handlers
6949 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
6950 * ill_mcast_lock.
6951 */
6952 void
6953 ill_mcast_timer_start(ip_stack_t *ipst)
6954 {
6955 int next;
6956
6957 mutex_enter(&ipst->ips_igmp_timer_lock);
6958 next = ipst->ips_igmp_deferred_next;
6959 ipst->ips_igmp_deferred_next = INFINITY;
6960 mutex_exit(&ipst->ips_igmp_timer_lock);
6961
6962 if (next != INFINITY)
6963 igmp_start_timers(next, ipst);
6964
6965 mutex_enter(&ipst->ips_mld_timer_lock);
6966 next = ipst->ips_mld_deferred_next;
6967 ipst->ips_mld_deferred_next = INFINITY;
6968 mutex_exit(&ipst->ips_mld_timer_lock);
6969
6970 if (next != INFINITY)
6971 mld_start_timers(next, ipst);
6972 }
6973
6974 /*
6975 * Start the current exclusive operation on `ipsq'; associate it with `ipif'
6976 * and `ioccmd'.
6977 */
6978 void
6979 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
6980 {
6981 ill_t *ill = ipif->ipif_ill;
6982 ipxop_t *ipx = ipsq->ipsq_xop;
6983
6984 ASSERT(IAM_WRITER_IPSQ(ipsq));
6985 ASSERT(ipx->ipx_current_ipif == NULL);
6986 ASSERT(ipx->ipx_current_ioctl == 0);
6987
6988 ipx->ipx_current_done = B_FALSE;
6989 ipx->ipx_current_ioctl = ioccmd;
6990 mutex_enter(&ipx->ipx_lock);
6991 ipx->ipx_current_ipif = ipif;
6992 mutex_exit(&ipx->ipx_lock);
6993
6994 /*
6995 * Set IPIF_CHANGING on one or more ipifs associated with the
6996 * current exclusive operation. IPIF_CHANGING prevents any new
6997 * references to the ipif (so that the references will eventually
6998 * drop to zero) and also prevents any "get" operations (e.g.,
6999 * SIOCGLIFFLAGS) from being able to access the ipif until the
7000 * operation has completed and the ipif is again in a stable state.
7001 *
7002 * For ioctls, IPIF_CHANGING is set on the ipif associated with the
7003 * ioctl. For internal operations (where ioccmd is zero), all ipifs
7004 * on the ill are marked with IPIF_CHANGING since it's unclear which
7005 * ipifs will be affected.
7006 *
7007 * Note that SIOCLIFREMOVEIF is a special case as it sets
7008 * IPIF_CONDEMNED internally after identifying the right ipif to
7009 * operate on.
7010 */
7011 switch (ioccmd) {
7012 case SIOCLIFREMOVEIF:
7013 break;
7014 case 0:
7015 mutex_enter(&ill->ill_lock);
7016 ipif = ipif->ipif_ill->ill_ipif;
7017 for (; ipif != NULL; ipif = ipif->ipif_next)
7018 ipif->ipif_state_flags |= IPIF_CHANGING;
7019 mutex_exit(&ill->ill_lock);
7020 break;
7021 default:
7022 mutex_enter(&ill->ill_lock);
7023 ipif->ipif_state_flags |= IPIF_CHANGING;
7024 mutex_exit(&ill->ill_lock);
7025 }
7026 }
7027
7028 /*
7029 * Finish the current exclusive operation on `ipsq'. Usually, this will allow
7030 * the next exclusive operation to begin once we ipsq_exit(). However, if
7031 * pending DLPI operations remain, then we will wait for the queue to drain
7032 * before allowing the next exclusive operation to begin. This ensures that
7033 * DLPI operations from one exclusive operation are never improperly processed
7034 * as part of a subsequent exclusive operation.
7035 */
7036 void
7037 ipsq_current_finish(ipsq_t *ipsq)
7038 {
7039 ipxop_t *ipx = ipsq->ipsq_xop;
7040 t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
7041 ipif_t *ipif = ipx->ipx_current_ipif;
7042
7043 ASSERT(IAM_WRITER_IPSQ(ipsq));
7044
7045 /*
7046 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
7047 * (but in that case, IPIF_CHANGING will already be clear and no
7048 * pending DLPI messages can remain).
7049 */
7050 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
7051 ill_t *ill = ipif->ipif_ill;
7052
7053 mutex_enter(&ill->ill_lock);
7054 dlpi_pending = ill->ill_dlpi_pending;
7055 if (ipx->ipx_current_ioctl == 0) {
7056 ipif = ill->ill_ipif;
7057 for (; ipif != NULL; ipif = ipif->ipif_next)
7058 ipif->ipif_state_flags &= ~IPIF_CHANGING;
7059 } else {
7060 ipif->ipif_state_flags &= ~IPIF_CHANGING;
7061 }
7062 mutex_exit(&ill->ill_lock);
7063 }
7064
7065 ASSERT(!ipx->ipx_current_done);
7066 ipx->ipx_current_done = B_TRUE;
7067 ipx->ipx_current_ioctl = 0;
7068 if (dlpi_pending == DL_PRIM_INVAL) {
7069 mutex_enter(&ipx->ipx_lock);
7070 ipx->ipx_current_ipif = NULL;
7071 mutex_exit(&ipx->ipx_lock);
7072 }
7073 }
7074
7075 /*
7076 * The ill is closing. Flush all messages on the ipsq that originated
7077 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
7078 * for this ill since ipsq_enter could not have entered until then.
7079 * New messages can't be queued since the CONDEMNED flag is set.
7080 */
7081 static void
7082 ipsq_flush(ill_t *ill)
7083 {
7084 queue_t *q;
7085 mblk_t *prev;
7086 mblk_t *mp;
7087 mblk_t *mp_next;
7088 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
7089
7090 ASSERT(IAM_WRITER_ILL(ill));
7091
7092 /*
7093 * Flush any messages sent up by the driver.
7094 */
7095 mutex_enter(&ipx->ipx_lock);
7096 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
7097 mp_next = mp->b_next;
7098 q = mp->b_queue;
7099 if (q == ill->ill_rq || q == ill->ill_wq) {
7100 /* dequeue mp */
7101 if (prev == NULL)
7102 ipx->ipx_mphead = mp->b_next;
7103 else
7104 prev->b_next = mp->b_next;
7105 if (ipx->ipx_mptail == mp) {
7106 ASSERT(mp_next == NULL);
7107 ipx->ipx_mptail = prev;
7108 }
7109 inet_freemsg(mp);
7110 } else {
7111 prev = mp;
7112 }
7113 }
7114 mutex_exit(&ipx->ipx_lock);
7115 (void) ipsq_pending_mp_cleanup(ill, NULL);
7116 ipsq_xopq_mp_cleanup(ill, NULL);
7117 }
7118
7119 /*
7120 * Parse an ifreq or lifreq struct coming down ioctls and refhold
7121 * and return the associated ipif.
7122 * Return value:
7123 * Non zero: An error has occurred. ci may not be filled out.
7124 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and
7125 * a held ipif in ci.ci_ipif.
7126 */
7127 int
7128 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
7129 cmd_info_t *ci)
7130 {
7131 char *name;
7132 struct ifreq *ifr;
7133 struct lifreq *lifr;
7134 ipif_t *ipif = NULL;
7135 ill_t *ill;
7136 conn_t *connp;
7137 boolean_t isv6;
7138 int err;
7139 mblk_t *mp1;
7140 zoneid_t zoneid;
7141 ip_stack_t *ipst;
7142
7143 if (q->q_next != NULL) {
7144 ill = (ill_t *)q->q_ptr;
7145 isv6 = ill->ill_isv6;
7146 connp = NULL;
7147 zoneid = ALL_ZONES;
7148 ipst = ill->ill_ipst;
7149 } else {
7150 ill = NULL;
7151 connp = Q_TO_CONN(q);
7152 isv6 = (connp->conn_family == AF_INET6);
7153 zoneid = connp->conn_zoneid;
7154 if (zoneid == GLOBAL_ZONEID) {
7155 /* global zone can access ipifs in all zones */
7156 zoneid = ALL_ZONES;
7157 }
7158 ipst = connp->conn_netstack->netstack_ip;
7159 }
7160
7161 /* Has been checked in ip_wput_nondata */
7162 mp1 = mp->b_cont->b_cont;
7163
7164 if (ipip->ipi_cmd_type == IF_CMD) {
7165 /* This a old style SIOC[GS]IF* command */
7166 ifr = (struct ifreq *)mp1->b_rptr;
7167 /*
7168 * Null terminate the string to protect against buffer
7169 * overrun. String was generated by user code and may not
7170 * be trusted.
7171 */
7172 ifr->ifr_name[IFNAMSIZ - 1] = '\0';
7173 name = ifr->ifr_name;
7174 ci->ci_sin = (sin_t *)&ifr->ifr_addr;
7175 ci->ci_sin6 = NULL;
7176 ci->ci_lifr = (struct lifreq *)ifr;
7177 } else {
7178 /* This a new style SIOC[GS]LIF* command */
7179 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
7180 lifr = (struct lifreq *)mp1->b_rptr;
7181 /*
7182 * Null terminate the string to protect against buffer
7183 * overrun. String was generated by user code and may not
7184 * be trusted.
7185 */
7186 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
7187 name = lifr->lifr_name;
7188 ci->ci_sin = (sin_t *)&lifr->lifr_addr;
7189 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
7190 ci->ci_lifr = lifr;
7191 }
7192
7193 if (ipip->ipi_cmd == SIOCSLIFNAME) {
7194 /*
7195 * The ioctl will be failed if the ioctl comes down
7196 * an conn stream
7197 */
7198 if (ill == NULL) {
7199 /*
7200 * Not an ill queue, return EINVAL same as the
7201 * old error code.
7202 */
7203 return (ENXIO);
7204 }
7205 ipif = ill->ill_ipif;
7206 ipif_refhold(ipif);
7207 } else {
7208 /*
7209 * Ensure that ioctls don't see any internal state changes
7210 * caused by set ioctls by deferring them if IPIF_CHANGING is
7211 * set.
7212 */
7213 ipif = ipif_lookup_on_name_async(name, mi_strlen(name),
7214 isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst);
7215 if (ipif == NULL) {
7216 if (err == EINPROGRESS)
7217 return (err);
7218 err = 0; /* Ensure we don't use it below */
7219 }
7220 }
7221
7222 /*
7223 * Old style [GS]IFCMD does not admit IPv6 ipif
7224 */
7225 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
7226 ipif_refrele(ipif);
7227 return (ENXIO);
7228 }
7229
7230 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
7231 name[0] == '\0') {
7232 /*
7233 * Handle a or a SIOC?IF* with a null name
7234 * during plumb (on the ill queue before the I_PLINK).
7235 */
7236 ipif = ill->ill_ipif;
7237 ipif_refhold(ipif);
7238 }
7239
7240 if (ipif == NULL)
7241 return (ENXIO);
7242
7243 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
7244 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
7245
7246 ci->ci_ipif = ipif;
7247 return (0);
7248 }
7249
7250 /*
7251 * Return the total number of ipifs.
7252 */
7253 static uint_t
7254 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
7255 {
7256 uint_t numifs = 0;
7257 ill_t *ill;
7258 ill_walk_context_t ctx;
7259 ipif_t *ipif;
7260
7261 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7262 ill = ILL_START_WALK_V4(&ctx, ipst);
7263 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7264 if (IS_UNDER_IPMP(ill))
7265 continue;
7266 for (ipif = ill->ill_ipif; ipif != NULL;
7267 ipif = ipif->ipif_next) {
7268 if (ipif->ipif_zoneid == zoneid ||
7269 ipif->ipif_zoneid == ALL_ZONES)
7270 numifs++;
7271 }
7272 }
7273 rw_exit(&ipst->ips_ill_g_lock);
7274 return (numifs);
7275 }
7276
7277 /*
7278 * Return the total number of ipifs.
7279 */
7280 static uint_t
7281 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
7282 {
7283 uint_t numifs = 0;
7284 ill_t *ill;
7285 ipif_t *ipif;
7286 ill_walk_context_t ctx;
7287
7288 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7289
7290 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7291 if (family == AF_INET)
7292 ill = ILL_START_WALK_V4(&ctx, ipst);
7293 else if (family == AF_INET6)
7294 ill = ILL_START_WALK_V6(&ctx, ipst);
7295 else
7296 ill = ILL_START_WALK_ALL(&ctx, ipst);
7297
7298 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7299 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
7300 continue;
7301
7302 for (ipif = ill->ill_ipif; ipif != NULL;
7303 ipif = ipif->ipif_next) {
7304 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7305 !(lifn_flags & LIFC_NOXMIT))
7306 continue;
7307 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7308 !(lifn_flags & LIFC_TEMPORARY))
7309 continue;
7310 if (((ipif->ipif_flags &
7311 (IPIF_NOXMIT|IPIF_NOLOCAL|
7312 IPIF_DEPRECATED)) ||
7313 IS_LOOPBACK(ill) ||
7314 !(ipif->ipif_flags & IPIF_UP)) &&
7315 (lifn_flags & LIFC_EXTERNAL_SOURCE))
7316 continue;
7317
7318 if (zoneid != ipif->ipif_zoneid &&
7319 ipif->ipif_zoneid != ALL_ZONES &&
7320 (zoneid != GLOBAL_ZONEID ||
7321 !(lifn_flags & LIFC_ALLZONES)))
7322 continue;
7323
7324 numifs++;
7325 }
7326 }
7327 rw_exit(&ipst->ips_ill_g_lock);
7328 return (numifs);
7329 }
7330
7331 uint_t
7332 ip_get_lifsrcofnum(ill_t *ill)
7333 {
7334 uint_t numifs = 0;
7335 ill_t *ill_head = ill;
7336 ip_stack_t *ipst = ill->ill_ipst;
7337
7338 /*
7339 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7340 * other thread may be trying to relink the ILLs in this usesrc group
7341 * and adjusting the ill_usesrc_grp_next pointers
7342 */
7343 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7344 if ((ill->ill_usesrc_ifindex == 0) &&
7345 (ill->ill_usesrc_grp_next != NULL)) {
7346 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7347 ill = ill->ill_usesrc_grp_next)
7348 numifs++;
7349 }
7350 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7351
7352 return (numifs);
7353 }
7354
7355 /* Null values are passed in for ipif, sin, and ifreq */
7356 /* ARGSUSED */
7357 int
7358 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7359 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7360 {
7361 int *nump;
7362 conn_t *connp = Q_TO_CONN(q);
7363
7364 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7365
7366 /* Existence of b_cont->b_cont checked in ip_wput_nondata */
7367 nump = (int *)mp->b_cont->b_cont->b_rptr;
7368
7369 *nump = ip_get_numifs(connp->conn_zoneid,
7370 connp->conn_netstack->netstack_ip);
7371 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7372 return (0);
7373 }
7374
7375 /* Null values are passed in for ipif, sin, and ifreq */
7376 /* ARGSUSED */
7377 int
7378 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7379 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7380 {
7381 struct lifnum *lifn;
7382 mblk_t *mp1;
7383 conn_t *connp = Q_TO_CONN(q);
7384
7385 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7386
7387 /* Existence checked in ip_wput_nondata */
7388 mp1 = mp->b_cont->b_cont;
7389
7390 lifn = (struct lifnum *)mp1->b_rptr;
7391 switch (lifn->lifn_family) {
7392 case AF_UNSPEC:
7393 case AF_INET:
7394 case AF_INET6:
7395 break;
7396 default:
7397 return (EAFNOSUPPORT);
7398 }
7399
7400 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7401 connp->conn_zoneid, connp->conn_netstack->netstack_ip);
7402 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7403 return (0);
7404 }
7405
7406 /* ARGSUSED */
7407 int
7408 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7409 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7410 {
7411 STRUCT_HANDLE(ifconf, ifc);
7412 mblk_t *mp1;
7413 struct iocblk *iocp;
7414 struct ifreq *ifr;
7415 ill_walk_context_t ctx;
7416 ill_t *ill;
7417 ipif_t *ipif;
7418 struct sockaddr_in *sin;
7419 int32_t ifclen;
7420 zoneid_t zoneid;
7421 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7422
7423 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7424
7425 ip1dbg(("ip_sioctl_get_ifconf"));
7426 /* Existence verified in ip_wput_nondata */
7427 mp1 = mp->b_cont->b_cont;
7428 iocp = (struct iocblk *)mp->b_rptr;
7429 zoneid = Q_TO_CONN(q)->conn_zoneid;
7430
7431 /*
7432 * The original SIOCGIFCONF passed in a struct ifconf which specified
7433 * the user buffer address and length into which the list of struct
7434 * ifreqs was to be copied. Since AT&T Streams does not seem to
7435 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7436 * the SIOCGIFCONF operation was redefined to simply provide
7437 * a large output buffer into which we are supposed to jam the ifreq
7438 * array. The same ioctl command code was used, despite the fact that
7439 * both the applications and the kernel code had to change, thus making
7440 * it impossible to support both interfaces.
7441 *
7442 * For reasons not good enough to try to explain, the following
7443 * algorithm is used for deciding what to do with one of these:
7444 * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7445 * form with the output buffer coming down as the continuation message.
7446 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7447 * and we have to copy in the ifconf structure to find out how big the
7448 * output buffer is and where to copy out to. Sure no problem...
7449 *
7450 */
7451 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7452 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7453 int numifs = 0;
7454 size_t ifc_bufsize;
7455
7456 /*
7457 * Must be (better be!) continuation of a TRANSPARENT
7458 * IOCTL. We just copied in the ifconf structure.
7459 */
7460 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7461 (struct ifconf *)mp1->b_rptr);
7462
7463 /*
7464 * Allocate a buffer to hold requested information.
7465 *
7466 * If ifc_len is larger than what is needed, we only
7467 * allocate what we will use.
7468 *
7469 * If ifc_len is smaller than what is needed, return
7470 * EINVAL.
7471 *
7472 * XXX: the ill_t structure can hava 2 counters, for
7473 * v4 and v6 (not just ill_ipif_up_count) to store the
7474 * number of interfaces for a device, so we don't need
7475 * to count them here...
7476 */
7477 numifs = ip_get_numifs(zoneid, ipst);
7478
7479 ifclen = STRUCT_FGET(ifc, ifc_len);
7480 ifc_bufsize = numifs * sizeof (struct ifreq);
7481 if (ifc_bufsize > ifclen) {
7482 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7483 /* old behaviour */
7484 return (EINVAL);
7485 } else {
7486 ifc_bufsize = ifclen;
7487 }
7488 }
7489
7490 mp1 = mi_copyout_alloc(q, mp,
7491 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
7492 if (mp1 == NULL)
7493 return (ENOMEM);
7494
7495 mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
7496 }
7497 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7498 /*
7499 * the SIOCGIFCONF ioctl only knows about
7500 * IPv4 addresses, so don't try to tell
7501 * it about interfaces with IPv6-only
7502 * addresses. (Last parm 'isv6' is B_FALSE)
7503 */
7504
7505 ifr = (struct ifreq *)mp1->b_rptr;
7506
7507 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7508 ill = ILL_START_WALK_V4(&ctx, ipst);
7509 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7510 if (IS_UNDER_IPMP(ill))
7511 continue;
7512 for (ipif = ill->ill_ipif; ipif != NULL;
7513 ipif = ipif->ipif_next) {
7514 if (zoneid != ipif->ipif_zoneid &&
7515 ipif->ipif_zoneid != ALL_ZONES)
7516 continue;
7517 if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
7518 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7519 /* old behaviour */
7520 rw_exit(&ipst->ips_ill_g_lock);
7521 return (EINVAL);
7522 } else {
7523 goto if_copydone;
7524 }
7525 }
7526 ipif_get_name(ipif, ifr->ifr_name,
7527 sizeof (ifr->ifr_name));
7528 sin = (sin_t *)&ifr->ifr_addr;
7529 *sin = sin_null;
7530 sin->sin_family = AF_INET;
7531 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7532 ifr++;
7533 }
7534 }
7535 if_copydone:
7536 rw_exit(&ipst->ips_ill_g_lock);
7537 mp1->b_wptr = (uchar_t *)ifr;
7538
7539 if (STRUCT_BUF(ifc) != NULL) {
7540 STRUCT_FSET(ifc, ifc_len,
7541 (int)((uchar_t *)ifr - mp1->b_rptr));
7542 }
7543 return (0);
7544 }
7545
7546 /*
7547 * Get the interfaces using the address hosted on the interface passed in,
7548 * as a source adddress
7549 */
7550 /* ARGSUSED */
7551 int
7552 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7553 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7554 {
7555 mblk_t *mp1;
7556 ill_t *ill, *ill_head;
7557 ipif_t *ipif, *orig_ipif;
7558 int numlifs = 0;
7559 size_t lifs_bufsize, lifsmaxlen;
7560 struct lifreq *lifr;
7561 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7562 uint_t ifindex;
7563 zoneid_t zoneid;
7564 boolean_t isv6 = B_FALSE;
7565 struct sockaddr_in *sin;
7566 struct sockaddr_in6 *sin6;
7567 STRUCT_HANDLE(lifsrcof, lifs);
7568 ip_stack_t *ipst;
7569
7570 ipst = CONNQ_TO_IPST(q);
7571
7572 ASSERT(q->q_next == NULL);
7573
7574 zoneid = Q_TO_CONN(q)->conn_zoneid;
7575
7576 /* Existence verified in ip_wput_nondata */
7577 mp1 = mp->b_cont->b_cont;
7578
7579 /*
7580 * Must be (better be!) continuation of a TRANSPARENT
7581 * IOCTL. We just copied in the lifsrcof structure.
7582 */
7583 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
7584 (struct lifsrcof *)mp1->b_rptr);
7585
7586 if (MBLKL(mp1) != STRUCT_SIZE(lifs))
7587 return (EINVAL);
7588
7589 ifindex = STRUCT_FGET(lifs, lifs_ifindex);
7590 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
7591 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
7592 if (ipif == NULL) {
7593 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
7594 ifindex));
7595 return (ENXIO);
7596 }
7597
7598 /* Allocate a buffer to hold requested information */
7599 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
7600 lifs_bufsize = numlifs * sizeof (struct lifreq);
7601 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen);
7602 /* The actual size needed is always returned in lifs_len */
7603 STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
7604
7605 /* If the amount we need is more than what is passed in, abort */
7606 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
7607 ipif_refrele(ipif);
7608 return (0);
7609 }
7610
7611 mp1 = mi_copyout_alloc(q, mp,
7612 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
7613 if (mp1 == NULL) {
7614 ipif_refrele(ipif);
7615 return (ENOMEM);
7616 }
7617
7618 mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
7619 bzero(mp1->b_rptr, lifs_bufsize);
7620
7621 lifr = (struct lifreq *)mp1->b_rptr;
7622
7623 ill = ill_head = ipif->ipif_ill;
7624 orig_ipif = ipif;
7625
7626 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
7627 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7628 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7629
7630 ill = ill->ill_usesrc_grp_next; /* start from next ill */
7631 for (; (ill != NULL) && (ill != ill_head);
7632 ill = ill->ill_usesrc_grp_next) {
7633
7634 if ((uchar_t *)&lifr[1] > mp1->b_wptr)
7635 break;
7636
7637 ipif = ill->ill_ipif;
7638 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
7639 if (ipif->ipif_isv6) {
7640 sin6 = (sin6_t *)&lifr->lifr_addr;
7641 *sin6 = sin6_null;
7642 sin6->sin6_family = AF_INET6;
7643 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
7644 lifr->lifr_addrlen = ip_mask_to_plen_v6(
7645 &ipif->ipif_v6net_mask);
7646 } else {
7647 sin = (sin_t *)&lifr->lifr_addr;
7648 *sin = sin_null;
7649 sin->sin_family = AF_INET;
7650 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7651 lifr->lifr_addrlen = ip_mask_to_plen(
7652 ipif->ipif_net_mask);
7653 }
7654 lifr++;
7655 }
7656 rw_exit(&ipst->ips_ill_g_lock);
7657 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7658 ipif_refrele(orig_ipif);
7659 mp1->b_wptr = (uchar_t *)lifr;
7660 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
7661
7662 return (0);
7663 }
7664
7665 /* ARGSUSED */
7666 int
7667 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7668 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7669 {
7670 mblk_t *mp1;
7671 int list;
7672 ill_t *ill;
7673 ipif_t *ipif;
7674 int flags;
7675 int numlifs = 0;
7676 size_t lifc_bufsize;
7677 struct lifreq *lifr;
7678 sa_family_t family;
7679 struct sockaddr_in *sin;
7680 struct sockaddr_in6 *sin6;
7681 ill_walk_context_t ctx;
7682 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7683 int32_t lifclen;
7684 zoneid_t zoneid;
7685 STRUCT_HANDLE(lifconf, lifc);
7686 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7687
7688 ip1dbg(("ip_sioctl_get_lifconf"));
7689
7690 ASSERT(q->q_next == NULL);
7691
7692 zoneid = Q_TO_CONN(q)->conn_zoneid;
7693
7694 /* Existence verified in ip_wput_nondata */
7695 mp1 = mp->b_cont->b_cont;
7696
7697 /*
7698 * An extended version of SIOCGIFCONF that takes an
7699 * additional address family and flags field.
7700 * AF_UNSPEC retrieve both IPv4 and IPv6.
7701 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
7702 * interfaces are omitted.
7703 * Similarly, IPIF_TEMPORARY interfaces are omitted
7704 * unless LIFC_TEMPORARY is specified.
7705 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
7706 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
7707 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
7708 * has priority over LIFC_NOXMIT.
7709 */
7710 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
7711
7712 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
7713 return (EINVAL);
7714
7715 /*
7716 * Must be (better be!) continuation of a TRANSPARENT
7717 * IOCTL. We just copied in the lifconf structure.
7718 */
7719 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
7720
7721 family = STRUCT_FGET(lifc, lifc_family);
7722 flags = STRUCT_FGET(lifc, lifc_flags);
7723
7724 switch (family) {
7725 case AF_UNSPEC:
7726 /*
7727 * walk all ILL's.
7728 */
7729 list = MAX_G_HEADS;
7730 break;
7731 case AF_INET:
7732 /*
7733 * walk only IPV4 ILL's.
7734 */
7735 list = IP_V4_G_HEAD;
7736 break;
7737 case AF_INET6:
7738 /*
7739 * walk only IPV6 ILL's.
7740 */
7741 list = IP_V6_G_HEAD;
7742 break;
7743 default:
7744 return (EAFNOSUPPORT);
7745 }
7746
7747 /*
7748 * Allocate a buffer to hold requested information.
7749 *
7750 * If lifc_len is larger than what is needed, we only
7751 * allocate what we will use.
7752 *
7753 * If lifc_len is smaller than what is needed, return
7754 * EINVAL.
7755 */
7756 numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
7757 lifc_bufsize = numlifs * sizeof (struct lifreq);
7758 lifclen = STRUCT_FGET(lifc, lifc_len);
7759 if (lifc_bufsize > lifclen) {
7760 if (iocp->ioc_cmd == O_SIOCGLIFCONF)
7761 return (EINVAL);
7762 else
7763 lifc_bufsize = lifclen;
7764 }
7765
7766 mp1 = mi_copyout_alloc(q, mp,
7767 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
7768 if (mp1 == NULL)
7769 return (ENOMEM);
7770
7771 mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
7772 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7773
7774 lifr = (struct lifreq *)mp1->b_rptr;
7775
7776 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7777 ill = ill_first(list, list, &ctx, ipst);
7778 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7779 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
7780 continue;
7781
7782 for (ipif = ill->ill_ipif; ipif != NULL;
7783 ipif = ipif->ipif_next) {
7784 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7785 !(flags & LIFC_NOXMIT))
7786 continue;
7787
7788 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7789 !(flags & LIFC_TEMPORARY))
7790 continue;
7791
7792 if (((ipif->ipif_flags &
7793 (IPIF_NOXMIT|IPIF_NOLOCAL|
7794 IPIF_DEPRECATED)) ||
7795 IS_LOOPBACK(ill) ||
7796 !(ipif->ipif_flags & IPIF_UP)) &&
7797 (flags & LIFC_EXTERNAL_SOURCE))
7798 continue;
7799
7800 if (zoneid != ipif->ipif_zoneid &&
7801 ipif->ipif_zoneid != ALL_ZONES &&
7802 (zoneid != GLOBAL_ZONEID ||
7803 !(flags & LIFC_ALLZONES)))
7804 continue;
7805
7806 if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
7807 if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
7808 rw_exit(&ipst->ips_ill_g_lock);
7809 return (EINVAL);
7810 } else {
7811 goto lif_copydone;
7812 }
7813 }
7814
7815 ipif_get_name(ipif, lifr->lifr_name,
7816 sizeof (lifr->lifr_name));
7817 lifr->lifr_type = ill->ill_type;
7818 if (ipif->ipif_isv6) {
7819 sin6 = (sin6_t *)&lifr->lifr_addr;
7820 *sin6 = sin6_null;
7821 sin6->sin6_family = AF_INET6;
7822 sin6->sin6_addr =
7823 ipif->ipif_v6lcl_addr;
7824 lifr->lifr_addrlen =
7825 ip_mask_to_plen_v6(
7826 &ipif->ipif_v6net_mask);
7827 } else {
7828 sin = (sin_t *)&lifr->lifr_addr;
7829 *sin = sin_null;
7830 sin->sin_family = AF_INET;
7831 sin->sin_addr.s_addr =
7832 ipif->ipif_lcl_addr;
7833 lifr->lifr_addrlen =
7834 ip_mask_to_plen(
7835 ipif->ipif_net_mask);
7836 }
7837 lifr++;
7838 }
7839 }
7840 lif_copydone:
7841 rw_exit(&ipst->ips_ill_g_lock);
7842
7843 mp1->b_wptr = (uchar_t *)lifr;
7844 if (STRUCT_BUF(lifc) != NULL) {
7845 STRUCT_FSET(lifc, lifc_len,
7846 (int)((uchar_t *)lifr - mp1->b_rptr));
7847 }
7848 return (0);
7849 }
7850
7851 static void
7852 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
7853 {
7854 ip6_asp_t *table;
7855 size_t table_size;
7856 mblk_t *data_mp;
7857 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7858 ip_stack_t *ipst;
7859
7860 if (q->q_next == NULL)
7861 ipst = CONNQ_TO_IPST(q);
7862 else
7863 ipst = ILLQ_TO_IPST(q);
7864
7865 /* These two ioctls are I_STR only */
7866 if (iocp->ioc_count == TRANSPARENT) {
7867 miocnak(q, mp, 0, EINVAL);
7868 return;
7869 }
7870
7871 data_mp = mp->b_cont;
7872 if (data_mp == NULL) {
7873 /* The user passed us a NULL argument */
7874 table = NULL;
7875 table_size = iocp->ioc_count;
7876 } else {
7877 /*
7878 * The user provided a table. The stream head
7879 * may have copied in the user data in chunks,
7880 * so make sure everything is pulled up
7881 * properly.
7882 */
7883 if (MBLKL(data_mp) < iocp->ioc_count) {
7884 mblk_t *new_data_mp;
7885 if ((new_data_mp = msgpullup(data_mp, -1)) ==
7886 NULL) {
7887 miocnak(q, mp, 0, ENOMEM);
7888 return;
7889 }
7890 freemsg(data_mp);
7891 data_mp = new_data_mp;
7892 mp->b_cont = data_mp;
7893 }
7894 table = (ip6_asp_t *)data_mp->b_rptr;
7895 table_size = iocp->ioc_count;
7896 }
7897
7898 switch (iocp->ioc_cmd) {
7899 case SIOCGIP6ADDRPOLICY:
7900 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
7901 if (iocp->ioc_rval == -1)
7902 iocp->ioc_error = EINVAL;
7903 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7904 else if (table != NULL &&
7905 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
7906 ip6_asp_t *src = table;
7907 ip6_asp32_t *dst = (void *)table;
7908 int count = table_size / sizeof (ip6_asp_t);
7909 int i;
7910
7911 /*
7912 * We need to do an in-place shrink of the array
7913 * to match the alignment attributes of the
7914 * 32-bit ABI looking at it.
7915 */
7916 /* LINTED: logical expression always true: op "||" */
7917 ASSERT(sizeof (*src) > sizeof (*dst));
7918 for (i = 1; i < count; i++)
7919 bcopy(src + i, dst + i, sizeof (*dst));
7920 }
7921 #endif
7922 break;
7923
7924 case SIOCSIP6ADDRPOLICY:
7925 ASSERT(mp->b_prev == NULL);
7926 mp->b_prev = (void *)q;
7927 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7928 /*
7929 * We pass in the datamodel here so that the ip6_asp_replace()
7930 * routine can handle converting from 32-bit to native formats
7931 * where necessary.
7932 *
7933 * A better way to handle this might be to convert the inbound
7934 * data structure here, and hang it off a new 'mp'; thus the
7935 * ip6_asp_replace() logic would always be dealing with native
7936 * format data structures..
7937 *
7938 * (An even simpler way to handle these ioctls is to just
7939 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
7940 * and just recompile everything that depends on it.)
7941 */
7942 #endif
7943 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
7944 iocp->ioc_flag & IOC_MODELS);
7945 return;
7946 }
7947
7948 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
7949 qreply(q, mp);
7950 }
7951
7952 static void
7953 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
7954 {
7955 mblk_t *data_mp;
7956 struct dstinforeq *dir;
7957 uint8_t *end, *cur;
7958 in6_addr_t *daddr, *saddr;
7959 ipaddr_t v4daddr;
7960 ire_t *ire;
7961 ipaddr_t v4setsrc;
7962 in6_addr_t v6setsrc;
7963 char *slabel, *dlabel;
7964 boolean_t isipv4;
7965 int match_ire;
7966 ill_t *dst_ill;
7967 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7968 conn_t *connp = Q_TO_CONN(q);
7969 zoneid_t zoneid = IPCL_ZONEID(connp);
7970 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
7971 uint64_t ipif_flags;
7972
7973 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7974
7975 /*
7976 * This ioctl is I_STR only, and must have a
7977 * data mblk following the M_IOCTL mblk.
7978 */
7979 data_mp = mp->b_cont;
7980 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
7981 miocnak(q, mp, 0, EINVAL);
7982 return;
7983 }
7984
7985 if (MBLKL(data_mp) < iocp->ioc_count) {
7986 mblk_t *new_data_mp;
7987
7988 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
7989 miocnak(q, mp, 0, ENOMEM);
7990 return;
7991 }
7992 freemsg(data_mp);
7993 data_mp = new_data_mp;
7994 mp->b_cont = data_mp;
7995 }
7996 match_ire = MATCH_IRE_DSTONLY;
7997
7998 for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
7999 end - cur >= sizeof (struct dstinforeq);
8000 cur += sizeof (struct dstinforeq)) {
8001 dir = (struct dstinforeq *)cur;
8002 daddr = &dir->dir_daddr;
8003 saddr = &dir->dir_saddr;
8004
8005 /*
8006 * ip_addr_scope_v6() and ip6_asp_lookup() handle
8007 * v4 mapped addresses; ire_ftable_lookup_v6()
8008 * and ip_select_source_v6() do not.
8009 */
8010 dir->dir_dscope = ip_addr_scope_v6(daddr);
8011 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
8012
8013 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
8014 if (isipv4) {
8015 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
8016 v4setsrc = INADDR_ANY;
8017 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
8018 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc,
8019 NULL, NULL);
8020 } else {
8021 v6setsrc = ipv6_all_zeros;
8022 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
8023 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc,
8024 NULL, NULL);
8025 }
8026 ASSERT(ire != NULL);
8027 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
8028 ire_refrele(ire);
8029 dir->dir_dreachable = 0;
8030
8031 /* move on to next dst addr */
8032 continue;
8033 }
8034 dir->dir_dreachable = 1;
8035
8036 dst_ill = ire_nexthop_ill(ire);
8037 if (dst_ill == NULL) {
8038 ire_refrele(ire);
8039 continue;
8040 }
8041
8042 /* With ipmp we most likely look at the ipmp ill here */
8043 dir->dir_dmactype = dst_ill->ill_mactype;
8044
8045 if (isipv4) {
8046 ipaddr_t v4saddr;
8047
8048 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
8049 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
8050 &v4saddr, NULL, &ipif_flags) != 0) {
8051 v4saddr = INADDR_ANY;
8052 ipif_flags = 0;
8053 }
8054 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
8055 } else {
8056 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
8057 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
8058 saddr, NULL, &ipif_flags) != 0) {
8059 *saddr = ipv6_all_zeros;
8060 ipif_flags = 0;
8061 }
8062 }
8063
8064 dir->dir_sscope = ip_addr_scope_v6(saddr);
8065 slabel = ip6_asp_lookup(saddr, NULL, ipst);
8066 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
8067 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
8068 ire_refrele(ire);
8069 ill_refrele(dst_ill);
8070 }
8071 miocack(q, mp, iocp->ioc_count, 0);
8072 }
8073
8074 /*
8075 * Check if this is an address assigned to this machine.
8076 * Skips interfaces that are down by using ire checks.
8077 * Translates mapped addresses to v4 addresses and then
8078 * treats them as such, returning true if the v4 address
8079 * associated with this mapped address is configured.
8080 * Note: Applications will have to be careful what they do
8081 * with the response; use of mapped addresses limits
8082 * what can be done with the socket, especially with
8083 * respect to socket options and ioctls - neither IPv4
8084 * options nor IPv6 sticky options/ancillary data options
8085 * may be used.
8086 */
8087 /* ARGSUSED */
8088 int
8089 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8090 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8091 {
8092 struct sioc_addrreq *sia;
8093 sin_t *sin;
8094 ire_t *ire;
8095 mblk_t *mp1;
8096 zoneid_t zoneid;
8097 ip_stack_t *ipst;
8098
8099 ip1dbg(("ip_sioctl_tmyaddr"));
8100
8101 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8102 zoneid = Q_TO_CONN(q)->conn_zoneid;
8103 ipst = CONNQ_TO_IPST(q);
8104
8105 /* Existence verified in ip_wput_nondata */
8106 mp1 = mp->b_cont->b_cont;
8107 sia = (struct sioc_addrreq *)mp1->b_rptr;
8108 sin = (sin_t *)&sia->sa_addr;
8109 switch (sin->sin_family) {
8110 case AF_INET6: {
8111 sin6_t *sin6 = (sin6_t *)sin;
8112
8113 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8114 ipaddr_t v4_addr;
8115
8116 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8117 v4_addr);
8118 ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
8119 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8120 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8121 } else {
8122 in6_addr_t v6addr;
8123
8124 v6addr = sin6->sin6_addr;
8125 ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
8126 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8127 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8128 }
8129 break;
8130 }
8131 case AF_INET: {
8132 ipaddr_t v4addr;
8133
8134 v4addr = sin->sin_addr.s_addr;
8135 ire = ire_ftable_lookup_v4(v4addr, 0, 0,
8136 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8137 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8138 break;
8139 }
8140 default:
8141 return (EAFNOSUPPORT);
8142 }
8143 if (ire != NULL) {
8144 sia->sa_res = 1;
8145 ire_refrele(ire);
8146 } else {
8147 sia->sa_res = 0;
8148 }
8149 return (0);
8150 }
8151
8152 /*
8153 * Check if this is an address assigned on-link i.e. neighbor,
8154 * and makes sure it's reachable from the current zone.
8155 * Returns true for my addresses as well.
8156 * Translates mapped addresses to v4 addresses and then
8157 * treats them as such, returning true if the v4 address
8158 * associated with this mapped address is configured.
8159 * Note: Applications will have to be careful what they do
8160 * with the response; use of mapped addresses limits
8161 * what can be done with the socket, especially with
8162 * respect to socket options and ioctls - neither IPv4
8163 * options nor IPv6 sticky options/ancillary data options
8164 * may be used.
8165 */
8166 /* ARGSUSED */
8167 int
8168 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8169 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
8170 {
8171 struct sioc_addrreq *sia;
8172 sin_t *sin;
8173 mblk_t *mp1;
8174 ire_t *ire = NULL;
8175 zoneid_t zoneid;
8176 ip_stack_t *ipst;
8177
8178 ip1dbg(("ip_sioctl_tonlink"));
8179
8180 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8181 zoneid = Q_TO_CONN(q)->conn_zoneid;
8182 ipst = CONNQ_TO_IPST(q);
8183
8184 /* Existence verified in ip_wput_nondata */
8185 mp1 = mp->b_cont->b_cont;
8186 sia = (struct sioc_addrreq *)mp1->b_rptr;
8187 sin = (sin_t *)&sia->sa_addr;
8188
8189 /*
8190 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
8191 * to make sure we only look at on-link unicast address.
8192 */
8193 switch (sin->sin_family) {
8194 case AF_INET6: {
8195 sin6_t *sin6 = (sin6_t *)sin;
8196
8197 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8198 ipaddr_t v4_addr;
8199
8200 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8201 v4_addr);
8202 if (!CLASSD(v4_addr)) {
8203 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
8204 NULL, zoneid, NULL, MATCH_IRE_DSTONLY,
8205 0, ipst, NULL);
8206 }
8207 } else {
8208 in6_addr_t v6addr;
8209
8210 v6addr = sin6->sin6_addr;
8211 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
8212 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
8213 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0,
8214 ipst, NULL);
8215 }
8216 }
8217 break;
8218 }
8219 case AF_INET: {
8220 ipaddr_t v4addr;
8221
8222 v4addr = sin->sin_addr.s_addr;
8223 if (!CLASSD(v4addr)) {
8224 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
8225 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
8226 }
8227 break;
8228 }
8229 default:
8230 return (EAFNOSUPPORT);
8231 }
8232 sia->sa_res = 0;
8233 if (ire != NULL) {
8234 ASSERT(!(ire->ire_type & IRE_MULTICAST));
8235
8236 if ((ire->ire_type & IRE_ONLINK) &&
8237 !(ire->ire_type & IRE_BROADCAST))
8238 sia->sa_res = 1;
8239 ire_refrele(ire);
8240 }
8241 return (0);
8242 }
8243
8244 /*
8245 * TBD: implement when kernel maintaines a list of site prefixes.
8246 */
8247 /* ARGSUSED */
8248 int
8249 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8250 ip_ioctl_cmd_t *ipip, void *ifreq)
8251 {
8252 return (ENXIO);
8253 }
8254
8255 /* ARP IOCTLs. */
8256 /* ARGSUSED */
8257 int
8258 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8259 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8260 {
8261 int err;
8262 ipaddr_t ipaddr;
8263 struct iocblk *iocp;
8264 conn_t *connp;
8265 struct arpreq *ar;
8266 struct xarpreq *xar;
8267 int arp_flags, flags, alength;
8268 uchar_t *lladdr;
8269 ip_stack_t *ipst;
8270 ill_t *ill = ipif->ipif_ill;
8271 ill_t *proxy_ill = NULL;
8272 ipmp_arpent_t *entp = NULL;
8273 boolean_t proxyarp = B_FALSE;
8274 boolean_t if_arp_ioctl = B_FALSE;
8275 ncec_t *ncec = NULL;
8276 nce_t *nce;
8277
8278 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8279 connp = Q_TO_CONN(q);
8280 ipst = connp->conn_netstack->netstack_ip;
8281 iocp = (struct iocblk *)mp->b_rptr;
8282
8283 if (ipip->ipi_cmd_type == XARP_CMD) {
8284 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8285 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8286 ar = NULL;
8287
8288 arp_flags = xar->xarp_flags;
8289 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
8290 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
8291 /*
8292 * Validate against user's link layer address length
8293 * input and name and addr length limits.
8294 */
8295 alength = ill->ill_phys_addr_length;
8296 if (ipip->ipi_cmd == SIOCSXARP) {
8297 if (alength != xar->xarp_ha.sdl_alen ||
8298 (alength + xar->xarp_ha.sdl_nlen >
8299 sizeof (xar->xarp_ha.sdl_data)))
8300 return (EINVAL);
8301 }
8302 } else {
8303 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8304 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8305 xar = NULL;
8306
8307 arp_flags = ar->arp_flags;
8308 lladdr = (uchar_t *)ar->arp_ha.sa_data;
8309 /*
8310 * Theoretically, the sa_family could tell us what link
8311 * layer type this operation is trying to deal with. By
8312 * common usage AF_UNSPEC means ethernet. We'll assume
8313 * any attempt to use the SIOC?ARP ioctls is for ethernet,
8314 * for now. Our new SIOC*XARP ioctls can be used more
8315 * generally.
8316 *
8317 * If the underlying media happens to have a non 6 byte
8318 * address, arp module will fail set/get, but the del
8319 * operation will succeed.
8320 */
8321 alength = 6;
8322 if ((ipip->ipi_cmd != SIOCDARP) &&
8323 (alength != ill->ill_phys_addr_length)) {
8324 return (EINVAL);
8325 }
8326 }
8327
8328 /* Translate ATF* flags to NCE* flags */
8329 flags = 0;
8330 if (arp_flags & ATF_AUTHORITY)
8331 flags |= NCE_F_AUTHORITY;
8332 if (arp_flags & ATF_PERM)
8333 flags |= NCE_F_NONUD; /* not subject to aging */
8334 if (arp_flags & ATF_PUBL)
8335 flags |= NCE_F_PUBLISH;
8336
8337 /*
8338 * IPMP ARP special handling:
8339 *
8340 * 1. Since ARP mappings must appear consistent across the group,
8341 * prohibit changing ARP mappings on the underlying interfaces.
8342 *
8343 * 2. Since ARP mappings for IPMP data addresses are maintained by
8344 * IP itself, prohibit changing them.
8345 *
8346 * 3. For proxy ARP, use a functioning hardware address in the group,
8347 * provided one exists. If one doesn't, just add the entry as-is;
8348 * ipmp_illgrp_refresh_arpent() will refresh it if things change.
8349 */
8350 if (IS_UNDER_IPMP(ill)) {
8351 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
8352 return (EPERM);
8353 }
8354 if (IS_IPMP(ill)) {
8355 ipmp_illgrp_t *illg = ill->ill_grp;
8356
8357 switch (ipip->ipi_cmd) {
8358 case SIOCSARP:
8359 case SIOCSXARP:
8360 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
8361 if (proxy_ill != NULL) {
8362 proxyarp = B_TRUE;
8363 if (!ipmp_ill_is_active(proxy_ill))
8364 proxy_ill = ipmp_illgrp_next_ill(illg);
8365 if (proxy_ill != NULL)
8366 lladdr = proxy_ill->ill_phys_addr;
8367 }
8368 /* FALLTHRU */
8369 }
8370 }
8371
8372 ipaddr = sin->sin_addr.s_addr;
8373 /*
8374 * don't match across illgrp per case (1) and (2).
8375 * XXX use IS_IPMP(ill) like ndp_sioc_update?
8376 */
8377 nce = nce_lookup_v4(ill, &ipaddr);
8378 if (nce != NULL)
8379 ncec = nce->nce_common;
8380
8381 switch (iocp->ioc_cmd) {
8382 case SIOCDARP:
8383 case SIOCDXARP: {
8384 /*
8385 * Delete the NCE if any.
8386 */
8387 if (ncec == NULL) {
8388 iocp->ioc_error = ENXIO;
8389 break;
8390 }
8391 /* Don't allow changes to arp mappings of local addresses. */
8392 if (NCE_MYADDR(ncec)) {
8393 nce_refrele(nce);
8394 return (ENOTSUP);
8395 }
8396 iocp->ioc_error = 0;
8397
8398 /*
8399 * Delete the nce_common which has ncec_ill set to ipmp_ill.
8400 * This will delete all the nce entries on the under_ills.
8401 */
8402 ncec_delete(ncec);
8403 /*
8404 * Once the NCE has been deleted, then the ire_dep* consistency
8405 * mechanism will find any IRE which depended on the now
8406 * condemned NCE (as part of sending packets).
8407 * That mechanism handles redirects by deleting redirects
8408 * that refer to UNREACHABLE nces.
8409 */
8410 break;
8411 }
8412 case SIOCGARP:
8413 case SIOCGXARP:
8414 if (ncec != NULL) {
8415 lladdr = ncec->ncec_lladdr;
8416 flags = ncec->ncec_flags;
8417 iocp->ioc_error = 0;
8418 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
8419 } else {
8420 iocp->ioc_error = ENXIO;
8421 }
8422 break;
8423 case SIOCSARP:
8424 case SIOCSXARP:
8425 /* Don't allow changes to arp mappings of local addresses. */
8426 if (ncec != NULL && NCE_MYADDR(ncec)) {
8427 nce_refrele(nce);
8428 return (ENOTSUP);
8429 }
8430
8431 /* static arp entries will undergo NUD if ATF_PERM is not set */
8432 flags |= NCE_F_STATIC;
8433 if (!if_arp_ioctl) {
8434 ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
8435 lladdr, alength, flags);
8436 } else {
8437 ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8438 if (ipif != NULL) {
8439 ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
8440 lladdr, alength, flags);
8441 ipif_refrele(ipif);
8442 }
8443 }
8444 if (nce != NULL) {
8445 nce_refrele(nce);
8446 nce = NULL;
8447 }
8448 /*
8449 * NCE_F_STATIC entries will be added in state ND_REACHABLE
8450 * by nce_add_common()
8451 */
8452 err = nce_lookup_then_add_v4(ill, lladdr,
8453 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
8454 &nce);
8455 if (err == EEXIST) {
8456 ncec = nce->nce_common;
8457 mutex_enter(&ncec->ncec_lock);
8458 ncec->ncec_state = ND_REACHABLE;
8459 ncec->ncec_flags = flags;
8460 nce_update(ncec, ND_UNCHANGED, lladdr);
8461 mutex_exit(&ncec->ncec_lock);
8462 err = 0;
8463 }
8464 if (nce != NULL) {
8465 nce_refrele(nce);
8466 nce = NULL;
8467 }
8468 if (IS_IPMP(ill) && err == 0) {
8469 entp = ipmp_illgrp_create_arpent(ill->ill_grp,
8470 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
8471 flags);
8472 if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
8473 iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
8474 break;
8475 }
8476 }
8477 iocp->ioc_error = err;
8478 }
8479
8480 if (nce != NULL) {
8481 nce_refrele(nce);
8482 }
8483
8484 /*
8485 * If we created an IPMP ARP entry, mark that we've notified ARP.
8486 */
8487 if (entp != NULL)
8488 ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
8489
8490 return (iocp->ioc_error);
8491 }
8492
8493 /*
8494 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
8495 * the associated sin and refhold and return the associated ipif via `ci'.
8496 */
8497 int
8498 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8499 cmd_info_t *ci)
8500 {
8501 mblk_t *mp1;
8502 sin_t *sin;
8503 conn_t *connp;
8504 ipif_t *ipif;
8505 ire_t *ire = NULL;
8506 ill_t *ill = NULL;
8507 boolean_t exists;
8508 ip_stack_t *ipst;
8509 struct arpreq *ar;
8510 struct xarpreq *xar;
8511 struct sockaddr_dl *sdl;
8512
8513 /* ioctl comes down on a conn */
8514 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8515 connp = Q_TO_CONN(q);
8516 if (connp->conn_family == AF_INET6)
8517 return (ENXIO);
8518
8519 ipst = connp->conn_netstack->netstack_ip;
8520
8521 /* Verified in ip_wput_nondata */
8522 mp1 = mp->b_cont->b_cont;
8523
8524 if (ipip->ipi_cmd_type == XARP_CMD) {
8525 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
8526 xar = (struct xarpreq *)mp1->b_rptr;
8527 sin = (sin_t *)&xar->xarp_pa;
8528 sdl = &xar->xarp_ha;
8529
8530 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
8531 return (ENXIO);
8532 if (sdl->sdl_nlen >= LIFNAMSIZ)
8533 return (EINVAL);
8534 } else {
8535 ASSERT(ipip->ipi_cmd_type == ARP_CMD);
8536 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
8537 ar = (struct arpreq *)mp1->b_rptr;
8538 sin = (sin_t *)&ar->arp_pa;
8539 }
8540
8541 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
8542 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
8543 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
8544 if (ipif == NULL)
8545 return (ENXIO);
8546 if (ipif->ipif_id != 0) {
8547 ipif_refrele(ipif);
8548 return (ENXIO);
8549 }
8550 } else {
8551 /*
8552 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
8553 * of 0: use the IP address to find the ipif. If the IP
8554 * address is an IPMP test address, ire_ftable_lookup() will
8555 * find the wrong ill, so we first do an ipif_lookup_addr().
8556 */
8557 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
8558 ipst);
8559 if (ipif == NULL) {
8560 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
8561 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
8562 NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
8563 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
8564 if (ire != NULL)
8565 ire_refrele(ire);
8566 return (ENXIO);
8567 }
8568 ASSERT(ire != NULL && ill != NULL);
8569 ipif = ill->ill_ipif;
8570 ipif_refhold(ipif);
8571 ire_refrele(ire);
8572 }
8573 }
8574
8575 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
8576 ipif_refrele(ipif);
8577 return (ENXIO);
8578 }
8579
8580 ci->ci_sin = sin;
8581 ci->ci_ipif = ipif;
8582 return (0);
8583 }
8584
8585 /*
8586 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
8587 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is
8588 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
8589 * up and thus an ill can join that illgrp.
8590 *
8591 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
8592 * open()/close() primarily because close() is not allowed to fail or block
8593 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason
8594 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure
8595 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
8596 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts
8597 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
8598 * state if I_UNLINK didn't occur.
8599 *
8600 * Note that for each plumb/unplumb operation, we may end up here more than
8601 * once because of the way ifconfig works. However, it's OK to link the same
8602 * illgrp more than once, or unlink an illgrp that's already unlinked.
8603 */
8604 static int
8605 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
8606 {
8607 int err;
8608 ip_stack_t *ipst = ill->ill_ipst;
8609
8610 ASSERT(IS_IPMP(ill));
8611 ASSERT(IAM_WRITER_ILL(ill));
8612
8613 switch (ioccmd) {
8614 case I_LINK:
8615 return (ENOTSUP);
8616
8617 case I_PLINK:
8618 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8619 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
8620 rw_exit(&ipst->ips_ipmp_lock);
8621 break;
8622
8623 case I_PUNLINK:
8624 /*
8625 * Require all UP ipifs be brought down prior to unlinking the
8626 * illgrp so any associated IREs (and other state) is torched.
8627 */
8628 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
8629 return (EBUSY);
8630
8631 /*
8632 * NOTE: We hold ipmp_lock across the unlink to prevent a race
8633 * with an SIOCSLIFGROUPNAME request from an ill trying to
8634 * join this group. Specifically: ills trying to join grab
8635 * ipmp_lock and bump a "pending join" counter checked by
8636 * ipmp_illgrp_unlink_grp(). During the unlink no new pending
8637 * joins can occur (since we have ipmp_lock). Once we drop
8638 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
8639 * find the illgrp (since we unlinked it) and will return
8640 * EAFNOSUPPORT. This will then take them back through the
8641 * IPMP meta-interface plumbing logic in ifconfig, and thus
8642 * back through I_PLINK above.
8643 */
8644 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8645 err = ipmp_illgrp_unlink_grp(ill->ill_grp);
8646 rw_exit(&ipst->ips_ipmp_lock);
8647 return (err);
8648 default:
8649 break;
8650 }
8651 return (0);
8652 }
8653
8654 /*
8655 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
8656 * atomically set/clear the muxids. Also complete the ioctl by acking or
8657 * naking it. Note that the code is structured such that the link type,
8658 * whether it's persistent or not, is treated equally. ifconfig(1M) and
8659 * its clones use the persistent link, while pppd(1M) and perhaps many
8660 * other daemons may use non-persistent link. When combined with some
8661 * ill_t states, linking and unlinking lower streams may be used as
8662 * indicators of dynamic re-plumbing events [see PSARC/1999/348].
8663 */
8664 /* ARGSUSED */
8665 void
8666 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8667 {
8668 mblk_t *mp1;
8669 struct linkblk *li;
8670 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8671 int err = 0;
8672
8673 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
8674 ioccmd == I_LINK || ioccmd == I_UNLINK);
8675
8676 mp1 = mp->b_cont; /* This is the linkblk info */
8677 li = (struct linkblk *)mp1->b_rptr;
8678
8679 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
8680 if (err == EINPROGRESS)
8681 return;
8682 if (err == 0)
8683 miocack(q, mp, 0, 0);
8684 else
8685 miocnak(q, mp, 0, err);
8686
8687 /* Conn was refheld in ip_sioctl_copyin_setup */
8688 if (CONN_Q(q)) {
8689 CONN_DEC_IOCTLREF(Q_TO_CONN(q));
8690 CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
8691 }
8692 }
8693
8694 /*
8695 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
8696 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
8697 * module stream).
8698 * Returns zero on success, EINPROGRESS if the operation is still pending, or
8699 * an error code on failure.
8700 */
8701 static int
8702 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
8703 struct linkblk *li)
8704 {
8705 int err = 0;
8706 ill_t *ill;
8707 queue_t *ipwq, *dwq;
8708 const char *name;
8709 struct qinit *qinfo;
8710 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
8711 boolean_t entered_ipsq = B_FALSE;
8712 boolean_t is_ip = B_FALSE;
8713 arl_t *arl;
8714
8715 /*
8716 * Walk the lower stream to verify it's the IP module stream.
8717 * The IP module is identified by its name, wput function,
8718 * and non-NULL q_next. STREAMS ensures that the lower stream
8719 * (li->l_qbot) will not vanish until this ioctl completes.
8720 */
8721 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
8722 qinfo = ipwq->q_qinfo;
8723 name = qinfo->qi_minfo->mi_idname;
8724 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
8725 qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8726 is_ip = B_TRUE;
8727 break;
8728 }
8729 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
8730 qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8731 break;
8732 }
8733 }
8734
8735 /*
8736 * If this isn't an IP module stream, bail.
8737 */
8738 if (ipwq == NULL)
8739 return (0);
8740
8741 if (!is_ip) {
8742 arl = (arl_t *)ipwq->q_ptr;
8743 ill = arl_to_ill(arl);
8744 if (ill == NULL)
8745 return (0);
8746 } else {
8747 ill = ipwq->q_ptr;
8748 }
8749 ASSERT(ill != NULL);
8750
8751 if (ipsq == NULL) {
8752 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
8753 NEW_OP, B_FALSE);
8754 if (ipsq == NULL) {
8755 if (!is_ip)
8756 ill_refrele(ill);
8757 return (EINPROGRESS);
8758 }
8759 entered_ipsq = B_TRUE;
8760 }
8761 ASSERT(IAM_WRITER_ILL(ill));
8762 mutex_enter(&ill->ill_lock);
8763 if (!is_ip) {
8764 if (islink && ill->ill_muxid == 0) {
8765 /*
8766 * Plumbing has to be done with IP plumbed first, arp
8767 * second, but here we have arp being plumbed first.
8768 */
8769 mutex_exit(&ill->ill_lock);
8770 if (entered_ipsq)
8771 ipsq_exit(ipsq);
8772 ill_refrele(ill);
8773 return (EINVAL);
8774 }
8775 }
8776 mutex_exit(&ill->ill_lock);
8777 if (!is_ip) {
8778 arl->arl_muxid = islink ? li->l_index : 0;
8779 ill_refrele(ill);
8780 goto done;
8781 }
8782
8783 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
8784 goto done;
8785
8786 /*
8787 * As part of I_{P}LINKing, stash the number of downstream modules and
8788 * the read queue of the module immediately below IP in the ill.
8789 * These are used during the capability negotiation below.
8790 */
8791 ill->ill_lmod_rq = NULL;
8792 ill->ill_lmod_cnt = 0;
8793 if (islink && ((dwq = ipwq->q_next) != NULL)) {
8794 ill->ill_lmod_rq = RD(dwq);
8795 for (; dwq != NULL; dwq = dwq->q_next)
8796 ill->ill_lmod_cnt++;
8797 }
8798
8799 ill->ill_muxid = islink ? li->l_index : 0;
8800
8801 /*
8802 * Mark the ipsq busy until the capability operations initiated below
8803 * complete. The PLINK/UNLINK ioctl itself completes when our caller
8804 * returns, but the capability operation may complete asynchronously
8805 * much later.
8806 */
8807 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
8808 /*
8809 * If there's at least one up ipif on this ill, then we're bound to
8810 * the underlying driver via DLPI. In that case, renegotiate
8811 * capabilities to account for any possible change in modules
8812 * interposed between IP and the driver.
8813 */
8814 if (ill->ill_ipif_up_count > 0) {
8815 if (islink)
8816 ill_capability_probe(ill);
8817 else
8818 ill_capability_reset(ill, B_FALSE);
8819 }
8820 ipsq_current_finish(ipsq);
8821 done:
8822 if (entered_ipsq)
8823 ipsq_exit(ipsq);
8824
8825 return (err);
8826 }
8827
8828 /*
8829 * Search the ioctl command in the ioctl tables and return a pointer
8830 * to the ioctl command information. The ioctl command tables are
8831 * static and fully populated at compile time.
8832 */
8833 ip_ioctl_cmd_t *
8834 ip_sioctl_lookup(int ioc_cmd)
8835 {
8836 int index;
8837 ip_ioctl_cmd_t *ipip;
8838 ip_ioctl_cmd_t *ipip_end;
8839
8840 if (ioc_cmd == IPI_DONTCARE)
8841 return (NULL);
8842
8843 /*
8844 * Do a 2 step search. First search the indexed table
8845 * based on the least significant byte of the ioctl cmd.
8846 * If we don't find a match, then search the misc table
8847 * serially.
8848 */
8849 index = ioc_cmd & 0xFF;
8850 if (index < ip_ndx_ioctl_count) {
8851 ipip = &ip_ndx_ioctl_table[index];
8852 if (ipip->ipi_cmd == ioc_cmd) {
8853 /* Found a match in the ndx table */
8854 return (ipip);
8855 }
8856 }
8857
8858 /* Search the misc table */
8859 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
8860 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
8861 if (ipip->ipi_cmd == ioc_cmd)
8862 /* Found a match in the misc table */
8863 return (ipip);
8864 }
8865
8866 return (NULL);
8867 }
8868
8869 /*
8870 * helper function for ip_sioctl_getsetprop(), which does some sanity checks
8871 */
8872 static boolean_t
8873 getset_ioctl_checks(mblk_t *mp)
8874 {
8875 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8876 mblk_t *mp1 = mp->b_cont;
8877 mod_ioc_prop_t *pioc;
8878 uint_t flags;
8879 uint_t pioc_size;
8880
8881 /* do sanity checks on various arguments */
8882 if (mp1 == NULL || iocp->ioc_count == 0 ||
8883 iocp->ioc_count == TRANSPARENT) {
8884 return (B_FALSE);
8885 }
8886 if (msgdsize(mp1) < iocp->ioc_count) {
8887 if (!pullupmsg(mp1, iocp->ioc_count))
8888 return (B_FALSE);
8889 }
8890
8891 pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8892
8893 /* sanity checks on mpr_valsize */
8894 pioc_size = sizeof (mod_ioc_prop_t);
8895 if (pioc->mpr_valsize != 0)
8896 pioc_size += pioc->mpr_valsize - 1;
8897
8898 if (iocp->ioc_count != pioc_size)
8899 return (B_FALSE);
8900
8901 flags = pioc->mpr_flags;
8902 if (iocp->ioc_cmd == SIOCSETPROP) {
8903 /*
8904 * One can either reset the value to it's default value or
8905 * change the current value or append/remove the value from
8906 * a multi-valued properties.
8907 */
8908 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8909 flags != MOD_PROP_ACTIVE &&
8910 flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) &&
8911 flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE))
8912 return (B_FALSE);
8913 } else {
8914 ASSERT(iocp->ioc_cmd == SIOCGETPROP);
8915
8916 /*
8917 * One can retrieve only one kind of property information
8918 * at a time.
8919 */
8920 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE &&
8921 (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8922 (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE &&
8923 (flags & MOD_PROP_PERM) != MOD_PROP_PERM)
8924 return (B_FALSE);
8925 }
8926
8927 return (B_TRUE);
8928 }
8929
8930 /*
8931 * process the SIOC{SET|GET}PROP ioctl's
8932 */
8933 /* ARGSUSED */
8934 static void
8935 ip_sioctl_getsetprop(queue_t *q, mblk_t *mp)
8936 {
8937 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8938 mblk_t *mp1 = mp->b_cont;
8939 mod_ioc_prop_t *pioc;
8940 mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8941 ip_stack_t *ipst;
8942 netstack_t *stack;
8943 cred_t *cr;
8944 boolean_t set;
8945 int err;
8946
8947 ASSERT(q->q_next == NULL);
8948 ASSERT(CONN_Q(q));
8949
8950 if (!getset_ioctl_checks(mp)) {
8951 miocnak(q, mp, 0, EINVAL);
8952 return;
8953 }
8954 ipst = CONNQ_TO_IPST(q);
8955 stack = ipst->ips_netstack;
8956 pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8957
8958 switch (pioc->mpr_proto) {
8959 case MOD_PROTO_IP:
8960 case MOD_PROTO_IPV4:
8961 case MOD_PROTO_IPV6:
8962 ptbl = ipst->ips_propinfo_tbl;
8963 break;
8964 case MOD_PROTO_RAWIP:
8965 ptbl = stack->netstack_icmp->is_propinfo_tbl;
8966 break;
8967 case MOD_PROTO_TCP:
8968 ptbl = stack->netstack_tcp->tcps_propinfo_tbl;
8969 break;
8970 case MOD_PROTO_UDP:
8971 ptbl = stack->netstack_udp->us_propinfo_tbl;
8972 break;
8973 case MOD_PROTO_SCTP:
8974 ptbl = stack->netstack_sctp->sctps_propinfo_tbl;
8975 break;
8976 default:
8977 miocnak(q, mp, 0, EINVAL);
8978 return;
8979 }
8980
8981 pinfo = mod_prop_lookup(ptbl, pioc->mpr_name, pioc->mpr_proto);
8982 if (pinfo == NULL) {
8983 miocnak(q, mp, 0, ENOENT);
8984 return;
8985 }
8986
8987 set = (iocp->ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE;
8988 if (set && pinfo->mpi_setf != NULL) {
8989 cr = msg_getcred(mp, NULL);
8990 if (cr == NULL)
8991 cr = iocp->ioc_cr;
8992 err = pinfo->mpi_setf(stack, cr, pinfo, pioc->mpr_ifname,
8993 pioc->mpr_val, pioc->mpr_flags);
8994 } else if (!set && pinfo->mpi_getf != NULL) {
8995 err = pinfo->mpi_getf(stack, pinfo, pioc->mpr_ifname,
8996 pioc->mpr_val, pioc->mpr_valsize, pioc->mpr_flags);
8997 } else {
8998 err = EPERM;
8999 }
9000
9001 if (err != 0) {
9002 miocnak(q, mp, 0, err);
9003 } else {
9004 if (set)
9005 miocack(q, mp, 0, 0);
9006 else /* For get, we need to return back the data */
9007 miocack(q, mp, iocp->ioc_count, 0);
9008 }
9009 }
9010
9011 /*
9012 * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding
9013 * as several routing daemons have unfortunately used this 'unpublished'
9014 * but well-known ioctls.
9015 */
9016 /* ARGSUSED */
9017 static void
9018 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp)
9019 {
9020 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9021 mblk_t *mp1 = mp->b_cont;
9022 char *pname, *pval, *buf;
9023 uint_t bufsize, proto;
9024 mod_prop_info_t *pinfo = NULL;
9025 ip_stack_t *ipst;
9026 int err = 0;
9027
9028 ASSERT(CONN_Q(q));
9029 ipst = CONNQ_TO_IPST(q);
9030
9031 if (iocp->ioc_count == 0 || mp1 == NULL) {
9032 miocnak(q, mp, 0, EINVAL);
9033 return;
9034 }
9035
9036 mp1->b_datap->db_lim[-1] = '\0'; /* Force null termination */
9037 pval = buf = pname = (char *)mp1->b_rptr;
9038 bufsize = MBLKL(mp1);
9039
9040 if (strcmp(pname, "ip_forwarding") == 0) {
9041 pname = "forwarding";
9042 proto = MOD_PROTO_IPV4;
9043 } else if (strcmp(pname, "ip6_forwarding") == 0) {
9044 pname = "forwarding";
9045 proto = MOD_PROTO_IPV6;
9046 } else {
9047 miocnak(q, mp, 0, EINVAL);
9048 return;
9049 }
9050
9051 pinfo = mod_prop_lookup(ipst->ips_propinfo_tbl, pname, proto);
9052
9053 switch (iocp->ioc_cmd) {
9054 case ND_GET:
9055 if ((err = pinfo->mpi_getf(ipst->ips_netstack, pinfo, NULL, buf,
9056 bufsize, 0)) == 0) {
9057 miocack(q, mp, iocp->ioc_count, 0);
9058 return;
9059 }
9060 break;
9061 case ND_SET:
9062 /*
9063 * buffer will have property name and value in the following
9064 * format,
9065 * <property name>'\0'<property value>'\0', extract them;
9066 */
9067 while (*pval++)
9068 noop;
9069
9070 if (!*pval || pval >= (char *)mp1->b_wptr) {
9071 err = EINVAL;
9072 } else if ((err = pinfo->mpi_setf(ipst->ips_netstack, NULL,
9073 pinfo, NULL, pval, 0)) == 0) {
9074 miocack(q, mp, 0, 0);
9075 return;
9076 }
9077 break;
9078 default:
9079 err = EINVAL;
9080 break;
9081 }
9082 miocnak(q, mp, 0, err);
9083 }
9084
9085 /*
9086 * Wrapper function for resuming deferred ioctl processing
9087 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
9088 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
9089 */
9090 /* ARGSUSED */
9091 void
9092 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
9093 void *dummy_arg)
9094 {
9095 ip_sioctl_copyin_setup(q, mp);
9096 }
9097
9098 /*
9099 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
9100 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle
9101 * in either I_STR or TRANSPARENT form, using the mi_copy facility.
9102 * We establish here the size of the block to be copied in. mi_copyin
9103 * arranges for this to happen, an processing continues in ip_wput_nondata with
9104 * an M_IOCDATA message.
9105 */
9106 void
9107 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
9108 {
9109 int copyin_size;
9110 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9111 ip_ioctl_cmd_t *ipip;
9112 cred_t *cr;
9113 ip_stack_t *ipst;
9114
9115 if (CONN_Q(q))
9116 ipst = CONNQ_TO_IPST(q);
9117 else
9118 ipst = ILLQ_TO_IPST(q);
9119
9120 ipip = ip_sioctl_lookup(iocp->ioc_cmd);
9121 if (ipip == NULL) {
9122 /*
9123 * The ioctl is not one we understand or own.
9124 * Pass it along to be processed down stream,
9125 * if this is a module instance of IP, else nak
9126 * the ioctl.
9127 */
9128 if (q->q_next == NULL) {
9129 goto nak;
9130 } else {
9131 putnext(q, mp);
9132 return;
9133 }
9134 }
9135
9136 /*
9137 * If this is deferred, then we will do all the checks when we
9138 * come back.
9139 */
9140 if ((iocp->ioc_cmd == SIOCGDSTINFO ||
9141 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
9142 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
9143 return;
9144 }
9145
9146 /*
9147 * Only allow a very small subset of IP ioctls on this stream if
9148 * IP is a module and not a driver. Allowing ioctls to be processed
9149 * in this case may cause assert failures or data corruption.
9150 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
9151 * ioctls allowed on an IP module stream, after which this stream
9152 * normally becomes a multiplexor (at which time the stream head
9153 * will fail all ioctls).
9154 */
9155 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
9156 goto nak;
9157 }
9158
9159 /* Make sure we have ioctl data to process. */
9160 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
9161 goto nak;
9162
9163 /*
9164 * Prefer dblk credential over ioctl credential; some synthesized
9165 * ioctls have kcred set because there's no way to crhold()
9166 * a credential in some contexts. (ioc_cr is not crfree() by
9167 * the framework; the caller of ioctl needs to hold the reference
9168 * for the duration of the call).
9169 */
9170 cr = msg_getcred(mp, NULL);
9171 if (cr == NULL)
9172 cr = iocp->ioc_cr;
9173
9174 /* Make sure normal users don't send down privileged ioctls */
9175 if ((ipip->ipi_flags & IPI_PRIV) &&
9176 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
9177 /* We checked the privilege earlier but log it here */
9178 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
9179 return;
9180 }
9181
9182 /*
9183 * The ioctl command tables can only encode fixed length
9184 * ioctl data. If the length is variable, the table will
9185 * encode the length as zero. Such special cases are handled
9186 * below in the switch.
9187 */
9188 if (ipip->ipi_copyin_size != 0) {
9189 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
9190 return;
9191 }
9192
9193 switch (iocp->ioc_cmd) {
9194 case O_SIOCGIFCONF:
9195 case SIOCGIFCONF:
9196 /*
9197 * This IOCTL is hilarious. See comments in
9198 * ip_sioctl_get_ifconf for the story.
9199 */
9200 if (iocp->ioc_count == TRANSPARENT)
9201 copyin_size = SIZEOF_STRUCT(ifconf,
9202 iocp->ioc_flag);
9203 else
9204 copyin_size = iocp->ioc_count;
9205 mi_copyin(q, mp, NULL, copyin_size);
9206 return;
9207
9208 case O_SIOCGLIFCONF:
9209 case SIOCGLIFCONF:
9210 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
9211 mi_copyin(q, mp, NULL, copyin_size);
9212 return;
9213
9214 case SIOCGLIFSRCOF:
9215 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
9216 mi_copyin(q, mp, NULL, copyin_size);
9217 return;
9218
9219 case SIOCGIP6ADDRPOLICY:
9220 ip_sioctl_ip6addrpolicy(q, mp);
9221 ip6_asp_table_refrele(ipst);
9222 return;
9223
9224 case SIOCSIP6ADDRPOLICY:
9225 ip_sioctl_ip6addrpolicy(q, mp);
9226 return;
9227
9228 case SIOCGDSTINFO:
9229 ip_sioctl_dstinfo(q, mp);
9230 ip6_asp_table_refrele(ipst);
9231 return;
9232
9233 case ND_SET:
9234 case ND_GET:
9235 ip_process_legacy_nddprop(q, mp);
9236 return;
9237
9238 case SIOCSETPROP:
9239 case SIOCGETPROP:
9240 ip_sioctl_getsetprop(q, mp);
9241 return;
9242
9243 case I_PLINK:
9244 case I_PUNLINK:
9245 case I_LINK:
9246 case I_UNLINK:
9247 /*
9248 * We treat non-persistent link similarly as the persistent
9249 * link case, in terms of plumbing/unplumbing, as well as
9250 * dynamic re-plumbing events indicator. See comments
9251 * in ip_sioctl_plink() for more.
9252 *
9253 * Request can be enqueued in the 'ipsq' while waiting
9254 * to become exclusive. So bump up the conn ref.
9255 */
9256 if (CONN_Q(q)) {
9257 CONN_INC_REF(Q_TO_CONN(q));
9258 CONN_INC_IOCTLREF(Q_TO_CONN(q))
9259 }
9260 ip_sioctl_plink(NULL, q, mp, NULL);
9261 return;
9262
9263 case IP_IOCTL:
9264 ip_wput_ioctl(q, mp);
9265 return;
9266
9267 case SIOCILB:
9268 /* The ioctl length varies depending on the ILB command. */
9269 copyin_size = iocp->ioc_count;
9270 if (copyin_size < sizeof (ilb_cmd_t))
9271 goto nak;
9272 mi_copyin(q, mp, NULL, copyin_size);
9273 return;
9274
9275 default:
9276 cmn_err(CE_WARN, "Unknown ioctl %d/0x%x slipped through.",
9277 iocp->ioc_cmd, iocp->ioc_cmd);
9278 /* FALLTHRU */
9279 }
9280 nak:
9281 if (mp->b_cont != NULL) {
9282 freemsg(mp->b_cont);
9283 mp->b_cont = NULL;
9284 }
9285 iocp->ioc_error = EINVAL;
9286 mp->b_datap->db_type = M_IOCNAK;
9287 iocp->ioc_count = 0;
9288 qreply(q, mp);
9289 }
9290
9291 static void
9292 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
9293 {
9294 struct arpreq *ar;
9295 struct xarpreq *xar;
9296 mblk_t *tmp;
9297 struct iocblk *iocp;
9298 int x_arp_ioctl = B_FALSE;
9299 int *flagsp;
9300 char *storage = NULL;
9301
9302 ASSERT(ill != NULL);
9303
9304 iocp = (struct iocblk *)mp->b_rptr;
9305 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
9306
9307 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
9308 if ((iocp->ioc_cmd == SIOCGXARP) ||
9309 (iocp->ioc_cmd == SIOCSXARP)) {
9310 x_arp_ioctl = B_TRUE;
9311 xar = (struct xarpreq *)tmp->b_rptr;
9312 flagsp = &xar->xarp_flags;
9313 storage = xar->xarp_ha.sdl_data;
9314 } else {
9315 ar = (struct arpreq *)tmp->b_rptr;
9316 flagsp = &ar->arp_flags;
9317 storage = ar->arp_ha.sa_data;
9318 }
9319
9320 /*
9321 * We're done if this is not an SIOCG{X}ARP
9322 */
9323 if (x_arp_ioctl) {
9324 storage += ill_xarp_info(&xar->xarp_ha, ill);
9325 if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9326 sizeof (xar->xarp_ha.sdl_data)) {
9327 iocp->ioc_error = EINVAL;
9328 return;
9329 }
9330 }
9331 *flagsp = ATF_INUSE;
9332 /*
9333 * If /sbin/arp told us we are the authority using the "permanent"
9334 * flag, or if this is one of my addresses print "permanent"
9335 * in the /sbin/arp output.
9336 */
9337 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
9338 *flagsp |= ATF_AUTHORITY;
9339 if (flags & NCE_F_NONUD)
9340 *flagsp |= ATF_PERM; /* not subject to aging */
9341 if (flags & NCE_F_PUBLISH)
9342 *flagsp |= ATF_PUBL;
9343 if (hwaddr != NULL) {
9344 *flagsp |= ATF_COM;
9345 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
9346 }
9347 }
9348
9349 /*
9350 * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9351 * interface) create the next available logical interface for this
9352 * physical interface.
9353 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9354 * ipif with the specified name.
9355 *
9356 * If the address family is not AF_UNSPEC then set the address as well.
9357 *
9358 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
9359 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
9360 *
9361 * Executed as a writer on the ill.
9362 * So no lock is needed to traverse the ipif chain, or examine the
9363 * phyint flags.
9364 */
9365 /* ARGSUSED */
9366 int
9367 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9368 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9369 {
9370 mblk_t *mp1;
9371 struct lifreq *lifr;
9372 boolean_t isv6;
9373 boolean_t exists;
9374 char *name;
9375 char *endp;
9376 char *cp;
9377 int namelen;
9378 ipif_t *ipif;
9379 long id;
9380 ipsq_t *ipsq;
9381 ill_t *ill;
9382 sin_t *sin;
9383 int err = 0;
9384 boolean_t found_sep = B_FALSE;
9385 conn_t *connp;
9386 zoneid_t zoneid;
9387 ip_stack_t *ipst = CONNQ_TO_IPST(q);
9388
9389 ASSERT(q->q_next == NULL);
9390 ip1dbg(("ip_sioctl_addif\n"));
9391 /* Existence of mp1 has been checked in ip_wput_nondata */
9392 mp1 = mp->b_cont->b_cont;
9393 /*
9394 * Null terminate the string to protect against buffer
9395 * overrun. String was generated by user code and may not
9396 * be trusted.
9397 */
9398 lifr = (struct lifreq *)mp1->b_rptr;
9399 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
9400 name = lifr->lifr_name;
9401 ASSERT(CONN_Q(q));
9402 connp = Q_TO_CONN(q);
9403 isv6 = (connp->conn_family == AF_INET6);
9404 zoneid = connp->conn_zoneid;
9405 namelen = mi_strlen(name);
9406 if (namelen == 0)
9407 return (EINVAL);
9408
9409 exists = B_FALSE;
9410 if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
9411 (mi_strcmp(name, ipif_loopback_name) == 0)) {
9412 /*
9413 * Allow creating lo0 using SIOCLIFADDIF.
9414 * can't be any other writer thread. So can pass null below
9415 * for the last 4 args to ipif_lookup_name.
9416 */
9417 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
9418 &exists, isv6, zoneid, ipst);
9419 /* Prevent any further action */
9420 if (ipif == NULL) {
9421 return (ENOBUFS);
9422 } else if (!exists) {
9423 /* We created the ipif now and as writer */
9424 ipif_refrele(ipif);
9425 return (0);
9426 } else {
9427 ill = ipif->ipif_ill;
9428 ill_refhold(ill);
9429 ipif_refrele(ipif);
9430 }
9431 } else {
9432 /* Look for a colon in the name. */
9433 endp = &name[namelen];
9434 for (cp = endp; --cp > name; ) {
9435 if (*cp == IPIF_SEPARATOR_CHAR) {
9436 found_sep = B_TRUE;
9437 /*
9438 * Reject any non-decimal aliases for plumbing
9439 * of logical interfaces. Aliases with leading
9440 * zeroes are also rejected as they introduce
9441 * ambiguity in the naming of the interfaces.
9442 * Comparing with "0" takes care of all such
9443 * cases.
9444 */
9445 if ((strncmp("0", cp+1, 1)) == 0)
9446 return (EINVAL);
9447
9448 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
9449 id <= 0 || *endp != '\0') {
9450 return (EINVAL);
9451 }
9452 *cp = '\0';
9453 break;
9454 }
9455 }
9456 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
9457 if (found_sep)
9458 *cp = IPIF_SEPARATOR_CHAR;
9459 if (ill == NULL)
9460 return (ENXIO);
9461 }
9462
9463 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
9464 B_TRUE);
9465
9466 /*
9467 * Release the refhold due to the lookup, now that we are excl
9468 * or we are just returning
9469 */
9470 ill_refrele(ill);
9471
9472 if (ipsq == NULL)
9473 return (EINPROGRESS);
9474
9475 /* We are now exclusive on the IPSQ */
9476 ASSERT(IAM_WRITER_ILL(ill));
9477
9478 if (found_sep) {
9479 /* Now see if there is an IPIF with this unit number. */
9480 for (ipif = ill->ill_ipif; ipif != NULL;
9481 ipif = ipif->ipif_next) {
9482 if (ipif->ipif_id == id) {
9483 err = EEXIST;
9484 goto done;
9485 }
9486 }
9487 }
9488
9489 /*
9490 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
9491 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name()
9492 * instead.
9493 */
9494 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
9495 B_TRUE, B_TRUE, &err)) == NULL) {
9496 goto done;
9497 }
9498
9499 /* Return created name with ioctl */
9500 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
9501 IPIF_SEPARATOR_CHAR, ipif->ipif_id);
9502 ip1dbg(("created %s\n", lifr->lifr_name));
9503
9504 /* Set address */
9505 sin = (sin_t *)&lifr->lifr_addr;
9506 if (sin->sin_family != AF_UNSPEC) {
9507 err = ip_sioctl_addr(ipif, sin, q, mp,
9508 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
9509 }
9510
9511 done:
9512 ipsq_exit(ipsq);
9513 return (err);
9514 }
9515
9516 /*
9517 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
9518 * interface) delete it based on the IP address (on this physical interface).
9519 * Otherwise delete it based on the ipif_id.
9520 * Also, special handling to allow a removeif of lo0.
9521 */
9522 /* ARGSUSED */
9523 int
9524 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9525 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9526 {
9527 conn_t *connp;
9528 ill_t *ill = ipif->ipif_ill;
9529 boolean_t success;
9530 ip_stack_t *ipst;
9531
9532 ipst = CONNQ_TO_IPST(q);
9533
9534 ASSERT(q->q_next == NULL);
9535 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
9536 ill->ill_name, ipif->ipif_id, (void *)ipif));
9537 ASSERT(IAM_WRITER_IPIF(ipif));
9538
9539 connp = Q_TO_CONN(q);
9540 /*
9541 * Special case for unplumbing lo0 (the loopback physical interface).
9542 * If unplumbing lo0, the incoming address structure has been
9543 * initialized to all zeros. When unplumbing lo0, all its logical
9544 * interfaces must be removed too.
9545 *
9546 * Note that this interface may be called to remove a specific
9547 * loopback logical interface (eg, lo0:1). But in that case
9548 * ipif->ipif_id != 0 so that the code path for that case is the
9549 * same as any other interface (meaning it skips the code directly
9550 * below).
9551 */
9552 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9553 if (sin->sin_family == AF_UNSPEC &&
9554 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
9555 /*
9556 * Mark it condemned. No new ref. will be made to ill.
9557 */
9558 mutex_enter(&ill->ill_lock);
9559 ill->ill_state_flags |= ILL_CONDEMNED;
9560 for (ipif = ill->ill_ipif; ipif != NULL;
9561 ipif = ipif->ipif_next) {
9562 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9563 }
9564 mutex_exit(&ill->ill_lock);
9565
9566 ipif = ill->ill_ipif;
9567 /* unplumb the loopback interface */
9568 ill_delete(ill);
9569 mutex_enter(&connp->conn_lock);
9570 mutex_enter(&ill->ill_lock);
9571
9572 /* Are any references to this ill active */
9573 if (ill_is_freeable(ill)) {
9574 mutex_exit(&ill->ill_lock);
9575 mutex_exit(&connp->conn_lock);
9576 ill_delete_tail(ill);
9577 mi_free(ill);
9578 return (0);
9579 }
9580 success = ipsq_pending_mp_add(connp, ipif,
9581 CONNP_TO_WQ(connp), mp, ILL_FREE);
9582 mutex_exit(&connp->conn_lock);
9583 mutex_exit(&ill->ill_lock);
9584 if (success)
9585 return (EINPROGRESS);
9586 else
9587 return (EINTR);
9588 }
9589 }
9590
9591 if (ipif->ipif_id == 0) {
9592 ipsq_t *ipsq;
9593
9594 /* Find based on address */
9595 if (ipif->ipif_isv6) {
9596 sin6_t *sin6;
9597
9598 if (sin->sin_family != AF_INET6)
9599 return (EAFNOSUPPORT);
9600
9601 sin6 = (sin6_t *)sin;
9602 /* We are a writer, so we should be able to lookup */
9603 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
9604 ipst);
9605 } else {
9606 if (sin->sin_family != AF_INET)
9607 return (EAFNOSUPPORT);
9608
9609 /* We are a writer, so we should be able to lookup */
9610 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
9611 ipst);
9612 }
9613 if (ipif == NULL) {
9614 return (EADDRNOTAVAIL);
9615 }
9616
9617 /*
9618 * It is possible for a user to send an SIOCLIFREMOVEIF with
9619 * lifr_name of the physical interface but with an ip address
9620 * lifr_addr of a logical interface plumbed over it.
9621 * So update ipx_current_ipif now that ipif points to the
9622 * correct one.
9623 */
9624 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
9625 ipsq->ipsq_xop->ipx_current_ipif = ipif;
9626
9627 /* This is a writer */
9628 ipif_refrele(ipif);
9629 }
9630
9631 /*
9632 * Can not delete instance zero since it is tied to the ill.
9633 */
9634 if (ipif->ipif_id == 0)
9635 return (EBUSY);
9636
9637 mutex_enter(&ill->ill_lock);
9638 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9639 mutex_exit(&ill->ill_lock);
9640
9641 ipif_free(ipif);
9642
9643 mutex_enter(&connp->conn_lock);
9644 mutex_enter(&ill->ill_lock);
9645
9646 /* Are any references to this ipif active */
9647 if (ipif_is_freeable(ipif)) {
9648 mutex_exit(&ill->ill_lock);
9649 mutex_exit(&connp->conn_lock);
9650 ipif_non_duplicate(ipif);
9651 (void) ipif_down_tail(ipif);
9652 ipif_free_tail(ipif); /* frees ipif */
9653 return (0);
9654 }
9655 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
9656 IPIF_FREE);
9657 mutex_exit(&ill->ill_lock);
9658 mutex_exit(&connp->conn_lock);
9659 if (success)
9660 return (EINPROGRESS);
9661 else
9662 return (EINTR);
9663 }
9664
9665 /*
9666 * Restart the removeif ioctl. The refcnt has gone down to 0.
9667 * The ipif is already condemned. So can't find it thru lookups.
9668 */
9669 /* ARGSUSED */
9670 int
9671 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
9672 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9673 {
9674 ill_t *ill = ipif->ipif_ill;
9675
9676 ASSERT(IAM_WRITER_IPIF(ipif));
9677 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
9678
9679 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
9680 ill->ill_name, ipif->ipif_id, (void *)ipif));
9681
9682 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9683 ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
9684 ill_delete_tail(ill);
9685 mi_free(ill);
9686 return (0);
9687 }
9688
9689 ipif_non_duplicate(ipif);
9690 (void) ipif_down_tail(ipif);
9691 ipif_free_tail(ipif);
9692
9693 return (0);
9694 }
9695
9696 /*
9697 * Set the local interface address using the given prefix and ill_token.
9698 */
9699 /* ARGSUSED */
9700 int
9701 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9702 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9703 {
9704 int err;
9705 in6_addr_t v6addr;
9706 sin6_t *sin6;
9707 ill_t *ill;
9708 int i;
9709
9710 ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n",
9711 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9712
9713 ASSERT(IAM_WRITER_IPIF(ipif));
9714
9715 if (!ipif->ipif_isv6)
9716 return (EINVAL);
9717
9718 if (sin->sin_family != AF_INET6)
9719 return (EAFNOSUPPORT);
9720
9721 sin6 = (sin6_t *)sin;
9722 v6addr = sin6->sin6_addr;
9723 ill = ipif->ipif_ill;
9724
9725 if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) ||
9726 IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token))
9727 return (EADDRNOTAVAIL);
9728
9729 for (i = 0; i < 4; i++)
9730 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i];
9731
9732 err = ip_sioctl_addr(ipif, sin, q, mp,
9733 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq);
9734 return (err);
9735 }
9736
9737 /*
9738 * Restart entry point to restart the address set operation after the
9739 * refcounts have dropped to zero.
9740 */
9741 /* ARGSUSED */
9742 int
9743 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9744 ip_ioctl_cmd_t *ipip, void *ifreq)
9745 {
9746 ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n",
9747 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9748 return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq));
9749 }
9750
9751 /*
9752 * Set the local interface address.
9753 * Allow an address of all zero when the interface is down.
9754 */
9755 /* ARGSUSED */
9756 int
9757 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9758 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9759 {
9760 int err = 0;
9761 in6_addr_t v6addr;
9762 boolean_t need_up = B_FALSE;
9763 ill_t *ill;
9764
9765 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
9766 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9767
9768 ASSERT(IAM_WRITER_IPIF(ipif));
9769
9770 ill = ipif->ipif_ill;
9771 if (ipif->ipif_isv6) {
9772 sin6_t *sin6;
9773 phyint_t *phyi;
9774
9775 if (sin->sin_family != AF_INET6)
9776 return (EAFNOSUPPORT);
9777
9778 sin6 = (sin6_t *)sin;
9779 v6addr = sin6->sin6_addr;
9780 phyi = ill->ill_phyint;
9781
9782 /*
9783 * Enforce that true multicast interfaces have a link-local
9784 * address for logical unit 0.
9785 *
9786 * However for those ipif's for which link-local address was
9787 * not created by default, also allow setting :: as the address.
9788 * This scenario would arise, when we delete an address on ipif
9789 * with logical unit 0, we would want to set :: as the address.
9790 */
9791 if (ipif->ipif_id == 0 &&
9792 (ill->ill_flags & ILLF_MULTICAST) &&
9793 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
9794 !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
9795 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
9796
9797 /*
9798 * if default link-local was not created by kernel for
9799 * this ill, allow setting :: as the address on ipif:0.
9800 */
9801 if (ill->ill_flags & ILLF_NOLINKLOCAL) {
9802 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr))
9803 return (EADDRNOTAVAIL);
9804 } else {
9805 return (EADDRNOTAVAIL);
9806 }
9807 }
9808
9809 /*
9810 * up interfaces shouldn't have the unspecified address
9811 * unless they also have the IPIF_NOLOCAL flags set and
9812 * have a subnet assigned.
9813 */
9814 if ((ipif->ipif_flags & IPIF_UP) &&
9815 IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
9816 (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
9817 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
9818 return (EADDRNOTAVAIL);
9819 }
9820
9821 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9822 return (EADDRNOTAVAIL);
9823 } else {
9824 ipaddr_t addr;
9825
9826 if (sin->sin_family != AF_INET)
9827 return (EAFNOSUPPORT);
9828
9829 addr = sin->sin_addr.s_addr;
9830
9831 /* Allow INADDR_ANY as the local address. */
9832 if (addr != INADDR_ANY &&
9833 !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
9834 return (EADDRNOTAVAIL);
9835
9836 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9837 }
9838 /* verify that the address being configured is permitted by mac */
9839 if (!ill_ipcheck_addr(ill, &v6addr)) {
9840 return (EPERM);
9841 }
9842 /*
9843 * Even if there is no change we redo things just to rerun
9844 * ipif_set_default.
9845 */
9846 if (ipif->ipif_flags & IPIF_UP) {
9847 /*
9848 * Setting a new local address, make sure
9849 * we have net and subnet bcast ire's for
9850 * the old address if we need them.
9851 */
9852 /*
9853 * If the interface is already marked up,
9854 * we call ipif_down which will take care
9855 * of ditching any IREs that have been set
9856 * up based on the old interface address.
9857 */
9858 err = ipif_logical_down(ipif, q, mp);
9859 if (err == EINPROGRESS)
9860 return (err);
9861 (void) ipif_down_tail(ipif);
9862 need_up = 1;
9863 }
9864
9865 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
9866 return (err);
9867 }
9868
9869 int
9870 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9871 boolean_t need_up)
9872 {
9873 in6_addr_t v6addr;
9874 in6_addr_t ov6addr;
9875 ipaddr_t addr;
9876 sin6_t *sin6;
9877 int sinlen;
9878 int err = 0;
9879 ill_t *ill = ipif->ipif_ill;
9880 boolean_t need_dl_down;
9881 boolean_t need_arp_down;
9882 struct iocblk *iocp;
9883
9884 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
9885
9886 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
9887 ill->ill_name, ipif->ipif_id, (void *)ipif));
9888 ASSERT(IAM_WRITER_IPIF(ipif));
9889
9890 /* Must cancel any pending timer before taking the ill_lock */
9891 if (ipif->ipif_recovery_id != 0)
9892 (void) untimeout(ipif->ipif_recovery_id);
9893 ipif->ipif_recovery_id = 0;
9894
9895 if (ipif->ipif_isv6) {
9896 sin6 = (sin6_t *)sin;
9897 v6addr = sin6->sin6_addr;
9898 sinlen = sizeof (struct sockaddr_in6);
9899 } else {
9900 addr = sin->sin_addr.s_addr;
9901 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9902 sinlen = sizeof (struct sockaddr_in);
9903 }
9904 mutex_enter(&ill->ill_lock);
9905 ov6addr = ipif->ipif_v6lcl_addr;
9906 ipif->ipif_v6lcl_addr = v6addr;
9907 sctp_update_ipif_addr(ipif, ov6addr);
9908 ipif->ipif_addr_ready = 0;
9909
9910 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
9911
9912 /*
9913 * If the interface was previously marked as a duplicate, then since
9914 * we've now got a "new" address, it should no longer be considered a
9915 * duplicate -- even if the "new" address is the same as the old one.
9916 * Note that if all ipifs are down, we may have a pending ARP down
9917 * event to handle. This is because we want to recover from duplicates
9918 * and thus delay tearing down ARP until the duplicates have been
9919 * removed or disabled.
9920 */
9921 need_dl_down = need_arp_down = B_FALSE;
9922 if (ipif->ipif_flags & IPIF_DUPLICATE) {
9923 need_arp_down = !need_up;
9924 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9925 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9926 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9927 need_dl_down = B_TRUE;
9928 }
9929 }
9930
9931 ipif_set_default(ipif);
9932
9933 /*
9934 * If we've just manually set the IPv6 link-local address (0th ipif),
9935 * tag the ill so that future updates to the interface ID don't result
9936 * in this address getting automatically reconfigured from under the
9937 * administrator.
9938 */
9939 if (ipif->ipif_isv6 && ipif->ipif_id == 0) {
9940 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR &&
9941 !IN6_IS_ADDR_UNSPECIFIED(&v6addr)))
9942 ill->ill_manual_linklocal = 1;
9943 }
9944
9945 /*
9946 * When publishing an interface address change event, we only notify
9947 * the event listeners of the new address. It is assumed that if they
9948 * actively care about the addresses assigned that they will have
9949 * already discovered the previous address assigned (if there was one.)
9950 *
9951 * Don't attach nic event message for SIOCLIFADDIF ioctl.
9952 */
9953 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
9954 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id),
9955 NE_ADDRESS_CHANGE, sin, sinlen);
9956 }
9957
9958 mutex_exit(&ill->ill_lock);
9959
9960 if (need_up) {
9961 /*
9962 * Now bring the interface back up. If this
9963 * is the only IPIF for the ILL, ipif_up
9964 * will have to re-bind to the device, so
9965 * we may get back EINPROGRESS, in which
9966 * case, this IOCTL will get completed in
9967 * ip_rput_dlpi when we see the DL_BIND_ACK.
9968 */
9969 err = ipif_up(ipif, q, mp);
9970 } else {
9971 /* Perhaps ilgs should use this ill */
9972 update_conn_ill(NULL, ill->ill_ipst);
9973 }
9974
9975 if (need_dl_down)
9976 ill_dl_down(ill);
9977
9978 if (need_arp_down && !ill->ill_isv6)
9979 (void) ipif_arp_down(ipif);
9980
9981 /*
9982 * The default multicast interface might have changed (for
9983 * instance if the IPv6 scope of the address changed)
9984 */
9985 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
9986
9987 return (err);
9988 }
9989
9990 /*
9991 * Restart entry point to restart the address set operation after the
9992 * refcounts have dropped to zero.
9993 */
9994 /* ARGSUSED */
9995 int
9996 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9997 ip_ioctl_cmd_t *ipip, void *ifreq)
9998 {
9999 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
10000 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10001 ASSERT(IAM_WRITER_IPIF(ipif));
10002 (void) ipif_down_tail(ipif);
10003 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
10004 }
10005
10006 /* ARGSUSED */
10007 int
10008 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10009 ip_ioctl_cmd_t *ipip, void *if_req)
10010 {
10011 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
10012 struct lifreq *lifr = (struct lifreq *)if_req;
10013
10014 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
10015 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10016 /*
10017 * The net mask and address can't change since we have a
10018 * reference to the ipif. So no lock is necessary.
10019 */
10020 if (ipif->ipif_isv6) {
10021 *sin6 = sin6_null;
10022 sin6->sin6_family = AF_INET6;
10023 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
10024 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
10025 sin6->sin6_scope_id =
10026 ipif->ipif_ill->ill_phyint->phyint_ifindex;
10027 }
10028 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10029 lifr->lifr_addrlen =
10030 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10031 } else {
10032 *sin = sin_null;
10033 sin->sin_family = AF_INET;
10034 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
10035 if (ipip->ipi_cmd_type == LIF_CMD) {
10036 lifr->lifr_addrlen =
10037 ip_mask_to_plen(ipif->ipif_net_mask);
10038 }
10039 }
10040 return (0);
10041 }
10042
10043 /*
10044 * Set the destination address for a pt-pt interface.
10045 */
10046 /* ARGSUSED */
10047 int
10048 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10049 ip_ioctl_cmd_t *ipip, void *if_req)
10050 {
10051 int err = 0;
10052 in6_addr_t v6addr;
10053 boolean_t need_up = B_FALSE;
10054
10055 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
10056 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10057 ASSERT(IAM_WRITER_IPIF(ipif));
10058
10059 if (ipif->ipif_isv6) {
10060 sin6_t *sin6;
10061
10062 if (sin->sin_family != AF_INET6)
10063 return (EAFNOSUPPORT);
10064
10065 sin6 = (sin6_t *)sin;
10066 v6addr = sin6->sin6_addr;
10067
10068 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
10069 return (EADDRNOTAVAIL);
10070 } else {
10071 ipaddr_t addr;
10072
10073 if (sin->sin_family != AF_INET)
10074 return (EAFNOSUPPORT);
10075
10076 addr = sin->sin_addr.s_addr;
10077 if (addr != INADDR_ANY &&
10078 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) {
10079 return (EADDRNOTAVAIL);
10080 }
10081
10082 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10083 }
10084
10085 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
10086 return (0); /* No change */
10087
10088 if (ipif->ipif_flags & IPIF_UP) {
10089 /*
10090 * If the interface is already marked up,
10091 * we call ipif_down which will take care
10092 * of ditching any IREs that have been set
10093 * up based on the old pp dst address.
10094 */
10095 err = ipif_logical_down(ipif, q, mp);
10096 if (err == EINPROGRESS)
10097 return (err);
10098 (void) ipif_down_tail(ipif);
10099 need_up = B_TRUE;
10100 }
10101 /*
10102 * could return EINPROGRESS. If so ioctl will complete in
10103 * ip_rput_dlpi_writer
10104 */
10105 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
10106 return (err);
10107 }
10108
10109 static int
10110 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10111 boolean_t need_up)
10112 {
10113 in6_addr_t v6addr;
10114 ill_t *ill = ipif->ipif_ill;
10115 int err = 0;
10116 boolean_t need_dl_down;
10117 boolean_t need_arp_down;
10118
10119 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
10120 ipif->ipif_id, (void *)ipif));
10121
10122 /* Must cancel any pending timer before taking the ill_lock */
10123 if (ipif->ipif_recovery_id != 0)
10124 (void) untimeout(ipif->ipif_recovery_id);
10125 ipif->ipif_recovery_id = 0;
10126
10127 if (ipif->ipif_isv6) {
10128 sin6_t *sin6;
10129
10130 sin6 = (sin6_t *)sin;
10131 v6addr = sin6->sin6_addr;
10132 } else {
10133 ipaddr_t addr;
10134
10135 addr = sin->sin_addr.s_addr;
10136 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10137 }
10138 mutex_enter(&ill->ill_lock);
10139 /* Set point to point destination address. */
10140 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10141 /*
10142 * Allow this as a means of creating logical
10143 * pt-pt interfaces on top of e.g. an Ethernet.
10144 * XXX Undocumented HACK for testing.
10145 * pt-pt interfaces are created with NUD disabled.
10146 */
10147 ipif->ipif_flags |= IPIF_POINTOPOINT;
10148 ipif->ipif_flags &= ~IPIF_BROADCAST;
10149 if (ipif->ipif_isv6)
10150 ill->ill_flags |= ILLF_NONUD;
10151 }
10152
10153 /*
10154 * If the interface was previously marked as a duplicate, then since
10155 * we've now got a "new" address, it should no longer be considered a
10156 * duplicate -- even if the "new" address is the same as the old one.
10157 * Note that if all ipifs are down, we may have a pending ARP down
10158 * event to handle.
10159 */
10160 need_dl_down = need_arp_down = B_FALSE;
10161 if (ipif->ipif_flags & IPIF_DUPLICATE) {
10162 need_arp_down = !need_up;
10163 ipif->ipif_flags &= ~IPIF_DUPLICATE;
10164 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
10165 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
10166 need_dl_down = B_TRUE;
10167 }
10168 }
10169
10170 /*
10171 * If we've just manually set the IPv6 destination link-local address
10172 * (0th ipif), tag the ill so that future updates to the destination
10173 * interface ID (as can happen with interfaces over IP tunnels) don't
10174 * result in this address getting automatically reconfigured from
10175 * under the administrator.
10176 */
10177 if (ipif->ipif_isv6 && ipif->ipif_id == 0)
10178 ill->ill_manual_dst_linklocal = 1;
10179
10180 /* Set the new address. */
10181 ipif->ipif_v6pp_dst_addr = v6addr;
10182 /* Make sure subnet tracks pp_dst */
10183 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
10184 mutex_exit(&ill->ill_lock);
10185
10186 if (need_up) {
10187 /*
10188 * Now bring the interface back up. If this
10189 * is the only IPIF for the ILL, ipif_up
10190 * will have to re-bind to the device, so
10191 * we may get back EINPROGRESS, in which
10192 * case, this IOCTL will get completed in
10193 * ip_rput_dlpi when we see the DL_BIND_ACK.
10194 */
10195 err = ipif_up(ipif, q, mp);
10196 }
10197
10198 if (need_dl_down)
10199 ill_dl_down(ill);
10200 if (need_arp_down && !ipif->ipif_isv6)
10201 (void) ipif_arp_down(ipif);
10202
10203 return (err);
10204 }
10205
10206 /*
10207 * Restart entry point to restart the dstaddress set operation after the
10208 * refcounts have dropped to zero.
10209 */
10210 /* ARGSUSED */
10211 int
10212 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10213 ip_ioctl_cmd_t *ipip, void *ifreq)
10214 {
10215 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
10216 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10217 (void) ipif_down_tail(ipif);
10218 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
10219 }
10220
10221 /* ARGSUSED */
10222 int
10223 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10224 ip_ioctl_cmd_t *ipip, void *if_req)
10225 {
10226 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
10227
10228 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
10229 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10230 /*
10231 * Get point to point destination address. The addresses can't
10232 * change since we hold a reference to the ipif.
10233 */
10234 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
10235 return (EADDRNOTAVAIL);
10236
10237 if (ipif->ipif_isv6) {
10238 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10239 *sin6 = sin6_null;
10240 sin6->sin6_family = AF_INET6;
10241 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
10242 } else {
10243 *sin = sin_null;
10244 sin->sin_family = AF_INET;
10245 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
10246 }
10247 return (0);
10248 }
10249
10250 /*
10251 * Check which flags will change by the given flags being set
10252 * silently ignore flags which userland is not allowed to control.
10253 * (Because these flags may change between SIOCGLIFFLAGS and
10254 * SIOCSLIFFLAGS, and that's outside of userland's control,
10255 * we need to silently ignore them rather than fail.)
10256 */
10257 static void
10258 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp,
10259 uint64_t *offp)
10260 {
10261 ill_t *ill = ipif->ipif_ill;
10262 phyint_t *phyi = ill->ill_phyint;
10263 uint64_t cantchange_flags, intf_flags;
10264 uint64_t turn_on, turn_off;
10265
10266 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10267 cantchange_flags = IFF_CANTCHANGE;
10268 if (IS_IPMP(ill))
10269 cantchange_flags |= IFF_IPMP_CANTCHANGE;
10270 turn_on = (flags ^ intf_flags) & ~cantchange_flags;
10271 turn_off = intf_flags & turn_on;
10272 turn_on ^= turn_off;
10273 *onp = turn_on;
10274 *offp = turn_off;
10275 }
10276
10277 /*
10278 * Set interface flags. Many flags require special handling (e.g.,
10279 * bringing the interface down); see below for details.
10280 *
10281 * NOTE : We really don't enforce that ipif_id zero should be used
10282 * for setting any flags other than IFF_LOGINT_FLAGS. This
10283 * is because applications generally does SICGLIFFLAGS and
10284 * ORs in the new flags (that affects the logical) and does a
10285 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10286 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10287 * flags that will be turned on is correct with respect to
10288 * ipif_id 0. For backward compatibility reasons, it is not done.
10289 */
10290 /* ARGSUSED */
10291 int
10292 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10293 ip_ioctl_cmd_t *ipip, void *if_req)
10294 {
10295 uint64_t turn_on;
10296 uint64_t turn_off;
10297 int err = 0;
10298 phyint_t *phyi;
10299 ill_t *ill;
10300 conn_t *connp;
10301 uint64_t intf_flags;
10302 boolean_t phyint_flags_modified = B_FALSE;
10303 uint64_t flags;
10304 struct ifreq *ifr;
10305 struct lifreq *lifr;
10306 boolean_t set_linklocal = B_FALSE;
10307
10308 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10309 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10310
10311 ASSERT(IAM_WRITER_IPIF(ipif));
10312
10313 ill = ipif->ipif_ill;
10314 phyi = ill->ill_phyint;
10315
10316 if (ipip->ipi_cmd_type == IF_CMD) {
10317 ifr = (struct ifreq *)if_req;
10318 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10319 } else {
10320 lifr = (struct lifreq *)if_req;
10321 flags = lifr->lifr_flags;
10322 }
10323
10324 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10325
10326 /*
10327 * Have the flags been set correctly until now?
10328 */
10329 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10330 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10331 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10332 /*
10333 * Compare the new flags to the old, and partition
10334 * into those coming on and those going off.
10335 * For the 16 bit command keep the bits above bit 16 unchanged.
10336 */
10337 if (ipip->ipi_cmd == SIOCSIFFLAGS)
10338 flags |= intf_flags & ~0xFFFF;
10339
10340 /*
10341 * Explicitly fail attempts to change flags that are always invalid on
10342 * an IPMP meta-interface.
10343 */
10344 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
10345 return (EINVAL);
10346
10347 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10348 if ((turn_on|turn_off) == 0)
10349 return (0); /* No change */
10350
10351 /*
10352 * All test addresses must be IFF_DEPRECATED (to ensure source address
10353 * selection avoids them) -- so force IFF_DEPRECATED on, and do not
10354 * allow it to be turned off.
10355 */
10356 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
10357 (turn_on|intf_flags) & IFF_NOFAILOVER)
10358 return (EINVAL);
10359
10360 if ((connp = Q_TO_CONN(q)) == NULL)
10361 return (EINVAL);
10362
10363 /*
10364 * Only vrrp control socket is allowed to change IFF_UP and
10365 * IFF_NOACCEPT flags when IFF_VRRP is set.
10366 */
10367 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) {
10368 if (!connp->conn_isvrrp)
10369 return (EINVAL);
10370 }
10371
10372 /*
10373 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by
10374 * VRRP control socket.
10375 */
10376 if ((turn_off | turn_on) & IFF_NOACCEPT) {
10377 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP))
10378 return (EINVAL);
10379 }
10380
10381 if (turn_on & IFF_NOFAILOVER) {
10382 turn_on |= IFF_DEPRECATED;
10383 flags |= IFF_DEPRECATED;
10384 }
10385
10386 /*
10387 * On underlying interfaces, only allow applications to manage test
10388 * addresses -- otherwise, they may get confused when the address
10389 * moves as part of being brought up. Likewise, prevent an
10390 * application-managed test address from being converted to a data
10391 * address. To prevent migration of administratively up addresses in
10392 * the kernel, we don't allow them to be converted either.
10393 */
10394 if (IS_UNDER_IPMP(ill)) {
10395 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
10396
10397 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
10398 return (EINVAL);
10399
10400 if ((turn_off & IFF_NOFAILOVER) &&
10401 (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
10402 return (EINVAL);
10403 }
10404
10405 /*
10406 * Only allow IFF_TEMPORARY flag to be set on
10407 * IPv6 interfaces.
10408 */
10409 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
10410 return (EINVAL);
10411
10412 /*
10413 * cannot turn off IFF_NOXMIT on VNI interfaces.
10414 */
10415 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
10416 return (EINVAL);
10417
10418 /*
10419 * Don't allow the IFF_ROUTER flag to be turned on on loopback
10420 * interfaces. It makes no sense in that context.
10421 */
10422 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
10423 return (EINVAL);
10424
10425 /*
10426 * For IPv6 ipif_id 0, don't allow the interface to be up without
10427 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
10428 * If the link local address isn't set, and can be set, it will get
10429 * set later on in this function.
10430 */
10431 if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
10432 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
10433 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
10434 if (ipif_cant_setlinklocal(ipif))
10435 return (EINVAL);
10436 set_linklocal = B_TRUE;
10437 }
10438
10439 /*
10440 * If we modify physical interface flags, we'll potentially need to
10441 * send up two routing socket messages for the changes (one for the
10442 * IPv4 ill, and another for the IPv6 ill). Note that here.
10443 */
10444 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10445 phyint_flags_modified = B_TRUE;
10446
10447 /*
10448 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
10449 * (otherwise, we'd immediately use them, defeating standby). Also,
10450 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
10451 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
10452 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We
10453 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
10454 * will not be honored.
10455 */
10456 if (turn_on & PHYI_STANDBY) {
10457 /*
10458 * No need to grab ill_g_usesrc_lock here; see the
10459 * synchronization notes in ip.c.
10460 */
10461 if (ill->ill_usesrc_grp_next != NULL ||
10462 intf_flags & PHYI_INACTIVE)
10463 return (EINVAL);
10464 if (!(flags & PHYI_FAILED)) {
10465 flags |= PHYI_INACTIVE;
10466 turn_on |= PHYI_INACTIVE;
10467 }
10468 }
10469
10470 if (turn_off & PHYI_STANDBY) {
10471 flags &= ~PHYI_INACTIVE;
10472 turn_off |= PHYI_INACTIVE;
10473 }
10474
10475 /*
10476 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
10477 * would end up on.
10478 */
10479 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
10480 (PHYI_FAILED | PHYI_INACTIVE))
10481 return (EINVAL);
10482
10483 /*
10484 * If ILLF_ROUTER changes, we need to change the ip forwarding
10485 * status of the interface.
10486 */
10487 if ((turn_on | turn_off) & ILLF_ROUTER) {
10488 err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
10489 if (err != 0)
10490 return (err);
10491 }
10492
10493 /*
10494 * If the interface is not UP and we are not going to
10495 * bring it UP, record the flags and return. When the
10496 * interface comes UP later, the right actions will be
10497 * taken.
10498 */
10499 if (!(ipif->ipif_flags & IPIF_UP) &&
10500 !(turn_on & IPIF_UP)) {
10501 /* Record new flags in their respective places. */
10502 mutex_enter(&ill->ill_lock);
10503 mutex_enter(&ill->ill_phyint->phyint_lock);
10504 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10505 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10506 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10507 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10508 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10509 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10510 mutex_exit(&ill->ill_lock);
10511 mutex_exit(&ill->ill_phyint->phyint_lock);
10512
10513 /*
10514 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
10515 * same to the kernel: if any of them has been set by
10516 * userland, the interface cannot be used for data traffic.
10517 */
10518 if ((turn_on|turn_off) &
10519 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10520 ASSERT(!IS_IPMP(ill));
10521 /*
10522 * It's possible the ill is part of an "anonymous"
10523 * IPMP group rather than a real group. In that case,
10524 * there are no other interfaces in the group and thus
10525 * no need to call ipmp_phyint_refresh_active().
10526 */
10527 if (IS_UNDER_IPMP(ill))
10528 ipmp_phyint_refresh_active(phyi);
10529 }
10530
10531 if (phyint_flags_modified) {
10532 if (phyi->phyint_illv4 != NULL) {
10533 ip_rts_ifmsg(phyi->phyint_illv4->
10534 ill_ipif, RTSQ_DEFAULT);
10535 }
10536 if (phyi->phyint_illv6 != NULL) {
10537 ip_rts_ifmsg(phyi->phyint_illv6->
10538 ill_ipif, RTSQ_DEFAULT);
10539 }
10540 }
10541 /* The default multicast interface might have changed */
10542 ire_increment_multicast_generation(ill->ill_ipst,
10543 ill->ill_isv6);
10544
10545 return (0);
10546 } else if (set_linklocal) {
10547 mutex_enter(&ill->ill_lock);
10548 if (set_linklocal)
10549 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
10550 mutex_exit(&ill->ill_lock);
10551 }
10552
10553 /*
10554 * Disallow IPv6 interfaces coming up that have the unspecified address,
10555 * or point-to-point interfaces with an unspecified destination. We do
10556 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
10557 * have a subnet assigned, which is how in.ndpd currently manages its
10558 * onlink prefix list when no addresses are configured with those
10559 * prefixes.
10560 */
10561 if (ipif->ipif_isv6 &&
10562 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
10563 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
10564 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
10565 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10566 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
10567 return (EINVAL);
10568 }
10569
10570 /*
10571 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
10572 * from being brought up.
10573 */
10574 if (!ipif->ipif_isv6 &&
10575 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10576 ipif->ipif_pp_dst_addr == INADDR_ANY)) {
10577 return (EINVAL);
10578 }
10579
10580 /*
10581 * If we are going to change one or more of the flags that are
10582 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
10583 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
10584 * IPIF_NOFAILOVER, we will take special action. This is
10585 * done by bring the ipif down, changing the flags and bringing
10586 * it back up again. For IPIF_NOFAILOVER, the act of bringing it
10587 * back up will trigger the address to be moved.
10588 *
10589 * If we are going to change IFF_NOACCEPT, we need to bring
10590 * all the ipifs down then bring them up again. The act of
10591 * bringing all the ipifs back up will trigger the local
10592 * ires being recreated with "no_accept" set/cleared.
10593 *
10594 * Note that ILLF_NOACCEPT is always set separately from the
10595 * other flags.
10596 */
10597 if ((turn_on|turn_off) &
10598 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
10599 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
10600 IPIF_NOFAILOVER)) {
10601 /*
10602 * ipif_down() will ire_delete bcast ire's for the subnet,
10603 * while the ire_identical_ref tracks the case of IRE_BROADCAST
10604 * entries shared between multiple ipifs on the same subnet.
10605 */
10606 if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
10607 !(turn_off & IPIF_UP)) {
10608 if (ipif->ipif_flags & IPIF_UP)
10609 ill->ill_logical_down = 1;
10610 turn_on &= ~IPIF_UP;
10611 }
10612 err = ipif_down(ipif, q, mp);
10613 ip1dbg(("ipif_down returns %d err ", err));
10614 if (err == EINPROGRESS)
10615 return (err);
10616 (void) ipif_down_tail(ipif);
10617 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10618 /*
10619 * If we can quiesce the ill, then continue. If not, then
10620 * ip_sioctl_flags_tail() will be called from
10621 * ipif_ill_refrele_tail().
10622 */
10623 ill_down_ipifs(ill, B_TRUE);
10624
10625 mutex_enter(&connp->conn_lock);
10626 mutex_enter(&ill->ill_lock);
10627 if (!ill_is_quiescent(ill)) {
10628 boolean_t success;
10629
10630 success = ipsq_pending_mp_add(connp, ill->ill_ipif,
10631 q, mp, ILL_DOWN);
10632 mutex_exit(&ill->ill_lock);
10633 mutex_exit(&connp->conn_lock);
10634 return (success ? EINPROGRESS : EINTR);
10635 }
10636 mutex_exit(&ill->ill_lock);
10637 mutex_exit(&connp->conn_lock);
10638 }
10639 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10640 }
10641
10642 static int
10643 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
10644 {
10645 ill_t *ill;
10646 phyint_t *phyi;
10647 uint64_t turn_on, turn_off;
10648 boolean_t phyint_flags_modified = B_FALSE;
10649 int err = 0;
10650 boolean_t set_linklocal = B_FALSE;
10651
10652 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
10653 ipif->ipif_ill->ill_name, ipif->ipif_id));
10654
10655 ASSERT(IAM_WRITER_IPIF(ipif));
10656
10657 ill = ipif->ipif_ill;
10658 phyi = ill->ill_phyint;
10659
10660 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10661
10662 /*
10663 * IFF_UP is handled separately.
10664 */
10665 turn_on &= ~IFF_UP;
10666 turn_off &= ~IFF_UP;
10667
10668 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10669 phyint_flags_modified = B_TRUE;
10670
10671 /*
10672 * Now we change the flags. Track current value of
10673 * other flags in their respective places.
10674 */
10675 mutex_enter(&ill->ill_lock);
10676 mutex_enter(&phyi->phyint_lock);
10677 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10678 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10679 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10680 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10681 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10682 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10683 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
10684 set_linklocal = B_TRUE;
10685 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
10686 }
10687
10688 mutex_exit(&ill->ill_lock);
10689 mutex_exit(&phyi->phyint_lock);
10690
10691 if (set_linklocal)
10692 (void) ipif_setlinklocal(ipif);
10693
10694 /*
10695 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
10696 * the kernel: if any of them has been set by userland, the interface
10697 * cannot be used for data traffic.
10698 */
10699 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10700 ASSERT(!IS_IPMP(ill));
10701 /*
10702 * It's possible the ill is part of an "anonymous" IPMP group
10703 * rather than a real group. In that case, there are no other
10704 * interfaces in the group and thus no need for us to call
10705 * ipmp_phyint_refresh_active().
10706 */
10707 if (IS_UNDER_IPMP(ill))
10708 ipmp_phyint_refresh_active(phyi);
10709 }
10710
10711 if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10712 /*
10713 * If the ILLF_NOACCEPT flag is changed, bring up all the
10714 * ipifs that were brought down.
10715 *
10716 * The routing sockets messages are sent as the result
10717 * of ill_up_ipifs(), further, SCTP's IPIF list was updated
10718 * as well.
10719 */
10720 err = ill_up_ipifs(ill, q, mp);
10721 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
10722 /*
10723 * XXX ipif_up really does not know whether a phyint flags
10724 * was modified or not. So, it sends up information on
10725 * only one routing sockets message. As we don't bring up
10726 * the interface and also set PHYI_ flags simultaneously
10727 * it should be okay.
10728 */
10729 err = ipif_up(ipif, q, mp);
10730 } else {
10731 /*
10732 * Make sure routing socket sees all changes to the flags.
10733 * ipif_up_done* handles this when we use ipif_up.
10734 */
10735 if (phyint_flags_modified) {
10736 if (phyi->phyint_illv4 != NULL) {
10737 ip_rts_ifmsg(phyi->phyint_illv4->
10738 ill_ipif, RTSQ_DEFAULT);
10739 }
10740 if (phyi->phyint_illv6 != NULL) {
10741 ip_rts_ifmsg(phyi->phyint_illv6->
10742 ill_ipif, RTSQ_DEFAULT);
10743 }
10744 } else {
10745 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
10746 }
10747 /*
10748 * Update the flags in SCTP's IPIF list, ipif_up() will do
10749 * this in need_up case.
10750 */
10751 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10752 }
10753
10754 /* The default multicast interface might have changed */
10755 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
10756 return (err);
10757 }
10758
10759 /*
10760 * Restart the flags operation now that the refcounts have dropped to zero.
10761 */
10762 /* ARGSUSED */
10763 int
10764 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10765 ip_ioctl_cmd_t *ipip, void *if_req)
10766 {
10767 uint64_t flags;
10768 struct ifreq *ifr = if_req;
10769 struct lifreq *lifr = if_req;
10770 uint64_t turn_on, turn_off;
10771
10772 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
10773 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10774
10775 if (ipip->ipi_cmd_type == IF_CMD) {
10776 /* cast to uint16_t prevents unwanted sign extension */
10777 flags = (uint16_t)ifr->ifr_flags;
10778 } else {
10779 flags = lifr->lifr_flags;
10780 }
10781
10782 /*
10783 * If this function call is a result of the ILLF_NOACCEPT flag
10784 * change, do not call ipif_down_tail(). See ip_sioctl_flags().
10785 */
10786 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10787 if (!((turn_on|turn_off) & ILLF_NOACCEPT))
10788 (void) ipif_down_tail(ipif);
10789
10790 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10791 }
10792
10793 /*
10794 * Can operate on either a module or a driver queue.
10795 */
10796 /* ARGSUSED */
10797 int
10798 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10799 ip_ioctl_cmd_t *ipip, void *if_req)
10800 {
10801 /*
10802 * Has the flags been set correctly till now ?
10803 */
10804 ill_t *ill = ipif->ipif_ill;
10805 phyint_t *phyi = ill->ill_phyint;
10806
10807 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
10808 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10809 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10810 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10811 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10812
10813 /*
10814 * Need a lock since some flags can be set even when there are
10815 * references to the ipif.
10816 */
10817 mutex_enter(&ill->ill_lock);
10818 if (ipip->ipi_cmd_type == IF_CMD) {
10819 struct ifreq *ifr = (struct ifreq *)if_req;
10820
10821 /* Get interface flags (low 16 only). */
10822 ifr->ifr_flags = ((ipif->ipif_flags |
10823 ill->ill_flags | phyi->phyint_flags) & 0xffff);
10824 } else {
10825 struct lifreq *lifr = (struct lifreq *)if_req;
10826
10827 /* Get interface flags. */
10828 lifr->lifr_flags = ipif->ipif_flags |
10829 ill->ill_flags | phyi->phyint_flags;
10830 }
10831 mutex_exit(&ill->ill_lock);
10832 return (0);
10833 }
10834
10835 /*
10836 * We allow the MTU to be set on an ILL, but not have it be different
10837 * for different IPIFs since we don't actually send packets on IPIFs.
10838 */
10839 /* ARGSUSED */
10840 int
10841 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10842 ip_ioctl_cmd_t *ipip, void *if_req)
10843 {
10844 int mtu;
10845 int ip_min_mtu;
10846 struct ifreq *ifr;
10847 struct lifreq *lifr;
10848 ill_t *ill;
10849
10850 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
10851 ipif->ipif_id, (void *)ipif));
10852 if (ipip->ipi_cmd_type == IF_CMD) {
10853 ifr = (struct ifreq *)if_req;
10854 mtu = ifr->ifr_metric;
10855 } else {
10856 lifr = (struct lifreq *)if_req;
10857 mtu = lifr->lifr_mtu;
10858 }
10859 /* Only allow for logical unit zero i.e. not on "bge0:17" */
10860 if (ipif->ipif_id != 0)
10861 return (EINVAL);
10862
10863 ill = ipif->ipif_ill;
10864 if (ipif->ipif_isv6)
10865 ip_min_mtu = IPV6_MIN_MTU;
10866 else
10867 ip_min_mtu = IP_MIN_MTU;
10868
10869 mutex_enter(&ill->ill_lock);
10870 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
10871 mutex_exit(&ill->ill_lock);
10872 return (EINVAL);
10873 }
10874 /* Avoid increasing ill_mc_mtu */
10875 if (ill->ill_mc_mtu > mtu)
10876 ill->ill_mc_mtu = mtu;
10877
10878 /*
10879 * The dce and fragmentation code can handle changes to ill_mtu
10880 * concurrent with sending/fragmenting packets.
10881 */
10882 ill->ill_mtu = mtu;
10883 ill->ill_flags |= ILLF_FIXEDMTU;
10884 mutex_exit(&ill->ill_lock);
10885
10886 /*
10887 * Make sure all dce_generation checks find out
10888 * that ill_mtu/ill_mc_mtu has changed.
10889 */
10890 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
10891
10892 /*
10893 * Refresh IPMP meta-interface MTU if necessary.
10894 */
10895 if (IS_UNDER_IPMP(ill))
10896 ipmp_illgrp_refresh_mtu(ill->ill_grp);
10897
10898 /* Update the MTU in SCTP's list */
10899 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10900 return (0);
10901 }
10902
10903 /* Get interface MTU. */
10904 /* ARGSUSED */
10905 int
10906 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10907 ip_ioctl_cmd_t *ipip, void *if_req)
10908 {
10909 struct ifreq *ifr;
10910 struct lifreq *lifr;
10911
10912 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
10913 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10914
10915 /*
10916 * We allow a get on any logical interface even though the set
10917 * can only be done on logical unit 0.
10918 */
10919 if (ipip->ipi_cmd_type == IF_CMD) {
10920 ifr = (struct ifreq *)if_req;
10921 ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
10922 } else {
10923 lifr = (struct lifreq *)if_req;
10924 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
10925 }
10926 return (0);
10927 }
10928
10929 /* Set interface broadcast address. */
10930 /* ARGSUSED2 */
10931 int
10932 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10933 ip_ioctl_cmd_t *ipip, void *if_req)
10934 {
10935 ipaddr_t addr;
10936 ire_t *ire;
10937 ill_t *ill = ipif->ipif_ill;
10938 ip_stack_t *ipst = ill->ill_ipst;
10939
10940 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
10941 ipif->ipif_id));
10942
10943 ASSERT(IAM_WRITER_IPIF(ipif));
10944 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10945 return (EADDRNOTAVAIL);
10946
10947 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */
10948
10949 if (sin->sin_family != AF_INET)
10950 return (EAFNOSUPPORT);
10951
10952 addr = sin->sin_addr.s_addr;
10953
10954 if (ipif->ipif_flags & IPIF_UP) {
10955 /*
10956 * If we are already up, make sure the new
10957 * broadcast address makes sense. If it does,
10958 * there should be an IRE for it already.
10959 */
10960 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
10961 ill, ipif->ipif_zoneid, NULL,
10962 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
10963 if (ire == NULL) {
10964 return (EINVAL);
10965 } else {
10966 ire_refrele(ire);
10967 }
10968 }
10969 /*
10970 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
10971 * needs to already exist we never need to change the set of
10972 * IRE_BROADCASTs when we are UP.
10973 */
10974 if (addr != ipif->ipif_brd_addr)
10975 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
10976
10977 return (0);
10978 }
10979
10980 /* Get interface broadcast address. */
10981 /* ARGSUSED */
10982 int
10983 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10984 ip_ioctl_cmd_t *ipip, void *if_req)
10985 {
10986 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
10987 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10988 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10989 return (EADDRNOTAVAIL);
10990
10991 /* IPIF_BROADCAST not possible with IPv6 */
10992 ASSERT(!ipif->ipif_isv6);
10993 *sin = sin_null;
10994 sin->sin_family = AF_INET;
10995 sin->sin_addr.s_addr = ipif->ipif_brd_addr;
10996 return (0);
10997 }
10998
10999 /*
11000 * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
11001 */
11002 /* ARGSUSED */
11003 int
11004 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11005 ip_ioctl_cmd_t *ipip, void *if_req)
11006 {
11007 int err = 0;
11008 in6_addr_t v6mask;
11009
11010 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
11011 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11012
11013 ASSERT(IAM_WRITER_IPIF(ipif));
11014
11015 if (ipif->ipif_isv6) {
11016 sin6_t *sin6;
11017
11018 if (sin->sin_family != AF_INET6)
11019 return (EAFNOSUPPORT);
11020
11021 sin6 = (sin6_t *)sin;
11022 v6mask = sin6->sin6_addr;
11023 } else {
11024 ipaddr_t mask;
11025
11026 if (sin->sin_family != AF_INET)
11027 return (EAFNOSUPPORT);
11028
11029 mask = sin->sin_addr.s_addr;
11030 if (!ip_contiguous_mask(ntohl(mask)))
11031 return (ENOTSUP);
11032 V4MASK_TO_V6(mask, v6mask);
11033 }
11034
11035 /*
11036 * No big deal if the interface isn't already up, or the mask
11037 * isn't really changing, or this is pt-pt.
11038 */
11039 if (!(ipif->ipif_flags & IPIF_UP) ||
11040 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
11041 (ipif->ipif_flags & IPIF_POINTOPOINT)) {
11042 ipif->ipif_v6net_mask = v6mask;
11043 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11044 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
11045 ipif->ipif_v6net_mask,
11046 ipif->ipif_v6subnet);
11047 }
11048 return (0);
11049 }
11050 /*
11051 * Make sure we have valid net and subnet broadcast ire's
11052 * for the old netmask, if needed by other logical interfaces.
11053 */
11054 err = ipif_logical_down(ipif, q, mp);
11055 if (err == EINPROGRESS)
11056 return (err);
11057 (void) ipif_down_tail(ipif);
11058 err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
11059 return (err);
11060 }
11061
11062 static int
11063 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
11064 {
11065 in6_addr_t v6mask;
11066 int err = 0;
11067
11068 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
11069 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11070
11071 if (ipif->ipif_isv6) {
11072 sin6_t *sin6;
11073
11074 sin6 = (sin6_t *)sin;
11075 v6mask = sin6->sin6_addr;
11076 } else {
11077 ipaddr_t mask;
11078
11079 mask = sin->sin_addr.s_addr;
11080 V4MASK_TO_V6(mask, v6mask);
11081 }
11082
11083 ipif->ipif_v6net_mask = v6mask;
11084 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11085 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
11086 ipif->ipif_v6subnet);
11087 }
11088 err = ipif_up(ipif, q, mp);
11089
11090 if (err == 0 || err == EINPROGRESS) {
11091 /*
11092 * The interface must be DL_BOUND if this packet has to
11093 * go out on the wire. Since we only go through a logical
11094 * down and are bound with the driver during an internal
11095 * down/up that is satisfied.
11096 */
11097 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
11098 /* Potentially broadcast an address mask reply. */
11099 ipif_mask_reply(ipif);
11100 }
11101 }
11102 return (err);
11103 }
11104
11105 /* ARGSUSED */
11106 int
11107 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11108 ip_ioctl_cmd_t *ipip, void *if_req)
11109 {
11110 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
11111 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11112 (void) ipif_down_tail(ipif);
11113 return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
11114 }
11115
11116 /* Get interface net mask. */
11117 /* ARGSUSED */
11118 int
11119 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11120 ip_ioctl_cmd_t *ipip, void *if_req)
11121 {
11122 struct lifreq *lifr = (struct lifreq *)if_req;
11123 struct sockaddr_in6 *sin6 = (sin6_t *)sin;
11124
11125 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
11126 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11127
11128 /*
11129 * net mask can't change since we have a reference to the ipif.
11130 */
11131 if (ipif->ipif_isv6) {
11132 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11133 *sin6 = sin6_null;
11134 sin6->sin6_family = AF_INET6;
11135 sin6->sin6_addr = ipif->ipif_v6net_mask;
11136 lifr->lifr_addrlen =
11137 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11138 } else {
11139 *sin = sin_null;
11140 sin->sin_family = AF_INET;
11141 sin->sin_addr.s_addr = ipif->ipif_net_mask;
11142 if (ipip->ipi_cmd_type == LIF_CMD) {
11143 lifr->lifr_addrlen =
11144 ip_mask_to_plen(ipif->ipif_net_mask);
11145 }
11146 }
11147 return (0);
11148 }
11149
11150 /* ARGSUSED */
11151 int
11152 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11153 ip_ioctl_cmd_t *ipip, void *if_req)
11154 {
11155 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
11156 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11157
11158 /*
11159 * Since no applications should ever be setting metrics on underlying
11160 * interfaces, we explicitly fail to smoke 'em out.
11161 */
11162 if (IS_UNDER_IPMP(ipif->ipif_ill))
11163 return (EINVAL);
11164
11165 /*
11166 * Set interface metric. We don't use this for
11167 * anything but we keep track of it in case it is
11168 * important to routing applications or such.
11169 */
11170 if (ipip->ipi_cmd_type == IF_CMD) {
11171 struct ifreq *ifr;
11172
11173 ifr = (struct ifreq *)if_req;
11174 ipif->ipif_ill->ill_metric = ifr->ifr_metric;
11175 } else {
11176 struct lifreq *lifr;
11177
11178 lifr = (struct lifreq *)if_req;
11179 ipif->ipif_ill->ill_metric = lifr->lifr_metric;
11180 }
11181 return (0);
11182 }
11183
11184 /* ARGSUSED */
11185 int
11186 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11187 ip_ioctl_cmd_t *ipip, void *if_req)
11188 {
11189 /* Get interface metric. */
11190 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
11191 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11192
11193 if (ipip->ipi_cmd_type == IF_CMD) {
11194 struct ifreq *ifr;
11195
11196 ifr = (struct ifreq *)if_req;
11197 ifr->ifr_metric = ipif->ipif_ill->ill_metric;
11198 } else {
11199 struct lifreq *lifr;
11200
11201 lifr = (struct lifreq *)if_req;
11202 lifr->lifr_metric = ipif->ipif_ill->ill_metric;
11203 }
11204
11205 return (0);
11206 }
11207
11208 /* ARGSUSED */
11209 int
11210 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11211 ip_ioctl_cmd_t *ipip, void *if_req)
11212 {
11213 int arp_muxid;
11214
11215 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
11216 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11217 /*
11218 * Set the muxid returned from I_PLINK.
11219 */
11220 if (ipip->ipi_cmd_type == IF_CMD) {
11221 struct ifreq *ifr = (struct ifreq *)if_req;
11222
11223 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
11224 arp_muxid = ifr->ifr_arp_muxid;
11225 } else {
11226 struct lifreq *lifr = (struct lifreq *)if_req;
11227
11228 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
11229 arp_muxid = lifr->lifr_arp_muxid;
11230 }
11231 arl_set_muxid(ipif->ipif_ill, arp_muxid);
11232 return (0);
11233 }
11234
11235 /* ARGSUSED */
11236 int
11237 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11238 ip_ioctl_cmd_t *ipip, void *if_req)
11239 {
11240 int arp_muxid = 0;
11241
11242 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
11243 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11244 /*
11245 * Get the muxid saved in ill for I_PUNLINK.
11246 */
11247 arp_muxid = arl_get_muxid(ipif->ipif_ill);
11248 if (ipip->ipi_cmd_type == IF_CMD) {
11249 struct ifreq *ifr = (struct ifreq *)if_req;
11250
11251 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11252 ifr->ifr_arp_muxid = arp_muxid;
11253 } else {
11254 struct lifreq *lifr = (struct lifreq *)if_req;
11255
11256 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11257 lifr->lifr_arp_muxid = arp_muxid;
11258 }
11259 return (0);
11260 }
11261
11262 /*
11263 * Set the subnet prefix. Does not modify the broadcast address.
11264 */
11265 /* ARGSUSED */
11266 int
11267 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11268 ip_ioctl_cmd_t *ipip, void *if_req)
11269 {
11270 int err = 0;
11271 in6_addr_t v6addr;
11272 in6_addr_t v6mask;
11273 boolean_t need_up = B_FALSE;
11274 int addrlen;
11275
11276 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11277 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11278
11279 ASSERT(IAM_WRITER_IPIF(ipif));
11280 addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11281
11282 if (ipif->ipif_isv6) {
11283 sin6_t *sin6;
11284
11285 if (sin->sin_family != AF_INET6)
11286 return (EAFNOSUPPORT);
11287
11288 sin6 = (sin6_t *)sin;
11289 v6addr = sin6->sin6_addr;
11290 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11291 return (EADDRNOTAVAIL);
11292 } else {
11293 ipaddr_t addr;
11294
11295 if (sin->sin_family != AF_INET)
11296 return (EAFNOSUPPORT);
11297
11298 addr = sin->sin_addr.s_addr;
11299 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11300 return (EADDRNOTAVAIL);
11301 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11302 /* Add 96 bits */
11303 addrlen += IPV6_ABITS - IP_ABITS;
11304 }
11305
11306 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11307 return (EINVAL);
11308
11309 /* Check if bits in the address is set past the mask */
11310 if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11311 return (EINVAL);
11312
11313 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11314 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11315 return (0); /* No change */
11316
11317 if (ipif->ipif_flags & IPIF_UP) {
11318 /*
11319 * If the interface is already marked up,
11320 * we call ipif_down which will take care
11321 * of ditching any IREs that have been set
11322 * up based on the old interface address.
11323 */
11324 err = ipif_logical_down(ipif, q, mp);
11325 if (err == EINPROGRESS)
11326 return (err);
11327 (void) ipif_down_tail(ipif);
11328 need_up = B_TRUE;
11329 }
11330
11331 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11332 return (err);
11333 }
11334
11335 static int
11336 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11337 queue_t *q, mblk_t *mp, boolean_t need_up)
11338 {
11339 ill_t *ill = ipif->ipif_ill;
11340 int err = 0;
11341
11342 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11343 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11344
11345 /* Set the new address. */
11346 mutex_enter(&ill->ill_lock);
11347 ipif->ipif_v6net_mask = v6mask;
11348 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11349 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11350 ipif->ipif_v6subnet);
11351 }
11352 mutex_exit(&ill->ill_lock);
11353
11354 if (need_up) {
11355 /*
11356 * Now bring the interface back up. If this
11357 * is the only IPIF for the ILL, ipif_up
11358 * will have to re-bind to the device, so
11359 * we may get back EINPROGRESS, in which
11360 * case, this IOCTL will get completed in
11361 * ip_rput_dlpi when we see the DL_BIND_ACK.
11362 */
11363 err = ipif_up(ipif, q, mp);
11364 if (err == EINPROGRESS)
11365 return (err);
11366 }
11367 return (err);
11368 }
11369
11370 /* ARGSUSED */
11371 int
11372 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11373 ip_ioctl_cmd_t *ipip, void *if_req)
11374 {
11375 int addrlen;
11376 in6_addr_t v6addr;
11377 in6_addr_t v6mask;
11378 struct lifreq *lifr = (struct lifreq *)if_req;
11379
11380 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
11381 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11382 (void) ipif_down_tail(ipif);
11383
11384 addrlen = lifr->lifr_addrlen;
11385 if (ipif->ipif_isv6) {
11386 sin6_t *sin6;
11387
11388 sin6 = (sin6_t *)sin;
11389 v6addr = sin6->sin6_addr;
11390 } else {
11391 ipaddr_t addr;
11392
11393 addr = sin->sin_addr.s_addr;
11394 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11395 addrlen += IPV6_ABITS - IP_ABITS;
11396 }
11397 (void) ip_plen_to_mask_v6(addrlen, &v6mask);
11398
11399 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
11400 }
11401
11402 /* ARGSUSED */
11403 int
11404 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11405 ip_ioctl_cmd_t *ipip, void *if_req)
11406 {
11407 struct lifreq *lifr = (struct lifreq *)if_req;
11408 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
11409
11410 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
11411 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11412 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11413
11414 if (ipif->ipif_isv6) {
11415 *sin6 = sin6_null;
11416 sin6->sin6_family = AF_INET6;
11417 sin6->sin6_addr = ipif->ipif_v6subnet;
11418 lifr->lifr_addrlen =
11419 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11420 } else {
11421 *sin = sin_null;
11422 sin->sin_family = AF_INET;
11423 sin->sin_addr.s_addr = ipif->ipif_subnet;
11424 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
11425 }
11426 return (0);
11427 }
11428
11429 /*
11430 * Set the IPv6 address token.
11431 */
11432 /* ARGSUSED */
11433 int
11434 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11435 ip_ioctl_cmd_t *ipi, void *if_req)
11436 {
11437 ill_t *ill = ipif->ipif_ill;
11438 int err;
11439 in6_addr_t v6addr;
11440 in6_addr_t v6mask;
11441 boolean_t need_up = B_FALSE;
11442 int i;
11443 sin6_t *sin6 = (sin6_t *)sin;
11444 struct lifreq *lifr = (struct lifreq *)if_req;
11445 int addrlen;
11446
11447 ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
11448 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11449 ASSERT(IAM_WRITER_IPIF(ipif));
11450
11451 addrlen = lifr->lifr_addrlen;
11452 /* Only allow for logical unit zero i.e. not on "le0:17" */
11453 if (ipif->ipif_id != 0)
11454 return (EINVAL);
11455
11456 if (!ipif->ipif_isv6)
11457 return (EINVAL);
11458
11459 if (addrlen > IPV6_ABITS)
11460 return (EINVAL);
11461
11462 v6addr = sin6->sin6_addr;
11463
11464 /*
11465 * The length of the token is the length from the end. To get
11466 * the proper mask for this, compute the mask of the bits not
11467 * in the token; ie. the prefix, and then xor to get the mask.
11468 */
11469 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
11470 return (EINVAL);
11471 for (i = 0; i < 4; i++) {
11472 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11473 }
11474
11475 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
11476 ill->ill_token_length == addrlen)
11477 return (0); /* No change */
11478
11479 if (ipif->ipif_flags & IPIF_UP) {
11480 err = ipif_logical_down(ipif, q, mp);
11481 if (err == EINPROGRESS)
11482 return (err);
11483 (void) ipif_down_tail(ipif);
11484 need_up = B_TRUE;
11485 }
11486 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
11487 return (err);
11488 }
11489
11490 static int
11491 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
11492 mblk_t *mp, boolean_t need_up)
11493 {
11494 in6_addr_t v6addr;
11495 in6_addr_t v6mask;
11496 ill_t *ill = ipif->ipif_ill;
11497 int i;
11498 int err = 0;
11499
11500 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
11501 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11502 v6addr = sin6->sin6_addr;
11503 /*
11504 * The length of the token is the length from the end. To get
11505 * the proper mask for this, compute the mask of the bits not
11506 * in the token; ie. the prefix, and then xor to get the mask.
11507 */
11508 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
11509 for (i = 0; i < 4; i++)
11510 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11511
11512 mutex_enter(&ill->ill_lock);
11513 V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
11514 ill->ill_token_length = addrlen;
11515 ill->ill_manual_token = 1;
11516
11517 /* Reconfigure the link-local address based on this new token */
11518 ipif_setlinklocal(ill->ill_ipif);
11519
11520 mutex_exit(&ill->ill_lock);
11521
11522 if (need_up) {
11523 /*
11524 * Now bring the interface back up. If this
11525 * is the only IPIF for the ILL, ipif_up
11526 * will have to re-bind to the device, so
11527 * we may get back EINPROGRESS, in which
11528 * case, this IOCTL will get completed in
11529 * ip_rput_dlpi when we see the DL_BIND_ACK.
11530 */
11531 err = ipif_up(ipif, q, mp);
11532 if (err == EINPROGRESS)
11533 return (err);
11534 }
11535 return (err);
11536 }
11537
11538 /* ARGSUSED */
11539 int
11540 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11541 ip_ioctl_cmd_t *ipi, void *if_req)
11542 {
11543 ill_t *ill;
11544 sin6_t *sin6 = (sin6_t *)sin;
11545 struct lifreq *lifr = (struct lifreq *)if_req;
11546
11547 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
11548 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11549 if (ipif->ipif_id != 0)
11550 return (EINVAL);
11551
11552 ill = ipif->ipif_ill;
11553 if (!ill->ill_isv6)
11554 return (ENXIO);
11555
11556 *sin6 = sin6_null;
11557 sin6->sin6_family = AF_INET6;
11558 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
11559 sin6->sin6_addr = ill->ill_token;
11560 lifr->lifr_addrlen = ill->ill_token_length;
11561 return (0);
11562 }
11563
11564 /*
11565 * Set (hardware) link specific information that might override
11566 * what was acquired through the DL_INFO_ACK.
11567 */
11568 /* ARGSUSED */
11569 int
11570 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11571 ip_ioctl_cmd_t *ipi, void *if_req)
11572 {
11573 ill_t *ill = ipif->ipif_ill;
11574 int ip_min_mtu;
11575 struct lifreq *lifr = (struct lifreq *)if_req;
11576 lif_ifinfo_req_t *lir;
11577
11578 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
11579 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11580 lir = &lifr->lifr_ifinfo;
11581 ASSERT(IAM_WRITER_IPIF(ipif));
11582
11583 /* Only allow for logical unit zero i.e. not on "bge0:17" */
11584 if (ipif->ipif_id != 0)
11585 return (EINVAL);
11586
11587 /* Set interface MTU. */
11588 if (ipif->ipif_isv6)
11589 ip_min_mtu = IPV6_MIN_MTU;
11590 else
11591 ip_min_mtu = IP_MIN_MTU;
11592
11593 /*
11594 * Verify values before we set anything. Allow zero to
11595 * mean unspecified.
11596 *
11597 * XXX We should be able to set the user-defined lir_mtu to some value
11598 * that is greater than ill_current_frag but less than ill_max_frag- the
11599 * ill_max_frag value tells us the max MTU that can be handled by the
11600 * datalink, whereas the ill_current_frag is dynamically computed for
11601 * some link-types like tunnels, based on the tunnel PMTU. However,
11602 * since there is currently no way of distinguishing between
11603 * administratively fixed link mtu values (e.g., those set via
11604 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
11605 * for tunnels) we conservatively choose the ill_current_frag as the
11606 * upper-bound.
11607 */
11608 if (lir->lir_maxmtu != 0 &&
11609 (lir->lir_maxmtu > ill->ill_current_frag ||
11610 lir->lir_maxmtu < ip_min_mtu))
11611 return (EINVAL);
11612 if (lir->lir_reachtime != 0 &&
11613 lir->lir_reachtime > ND_MAX_REACHTIME)
11614 return (EINVAL);
11615 if (lir->lir_reachretrans != 0 &&
11616 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
11617 return (EINVAL);
11618
11619 mutex_enter(&ill->ill_lock);
11620 /*
11621 * The dce and fragmentation code can handle changes to ill_mtu
11622 * concurrent with sending/fragmenting packets.
11623 */
11624 if (lir->lir_maxmtu != 0)
11625 ill->ill_user_mtu = lir->lir_maxmtu;
11626
11627 if (lir->lir_reachtime != 0)
11628 ill->ill_reachable_time = lir->lir_reachtime;
11629
11630 if (lir->lir_reachretrans != 0)
11631 ill->ill_reachable_retrans_time = lir->lir_reachretrans;
11632
11633 ill->ill_max_hops = lir->lir_maxhops;
11634 ill->ill_max_buf = ND_MAX_Q;
11635 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
11636 /*
11637 * ill_mtu is the actual interface MTU, obtained as the min
11638 * of user-configured mtu and the value announced by the
11639 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
11640 * we have already made the choice of requiring
11641 * ill_user_mtu < ill_current_frag by the time we get here,
11642 * the ill_mtu effectively gets assigned to the ill_user_mtu
11643 * here.
11644 */
11645 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
11646 ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
11647 }
11648 mutex_exit(&ill->ill_lock);
11649
11650 /*
11651 * Make sure all dce_generation checks find out
11652 * that ill_mtu/ill_mc_mtu has changed.
11653 */
11654 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
11655 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
11656
11657 /*
11658 * Refresh IPMP meta-interface MTU if necessary.
11659 */
11660 if (IS_UNDER_IPMP(ill))
11661 ipmp_illgrp_refresh_mtu(ill->ill_grp);
11662
11663 return (0);
11664 }
11665
11666 /* ARGSUSED */
11667 int
11668 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11669 ip_ioctl_cmd_t *ipi, void *if_req)
11670 {
11671 struct lif_ifinfo_req *lir;
11672 ill_t *ill = ipif->ipif_ill;
11673
11674 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
11675 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11676 if (ipif->ipif_id != 0)
11677 return (EINVAL);
11678
11679 lir = &((struct lifreq *)if_req)->lifr_ifinfo;
11680 lir->lir_maxhops = ill->ill_max_hops;
11681 lir->lir_reachtime = ill->ill_reachable_time;
11682 lir->lir_reachretrans = ill->ill_reachable_retrans_time;
11683 lir->lir_maxmtu = ill->ill_mtu;
11684
11685 return (0);
11686 }
11687
11688 /*
11689 * Return best guess as to the subnet mask for the specified address.
11690 * Based on the subnet masks for all the configured interfaces.
11691 *
11692 * We end up returning a zero mask in the case of default, multicast or
11693 * experimental.
11694 */
11695 static ipaddr_t
11696 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
11697 {
11698 ipaddr_t net_mask;
11699 ill_t *ill;
11700 ipif_t *ipif;
11701 ill_walk_context_t ctx;
11702 ipif_t *fallback_ipif = NULL;
11703
11704 net_mask = ip_net_mask(addr);
11705 if (net_mask == 0) {
11706 *ipifp = NULL;
11707 return (0);
11708 }
11709
11710 /* Let's check to see if this is maybe a local subnet route. */
11711 /* this function only applies to IPv4 interfaces */
11712 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
11713 ill = ILL_START_WALK_V4(&ctx, ipst);
11714 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
11715 mutex_enter(&ill->ill_lock);
11716 for (ipif = ill->ill_ipif; ipif != NULL;
11717 ipif = ipif->ipif_next) {
11718 if (IPIF_IS_CONDEMNED(ipif))
11719 continue;
11720 if (!(ipif->ipif_flags & IPIF_UP))
11721 continue;
11722 if ((ipif->ipif_subnet & net_mask) ==
11723 (addr & net_mask)) {
11724 /*
11725 * Don't trust pt-pt interfaces if there are
11726 * other interfaces.
11727 */
11728 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
11729 if (fallback_ipif == NULL) {
11730 ipif_refhold_locked(ipif);
11731 fallback_ipif = ipif;
11732 }
11733 continue;
11734 }
11735
11736 /*
11737 * Fine. Just assume the same net mask as the
11738 * directly attached subnet interface is using.
11739 */
11740 ipif_refhold_locked(ipif);
11741 mutex_exit(&ill->ill_lock);
11742 rw_exit(&ipst->ips_ill_g_lock);
11743 if (fallback_ipif != NULL)
11744 ipif_refrele(fallback_ipif);
11745 *ipifp = ipif;
11746 return (ipif->ipif_net_mask);
11747 }
11748 }
11749 mutex_exit(&ill->ill_lock);
11750 }
11751 rw_exit(&ipst->ips_ill_g_lock);
11752
11753 *ipifp = fallback_ipif;
11754 return ((fallback_ipif != NULL) ?
11755 fallback_ipif->ipif_net_mask : net_mask);
11756 }
11757
11758 /*
11759 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
11760 */
11761 static void
11762 ip_wput_ioctl(queue_t *q, mblk_t *mp)
11763 {
11764 IOCP iocp;
11765 ipft_t *ipft;
11766 ipllc_t *ipllc;
11767 mblk_t *mp1;
11768 cred_t *cr;
11769 int error = 0;
11770 conn_t *connp;
11771
11772 ip1dbg(("ip_wput_ioctl"));
11773 iocp = (IOCP)mp->b_rptr;
11774 mp1 = mp->b_cont;
11775 if (mp1 == NULL) {
11776 iocp->ioc_error = EINVAL;
11777 mp->b_datap->db_type = M_IOCNAK;
11778 iocp->ioc_count = 0;
11779 qreply(q, mp);
11780 return;
11781 }
11782
11783 /*
11784 * These IOCTLs provide various control capabilities to
11785 * upstream agents such as ULPs and processes. There
11786 * are currently two such IOCTLs implemented. They
11787 * are used by TCP to provide update information for
11788 * existing IREs and to forcibly delete an IRE for a
11789 * host that is not responding, thereby forcing an
11790 * attempt at a new route.
11791 */
11792 iocp->ioc_error = EINVAL;
11793 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
11794 goto done;
11795
11796 ipllc = (ipllc_t *)mp1->b_rptr;
11797 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
11798 if (ipllc->ipllc_cmd == ipft->ipft_cmd)
11799 break;
11800 }
11801 /*
11802 * prefer credential from mblk over ioctl;
11803 * see ip_sioctl_copyin_setup
11804 */
11805 cr = msg_getcred(mp, NULL);
11806 if (cr == NULL)
11807 cr = iocp->ioc_cr;
11808
11809 /*
11810 * Refhold the conn in case the request gets queued up in some lookup
11811 */
11812 ASSERT(CONN_Q(q));
11813 connp = Q_TO_CONN(q);
11814 CONN_INC_REF(connp);
11815 CONN_INC_IOCTLREF(connp);
11816 if (ipft->ipft_pfi &&
11817 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
11818 pullupmsg(mp1, ipft->ipft_min_size))) {
11819 error = (*ipft->ipft_pfi)(q,
11820 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
11821 }
11822 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
11823 /*
11824 * CONN_OPER_PENDING_DONE happens in the function called
11825 * through ipft_pfi above.
11826 */
11827 return;
11828 }
11829
11830 CONN_DEC_IOCTLREF(connp);
11831 CONN_OPER_PENDING_DONE(connp);
11832 if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
11833 freemsg(mp);
11834 return;
11835 }
11836 iocp->ioc_error = error;
11837
11838 done:
11839 mp->b_datap->db_type = M_IOCACK;
11840 if (iocp->ioc_error)
11841 iocp->ioc_count = 0;
11842 qreply(q, mp);
11843 }
11844
11845 /*
11846 * Assign a unique id for the ipif. This is used by sctp_addr.c
11847 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
11848 */
11849 static void
11850 ipif_assign_seqid(ipif_t *ipif)
11851 {
11852 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
11853
11854 ipif->ipif_seqid = atomic_inc_64_nv(&ipst->ips_ipif_g_seqid);
11855 }
11856
11857 /*
11858 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are
11859 * administratively down (i.e., no DAD), of the same type, and locked. Note
11860 * that the clone is complete -- including the seqid -- and the expectation is
11861 * that the caller will either free or overwrite `sipif' before it's unlocked.
11862 */
11863 static void
11864 ipif_clone(const ipif_t *sipif, ipif_t *dipif)
11865 {
11866 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
11867 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
11868 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11869 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11870 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
11871
11872 dipif->ipif_flags = sipif->ipif_flags;
11873 dipif->ipif_zoneid = sipif->ipif_zoneid;
11874 dipif->ipif_v6subnet = sipif->ipif_v6subnet;
11875 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
11876 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
11877 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
11878 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
11879
11880 /*
11881 * As per the comment atop the function, we assume that these sipif
11882 * fields will be changed before sipif is unlocked.
11883 */
11884 dipif->ipif_seqid = sipif->ipif_seqid;
11885 dipif->ipif_state_flags = sipif->ipif_state_flags;
11886 }
11887
11888 /*
11889 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
11890 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
11891 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then
11892 * transfer the xop to `dipif'. Requires that all ipifs are administratively
11893 * down (i.e., no DAD), of the same type, and unlocked.
11894 */
11895 static void
11896 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
11897 {
11898 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
11899 ipxop_t *ipx = ipsq->ipsq_xop;
11900
11901 ASSERT(sipif != dipif);
11902 ASSERT(sipif != virgipif);
11903
11904 /*
11905 * Grab all of the locks that protect the ipif in a defined order.
11906 */
11907 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11908
11909 ipif_clone(sipif, dipif);
11910 if (virgipif != NULL) {
11911 ipif_clone(virgipif, sipif);
11912 mi_free(virgipif);
11913 }
11914
11915 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11916
11917 /*
11918 * Transfer ownership of the current xop, if necessary.
11919 */
11920 if (ipx->ipx_current_ipif == sipif) {
11921 ASSERT(ipx->ipx_pending_ipif == NULL);
11922 mutex_enter(&ipx->ipx_lock);
11923 ipx->ipx_current_ipif = dipif;
11924 mutex_exit(&ipx->ipx_lock);
11925 }
11926
11927 if (virgipif == NULL)
11928 mi_free(sipif);
11929 }
11930
11931 /*
11932 * checks if:
11933 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and
11934 * - logical interface is within the allowed range
11935 */
11936 static int
11937 is_lifname_valid(ill_t *ill, unsigned int ipif_id)
11938 {
11939 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ)
11940 return (ENAMETOOLONG);
11941
11942 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if)
11943 return (ERANGE);
11944 return (0);
11945 }
11946
11947 /*
11948 * Insert the ipif, so that the list of ipifs on the ill will be sorted
11949 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
11950 * be inserted into the first space available in the list. The value of
11951 * ipif_id will then be set to the appropriate value for its position.
11952 */
11953 static int
11954 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
11955 {
11956 ill_t *ill;
11957 ipif_t *tipif;
11958 ipif_t **tipifp;
11959 int id, err;
11960 ip_stack_t *ipst;
11961
11962 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
11963 IAM_WRITER_IPIF(ipif));
11964
11965 ill = ipif->ipif_ill;
11966 ASSERT(ill != NULL);
11967 ipst = ill->ill_ipst;
11968
11969 /*
11970 * In the case of lo0:0 we already hold the ill_g_lock.
11971 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
11972 * ipif_insert.
11973 */
11974 if (acquire_g_lock)
11975 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11976 mutex_enter(&ill->ill_lock);
11977 id = ipif->ipif_id;
11978 tipifp = &(ill->ill_ipif);
11979 if (id == -1) { /* need to find a real id */
11980 id = 0;
11981 while ((tipif = *tipifp) != NULL) {
11982 ASSERT(tipif->ipif_id >= id);
11983 if (tipif->ipif_id != id)
11984 break; /* non-consecutive id */
11985 id++;
11986 tipifp = &(tipif->ipif_next);
11987 }
11988 if ((err = is_lifname_valid(ill, id)) != 0) {
11989 mutex_exit(&ill->ill_lock);
11990 if (acquire_g_lock)
11991 rw_exit(&ipst->ips_ill_g_lock);
11992 return (err);
11993 }
11994 ipif->ipif_id = id; /* assign new id */
11995 } else if ((err = is_lifname_valid(ill, id)) == 0) {
11996 /* we have a real id; insert ipif in the right place */
11997 while ((tipif = *tipifp) != NULL) {
11998 ASSERT(tipif->ipif_id != id);
11999 if (tipif->ipif_id > id)
12000 break; /* found correct location */
12001 tipifp = &(tipif->ipif_next);
12002 }
12003 } else {
12004 mutex_exit(&ill->ill_lock);
12005 if (acquire_g_lock)
12006 rw_exit(&ipst->ips_ill_g_lock);
12007 return (err);
12008 }
12009
12010 ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
12011
12012 ipif->ipif_next = tipif;
12013 *tipifp = ipif;
12014 mutex_exit(&ill->ill_lock);
12015 if (acquire_g_lock)
12016 rw_exit(&ipst->ips_ill_g_lock);
12017
12018 return (0);
12019 }
12020
12021 static void
12022 ipif_remove(ipif_t *ipif)
12023 {
12024 ipif_t **ipifp;
12025 ill_t *ill = ipif->ipif_ill;
12026
12027 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
12028
12029 mutex_enter(&ill->ill_lock);
12030 ipifp = &ill->ill_ipif;
12031 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
12032 if (*ipifp == ipif) {
12033 *ipifp = ipif->ipif_next;
12034 break;
12035 }
12036 }
12037 mutex_exit(&ill->ill_lock);
12038 }
12039
12040 /*
12041 * Allocate and initialize a new interface control structure. (Always
12042 * called as writer.)
12043 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
12044 * is not part of the global linked list of ills. ipif_seqid is unique
12045 * in the system and to preserve the uniqueness, it is assigned only
12046 * when ill becomes part of the global list. At that point ill will
12047 * have a name. If it doesn't get assigned here, it will get assigned
12048 * in ipif_set_values() as part of SIOCSLIFNAME processing.
12049 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
12050 * the interface flags or any other information from the DL_INFO_ACK for
12051 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
12052 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
12053 * second DL_INFO_ACK comes in from the driver.
12054 */
12055 static ipif_t *
12056 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
12057 boolean_t insert, int *errorp)
12058 {
12059 int err;
12060 ipif_t *ipif;
12061 ip_stack_t *ipst = ill->ill_ipst;
12062
12063 ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
12064 ill->ill_name, id, (void *)ill));
12065 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
12066
12067 if (errorp != NULL)
12068 *errorp = 0;
12069
12070 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) {
12071 if (errorp != NULL)
12072 *errorp = ENOMEM;
12073 return (NULL);
12074 }
12075 *ipif = ipif_zero; /* start clean */
12076
12077 ipif->ipif_ill = ill;
12078 ipif->ipif_id = id; /* could be -1 */
12079 /*
12080 * Inherit the zoneid from the ill; for the shared stack instance
12081 * this is always the global zone
12082 */
12083 ipif->ipif_zoneid = ill->ill_zoneid;
12084
12085 ipif->ipif_refcnt = 0;
12086
12087 if (insert) {
12088 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) {
12089 mi_free(ipif);
12090 if (errorp != NULL)
12091 *errorp = err;
12092 return (NULL);
12093 }
12094 /* -1 id should have been replaced by real id */
12095 id = ipif->ipif_id;
12096 ASSERT(id >= 0);
12097 }
12098
12099 if (ill->ill_name[0] != '\0')
12100 ipif_assign_seqid(ipif);
12101
12102 /*
12103 * If this is the zeroth ipif on the IPMP ill, create the illgrp
12104 * (which must not exist yet because the zeroth ipif is created once
12105 * per ill). However, do not not link it to the ipmp_grp_t until
12106 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details.
12107 */
12108 if (id == 0 && IS_IPMP(ill)) {
12109 if (ipmp_illgrp_create(ill) == NULL) {
12110 if (insert) {
12111 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
12112 ipif_remove(ipif);
12113 rw_exit(&ipst->ips_ill_g_lock);
12114 }
12115 mi_free(ipif);
12116 if (errorp != NULL)
12117 *errorp = ENOMEM;
12118 return (NULL);
12119 }
12120 }
12121
12122 /*
12123 * We grab ill_lock to protect the flag changes. The ipif is still
12124 * not up and can't be looked up until the ioctl completes and the
12125 * IPIF_CHANGING flag is cleared.
12126 */
12127 mutex_enter(&ill->ill_lock);
12128
12129 ipif->ipif_ire_type = ire_type;
12130
12131 if (ipif->ipif_isv6) {
12132 ill->ill_flags |= ILLF_IPV6;
12133 } else {
12134 ipaddr_t inaddr_any = INADDR_ANY;
12135
12136 ill->ill_flags |= ILLF_IPV4;
12137
12138 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
12139 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12140 &ipif->ipif_v6lcl_addr);
12141 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12142 &ipif->ipif_v6subnet);
12143 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12144 &ipif->ipif_v6net_mask);
12145 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12146 &ipif->ipif_v6brd_addr);
12147 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12148 &ipif->ipif_v6pp_dst_addr);
12149 }
12150
12151 /*
12152 * Don't set the interface flags etc. now, will do it in
12153 * ip_ll_subnet_defaults.
12154 */
12155 if (!initialize)
12156 goto out;
12157
12158 /*
12159 * NOTE: The IPMP meta-interface is special-cased because it starts
12160 * with no underlying interfaces (and thus an unknown broadcast
12161 * address length), but all interfaces that can be placed into an IPMP
12162 * group are required to be broadcast-capable.
12163 */
12164 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
12165 /*
12166 * Later detect lack of DLPI driver multicast capability by
12167 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi().
12168 */
12169 ill->ill_flags |= ILLF_MULTICAST;
12170 if (!ipif->ipif_isv6)
12171 ipif->ipif_flags |= IPIF_BROADCAST;
12172 } else {
12173 if (ill->ill_net_type != IRE_LOOPBACK) {
12174 if (ipif->ipif_isv6)
12175 /*
12176 * Note: xresolv interfaces will eventually need
12177 * NOARP set here as well, but that will require
12178 * those external resolvers to have some
12179 * knowledge of that flag and act appropriately.
12180 * Not to be changed at present.
12181 */
12182 ill->ill_flags |= ILLF_NONUD;
12183 else
12184 ill->ill_flags |= ILLF_NOARP;
12185 }
12186 if (ill->ill_phys_addr_length == 0) {
12187 if (IS_VNI(ill)) {
12188 ipif->ipif_flags |= IPIF_NOXMIT;
12189 } else {
12190 /* pt-pt supports multicast. */
12191 ill->ill_flags |= ILLF_MULTICAST;
12192 if (ill->ill_net_type != IRE_LOOPBACK)
12193 ipif->ipif_flags |= IPIF_POINTOPOINT;
12194 }
12195 }
12196 }
12197 out:
12198 mutex_exit(&ill->ill_lock);
12199 return (ipif);
12200 }
12201
12202 /*
12203 * Remove the neighbor cache entries associated with this logical
12204 * interface.
12205 */
12206 int
12207 ipif_arp_down(ipif_t *ipif)
12208 {
12209 ill_t *ill = ipif->ipif_ill;
12210 int err = 0;
12211
12212 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
12213 ASSERT(IAM_WRITER_IPIF(ipif));
12214
12215 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
12216 ill_t *, ill, ipif_t *, ipif);
12217 ipif_nce_down(ipif);
12218
12219 /*
12220 * If this is the last ipif that is going down and there are no
12221 * duplicate addresses we may yet attempt to re-probe, then we need to
12222 * clean up ARP completely.
12223 */
12224 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
12225 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
12226 /*
12227 * If this was the last ipif on an IPMP interface, purge any
12228 * static ARP entries associated with it.
12229 */
12230 if (IS_IPMP(ill))
12231 ipmp_illgrp_refresh_arpent(ill->ill_grp);
12232
12233 /* UNBIND, DETACH */
12234 err = arp_ll_down(ill);
12235 }
12236
12237 return (err);
12238 }
12239
12240 /*
12241 * Get the resolver set up for a new IP address. (Always called as writer.)
12242 * Called both for IPv4 and IPv6 interfaces, though it only does some
12243 * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
12244 *
12245 * The enumerated value res_act tunes the behavior:
12246 * * Res_act_initial: set up all the resolver structures for a new
12247 * IP address.
12248 * * Res_act_defend: tell ARP that it needs to send a single gratuitous
12249 * ARP message in defense of the address.
12250 * * Res_act_rebind: tell ARP to change the hardware address for an IP
12251 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif().
12252 *
12253 * Returns zero on success, or an errno upon failure.
12254 */
12255 int
12256 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
12257 {
12258 ill_t *ill = ipif->ipif_ill;
12259 int err;
12260 boolean_t was_dup;
12261
12262 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
12263 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
12264 ASSERT(IAM_WRITER_IPIF(ipif));
12265
12266 was_dup = B_FALSE;
12267 if (res_act == Res_act_initial) {
12268 ipif->ipif_addr_ready = 0;
12269 /*
12270 * We're bringing an interface up here. There's no way that we
12271 * should need to shut down ARP now.
12272 */
12273 mutex_enter(&ill->ill_lock);
12274 if (ipif->ipif_flags & IPIF_DUPLICATE) {
12275 ipif->ipif_flags &= ~IPIF_DUPLICATE;
12276 ill->ill_ipif_dup_count--;
12277 was_dup = B_TRUE;
12278 }
12279 mutex_exit(&ill->ill_lock);
12280 }
12281 if (ipif->ipif_recovery_id != 0)
12282 (void) untimeout(ipif->ipif_recovery_id);
12283 ipif->ipif_recovery_id = 0;
12284 if (ill->ill_net_type != IRE_IF_RESOLVER) {
12285 ipif->ipif_addr_ready = 1;
12286 return (0);
12287 }
12288 /* NDP will set the ipif_addr_ready flag when it's ready */
12289 if (ill->ill_isv6)
12290 return (0);
12291
12292 err = ipif_arp_up(ipif, res_act, was_dup);
12293 return (err);
12294 }
12295
12296 /*
12297 * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
12298 * when a link has just gone back up.
12299 */
12300 static void
12301 ipif_nce_start_dad(ipif_t *ipif)
12302 {
12303 ncec_t *ncec;
12304 ill_t *ill = ipif->ipif_ill;
12305 boolean_t isv6 = ill->ill_isv6;
12306
12307 if (isv6) {
12308 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
12309 &ipif->ipif_v6lcl_addr);
12310 } else {
12311 ipaddr_t v4addr;
12312
12313 if (ill->ill_net_type != IRE_IF_RESOLVER ||
12314 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
12315 ipif->ipif_lcl_addr == INADDR_ANY) {
12316 /*
12317 * If we can't contact ARP for some reason,
12318 * that's not really a problem. Just send
12319 * out the routing socket notification that
12320 * DAD completion would have done, and continue.
12321 */
12322 ipif_mask_reply(ipif);
12323 ipif_up_notify(ipif);
12324 ipif->ipif_addr_ready = 1;
12325 return;
12326 }
12327
12328 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
12329 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
12330 }
12331
12332 if (ncec == NULL) {
12333 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
12334 (void *)ipif));
12335 return;
12336 }
12337 if (!nce_restart_dad(ncec)) {
12338 /*
12339 * If we can't restart DAD for some reason, that's not really a
12340 * problem. Just send out the routing socket notification that
12341 * DAD completion would have done, and continue.
12342 */
12343 ipif_up_notify(ipif);
12344 ipif->ipif_addr_ready = 1;
12345 }
12346 ncec_refrele(ncec);
12347 }
12348
12349 /*
12350 * Restart duplicate address detection on all interfaces on the given ill.
12351 *
12352 * This is called when an interface transitions from down to up
12353 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
12354 *
12355 * Note that since the underlying physical link has transitioned, we must cause
12356 * at least one routing socket message to be sent here, either via DAD
12357 * completion or just by default on the first ipif. (If we don't do this, then
12358 * in.mpathd will see long delays when doing link-based failure recovery.)
12359 */
12360 void
12361 ill_restart_dad(ill_t *ill, boolean_t went_up)
12362 {
12363 ipif_t *ipif;
12364
12365 if (ill == NULL)
12366 return;
12367
12368 /*
12369 * If layer two doesn't support duplicate address detection, then just
12370 * send the routing socket message now and be done with it.
12371 */
12372 if (!ill->ill_isv6 && arp_no_defense) {
12373 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12374 return;
12375 }
12376
12377 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12378 if (went_up) {
12379
12380 if (ipif->ipif_flags & IPIF_UP) {
12381 ipif_nce_start_dad(ipif);
12382 } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
12383 /*
12384 * kick off the bring-up process now.
12385 */
12386 ipif_do_recovery(ipif);
12387 } else {
12388 /*
12389 * Unfortunately, the first ipif is "special"
12390 * and represents the underlying ill in the
12391 * routing socket messages. Thus, when this
12392 * one ipif is down, we must still notify so
12393 * that the user knows the IFF_RUNNING status
12394 * change. (If the first ipif is up, then
12395 * we'll handle eventual routing socket
12396 * notification via DAD completion.)
12397 */
12398 if (ipif == ill->ill_ipif) {
12399 ip_rts_ifmsg(ill->ill_ipif,
12400 RTSQ_DEFAULT);
12401 }
12402 }
12403 } else {
12404 /*
12405 * After link down, we'll need to send a new routing
12406 * message when the link comes back, so clear
12407 * ipif_addr_ready.
12408 */
12409 ipif->ipif_addr_ready = 0;
12410 }
12411 }
12412
12413 /*
12414 * If we've torn down links, then notify the user right away.
12415 */
12416 if (!went_up)
12417 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12418 }
12419
12420 static void
12421 ipsq_delete(ipsq_t *ipsq)
12422 {
12423 ipxop_t *ipx = ipsq->ipsq_xop;
12424
12425 ipsq->ipsq_ipst = NULL;
12426 ASSERT(ipsq->ipsq_phyint == NULL);
12427 ASSERT(ipsq->ipsq_xop != NULL);
12428 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
12429 ASSERT(ipx->ipx_pending_mp == NULL);
12430 kmem_free(ipsq, sizeof (ipsq_t));
12431 }
12432
12433 static int
12434 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
12435 {
12436 int err = 0;
12437 ipif_t *ipif;
12438
12439 if (ill == NULL)
12440 return (0);
12441
12442 ASSERT(IAM_WRITER_ILL(ill));
12443 ill->ill_up_ipifs = B_TRUE;
12444 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12445 if (ipif->ipif_was_up) {
12446 if (!(ipif->ipif_flags & IPIF_UP))
12447 err = ipif_up(ipif, q, mp);
12448 ipif->ipif_was_up = B_FALSE;
12449 if (err != 0) {
12450 ASSERT(err == EINPROGRESS);
12451 return (err);
12452 }
12453 }
12454 }
12455 ill->ill_up_ipifs = B_FALSE;
12456 return (0);
12457 }
12458
12459 /*
12460 * This function is called to bring up all the ipifs that were up before
12461 * bringing the ill down via ill_down_ipifs().
12462 */
12463 int
12464 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
12465 {
12466 int err;
12467
12468 ASSERT(IAM_WRITER_ILL(ill));
12469
12470 if (ill->ill_replumbing) {
12471 ill->ill_replumbing = 0;
12472 /*
12473 * Send down REPLUMB_DONE notification followed by the
12474 * BIND_REQ on the arp stream.
12475 */
12476 if (!ill->ill_isv6)
12477 arp_send_replumb_conf(ill);
12478 }
12479 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
12480 if (err != 0)
12481 return (err);
12482
12483 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
12484 }
12485
12486 /*
12487 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring
12488 * down the ipifs without sending DL_UNBIND_REQ to the driver.
12489 */
12490 static void
12491 ill_down_ipifs(ill_t *ill, boolean_t logical)
12492 {
12493 ipif_t *ipif;
12494
12495 ASSERT(IAM_WRITER_ILL(ill));
12496
12497 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12498 /*
12499 * We go through the ipif_down logic even if the ipif
12500 * is already down, since routes can be added based
12501 * on down ipifs. Going through ipif_down once again
12502 * will delete any IREs created based on these routes.
12503 */
12504 if (ipif->ipif_flags & IPIF_UP)
12505 ipif->ipif_was_up = B_TRUE;
12506
12507 if (logical) {
12508 (void) ipif_logical_down(ipif, NULL, NULL);
12509 ipif_non_duplicate(ipif);
12510 (void) ipif_down_tail(ipif);
12511 } else {
12512 (void) ipif_down(ipif, NULL, NULL);
12513 }
12514 }
12515 }
12516
12517 /*
12518 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take
12519 * a look again at valid source addresses.
12520 * This should be called each time after the set of source addresses has been
12521 * changed.
12522 */
12523 void
12524 ip_update_source_selection(ip_stack_t *ipst)
12525 {
12526 /* We skip past SRC_GENERATION_VERIFY */
12527 if (atomic_inc_32_nv(&ipst->ips_src_generation) ==
12528 SRC_GENERATION_VERIFY)
12529 atomic_inc_32(&ipst->ips_src_generation);
12530 }
12531
12532 /*
12533 * Finish the group join started in ip_sioctl_groupname().
12534 */
12535 /* ARGSUSED */
12536 static void
12537 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
12538 {
12539 ill_t *ill = q->q_ptr;
12540 phyint_t *phyi = ill->ill_phyint;
12541 ipmp_grp_t *grp = phyi->phyint_grp;
12542 ip_stack_t *ipst = ill->ill_ipst;
12543
12544 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
12545 ASSERT(!IS_IPMP(ill) && grp != NULL);
12546 ASSERT(IAM_WRITER_IPSQ(ipsq));
12547
12548 if (phyi->phyint_illv4 != NULL) {
12549 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12550 VERIFY(grp->gr_pendv4-- > 0);
12551 rw_exit(&ipst->ips_ipmp_lock);
12552 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
12553 }
12554 if (phyi->phyint_illv6 != NULL) {
12555 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12556 VERIFY(grp->gr_pendv6-- > 0);
12557 rw_exit(&ipst->ips_ipmp_lock);
12558 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
12559 }
12560 freemsg(mp);
12561 }
12562
12563 /*
12564 * Process an SIOCSLIFGROUPNAME request.
12565 */
12566 /* ARGSUSED */
12567 int
12568 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12569 ip_ioctl_cmd_t *ipip, void *ifreq)
12570 {
12571 struct lifreq *lifr = ifreq;
12572 ill_t *ill = ipif->ipif_ill;
12573 ip_stack_t *ipst = ill->ill_ipst;
12574 phyint_t *phyi = ill->ill_phyint;
12575 ipmp_grp_t *grp = phyi->phyint_grp;
12576 mblk_t *ipsq_mp;
12577 int err = 0;
12578
12579 /*
12580 * Note that phyint_grp can only change here, where we're exclusive.
12581 */
12582 ASSERT(IAM_WRITER_ILL(ill));
12583
12584 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
12585 (phyi->phyint_flags & PHYI_VIRTUAL))
12586 return (EINVAL);
12587
12588 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
12589
12590 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12591
12592 /*
12593 * If the name hasn't changed, there's nothing to do.
12594 */
12595 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
12596 goto unlock;
12597
12598 /*
12599 * Handle requests to rename an IPMP meta-interface.
12600 *
12601 * Note that creation of the IPMP meta-interface is handled in
12602 * userland through the standard plumbing sequence. As part of the
12603 * plumbing the IPMP meta-interface, its initial groupname is set to
12604 * the name of the interface (see ipif_set_values_tail()).
12605 */
12606 if (IS_IPMP(ill)) {
12607 err = ipmp_grp_rename(grp, lifr->lifr_groupname);
12608 goto unlock;
12609 }
12610
12611 /*
12612 * Handle requests to add or remove an IP interface from a group.
12613 */
12614 if (lifr->lifr_groupname[0] != '\0') { /* add */
12615 /*
12616 * Moves are handled by first removing the interface from
12617 * its existing group, and then adding it to another group.
12618 * So, fail if it's already in a group.
12619 */
12620 if (IS_UNDER_IPMP(ill)) {
12621 err = EALREADY;
12622 goto unlock;
12623 }
12624
12625 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
12626 if (grp == NULL) {
12627 err = ENOENT;
12628 goto unlock;
12629 }
12630
12631 /*
12632 * Check if the phyint and its ills are suitable for
12633 * inclusion into the group.
12634 */
12635 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
12636 goto unlock;
12637
12638 /*
12639 * Checks pass; join the group, and enqueue the remaining
12640 * illgrp joins for when we've become part of the group xop
12641 * and are exclusive across its IPSQs. Since qwriter_ip()
12642 * requires an mblk_t to scribble on, and since `mp' will be
12643 * freed as part of completing the ioctl, allocate another.
12644 */
12645 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
12646 err = ENOMEM;
12647 goto unlock;
12648 }
12649
12650 /*
12651 * Before we drop ipmp_lock, bump gr_pend* to ensure that the
12652 * IPMP meta-interface ills needed by `phyi' cannot go away
12653 * before ip_join_illgrps() is called back. See the comments
12654 * in ip_sioctl_plink_ipmp() for more.
12655 */
12656 if (phyi->phyint_illv4 != NULL)
12657 grp->gr_pendv4++;
12658 if (phyi->phyint_illv6 != NULL)
12659 grp->gr_pendv6++;
12660
12661 rw_exit(&ipst->ips_ipmp_lock);
12662
12663 ipmp_phyint_join_grp(phyi, grp);
12664 ill_refhold(ill);
12665 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
12666 SWITCH_OP, B_FALSE);
12667 return (0);
12668 } else {
12669 /*
12670 * Request to remove the interface from a group. If the
12671 * interface is not in a group, this trivially succeeds.
12672 */
12673 rw_exit(&ipst->ips_ipmp_lock);
12674 if (IS_UNDER_IPMP(ill))
12675 ipmp_phyint_leave_grp(phyi);
12676 return (0);
12677 }
12678 unlock:
12679 rw_exit(&ipst->ips_ipmp_lock);
12680 return (err);
12681 }
12682
12683 /*
12684 * Process an SIOCGLIFBINDING request.
12685 */
12686 /* ARGSUSED */
12687 int
12688 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12689 ip_ioctl_cmd_t *ipip, void *ifreq)
12690 {
12691 ill_t *ill;
12692 struct lifreq *lifr = ifreq;
12693 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12694
12695 if (!IS_IPMP(ipif->ipif_ill))
12696 return (EINVAL);
12697
12698 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12699 if ((ill = ipif->ipif_bound_ill) == NULL)
12700 lifr->lifr_binding[0] = '\0';
12701 else
12702 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ);
12703 rw_exit(&ipst->ips_ipmp_lock);
12704 return (0);
12705 }
12706
12707 /*
12708 * Process an SIOCGLIFGROUPNAME request.
12709 */
12710 /* ARGSUSED */
12711 int
12712 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12713 ip_ioctl_cmd_t *ipip, void *ifreq)
12714 {
12715 ipmp_grp_t *grp;
12716 struct lifreq *lifr = ifreq;
12717 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12718
12719 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12720 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
12721 lifr->lifr_groupname[0] = '\0';
12722 else
12723 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
12724 rw_exit(&ipst->ips_ipmp_lock);
12725 return (0);
12726 }
12727
12728 /*
12729 * Process an SIOCGLIFGROUPINFO request.
12730 */
12731 /* ARGSUSED */
12732 int
12733 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12734 ip_ioctl_cmd_t *ipip, void *dummy)
12735 {
12736 ipmp_grp_t *grp;
12737 lifgroupinfo_t *lifgr;
12738 ip_stack_t *ipst = CONNQ_TO_IPST(q);
12739
12740 /* ip_wput_nondata() verified mp->b_cont->b_cont */
12741 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
12742 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
12743
12744 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12745 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
12746 rw_exit(&ipst->ips_ipmp_lock);
12747 return (ENOENT);
12748 }
12749 ipmp_grp_info(grp, lifgr);
12750 rw_exit(&ipst->ips_ipmp_lock);
12751 return (0);
12752 }
12753
12754 static void
12755 ill_dl_down(ill_t *ill)
12756 {
12757 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
12758
12759 /*
12760 * The ill is down; unbind but stay attached since we're still
12761 * associated with a PPA. If we have negotiated DLPI capabilites
12762 * with the data link service provider (IDS_OK) then reset them.
12763 * The interval between unbinding and rebinding is potentially
12764 * unbounded hence we cannot assume things will be the same.
12765 * The DLPI capabilities will be probed again when the data link
12766 * is brought up.
12767 */
12768 mblk_t *mp = ill->ill_unbind_mp;
12769
12770 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
12771
12772 if (!ill->ill_replumbing) {
12773 /* Free all ilms for this ill */
12774 update_conn_ill(ill, ill->ill_ipst);
12775 } else {
12776 ill_leave_multicast(ill);
12777 }
12778
12779 ill->ill_unbind_mp = NULL;
12780
12781 mutex_enter(&ill->ill_lock);
12782 ill->ill_dl_up = 0;
12783 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
12784 mutex_exit(&ill->ill_lock);
12785
12786 if (mp != NULL) {
12787 ip1dbg(("ill_dl_down: %s (%u) for %s\n",
12788 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12789 ill->ill_name));
12790 mutex_enter(&ill->ill_lock);
12791 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
12792 mutex_exit(&ill->ill_lock);
12793 /*
12794 * ip_rput does not pass up normal (M_PROTO) DLPI messages
12795 * after ILL_CONDEMNED is set. So in the unplumb case, we call
12796 * ill_capability_dld_disable disable rightaway. If this is not
12797 * an unplumb operation then the disable happens on receipt of
12798 * the capab ack via ip_rput_dlpi_writer ->
12799 * ill_capability_ack_thr. In both cases the order of
12800 * the operations seen by DLD is capability disable followed
12801 * by DL_UNBIND. Also the DLD capability disable needs a
12802 * cv_wait'able context.
12803 */
12804 if (ill->ill_state_flags & ILL_CONDEMNED)
12805 ill_capability_dld_disable(ill);
12806 ill_capability_reset(ill, B_FALSE);
12807 ill_dlpi_send(ill, mp);
12808
12809 /*
12810 * Wait for the capability reset to finish.
12811 * In this case, it doesn't matter WHY or HOW it finished.
12812 */
12813 (void) ill_capability_wait(ill);
12814 }
12815 }
12816
12817 void
12818 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
12819 {
12820 union DL_primitives *dlp;
12821 t_uscalar_t prim;
12822 boolean_t waitack = B_FALSE;
12823
12824 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12825
12826 dlp = (union DL_primitives *)mp->b_rptr;
12827 prim = dlp->dl_primitive;
12828
12829 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
12830 dl_primstr(prim), prim, ill->ill_name));
12831
12832 switch (prim) {
12833 case DL_PHYS_ADDR_REQ:
12834 {
12835 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
12836 ill->ill_phys_addr_pend = dlpap->dl_addr_type;
12837 break;
12838 }
12839 case DL_BIND_REQ:
12840 mutex_enter(&ill->ill_lock);
12841 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
12842 mutex_exit(&ill->ill_lock);
12843 break;
12844 }
12845
12846 /*
12847 * Except for the ACKs for the M_PCPROTO messages, all other ACKs
12848 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
12849 * we only wait for the ACK of the DL_UNBIND_REQ.
12850 */
12851 mutex_enter(&ill->ill_lock);
12852 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12853 (prim == DL_UNBIND_REQ)) {
12854 ill->ill_dlpi_pending = prim;
12855 waitack = B_TRUE;
12856 }
12857
12858 mutex_exit(&ill->ill_lock);
12859 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
12860 char *, dl_primstr(prim), ill_t *, ill);
12861 putnext(ill->ill_wq, mp);
12862
12863 /*
12864 * There is no ack for DL_NOTIFY_CONF messages
12865 */
12866 if (waitack && prim == DL_NOTIFY_CONF)
12867 ill_dlpi_done(ill, prim);
12868 }
12869
12870 /*
12871 * Helper function for ill_dlpi_send().
12872 */
12873 /* ARGSUSED */
12874 static void
12875 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12876 {
12877 ill_dlpi_send(q->q_ptr, mp);
12878 }
12879
12880 /*
12881 * Send a DLPI control message to the driver but make sure there
12882 * is only one outstanding message. Uses ill_dlpi_pending to tell
12883 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
12884 * when an ACK or a NAK is received to process the next queued message.
12885 */
12886 void
12887 ill_dlpi_send(ill_t *ill, mblk_t *mp)
12888 {
12889 mblk_t **mpp;
12890
12891 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12892
12893 /*
12894 * To ensure that any DLPI requests for current exclusive operation
12895 * are always completely sent before any DLPI messages for other
12896 * operations, require writer access before enqueuing.
12897 */
12898 if (!IAM_WRITER_ILL(ill)) {
12899 ill_refhold(ill);
12900 /* qwriter_ip() does the ill_refrele() */
12901 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
12902 NEW_OP, B_TRUE);
12903 return;
12904 }
12905
12906 mutex_enter(&ill->ill_lock);
12907 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12908 /* Must queue message. Tail insertion */
12909 mpp = &ill->ill_dlpi_deferred;
12910 while (*mpp != NULL)
12911 mpp = &((*mpp)->b_next);
12912
12913 ip1dbg(("ill_dlpi_send: deferring request for %s "
12914 "while %s pending\n", ill->ill_name,
12915 dl_primstr(ill->ill_dlpi_pending)));
12916
12917 *mpp = mp;
12918 mutex_exit(&ill->ill_lock);
12919 return;
12920 }
12921 mutex_exit(&ill->ill_lock);
12922 ill_dlpi_dispatch(ill, mp);
12923 }
12924
12925 void
12926 ill_capability_send(ill_t *ill, mblk_t *mp)
12927 {
12928 ill->ill_capab_pending_cnt++;
12929 ill_dlpi_send(ill, mp);
12930 }
12931
12932 void
12933 ill_capability_done(ill_t *ill)
12934 {
12935 ASSERT(ill->ill_capab_pending_cnt != 0);
12936 ASSERT(IAM_WRITER_ILL(ill));
12937
12938 ill_dlpi_done(ill, DL_CAPABILITY_REQ);
12939
12940 ill->ill_capab_pending_cnt--;
12941 if (ill->ill_capab_pending_cnt == 0 &&
12942 ill->ill_dlpi_capab_state == IDCS_OK)
12943 ill_capability_reset_alloc(ill);
12944 }
12945
12946 /*
12947 * Send all deferred DLPI messages without waiting for their ACKs.
12948 */
12949 void
12950 ill_dlpi_send_deferred(ill_t *ill)
12951 {
12952 mblk_t *mp, *nextmp;
12953
12954 /*
12955 * Clear ill_dlpi_pending so that the message is not queued in
12956 * ill_dlpi_send().
12957 */
12958 mutex_enter(&ill->ill_lock);
12959 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12960 mp = ill->ill_dlpi_deferred;
12961 ill->ill_dlpi_deferred = NULL;
12962 mutex_exit(&ill->ill_lock);
12963
12964 for (; mp != NULL; mp = nextmp) {
12965 nextmp = mp->b_next;
12966 mp->b_next = NULL;
12967 ill_dlpi_send(ill, mp);
12968 }
12969 }
12970
12971 /*
12972 * Clear all the deferred DLPI messages. Called on receiving an M_ERROR
12973 * or M_HANGUP
12974 */
12975 static void
12976 ill_dlpi_clear_deferred(ill_t *ill)
12977 {
12978 mblk_t *mp, *nextmp;
12979
12980 mutex_enter(&ill->ill_lock);
12981 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12982 mp = ill->ill_dlpi_deferred;
12983 ill->ill_dlpi_deferred = NULL;
12984 mutex_exit(&ill->ill_lock);
12985
12986 for (; mp != NULL; mp = nextmp) {
12987 nextmp = mp->b_next;
12988 inet_freemsg(mp);
12989 }
12990 }
12991
12992 /*
12993 * Check if the DLPI primitive `prim' is pending; print a warning if not.
12994 */
12995 boolean_t
12996 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
12997 {
12998 t_uscalar_t pending;
12999
13000 mutex_enter(&ill->ill_lock);
13001 if (ill->ill_dlpi_pending == prim) {
13002 mutex_exit(&ill->ill_lock);
13003 return (B_TRUE);
13004 }
13005
13006 /*
13007 * During teardown, ill_dlpi_dispatch() will send DLPI requests
13008 * without waiting, so don't print any warnings in that case.
13009 */
13010 if (ill->ill_state_flags & ILL_CONDEMNED) {
13011 mutex_exit(&ill->ill_lock);
13012 return (B_FALSE);
13013 }
13014 pending = ill->ill_dlpi_pending;
13015 mutex_exit(&ill->ill_lock);
13016
13017 if (pending == DL_PRIM_INVAL) {
13018 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
13019 "received unsolicited ack for %s on %s\n",
13020 dl_primstr(prim), ill->ill_name);
13021 } else {
13022 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
13023 "received unexpected ack for %s on %s (expecting %s)\n",
13024 dl_primstr(prim), ill->ill_name, dl_primstr(pending));
13025 }
13026 return (B_FALSE);
13027 }
13028
13029 /*
13030 * Complete the current DLPI operation associated with `prim' on `ill' and
13031 * start the next queued DLPI operation (if any). If there are no queued DLPI
13032 * operations and the ill's current exclusive IPSQ operation has finished
13033 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to
13034 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See
13035 * the comments above ipsq_current_finish() for details.
13036 */
13037 void
13038 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
13039 {
13040 mblk_t *mp;
13041 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
13042 ipxop_t *ipx = ipsq->ipsq_xop;
13043
13044 ASSERT(IAM_WRITER_IPSQ(ipsq));
13045 mutex_enter(&ill->ill_lock);
13046
13047 ASSERT(prim != DL_PRIM_INVAL);
13048 ASSERT(ill->ill_dlpi_pending == prim);
13049
13050 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
13051 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
13052
13053 if ((mp = ill->ill_dlpi_deferred) == NULL) {
13054 ill->ill_dlpi_pending = DL_PRIM_INVAL;
13055 if (ipx->ipx_current_done) {
13056 mutex_enter(&ipx->ipx_lock);
13057 ipx->ipx_current_ipif = NULL;
13058 mutex_exit(&ipx->ipx_lock);
13059 }
13060 cv_signal(&ill->ill_cv);
13061 mutex_exit(&ill->ill_lock);
13062 return;
13063 }
13064
13065 ill->ill_dlpi_deferred = mp->b_next;
13066 mp->b_next = NULL;
13067 mutex_exit(&ill->ill_lock);
13068
13069 ill_dlpi_dispatch(ill, mp);
13070 }
13071
13072 /*
13073 * Queue a (multicast) DLPI control message to be sent to the driver by
13074 * later calling ill_dlpi_send_queued.
13075 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13076 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
13077 * for the same group to race.
13078 * We send DLPI control messages in order using ill_lock.
13079 * For IPMP we should be called on the cast_ill.
13080 */
13081 void
13082 ill_dlpi_queue(ill_t *ill, mblk_t *mp)
13083 {
13084 mblk_t **mpp;
13085
13086 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
13087
13088 mutex_enter(&ill->ill_lock);
13089 /* Must queue message. Tail insertion */
13090 mpp = &ill->ill_dlpi_deferred;
13091 while (*mpp != NULL)
13092 mpp = &((*mpp)->b_next);
13093
13094 *mpp = mp;
13095 mutex_exit(&ill->ill_lock);
13096 }
13097
13098 /*
13099 * Send the messages that were queued. Make sure there is only
13100 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
13101 * when an ACK or a NAK is received to process the next queued message.
13102 * For IPMP we are called on the upper ill, but when send what is queued
13103 * on the cast_ill.
13104 */
13105 void
13106 ill_dlpi_send_queued(ill_t *ill)
13107 {
13108 mblk_t *mp;
13109 union DL_primitives *dlp;
13110 t_uscalar_t prim;
13111 ill_t *release_ill = NULL;
13112
13113 if (IS_IPMP(ill)) {
13114 /* On the upper IPMP ill. */
13115 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13116 if (release_ill == NULL) {
13117 /* Avoid ever sending anything down to the ipmpstub */
13118 return;
13119 }
13120 ill = release_ill;
13121 }
13122 mutex_enter(&ill->ill_lock);
13123 while ((mp = ill->ill_dlpi_deferred) != NULL) {
13124 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
13125 /* Can't send. Somebody else will send it */
13126 mutex_exit(&ill->ill_lock);
13127 goto done;
13128 }
13129 ill->ill_dlpi_deferred = mp->b_next;
13130 mp->b_next = NULL;
13131 if (!ill->ill_dl_up) {
13132 /*
13133 * Nobody there. All multicast addresses will be
13134 * re-joined when we get the DL_BIND_ACK bringing the
13135 * interface up.
13136 */
13137 freemsg(mp);
13138 continue;
13139 }
13140 dlp = (union DL_primitives *)mp->b_rptr;
13141 prim = dlp->dl_primitive;
13142
13143 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
13144 (prim == DL_UNBIND_REQ)) {
13145 ill->ill_dlpi_pending = prim;
13146 }
13147 mutex_exit(&ill->ill_lock);
13148
13149 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
13150 char *, dl_primstr(prim), ill_t *, ill);
13151 putnext(ill->ill_wq, mp);
13152 mutex_enter(&ill->ill_lock);
13153 }
13154 mutex_exit(&ill->ill_lock);
13155 done:
13156 if (release_ill != NULL)
13157 ill_refrele(release_ill);
13158 }
13159
13160 /*
13161 * Queue an IP (IGMP/MLD) message to be sent by IP from
13162 * ill_mcast_send_queued
13163 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13164 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
13165 * group to race.
13166 * We send them in order using ill_lock.
13167 * For IPMP we are called on the upper ill, but we queue on the cast_ill.
13168 */
13169 void
13170 ill_mcast_queue(ill_t *ill, mblk_t *mp)
13171 {
13172 mblk_t **mpp;
13173 ill_t *release_ill = NULL;
13174
13175 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
13176
13177 if (IS_IPMP(ill)) {
13178 /* On the upper IPMP ill. */
13179 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13180 if (release_ill == NULL) {
13181 /* Discard instead of queuing for the ipmp interface */
13182 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13183 ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
13184 mp, ill);
13185 freemsg(mp);
13186 return;
13187 }
13188 ill = release_ill;
13189 }
13190
13191 mutex_enter(&ill->ill_lock);
13192 /* Must queue message. Tail insertion */
13193 mpp = &ill->ill_mcast_deferred;
13194 while (*mpp != NULL)
13195 mpp = &((*mpp)->b_next);
13196
13197 *mpp = mp;
13198 mutex_exit(&ill->ill_lock);
13199 if (release_ill != NULL)
13200 ill_refrele(release_ill);
13201 }
13202
13203 /*
13204 * Send the IP packets that were queued by ill_mcast_queue.
13205 * These are IGMP/MLD packets.
13206 *
13207 * For IPMP we are called on the upper ill, but when send what is queued
13208 * on the cast_ill.
13209 *
13210 * Request loopback of the report if we are acting as a multicast
13211 * router, so that the process-level routing demon can hear it.
13212 * This will run multiple times for the same group if there are members
13213 * on the same group for multiple ipif's on the same ill. The
13214 * igmp_input/mld_input code will suppress this due to the loopback thus we
13215 * always loopback membership report.
13216 *
13217 * We also need to make sure that this does not get load balanced
13218 * by IPMP. We do this by passing an ill to ip_output_simple.
13219 */
13220 void
13221 ill_mcast_send_queued(ill_t *ill)
13222 {
13223 mblk_t *mp;
13224 ip_xmit_attr_t ixas;
13225 ill_t *release_ill = NULL;
13226
13227 if (IS_IPMP(ill)) {
13228 /* On the upper IPMP ill. */
13229 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13230 if (release_ill == NULL) {
13231 /*
13232 * We should have no messages on the ipmp interface
13233 * but no point in trying to send them.
13234 */
13235 return;
13236 }
13237 ill = release_ill;
13238 }
13239 bzero(&ixas, sizeof (ixas));
13240 ixas.ixa_zoneid = ALL_ZONES;
13241 ixas.ixa_cred = kcred;
13242 ixas.ixa_cpid = NOPID;
13243 ixas.ixa_tsl = NULL;
13244 /*
13245 * Here we set ixa_ifindex. If IPMP it will be the lower ill which
13246 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
13247 * That is necessary to handle IGMP/MLD snooping switches.
13248 */
13249 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
13250 ixas.ixa_ipst = ill->ill_ipst;
13251
13252 mutex_enter(&ill->ill_lock);
13253 while ((mp = ill->ill_mcast_deferred) != NULL) {
13254 ill->ill_mcast_deferred = mp->b_next;
13255 mp->b_next = NULL;
13256 if (!ill->ill_dl_up) {
13257 /*
13258 * Nobody there. Just drop the ip packets.
13259 * IGMP/MLD will resend later, if this is a replumb.
13260 */
13261 freemsg(mp);
13262 continue;
13263 }
13264 mutex_enter(&ill->ill_phyint->phyint_lock);
13265 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
13266 /*
13267 * When the ill is getting deactivated, we only want to
13268 * send the DLPI messages, so drop IGMP/MLD packets.
13269 * DLPI messages are handled by ill_dlpi_send_queued()
13270 */
13271 mutex_exit(&ill->ill_phyint->phyint_lock);
13272 freemsg(mp);
13273 continue;
13274 }
13275 mutex_exit(&ill->ill_phyint->phyint_lock);
13276 mutex_exit(&ill->ill_lock);
13277
13278 /* Check whether we are sending IPv4 or IPv6. */
13279 if (ill->ill_isv6) {
13280 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
13281
13282 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
13283 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
13284 } else {
13285 ipha_t *ipha = (ipha_t *)mp->b_rptr;
13286
13287 ixas.ixa_multicast_ttl = ipha->ipha_ttl;
13288 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13289 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
13290 }
13291 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
13292 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
13293 (void) ip_output_simple(mp, &ixas);
13294 ixa_cleanup(&ixas);
13295
13296 mutex_enter(&ill->ill_lock);
13297 }
13298 mutex_exit(&ill->ill_lock);
13299
13300 done:
13301 if (release_ill != NULL)
13302 ill_refrele(release_ill);
13303 }
13304
13305 /*
13306 * Take down a specific interface, but don't lose any information about it.
13307 * (Always called as writer.)
13308 * This function goes through the down sequence even if the interface is
13309 * already down. There are 2 reasons.
13310 * a. Currently we permit interface routes that depend on down interfaces
13311 * to be added. This behaviour itself is questionable. However it appears
13312 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long
13313 * time. We go thru the cleanup in order to remove these routes.
13314 * b. The bringup of the interface could fail in ill_dl_up i.e. we get
13315 * DL_ERROR_ACK in response to the DL_BIND request. The interface is
13316 * down, but we need to cleanup i.e. do ill_dl_down and
13317 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
13318 *
13319 * IP-MT notes:
13320 *
13321 * Model of reference to interfaces.
13322 *
13323 * The following members in ipif_t track references to the ipif.
13324 * int ipif_refcnt; Active reference count
13325 *
13326 * The following members in ill_t track references to the ill.
13327 * int ill_refcnt; active refcnt
13328 * uint_t ill_ire_cnt; Number of ires referencing ill
13329 * uint_t ill_ncec_cnt; Number of ncecs referencing ill
13330 * uint_t ill_nce_cnt; Number of nces referencing ill
13331 * uint_t ill_ilm_cnt; Number of ilms referencing ill
13332 *
13333 * Reference to an ipif or ill can be obtained in any of the following ways.
13334 *
13335 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
13336 * Pointers to ipif / ill from other data structures viz ire and conn.
13337 * Implicit reference to the ipif / ill by holding a reference to the ire.
13338 *
13339 * The ipif/ill lookup functions return a reference held ipif / ill.
13340 * ipif_refcnt and ill_refcnt track the reference counts respectively.
13341 * This is a purely dynamic reference count associated with threads holding
13342 * references to the ipif / ill. Pointers from other structures do not
13343 * count towards this reference count.
13344 *
13345 * ill_ire_cnt is the number of ire's associated with the
13346 * ill. This is incremented whenever a new ire is created referencing the
13347 * ill. This is done atomically inside ire_add_v[46] where the ire is
13348 * actually added to the ire hash table. The count is decremented in
13349 * ire_inactive where the ire is destroyed.
13350 *
13351 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
13352 * This is incremented atomically in
13353 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
13354 * table. Similarly it is decremented in ncec_inactive() where the ncec
13355 * is destroyed.
13356 *
13357 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
13358 * incremented atomically in nce_add() where the nce is actually added to the
13359 * ill_nce. Similarly it is decremented in nce_inactive() where the nce
13360 * is destroyed.
13361 *
13362 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
13363 * ilm_add() and decremented before the ilm is freed in ilm_delete().
13364 *
13365 * Flow of ioctls involving interface down/up
13366 *
13367 * The following is the sequence of an attempt to set some critical flags on an
13368 * up interface.
13369 * ip_sioctl_flags
13370 * ipif_down
13371 * wait for ipif to be quiescent
13372 * ipif_down_tail
13373 * ip_sioctl_flags_tail
13374 *
13375 * All set ioctls that involve down/up sequence would have a skeleton similar
13376 * to the above. All the *tail functions are called after the refcounts have
13377 * dropped to the appropriate values.
13378 *
13379 * SIOC ioctls during the IPIF_CHANGING interval.
13380 *
13381 * Threads handling SIOC set ioctls serialize on the squeue, but this
13382 * is not done for SIOC get ioctls. Since a set ioctl can cause several
13383 * steps of internal changes to the state, some of which are visible in
13384 * ipif_flags (such as IFF_UP being cleared and later set), and we want
13385 * the set ioctl to be atomic related to the get ioctls, the SIOC get code
13386 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
13387 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
13388 * the current exclusive operation completes. The IPIF_CHANGING check
13389 * and enqueue is atomic using the ill_lock and ipsq_lock. The
13390 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
13391 * change while the ill_lock is held. Before dropping the ill_lock we acquire
13392 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
13393 * until we release the ipsq_lock, even though the ill/ipif state flags
13394 * can change after we drop the ill_lock.
13395 */
13396 int
13397 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13398 {
13399 ill_t *ill = ipif->ipif_ill;
13400 conn_t *connp;
13401 boolean_t success;
13402 boolean_t ipif_was_up = B_FALSE;
13403 ip_stack_t *ipst = ill->ill_ipst;
13404
13405 ASSERT(IAM_WRITER_IPIF(ipif));
13406
13407 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13408
13409 DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
13410 ill_t *, ill, ipif_t *, ipif);
13411
13412 if (ipif->ipif_flags & IPIF_UP) {
13413 mutex_enter(&ill->ill_lock);
13414 ipif->ipif_flags &= ~IPIF_UP;
13415 ASSERT(ill->ill_ipif_up_count > 0);
13416 --ill->ill_ipif_up_count;
13417 mutex_exit(&ill->ill_lock);
13418 ipif_was_up = B_TRUE;
13419 /* Update status in SCTP's list */
13420 sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
13421 ill_nic_event_dispatch(ipif->ipif_ill,
13422 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0);
13423 }
13424
13425 /*
13426 * Removal of the last ipif from an ill may result in a DL_UNBIND
13427 * being sent to the driver, and we must not send any data packets to
13428 * the driver after the DL_UNBIND_REQ. To ensure this, all the
13429 * ire and nce entries used in the data path will be cleaned
13430 * up, and we also set the ILL_DOWN_IN_PROGRESS bit to make
13431 * sure on new entries will be added until the ill is bound
13432 * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon
13433 * receipt of a DL_BIND_ACK.
13434 */
13435 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13436 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13437 ill->ill_dl_up) {
13438 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
13439 }
13440
13441 /*
13442 * Blow away memberships we established in ipif_multicast_up().
13443 */
13444 ipif_multicast_down(ipif);
13445
13446 /*
13447 * Remove from the mapping for __sin6_src_id. We insert only
13448 * when the address is not INADDR_ANY. As IPv4 addresses are
13449 * stored as mapped addresses, we need to check for mapped
13450 * INADDR_ANY also.
13451 */
13452 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
13453 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
13454 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
13455 int err;
13456
13457 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
13458 ipif->ipif_zoneid, ipst);
13459 if (err != 0) {
13460 ip0dbg(("ipif_down: srcid_remove %d\n", err));
13461 }
13462 }
13463
13464 if (ipif_was_up) {
13465 /* only delete if we'd added ire's before */
13466 if (ipif->ipif_isv6)
13467 ipif_delete_ires_v6(ipif);
13468 else
13469 ipif_delete_ires_v4(ipif);
13470 }
13471
13472 if (ipif_was_up && ill->ill_ipif_up_count == 0) {
13473 /*
13474 * Since the interface is now down, it may have just become
13475 * inactive. Note that this needs to be done even for a
13476 * lll_logical_down(), or ARP entries will not get correctly
13477 * restored when the interface comes back up.
13478 */
13479 if (IS_UNDER_IPMP(ill))
13480 ipmp_ill_refresh_active(ill);
13481 }
13482
13483 /*
13484 * neighbor-discovery or arp entries for this interface. The ipif
13485 * has to be quiesced, so we walk all the nce's and delete those
13486 * that point at the ipif->ipif_ill. At the same time, we also
13487 * update IPMP so that ipifs for data addresses are unbound. We dont
13488 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
13489 * that for ipif_down_tail()
13490 */
13491 ipif_nce_down(ipif);
13492
13493 /*
13494 * If this is the last ipif on the ill, we also need to remove
13495 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
13496 * never succeed.
13497 */
13498 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
13499 ire_walk_ill(0, 0, ill_downi, ill, ill);
13500
13501 /*
13502 * Walk all CONNs that can have a reference on an ire for this
13503 * ipif (we actually walk all that now have stale references).
13504 */
13505 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
13506
13507 /*
13508 * If mp is NULL the caller will wait for the appropriate refcnt.
13509 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down
13510 * and ill_delete -> ipif_free -> ipif_down
13511 */
13512 if (mp == NULL) {
13513 ASSERT(q == NULL);
13514 return (0);
13515 }
13516
13517 if (CONN_Q(q)) {
13518 connp = Q_TO_CONN(q);
13519 mutex_enter(&connp->conn_lock);
13520 } else {
13521 connp = NULL;
13522 }
13523 mutex_enter(&ill->ill_lock);
13524 /*
13525 * Are there any ire's pointing to this ipif that are still active ?
13526 * If this is the last ipif going down, are there any ire's pointing
13527 * to this ill that are still active ?
13528 */
13529 if (ipif_is_quiescent(ipif)) {
13530 mutex_exit(&ill->ill_lock);
13531 if (connp != NULL)
13532 mutex_exit(&connp->conn_lock);
13533 return (0);
13534 }
13535
13536 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
13537 ill->ill_name, (void *)ill));
13538 /*
13539 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
13540 * drops down, the operation will be restarted by ipif_ill_refrele_tail
13541 * which in turn is called by the last refrele on the ipif/ill/ire.
13542 */
13543 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
13544 if (!success) {
13545 /* The conn is closing. So just return */
13546 ASSERT(connp != NULL);
13547 mutex_exit(&ill->ill_lock);
13548 mutex_exit(&connp->conn_lock);
13549 return (EINTR);
13550 }
13551
13552 mutex_exit(&ill->ill_lock);
13553 if (connp != NULL)
13554 mutex_exit(&connp->conn_lock);
13555 return (EINPROGRESS);
13556 }
13557
13558 int
13559 ipif_down_tail(ipif_t *ipif)
13560 {
13561 ill_t *ill = ipif->ipif_ill;
13562 int err = 0;
13563
13564 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
13565 ill_t *, ill, ipif_t *, ipif);
13566
13567 /*
13568 * Skip any loopback interface (null wq).
13569 * If this is the last logical interface on the ill
13570 * have ill_dl_down tell the driver we are gone (unbind)
13571 * Note that lun 0 can ipif_down even though
13572 * there are other logical units that are up.
13573 * This occurs e.g. when we change a "significant" IFF_ flag.
13574 */
13575 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13576 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13577 ill->ill_dl_up) {
13578 ill_dl_down(ill);
13579 }
13580 if (!ipif->ipif_isv6)
13581 err = ipif_arp_down(ipif);
13582
13583 ill->ill_logical_down = 0;
13584
13585 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
13586 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
13587 return (err);
13588 }
13589
13590 /*
13591 * Bring interface logically down without bringing the physical interface
13592 * down e.g. when the netmask is changed. This avoids long lasting link
13593 * negotiations between an ethernet interface and a certain switches.
13594 */
13595 static int
13596 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13597 {
13598 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
13599 ill_t *, ipif->ipif_ill, ipif_t *, ipif);
13600
13601 /*
13602 * The ill_logical_down flag is a transient flag. It is set here
13603 * and is cleared once the down has completed in ipif_down_tail.
13604 * This flag does not indicate whether the ill stream is in the
13605 * DL_BOUND state with the driver. Instead this flag is used by
13606 * ipif_down_tail to determine whether to DL_UNBIND the stream with
13607 * the driver. The state of the ill stream i.e. whether it is
13608 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
13609 */
13610 ipif->ipif_ill->ill_logical_down = 1;
13611 return (ipif_down(ipif, q, mp));
13612 }
13613
13614 /*
13615 * Initiate deallocate of an IPIF. Always called as writer. Called by
13616 * ill_delete or ip_sioctl_removeif.
13617 */
13618 static void
13619 ipif_free(ipif_t *ipif)
13620 {
13621 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13622
13623 ASSERT(IAM_WRITER_IPIF(ipif));
13624
13625 if (ipif->ipif_recovery_id != 0)
13626 (void) untimeout(ipif->ipif_recovery_id);
13627 ipif->ipif_recovery_id = 0;
13628
13629 /*
13630 * Take down the interface. We can be called either from ill_delete
13631 * or from ip_sioctl_removeif.
13632 */
13633 (void) ipif_down(ipif, NULL, NULL);
13634
13635 /*
13636 * Now that the interface is down, there's no chance it can still
13637 * become a duplicate. Cancel any timer that may have been set while
13638 * tearing down.
13639 */
13640 if (ipif->ipif_recovery_id != 0)
13641 (void) untimeout(ipif->ipif_recovery_id);
13642 ipif->ipif_recovery_id = 0;
13643
13644 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13645 /* Remove pointers to this ill in the multicast routing tables */
13646 reset_mrt_vif_ipif(ipif);
13647 /* If necessary, clear the cached source ipif rotor. */
13648 if (ipif->ipif_ill->ill_src_ipif == ipif)
13649 ipif->ipif_ill->ill_src_ipif = NULL;
13650 rw_exit(&ipst->ips_ill_g_lock);
13651 }
13652
13653 static void
13654 ipif_free_tail(ipif_t *ipif)
13655 {
13656 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13657
13658 /*
13659 * Need to hold both ill_g_lock and ill_lock while
13660 * inserting or removing an ipif from the linked list
13661 * of ipifs hanging off the ill.
13662 */
13663 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13664
13665 #ifdef DEBUG
13666 ipif_trace_cleanup(ipif);
13667 #endif
13668
13669 /* Ask SCTP to take it out of it list */
13670 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
13671 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT);
13672
13673 /* Get it out of the ILL interface list. */
13674 ipif_remove(ipif);
13675 rw_exit(&ipst->ips_ill_g_lock);
13676
13677 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
13678 ASSERT(ipif->ipif_recovery_id == 0);
13679 ASSERT(ipif->ipif_ire_local == NULL);
13680 ASSERT(ipif->ipif_ire_if == NULL);
13681
13682 /* Free the memory. */
13683 mi_free(ipif);
13684 }
13685
13686 /*
13687 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
13688 * is zero.
13689 */
13690 void
13691 ipif_get_name(const ipif_t *ipif, char *buf, int len)
13692 {
13693 char lbuf[LIFNAMSIZ];
13694 char *name;
13695 size_t name_len;
13696
13697 buf[0] = '\0';
13698 name = ipif->ipif_ill->ill_name;
13699 name_len = ipif->ipif_ill->ill_name_length;
13700 if (ipif->ipif_id != 0) {
13701 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
13702 ipif->ipif_id);
13703 name = lbuf;
13704 name_len = mi_strlen(name) + 1;
13705 }
13706 len -= 1;
13707 buf[len] = '\0';
13708 len = MIN(len, name_len);
13709 bcopy(name, buf, len);
13710 }
13711
13712 /*
13713 * Sets `buf' to an ill name.
13714 */
13715 void
13716 ill_get_name(const ill_t *ill, char *buf, int len)
13717 {
13718 char *name;
13719 size_t name_len;
13720
13721 name = ill->ill_name;
13722 name_len = ill->ill_name_length;
13723 len -= 1;
13724 buf[len] = '\0';
13725 len = MIN(len, name_len);
13726 bcopy(name, buf, len);
13727 }
13728
13729 /*
13730 * Find an IPIF based on the name passed in. Names can be of the form <phys>
13731 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the
13732 * implied unit id is zero. <phys> must correspond to the name of an ILL.
13733 * (May be called as writer.)
13734 */
13735 static ipif_t *
13736 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
13737 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
13738 {
13739 char *cp;
13740 char *endp;
13741 long id;
13742 ill_t *ill;
13743 ipif_t *ipif;
13744 uint_t ire_type;
13745 boolean_t did_alloc = B_FALSE;
13746 char last;
13747
13748 /*
13749 * If the caller wants to us to create the ipif, make sure we have a
13750 * valid zoneid
13751 */
13752 ASSERT(!do_alloc || zoneid != ALL_ZONES);
13753
13754 if (namelen == 0) {
13755 return (NULL);
13756 }
13757
13758 *exists = B_FALSE;
13759 /* Look for a colon in the name. */
13760 endp = &name[namelen];
13761 for (cp = endp; --cp > name; ) {
13762 if (*cp == IPIF_SEPARATOR_CHAR)
13763 break;
13764 }
13765
13766 if (*cp == IPIF_SEPARATOR_CHAR) {
13767 /*
13768 * Reject any non-decimal aliases for logical
13769 * interfaces. Aliases with leading zeroes
13770 * are also rejected as they introduce ambiguity
13771 * in the naming of the interfaces.
13772 * In order to confirm with existing semantics,
13773 * and to not break any programs/script relying
13774 * on that behaviour, if<0>:0 is considered to be
13775 * a valid interface.
13776 *
13777 * If alias has two or more digits and the first
13778 * is zero, fail.
13779 */
13780 if (&cp[2] < endp && cp[1] == '0') {
13781 return (NULL);
13782 }
13783 }
13784
13785 if (cp <= name) {
13786 cp = endp;
13787 }
13788 last = *cp;
13789 *cp = '\0';
13790
13791 /*
13792 * Look up the ILL, based on the portion of the name
13793 * before the slash. ill_lookup_on_name returns a held ill.
13794 * Temporary to check whether ill exists already. If so
13795 * ill_lookup_on_name will clear it.
13796 */
13797 ill = ill_lookup_on_name(name, do_alloc, isv6,
13798 &did_alloc, ipst);
13799 *cp = last;
13800 if (ill == NULL)
13801 return (NULL);
13802
13803 /* Establish the unit number in the name. */
13804 id = 0;
13805 if (cp < endp && *endp == '\0') {
13806 /* If there was a colon, the unit number follows. */
13807 cp++;
13808 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13809 ill_refrele(ill);
13810 return (NULL);
13811 }
13812 }
13813
13814 mutex_enter(&ill->ill_lock);
13815 /* Now see if there is an IPIF with this unit number. */
13816 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13817 if (ipif->ipif_id == id) {
13818 if (zoneid != ALL_ZONES &&
13819 zoneid != ipif->ipif_zoneid &&
13820 ipif->ipif_zoneid != ALL_ZONES) {
13821 mutex_exit(&ill->ill_lock);
13822 ill_refrele(ill);
13823 return (NULL);
13824 }
13825 if (IPIF_CAN_LOOKUP(ipif)) {
13826 ipif_refhold_locked(ipif);
13827 mutex_exit(&ill->ill_lock);
13828 if (!did_alloc)
13829 *exists = B_TRUE;
13830 /*
13831 * Drop locks before calling ill_refrele
13832 * since it can potentially call into
13833 * ipif_ill_refrele_tail which can end up
13834 * in trying to acquire any lock.
13835 */
13836 ill_refrele(ill);
13837 return (ipif);
13838 }
13839 }
13840 }
13841
13842 if (!do_alloc) {
13843 mutex_exit(&ill->ill_lock);
13844 ill_refrele(ill);
13845 return (NULL);
13846 }
13847
13848 /*
13849 * If none found, atomically allocate and return a new one.
13850 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
13851 * to support "receive only" use of lo0:1 etc. as is still done
13852 * below as an initial guess.
13853 * However, this is now likely to be overriden later in ipif_up_done()
13854 * when we know for sure what address has been configured on the
13855 * interface, since we might have more than one loopback interface
13856 * with a loopback address, e.g. in the case of zones, and all the
13857 * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
13858 */
13859 if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
13860 ire_type = IRE_LOOPBACK;
13861 else
13862 ire_type = IRE_LOCAL;
13863 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL);
13864 if (ipif != NULL)
13865 ipif_refhold_locked(ipif);
13866 mutex_exit(&ill->ill_lock);
13867 ill_refrele(ill);
13868 return (ipif);
13869 }
13870
13871 /*
13872 * Variant of the above that queues the request on the ipsq when
13873 * IPIF_CHANGING is set.
13874 */
13875 static ipif_t *
13876 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6,
13877 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
13878 ip_stack_t *ipst)
13879 {
13880 char *cp;
13881 char *endp;
13882 long id;
13883 ill_t *ill;
13884 ipif_t *ipif;
13885 boolean_t did_alloc = B_FALSE;
13886 ipsq_t *ipsq;
13887
13888 if (error != NULL)
13889 *error = 0;
13890
13891 if (namelen == 0) {
13892 if (error != NULL)
13893 *error = ENXIO;
13894 return (NULL);
13895 }
13896
13897 /* Look for a colon in the name. */
13898 endp = &name[namelen];
13899 for (cp = endp; --cp > name; ) {
13900 if (*cp == IPIF_SEPARATOR_CHAR)
13901 break;
13902 }
13903
13904 if (*cp == IPIF_SEPARATOR_CHAR) {
13905 /*
13906 * Reject any non-decimal aliases for logical
13907 * interfaces. Aliases with leading zeroes
13908 * are also rejected as they introduce ambiguity
13909 * in the naming of the interfaces.
13910 * In order to confirm with existing semantics,
13911 * and to not break any programs/script relying
13912 * on that behaviour, if<0>:0 is considered to be
13913 * a valid interface.
13914 *
13915 * If alias has two or more digits and the first
13916 * is zero, fail.
13917 */
13918 if (&cp[2] < endp && cp[1] == '0') {
13919 if (error != NULL)
13920 *error = EINVAL;
13921 return (NULL);
13922 }
13923 }
13924
13925 if (cp <= name) {
13926 cp = endp;
13927 } else {
13928 *cp = '\0';
13929 }
13930
13931 /*
13932 * Look up the ILL, based on the portion of the name
13933 * before the slash. ill_lookup_on_name returns a held ill.
13934 * Temporary to check whether ill exists already. If so
13935 * ill_lookup_on_name will clear it.
13936 */
13937 ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst);
13938 if (cp != endp)
13939 *cp = IPIF_SEPARATOR_CHAR;
13940 if (ill == NULL)
13941 return (NULL);
13942
13943 /* Establish the unit number in the name. */
13944 id = 0;
13945 if (cp < endp && *endp == '\0') {
13946 /* If there was a colon, the unit number follows. */
13947 cp++;
13948 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13949 ill_refrele(ill);
13950 if (error != NULL)
13951 *error = ENXIO;
13952 return (NULL);
13953 }
13954 }
13955
13956 GRAB_CONN_LOCK(q);
13957 mutex_enter(&ill->ill_lock);
13958 /* Now see if there is an IPIF with this unit number. */
13959 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13960 if (ipif->ipif_id == id) {
13961 if (zoneid != ALL_ZONES &&
13962 zoneid != ipif->ipif_zoneid &&
13963 ipif->ipif_zoneid != ALL_ZONES) {
13964 mutex_exit(&ill->ill_lock);
13965 RELEASE_CONN_LOCK(q);
13966 ill_refrele(ill);
13967 if (error != NULL)
13968 *error = ENXIO;
13969 return (NULL);
13970 }
13971
13972 if (!(IPIF_IS_CHANGING(ipif) ||
13973 IPIF_IS_CONDEMNED(ipif)) ||
13974 IAM_WRITER_IPIF(ipif)) {
13975 ipif_refhold_locked(ipif);
13976 mutex_exit(&ill->ill_lock);
13977 /*
13978 * Drop locks before calling ill_refrele
13979 * since it can potentially call into
13980 * ipif_ill_refrele_tail which can end up
13981 * in trying to acquire any lock.
13982 */
13983 RELEASE_CONN_LOCK(q);
13984 ill_refrele(ill);
13985 return (ipif);
13986 } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) {
13987 ipsq = ill->ill_phyint->phyint_ipsq;
13988 mutex_enter(&ipsq->ipsq_lock);
13989 mutex_enter(&ipsq->ipsq_xop->ipx_lock);
13990 mutex_exit(&ill->ill_lock);
13991 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
13992 mutex_exit(&ipsq->ipsq_xop->ipx_lock);
13993 mutex_exit(&ipsq->ipsq_lock);
13994 RELEASE_CONN_LOCK(q);
13995 ill_refrele(ill);
13996 if (error != NULL)
13997 *error = EINPROGRESS;
13998 return (NULL);
13999 }
14000 }
14001 }
14002 RELEASE_CONN_LOCK(q);
14003 mutex_exit(&ill->ill_lock);
14004 ill_refrele(ill);
14005 if (error != NULL)
14006 *error = ENXIO;
14007 return (NULL);
14008 }
14009
14010 /*
14011 * This routine is called whenever a new address comes up on an ipif. If
14012 * we are configured to respond to address mask requests, then we are supposed
14013 * to broadcast an address mask reply at this time. This routine is also
14014 * called if we are already up, but a netmask change is made. This is legal
14015 * but might not make the system manager very popular. (May be called
14016 * as writer.)
14017 */
14018 void
14019 ipif_mask_reply(ipif_t *ipif)
14020 {
14021 icmph_t *icmph;
14022 ipha_t *ipha;
14023 mblk_t *mp;
14024 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
14025 ip_xmit_attr_t ixas;
14026
14027 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
14028
14029 if (!ipst->ips_ip_respond_to_address_mask_broadcast)
14030 return;
14031
14032 /* ICMP mask reply is IPv4 only */
14033 ASSERT(!ipif->ipif_isv6);
14034 /* ICMP mask reply is not for a loopback interface */
14035 ASSERT(ipif->ipif_ill->ill_wq != NULL);
14036
14037 if (ipif->ipif_lcl_addr == INADDR_ANY)
14038 return;
14039
14040 mp = allocb(REPLY_LEN, BPRI_HI);
14041 if (mp == NULL)
14042 return;
14043 mp->b_wptr = mp->b_rptr + REPLY_LEN;
14044
14045 ipha = (ipha_t *)mp->b_rptr;
14046 bzero(ipha, REPLY_LEN);
14047 *ipha = icmp_ipha;
14048 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
14049 ipha->ipha_src = ipif->ipif_lcl_addr;
14050 ipha->ipha_dst = ipif->ipif_brd_addr;
14051 ipha->ipha_length = htons(REPLY_LEN);
14052 ipha->ipha_ident = 0;
14053
14054 icmph = (icmph_t *)&ipha[1];
14055 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
14056 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
14057 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
14058
14059 bzero(&ixas, sizeof (ixas));
14060 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
14061 ixas.ixa_zoneid = ALL_ZONES;
14062 ixas.ixa_ifindex = 0;
14063 ixas.ixa_ipst = ipst;
14064 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
14065 (void) ip_output_simple(mp, &ixas);
14066 ixa_cleanup(&ixas);
14067 #undef REPLY_LEN
14068 }
14069
14070 /*
14071 * Join the ipif specific multicast groups.
14072 * Must be called after a mapping has been set up in the resolver. (Always
14073 * called as writer.)
14074 */
14075 void
14076 ipif_multicast_up(ipif_t *ipif)
14077 {
14078 int err;
14079 ill_t *ill;
14080 ilm_t *ilm;
14081
14082 ASSERT(IAM_WRITER_IPIF(ipif));
14083
14084 ill = ipif->ipif_ill;
14085
14086 ip1dbg(("ipif_multicast_up\n"));
14087 if (!(ill->ill_flags & ILLF_MULTICAST) ||
14088 ipif->ipif_allhosts_ilm != NULL)
14089 return;
14090
14091 if (ipif->ipif_isv6) {
14092 in6_addr_t v6allmc = ipv6_all_hosts_mcast;
14093 in6_addr_t v6solmc = ipv6_solicited_node_mcast;
14094
14095 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
14096
14097 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
14098 return;
14099
14100 ip1dbg(("ipif_multicast_up - addmulti\n"));
14101
14102 /*
14103 * Join the all hosts multicast address. We skip this for
14104 * underlying IPMP interfaces since they should be invisible.
14105 */
14106 if (!IS_UNDER_IPMP(ill)) {
14107 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
14108 &err);
14109 if (ilm == NULL) {
14110 ASSERT(err != 0);
14111 ip0dbg(("ipif_multicast_up: "
14112 "all_hosts_mcast failed %d\n", err));
14113 return;
14114 }
14115 ipif->ipif_allhosts_ilm = ilm;
14116 }
14117
14118 /*
14119 * Enable multicast for the solicited node multicast address.
14120 * If IPMP we need to put the membership on the upper ill.
14121 */
14122 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
14123 ill_t *mcast_ill = NULL;
14124 boolean_t need_refrele;
14125
14126 if (IS_UNDER_IPMP(ill) &&
14127 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
14128 need_refrele = B_TRUE;
14129 } else {
14130 mcast_ill = ill;
14131 need_refrele = B_FALSE;
14132 }
14133
14134 ilm = ip_addmulti(&v6solmc, mcast_ill,
14135 ipif->ipif_zoneid, &err);
14136 if (need_refrele)
14137 ill_refrele(mcast_ill);
14138
14139 if (ilm == NULL) {
14140 ASSERT(err != 0);
14141 ip0dbg(("ipif_multicast_up: solicited MC"
14142 " failed %d\n", err));
14143 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
14144 ipif->ipif_allhosts_ilm = NULL;
14145 (void) ip_delmulti(ilm);
14146 }
14147 return;
14148 }
14149 ipif->ipif_solmulti_ilm = ilm;
14150 }
14151 } else {
14152 in6_addr_t v6group;
14153
14154 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
14155 return;
14156
14157 /* Join the all hosts multicast address */
14158 ip1dbg(("ipif_multicast_up - addmulti\n"));
14159 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
14160
14161 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
14162 if (ilm == NULL) {
14163 ASSERT(err != 0);
14164 ip0dbg(("ipif_multicast_up: failed %d\n", err));
14165 return;
14166 }
14167 ipif->ipif_allhosts_ilm = ilm;
14168 }
14169 }
14170
14171 /*
14172 * Blow away any multicast groups that we joined in ipif_multicast_up().
14173 * (ilms from explicit memberships are handled in conn_update_ill.)
14174 */
14175 void
14176 ipif_multicast_down(ipif_t *ipif)
14177 {
14178 ASSERT(IAM_WRITER_IPIF(ipif));
14179
14180 ip1dbg(("ipif_multicast_down\n"));
14181
14182 if (ipif->ipif_allhosts_ilm != NULL) {
14183 (void) ip_delmulti(ipif->ipif_allhosts_ilm);
14184 ipif->ipif_allhosts_ilm = NULL;
14185 }
14186 if (ipif->ipif_solmulti_ilm != NULL) {
14187 (void) ip_delmulti(ipif->ipif_solmulti_ilm);
14188 ipif->ipif_solmulti_ilm = NULL;
14189 }
14190 }
14191
14192 /*
14193 * Used when an interface comes up to recreate any extra routes on this
14194 * interface.
14195 */
14196 int
14197 ill_recover_saved_ire(ill_t *ill)
14198 {
14199 mblk_t *mp;
14200 ip_stack_t *ipst = ill->ill_ipst;
14201
14202 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
14203
14204 mutex_enter(&ill->ill_saved_ire_lock);
14205 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
14206 ire_t *ire, *nire;
14207 ifrt_t *ifrt;
14208
14209 ifrt = (ifrt_t *)mp->b_rptr;
14210 /*
14211 * Create a copy of the IRE with the saved address and netmask.
14212 */
14213 if (ill->ill_isv6) {
14214 ire = ire_create_v6(
14215 &ifrt->ifrt_v6addr,
14216 &ifrt->ifrt_v6mask,
14217 &ifrt->ifrt_v6gateway_addr,
14218 ifrt->ifrt_type,
14219 ill,
14220 ifrt->ifrt_zoneid,
14221 ifrt->ifrt_flags,
14222 NULL,
14223 ipst);
14224 } else {
14225 ire = ire_create(
14226 (uint8_t *)&ifrt->ifrt_addr,
14227 (uint8_t *)&ifrt->ifrt_mask,
14228 (uint8_t *)&ifrt->ifrt_gateway_addr,
14229 ifrt->ifrt_type,
14230 ill,
14231 ifrt->ifrt_zoneid,
14232 ifrt->ifrt_flags,
14233 NULL,
14234 ipst);
14235 }
14236 if (ire == NULL) {
14237 mutex_exit(&ill->ill_saved_ire_lock);
14238 return (ENOMEM);
14239 }
14240
14241 if (ifrt->ifrt_flags & RTF_SETSRC) {
14242 if (ill->ill_isv6) {
14243 ire->ire_setsrc_addr_v6 =
14244 ifrt->ifrt_v6setsrc_addr;
14245 } else {
14246 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
14247 }
14248 }
14249
14250 /*
14251 * Some software (for example, GateD and Sun Cluster) attempts
14252 * to create (what amount to) IRE_PREFIX routes with the
14253 * loopback address as the gateway. This is primarily done to
14254 * set up prefixes with the RTF_REJECT flag set (for example,
14255 * when generating aggregate routes.)
14256 *
14257 * If the IRE type (as defined by ill->ill_net_type) is
14258 * IRE_LOOPBACK, then we map the request into a
14259 * IRE_IF_NORESOLVER.
14260 */
14261 if (ill->ill_net_type == IRE_LOOPBACK)
14262 ire->ire_type = IRE_IF_NORESOLVER;
14263
14264 /*
14265 * ire held by ire_add, will be refreled' towards the
14266 * the end of ipif_up_done
14267 */
14268 nire = ire_add(ire);
14269 /*
14270 * Check if it was a duplicate entry. This handles
14271 * the case of two racing route adds for the same route
14272 */
14273 if (nire == NULL) {
14274 ip1dbg(("ill_recover_saved_ire: FAILED\n"));
14275 } else if (nire != ire) {
14276 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
14277 (void *)nire));
14278 ire_delete(nire);
14279 } else {
14280 ip1dbg(("ill_recover_saved_ire: added ire %p\n",
14281 (void *)nire));
14282 }
14283 if (nire != NULL)
14284 ire_refrele(nire);
14285 }
14286 mutex_exit(&ill->ill_saved_ire_lock);
14287 return (0);
14288 }
14289
14290 /*
14291 * Used to set the netmask and broadcast address to default values when the
14292 * interface is brought up. (Always called as writer.)
14293 */
14294 static void
14295 ipif_set_default(ipif_t *ipif)
14296 {
14297 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14298
14299 if (!ipif->ipif_isv6) {
14300 /*
14301 * Interface holds an IPv4 address. Default
14302 * mask is the natural netmask.
14303 */
14304 if (!ipif->ipif_net_mask) {
14305 ipaddr_t v4mask;
14306
14307 v4mask = ip_net_mask(ipif->ipif_lcl_addr);
14308 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
14309 }
14310 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14311 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14312 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14313 } else {
14314 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14315 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14316 }
14317 /*
14318 * NOTE: SunOS 4.X does this even if the broadcast address
14319 * has been already set thus we do the same here.
14320 */
14321 if (ipif->ipif_flags & IPIF_BROADCAST) {
14322 ipaddr_t v4addr;
14323
14324 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
14325 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
14326 }
14327 } else {
14328 /*
14329 * Interface holds an IPv6-only address. Default
14330 * mask is all-ones.
14331 */
14332 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
14333 ipif->ipif_v6net_mask = ipv6_all_ones;
14334 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14335 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14336 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14337 } else {
14338 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14339 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14340 }
14341 }
14342 }
14343
14344 /*
14345 * Return 0 if this address can be used as local address without causing
14346 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
14347 * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
14348 * Note that the same IPv6 link-local address is allowed as long as the ills
14349 * are not on the same link.
14350 */
14351 int
14352 ip_addr_availability_check(ipif_t *new_ipif)
14353 {
14354 in6_addr_t our_v6addr;
14355 ill_t *ill;
14356 ipif_t *ipif;
14357 ill_walk_context_t ctx;
14358 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst;
14359
14360 ASSERT(IAM_WRITER_IPIF(new_ipif));
14361 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
14362 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
14363
14364 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
14365 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
14366 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
14367 return (0);
14368
14369 our_v6addr = new_ipif->ipif_v6lcl_addr;
14370
14371 if (new_ipif->ipif_isv6)
14372 ill = ILL_START_WALK_V6(&ctx, ipst);
14373 else
14374 ill = ILL_START_WALK_V4(&ctx, ipst);
14375
14376 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
14377 for (ipif = ill->ill_ipif; ipif != NULL;
14378 ipif = ipif->ipif_next) {
14379 if ((ipif == new_ipif) ||
14380 !(ipif->ipif_flags & IPIF_UP) ||
14381 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
14382 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
14383 &our_v6addr))
14384 continue;
14385
14386 if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
14387 new_ipif->ipif_flags |= IPIF_UNNUMBERED;
14388 else if (ipif->ipif_flags & IPIF_POINTOPOINT)
14389 ipif->ipif_flags |= IPIF_UNNUMBERED;
14390 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
14391 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
14392 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
14393 continue;
14394 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
14395 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
14396 continue;
14397 else if (new_ipif->ipif_ill == ill)
14398 return (EADDRINUSE);
14399 else
14400 return (EADDRNOTAVAIL);
14401 }
14402 }
14403
14404 return (0);
14405 }
14406
14407 /*
14408 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
14409 * IREs for the ipif.
14410 * When the routine returns EINPROGRESS then mp has been consumed and
14411 * the ioctl will be acked from ip_rput_dlpi.
14412 */
14413 int
14414 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
14415 {
14416 ill_t *ill = ipif->ipif_ill;
14417 boolean_t isv6 = ipif->ipif_isv6;
14418 int err = 0;
14419 boolean_t success;
14420 uint_t ipif_orig_id;
14421 ip_stack_t *ipst = ill->ill_ipst;
14422
14423 ASSERT(IAM_WRITER_IPIF(ipif));
14424
14425 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
14426 DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
14427 ill_t *, ill, ipif_t *, ipif);
14428
14429 /* Shouldn't get here if it is already up. */
14430 if (ipif->ipif_flags & IPIF_UP)
14431 return (EALREADY);
14432
14433 /*
14434 * If this is a request to bring up a data address on an interface
14435 * under IPMP, then move the address to its IPMP meta-interface and
14436 * try to bring it up. One complication is that the zeroth ipif for
14437 * an ill is special, in that every ill always has one, and that code
14438 * throughout IP deferences ill->ill_ipif without holding any locks.
14439 */
14440 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
14441 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
14442 ipif_t *stubipif = NULL, *moveipif = NULL;
14443 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
14444
14445 /*
14446 * The ipif being brought up should be quiesced. If it's not,
14447 * something has gone amiss and we need to bail out. (If it's
14448 * quiesced, we know it will remain so via IPIF_CONDEMNED.)
14449 */
14450 mutex_enter(&ill->ill_lock);
14451 if (!ipif_is_quiescent(ipif)) {
14452 mutex_exit(&ill->ill_lock);
14453 return (EINVAL);
14454 }
14455 mutex_exit(&ill->ill_lock);
14456
14457 /*
14458 * If we're going to need to allocate ipifs, do it prior
14459 * to starting the move (and grabbing locks).
14460 */
14461 if (ipif->ipif_id == 0) {
14462 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14463 B_FALSE, &err)) == NULL) {
14464 return (err);
14465 }
14466 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14467 B_FALSE, &err)) == NULL) {
14468 mi_free(moveipif);
14469 return (err);
14470 }
14471 }
14472
14473 /*
14474 * Grab or transfer the ipif to move. During the move, keep
14475 * ill_g_lock held to prevent any ill walker threads from
14476 * seeing things in an inconsistent state.
14477 */
14478 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14479 if (ipif->ipif_id != 0) {
14480 ipif_remove(ipif);
14481 } else {
14482 ipif_transfer(ipif, moveipif, stubipif);
14483 ipif = moveipif;
14484 }
14485
14486 /*
14487 * Place the ipif on the IPMP ill. If the zeroth ipif on
14488 * the IPMP ill is a stub (0.0.0.0 down address) then we
14489 * replace that one. Otherwise, pick the next available slot.
14490 */
14491 ipif->ipif_ill = ipmp_ill;
14492 ipif_orig_id = ipif->ipif_id;
14493
14494 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
14495 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
14496 ipif = ipmp_ill->ill_ipif;
14497 } else {
14498 ipif->ipif_id = -1;
14499 if ((err = ipif_insert(ipif, B_FALSE)) != 0) {
14500 /*
14501 * No more available ipif_id's -- put it back
14502 * on the original ill and fail the operation.
14503 * Since we're writer on the ill, we can be
14504 * sure our old slot is still available.
14505 */
14506 ipif->ipif_id = ipif_orig_id;
14507 ipif->ipif_ill = ill;
14508 if (ipif_orig_id == 0) {
14509 ipif_transfer(ipif, ill->ill_ipif,
14510 NULL);
14511 } else {
14512 VERIFY(ipif_insert(ipif, B_FALSE) == 0);
14513 }
14514 rw_exit(&ipst->ips_ill_g_lock);
14515 return (err);
14516 }
14517 }
14518 rw_exit(&ipst->ips_ill_g_lock);
14519
14520 /*
14521 * Tell SCTP that the ipif has moved. Note that even if we
14522 * had to allocate a new ipif, the original sequence id was
14523 * preserved and therefore SCTP won't know.
14524 */
14525 sctp_move_ipif(ipif, ill, ipmp_ill);
14526
14527 /*
14528 * If the ipif being brought up was on slot zero, then we
14529 * first need to bring up the placeholder we stuck there. In
14530 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
14531 * call to ipif_up() itself, if we successfully bring up the
14532 * placeholder, we'll check ill_move_ipif and bring it up too.
14533 */
14534 if (ipif_orig_id == 0) {
14535 ASSERT(ill->ill_move_ipif == NULL);
14536 ill->ill_move_ipif = ipif;
14537 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
14538 ASSERT(ill->ill_move_ipif == NULL);
14539 if (err != EINPROGRESS)
14540 ill->ill_move_ipif = NULL;
14541 return (err);
14542 }
14543
14544 /*
14545 * Bring it up on the IPMP ill.
14546 */
14547 return (ipif_up(ipif, q, mp));
14548 }
14549
14550 /* Skip arp/ndp for any loopback interface. */
14551 if (ill->ill_wq != NULL) {
14552 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14553 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
14554
14555 if (!ill->ill_dl_up) {
14556 /*
14557 * ill_dl_up is not yet set. i.e. we are yet to
14558 * DL_BIND with the driver and this is the first
14559 * logical interface on the ill to become "up".
14560 * Tell the driver to get going (via DL_BIND_REQ).
14561 * Note that changing "significant" IFF_ flags
14562 * address/netmask etc cause a down/up dance, but
14563 * does not cause an unbind (DL_UNBIND) with the driver
14564 */
14565 if ((err = ill_dl_up(ill, ipif)) != 0) {
14566 return (err);
14567 }
14568 }
14569
14570 /* Reject bringing up interfaces with unusable IP addresses */
14571 if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) {
14572 return (EPERM);
14573 }
14574
14575 /*
14576 * ipif_resolver_up may end up needeing to bind/attach
14577 * the ARP stream, which in turn necessitates a
14578 * DLPI message exchange with the driver. ioctls are
14579 * serialized and so we cannot send more than one
14580 * interface up message at a time. If ipif_resolver_up
14581 * does need to wait for the DLPI handshake for the ARP stream,
14582 * we get EINPROGRESS and we will complete in arp_bringup_done.
14583 */
14584
14585 ASSERT(connp != NULL || !CONN_Q(q));
14586 if (connp != NULL)
14587 mutex_enter(&connp->conn_lock);
14588 mutex_enter(&ill->ill_lock);
14589 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14590 mutex_exit(&ill->ill_lock);
14591 if (connp != NULL)
14592 mutex_exit(&connp->conn_lock);
14593 if (!success)
14594 return (EINTR);
14595
14596 /*
14597 * Crank up IPv6 neighbor discovery. Unlike ARP, this should
14598 * complete when ipif_ndp_up returns.
14599 */
14600 err = ipif_resolver_up(ipif, Res_act_initial);
14601 if (err == EINPROGRESS) {
14602 /* We will complete it in arp_bringup_done() */
14603 return (err);
14604 }
14605
14606 if (isv6 && err == 0)
14607 err = ipif_ndp_up(ipif, B_TRUE);
14608
14609 ASSERT(err != EINPROGRESS);
14610 mp = ipsq_pending_mp_get(ipsq, &connp);
14611 ASSERT(mp != NULL);
14612 if (err != 0)
14613 return (err);
14614 } else {
14615 /*
14616 * Interfaces without underlying hardware don't do duplicate
14617 * address detection.
14618 */
14619 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
14620 ipif->ipif_addr_ready = 1;
14621 err = ill_add_ires(ill);
14622 /* allocation failure? */
14623 if (err != 0)
14624 return (err);
14625 }
14626
14627 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
14628 if (err == 0 && ill->ill_move_ipif != NULL) {
14629 ipif = ill->ill_move_ipif;
14630 ill->ill_move_ipif = NULL;
14631 return (ipif_up(ipif, q, mp));
14632 }
14633 return (err);
14634 }
14635
14636 /*
14637 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
14638 * The identical set of IREs need to be removed in ill_delete_ires().
14639 */
14640 int
14641 ill_add_ires(ill_t *ill)
14642 {
14643 ire_t *ire;
14644 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1};
14645 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
14646
14647 if (ill->ill_ire_multicast != NULL)
14648 return (0);
14649
14650 /*
14651 * provide some dummy ire_addr for creating the ire.
14652 */
14653 if (ill->ill_isv6) {
14654 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
14655 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14656 } else {
14657 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
14658 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14659 }
14660 if (ire == NULL)
14661 return (ENOMEM);
14662
14663 ill->ill_ire_multicast = ire;
14664 return (0);
14665 }
14666
14667 void
14668 ill_delete_ires(ill_t *ill)
14669 {
14670 if (ill->ill_ire_multicast != NULL) {
14671 /*
14672 * BIND/ATTACH completed; Release the ref for ill_ire_multicast
14673 * which was taken without any th_tracing enabled.
14674 * We also mark it as condemned (note that it was never added)
14675 * so that caching conn's can move off of it.
14676 */
14677 ire_make_condemned(ill->ill_ire_multicast);
14678 ire_refrele_notr(ill->ill_ire_multicast);
14679 ill->ill_ire_multicast = NULL;
14680 }
14681 }
14682
14683 /*
14684 * Perform a bind for the physical device.
14685 *
14686 * When the routine returns successfully then dlpi has been bound and
14687 * capabilities negotiated. An unbind message will have been allocated
14688 * for later use in ipif_down.
14689 */
14690 static int
14691 ill_dl_up(ill_t *ill, ipif_t *ipif)
14692 {
14693 mblk_t *bind_mp = NULL;
14694 mblk_t *unbind_mp = NULL;
14695 int err;
14696
14697 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
14698
14699 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
14700 ASSERT(IAM_WRITER_ILL(ill));
14701
14702 /*
14703 * Make sure we have an IRE_MULTICAST in case we immediately
14704 * start receiving packets.
14705 */
14706 err = ill_add_ires(ill);
14707 if (err != 0)
14708 goto bad;
14709
14710 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
14711 DL_BIND_REQ);
14712 if (bind_mp == NULL)
14713 goto bad;
14714 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
14715 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
14716
14717 /*
14718 * ill_unbind_mp would be non-null if the following sequence had
14719 * happened:
14720 * - send DL_BIND_REQ to driver, wait for response
14721 * - multiple ioctls that need to bring the ipif up are encountered,
14722 * but they cannot enter the ipsq due to the outstanding DL_BIND_REQ.
14723 * These ioctls will then be enqueued on the ipsq
14724 * - a DL_ERROR_ACK is returned for the DL_BIND_REQ
14725 * At this point, the pending ioctls in the ipsq will be drained, and
14726 * since ill->ill_dl_up was not set, ill_dl_up would be invoked with
14727 * a non-null ill->ill_unbind_mp
14728 */
14729 if (ill->ill_unbind_mp == NULL) {
14730 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t),
14731 DL_UNBIND_REQ);
14732 if (unbind_mp == NULL)
14733 goto bad;
14734 }
14735
14736 /*
14737 * Save the unbind message for ill_dl_down(); it will be consumed when
14738 * the interface goes down.
14739 */
14740 if (ill->ill_unbind_mp == NULL)
14741 ill->ill_unbind_mp = unbind_mp;
14742
14743 ill_dlpi_send(ill, bind_mp);
14744 /* Send down link-layer capabilities probe if not already done. */
14745 ill_capability_probe(ill);
14746 /*
14747 * Wait for DLPI to be bound and the capability probe to finish.
14748 * The call drops-and-reacquires the squeue. If it couldn't because
14749 * ILL_CONDEMNED got set, bail.
14750 */
14751 if (!ill_capability_wait(ill))
14752 return (ENXIO);
14753
14754 /* DLPI failed to bind. Return the saved error */
14755 if (!ill->ill_dl_up) {
14756 return (ill->ill_dl_bind_err);
14757 }
14758
14759 /*
14760 * Sysid used to rely on the fact that netboots set domainname
14761 * and the like. Now that miniroot boots aren't strictly netboots
14762 * and miniroot network configuration is driven from userland
14763 * these things still need to be set. This situation can be detected
14764 * by comparing the interface being configured here to the one
14765 * dhcifname was set to reference by the boot loader. Once sysid is
14766 * converted to use dhcp_ipc_getinfo() this call can go away.
14767 */
14768 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) &&
14769 (strcmp(ill->ill_name, dhcifname) == 0) &&
14770 (strlen(srpc_domain) == 0)) {
14771 if (dhcpinit() != 0)
14772 cmn_err(CE_WARN, "no cached dhcp response");
14773 }
14774
14775 return (0);
14776 bad:
14777 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
14778
14779 freemsg(bind_mp);
14780 freemsg(unbind_mp);
14781 return (ENOMEM);
14782 }
14783
14784 /* Add room for tcp+ip headers */
14785 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
14786
14787 /*
14788 * DLPI and ARP is up.
14789 * Create all the IREs associated with an interface. Bring up multicast.
14790 * Set the interface flag and finish other initialization
14791 * that potentially had to be deferred to after DL_BIND_ACK.
14792 */
14793 int
14794 ipif_up_done(ipif_t *ipif)
14795 {
14796 ill_t *ill = ipif->ipif_ill;
14797 int err = 0;
14798 boolean_t loopback = B_FALSE;
14799 boolean_t update_src_selection = B_TRUE;
14800 ipif_t *tmp_ipif;
14801
14802 ip1dbg(("ipif_up_done(%s:%u)\n",
14803 ipif->ipif_ill->ill_name, ipif->ipif_id));
14804 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
14805 ill_t *, ill, ipif_t *, ipif);
14806
14807 /* Check if this is a loopback interface */
14808 if (ipif->ipif_ill->ill_wq == NULL)
14809 loopback = B_TRUE;
14810
14811 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14812
14813 /*
14814 * If all other interfaces for this ill are down or DEPRECATED,
14815 * or otherwise unsuitable for source address selection,
14816 * reset the src generation numbers to make sure source
14817 * address selection gets to take this new ipif into account.
14818 * No need to hold ill_lock while traversing the ipif list since
14819 * we are writer
14820 */
14821 for (tmp_ipif = ill->ill_ipif; tmp_ipif;
14822 tmp_ipif = tmp_ipif->ipif_next) {
14823 if (((tmp_ipif->ipif_flags &
14824 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
14825 !(tmp_ipif->ipif_flags & IPIF_UP)) ||
14826 (tmp_ipif == ipif))
14827 continue;
14828 /* first useable pre-existing interface */
14829 update_src_selection = B_FALSE;
14830 break;
14831 }
14832 if (update_src_selection)
14833 ip_update_source_selection(ill->ill_ipst);
14834
14835 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
14836 nce_t *loop_nce = NULL;
14837 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
14838
14839 /*
14840 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
14841 * ipif_lookup_on_name(), but in the case of zones we can have
14842 * several loopback addresses on lo0. So all the interfaces with
14843 * loopback addresses need to be marked IRE_LOOPBACK.
14844 */
14845 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
14846 htonl(INADDR_LOOPBACK))
14847 ipif->ipif_ire_type = IRE_LOOPBACK;
14848 else
14849 ipif->ipif_ire_type = IRE_LOCAL;
14850 if (ill->ill_net_type != IRE_LOOPBACK)
14851 flags |= NCE_F_PUBLISH;
14852
14853 /* add unicast nce for the local addr */
14854 err = nce_lookup_then_add_v4(ill, NULL,
14855 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
14856 ND_REACHABLE, &loop_nce);
14857 /* A shared-IP zone sees EEXIST for lo0:N */
14858 if (err == 0 || err == EEXIST) {
14859 ipif->ipif_added_nce = 1;
14860 loop_nce->nce_ipif_cnt++;
14861 nce_refrele(loop_nce);
14862 err = 0;
14863 } else {
14864 ASSERT(loop_nce == NULL);
14865 return (err);
14866 }
14867 }
14868
14869 /* Create all the IREs associated with this interface */
14870 err = ipif_add_ires_v4(ipif, loopback);
14871 if (err != 0) {
14872 /*
14873 * see comments about return value from
14874 * ip_addr_availability_check() in ipif_add_ires_v4().
14875 */
14876 if (err != EADDRINUSE) {
14877 (void) ipif_arp_down(ipif);
14878 } else {
14879 /*
14880 * Make IPMP aware of the deleted ipif so that
14881 * the needed ipmp cleanup (e.g., of ipif_bound_ill)
14882 * can be completed. Note that we do not want to
14883 * destroy the nce that was created on the ipmp_ill
14884 * for the active copy of the duplicate address in
14885 * use.
14886 */
14887 if (IS_IPMP(ill))
14888 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
14889 err = EADDRNOTAVAIL;
14890 }
14891 return (err);
14892 }
14893
14894 if (ill->ill_ipif_up_count == 1 && !loopback) {
14895 /* Recover any additional IREs entries for this ill */
14896 (void) ill_recover_saved_ire(ill);
14897 }
14898
14899 if (ill->ill_need_recover_multicast) {
14900 /*
14901 * Need to recover all multicast memberships in the driver.
14902 * This had to be deferred until we had attached. The same
14903 * code exists in ipif_up_done_v6() to recover IPv6
14904 * memberships.
14905 *
14906 * Note that it would be preferable to unconditionally do the
14907 * ill_recover_multicast() in ill_dl_up(), but we cannot do
14908 * that since ill_join_allmulti() depends on ill_dl_up being
14909 * set, and it is not set until we receive a DL_BIND_ACK after
14910 * having called ill_dl_up().
14911 */
14912 ill_recover_multicast(ill);
14913 }
14914
14915 if (ill->ill_ipif_up_count == 1) {
14916 /*
14917 * Since the interface is now up, it may now be active.
14918 */
14919 if (IS_UNDER_IPMP(ill))
14920 ipmp_ill_refresh_active(ill);
14921
14922 /*
14923 * If this is an IPMP interface, we may now be able to
14924 * establish ARP entries.
14925 */
14926 if (IS_IPMP(ill))
14927 ipmp_illgrp_refresh_arpent(ill->ill_grp);
14928 }
14929
14930 /* Join the allhosts multicast address */
14931 ipif_multicast_up(ipif);
14932
14933 if (!loopback && !update_src_selection &&
14934 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
14935 ip_update_source_selection(ill->ill_ipst);
14936
14937 if (!loopback && ipif->ipif_addr_ready) {
14938 /* Broadcast an address mask reply. */
14939 ipif_mask_reply(ipif);
14940 }
14941 /* Perhaps ilgs should use this ill */
14942 update_conn_ill(NULL, ill->ill_ipst);
14943
14944 /*
14945 * This had to be deferred until we had bound. Tell routing sockets and
14946 * others that this interface is up if it looks like the address has
14947 * been validated. Otherwise, if it isn't ready yet, wait for
14948 * duplicate address detection to do its thing.
14949 */
14950 if (ipif->ipif_addr_ready)
14951 ipif_up_notify(ipif);
14952 return (0);
14953 }
14954
14955 /*
14956 * Add the IREs associated with the ipif.
14957 * Those MUST be explicitly removed in ipif_delete_ires_v4.
14958 */
14959 static int
14960 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
14961 {
14962 ill_t *ill = ipif->ipif_ill;
14963 ip_stack_t *ipst = ill->ill_ipst;
14964 ire_t *ire_array[20];
14965 ire_t **irep = ire_array;
14966 ire_t **irep1;
14967 ipaddr_t net_mask = 0;
14968 ipaddr_t subnet_mask, route_mask;
14969 int err;
14970 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */
14971 ire_t *ire_if = NULL;
14972 uchar_t *gw;
14973
14974 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14975 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14976 /*
14977 * If we're on a labeled system then make sure that zone-
14978 * private addresses have proper remote host database entries.
14979 */
14980 if (is_system_labeled() &&
14981 ipif->ipif_ire_type != IRE_LOOPBACK &&
14982 !tsol_check_interface_address(ipif))
14983 return (EINVAL);
14984
14985 /* Register the source address for __sin6_src_id */
14986 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
14987 ipif->ipif_zoneid, ipst);
14988 if (err != 0) {
14989 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
14990 return (err);
14991 }
14992
14993 if (loopback)
14994 gw = (uchar_t *)&ipif->ipif_lcl_addr;
14995 else
14996 gw = NULL;
14997
14998 /* If the interface address is set, create the local IRE. */
14999 ire_local = ire_create(
15000 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */
15001 (uchar_t *)&ip_g_all_ones, /* mask */
15002 gw, /* gateway */
15003 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */
15004 ipif->ipif_ill,
15005 ipif->ipif_zoneid,
15006 ((ipif->ipif_flags & IPIF_PRIVATE) ?
15007 RTF_PRIVATE : 0) | RTF_KERNEL,
15008 NULL,
15009 ipst);
15010 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
15011 " for 0x%x\n", (void *)ipif, (void *)ire_local,
15012 ipif->ipif_ire_type,
15013 ntohl(ipif->ipif_lcl_addr)));
15014 if (ire_local == NULL) {
15015 ip1dbg(("ipif_up_done: NULL ire_local\n"));
15016 err = ENOMEM;
15017 goto bad;
15018 }
15019 } else {
15020 ip1dbg((
15021 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
15022 ipif->ipif_ire_type,
15023 ntohl(ipif->ipif_lcl_addr),
15024 (uint_t)ipif->ipif_flags));
15025 }
15026 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
15027 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
15028 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
15029 } else {
15030 net_mask = htonl(IN_CLASSA_NET); /* fallback */
15031 }
15032
15033 subnet_mask = ipif->ipif_net_mask;
15034
15035 /*
15036 * If mask was not specified, use natural netmask of
15037 * interface address. Also, store this mask back into the
15038 * ipif struct.
15039 */
15040 if (subnet_mask == 0) {
15041 subnet_mask = net_mask;
15042 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
15043 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
15044 ipif->ipif_v6subnet);
15045 }
15046
15047 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
15048 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
15049 ipif->ipif_subnet != INADDR_ANY) {
15050 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
15051
15052 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
15053 route_mask = IP_HOST_MASK;
15054 } else {
15055 route_mask = subnet_mask;
15056 }
15057
15058 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
15059 "creating if IRE ill_net_type 0x%x for 0x%x\n",
15060 (void *)ipif, (void *)ill, ill->ill_net_type,
15061 ntohl(ipif->ipif_subnet)));
15062 ire_if = ire_create(
15063 (uchar_t *)&ipif->ipif_subnet,
15064 (uchar_t *)&route_mask,
15065 (uchar_t *)&ipif->ipif_lcl_addr,
15066 ill->ill_net_type,
15067 ill,
15068 ipif->ipif_zoneid,
15069 ((ipif->ipif_flags & IPIF_PRIVATE) ?
15070 RTF_PRIVATE: 0) | RTF_KERNEL,
15071 NULL,
15072 ipst);
15073 if (ire_if == NULL) {
15074 ip1dbg(("ipif_up_done: NULL ire_if\n"));
15075 err = ENOMEM;
15076 goto bad;
15077 }
15078 }
15079
15080 /*
15081 * Create any necessary broadcast IREs.
15082 */
15083 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15084 !(ipif->ipif_flags & IPIF_NOXMIT))
15085 irep = ipif_create_bcast_ires(ipif, irep);
15086
15087 /* If an earlier ire_create failed, get out now */
15088 for (irep1 = irep; irep1 > ire_array; ) {
15089 irep1--;
15090 if (*irep1 == NULL) {
15091 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
15092 err = ENOMEM;
15093 goto bad;
15094 }
15095 }
15096
15097 /*
15098 * Need to atomically check for IP address availability under
15099 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
15100 * ills or new ipifs can be added while we are checking availability.
15101 */
15102 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15103 mutex_enter(&ipst->ips_ip_addr_avail_lock);
15104 /* Mark it up, and increment counters. */
15105 ipif->ipif_flags |= IPIF_UP;
15106 ill->ill_ipif_up_count++;
15107 err = ip_addr_availability_check(ipif);
15108 mutex_exit(&ipst->ips_ip_addr_avail_lock);
15109 rw_exit(&ipst->ips_ill_g_lock);
15110
15111 if (err != 0) {
15112 /*
15113 * Our address may already be up on the same ill. In this case,
15114 * the ARP entry for our ipif replaced the one for the other
15115 * ipif. So we don't want to delete it (otherwise the other ipif
15116 * would be unable to send packets).
15117 * ip_addr_availability_check() identifies this case for us and
15118 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
15119 * which is the expected error code.
15120 */
15121 ill->ill_ipif_up_count--;
15122 ipif->ipif_flags &= ~IPIF_UP;
15123 goto bad;
15124 }
15125
15126 /*
15127 * Add in all newly created IREs. ire_create_bcast() has
15128 * already checked for duplicates of the IRE_BROADCAST type.
15129 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure
15130 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is
15131 * a /32 route.
15132 */
15133 if (ire_if != NULL) {
15134 ire_if = ire_add(ire_if);
15135 if (ire_if == NULL) {
15136 err = ENOMEM;
15137 goto bad2;
15138 }
15139 #ifdef DEBUG
15140 ire_refhold_notr(ire_if);
15141 ire_refrele(ire_if);
15142 #endif
15143 }
15144 if (ire_local != NULL) {
15145 ire_local = ire_add(ire_local);
15146 if (ire_local == NULL) {
15147 err = ENOMEM;
15148 goto bad2;
15149 }
15150 #ifdef DEBUG
15151 ire_refhold_notr(ire_local);
15152 ire_refrele(ire_local);
15153 #endif
15154 }
15155 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15156 if (ire_local != NULL)
15157 ipif->ipif_ire_local = ire_local;
15158 if (ire_if != NULL)
15159 ipif->ipif_ire_if = ire_if;
15160 rw_exit(&ipst->ips_ill_g_lock);
15161 ire_local = NULL;
15162 ire_if = NULL;
15163
15164 /*
15165 * We first add all of them, and if that succeeds we refrele the
15166 * bunch. That enables us to delete all of them should any of the
15167 * ire_adds fail.
15168 */
15169 for (irep1 = irep; irep1 > ire_array; ) {
15170 irep1--;
15171 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
15172 *irep1 = ire_add(*irep1);
15173 if (*irep1 == NULL) {
15174 err = ENOMEM;
15175 goto bad2;
15176 }
15177 }
15178
15179 for (irep1 = irep; irep1 > ire_array; ) {
15180 irep1--;
15181 /* refheld by ire_add. */
15182 if (*irep1 != NULL) {
15183 ire_refrele(*irep1);
15184 *irep1 = NULL;
15185 }
15186 }
15187
15188 if (!loopback) {
15189 /*
15190 * If the broadcast address has been set, make sure it makes
15191 * sense based on the interface address.
15192 * Only match on ill since we are sharing broadcast addresses.
15193 */
15194 if ((ipif->ipif_brd_addr != INADDR_ANY) &&
15195 (ipif->ipif_flags & IPIF_BROADCAST)) {
15196 ire_t *ire;
15197
15198 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
15199 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL,
15200 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
15201
15202 if (ire == NULL) {
15203 /*
15204 * If there isn't a matching broadcast IRE,
15205 * revert to the default for this netmask.
15206 */
15207 ipif->ipif_v6brd_addr = ipv6_all_zeros;
15208 mutex_enter(&ipif->ipif_ill->ill_lock);
15209 ipif_set_default(ipif);
15210 mutex_exit(&ipif->ipif_ill->ill_lock);
15211 } else {
15212 ire_refrele(ire);
15213 }
15214 }
15215
15216 }
15217 return (0);
15218
15219 bad2:
15220 ill->ill_ipif_up_count--;
15221 ipif->ipif_flags &= ~IPIF_UP;
15222
15223 bad:
15224 ip1dbg(("ipif_add_ires: FAILED \n"));
15225 if (ire_local != NULL)
15226 ire_delete(ire_local);
15227 if (ire_if != NULL)
15228 ire_delete(ire_if);
15229
15230 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15231 ire_local = ipif->ipif_ire_local;
15232 ipif->ipif_ire_local = NULL;
15233 ire_if = ipif->ipif_ire_if;
15234 ipif->ipif_ire_if = NULL;
15235 rw_exit(&ipst->ips_ill_g_lock);
15236 if (ire_local != NULL) {
15237 ire_delete(ire_local);
15238 ire_refrele_notr(ire_local);
15239 }
15240 if (ire_if != NULL) {
15241 ire_delete(ire_if);
15242 ire_refrele_notr(ire_if);
15243 }
15244
15245 while (irep > ire_array) {
15246 irep--;
15247 if (*irep != NULL) {
15248 ire_delete(*irep);
15249 }
15250 }
15251 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
15252
15253 return (err);
15254 }
15255
15256 /* Remove all the IREs created by ipif_add_ires_v4 */
15257 void
15258 ipif_delete_ires_v4(ipif_t *ipif)
15259 {
15260 ill_t *ill = ipif->ipif_ill;
15261 ip_stack_t *ipst = ill->ill_ipst;
15262 ire_t *ire;
15263
15264 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15265 ire = ipif->ipif_ire_local;
15266 ipif->ipif_ire_local = NULL;
15267 rw_exit(&ipst->ips_ill_g_lock);
15268 if (ire != NULL) {
15269 /*
15270 * Move count to ipif so we don't loose the count due to
15271 * a down/up dance.
15272 */
15273 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
15274
15275 ire_delete(ire);
15276 ire_refrele_notr(ire);
15277 }
15278 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15279 ire = ipif->ipif_ire_if;
15280 ipif->ipif_ire_if = NULL;
15281 rw_exit(&ipst->ips_ill_g_lock);
15282 if (ire != NULL) {
15283 ire_delete(ire);
15284 ire_refrele_notr(ire);
15285 }
15286
15287 /*
15288 * Delete the broadcast IREs.
15289 */
15290 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15291 !(ipif->ipif_flags & IPIF_NOXMIT))
15292 ipif_delete_bcast_ires(ipif);
15293 }
15294
15295 /*
15296 * Checks for availbility of a usable source address (if there is one) when the
15297 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
15298 * this selection is done regardless of the destination.
15299 */
15300 boolean_t
15301 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
15302 ip_stack_t *ipst)
15303 {
15304 ipif_t *ipif = NULL;
15305 ill_t *uill;
15306
15307 ASSERT(ifindex != 0);
15308
15309 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
15310 if (uill == NULL)
15311 return (B_FALSE);
15312
15313 mutex_enter(&uill->ill_lock);
15314 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15315 if (IPIF_IS_CONDEMNED(ipif))
15316 continue;
15317 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15318 continue;
15319 if (!(ipif->ipif_flags & IPIF_UP))
15320 continue;
15321 if (ipif->ipif_zoneid != zoneid)
15322 continue;
15323 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15324 ipif->ipif_lcl_addr == INADDR_ANY)
15325 continue;
15326 mutex_exit(&uill->ill_lock);
15327 ill_refrele(uill);
15328 return (B_TRUE);
15329 }
15330 mutex_exit(&uill->ill_lock);
15331 ill_refrele(uill);
15332 return (B_FALSE);
15333 }
15334
15335 /*
15336 * Find an ipif with a good local address on the ill+zoneid.
15337 */
15338 ipif_t *
15339 ipif_good_addr(ill_t *ill, zoneid_t zoneid)
15340 {
15341 ipif_t *ipif;
15342
15343 mutex_enter(&ill->ill_lock);
15344 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15345 if (IPIF_IS_CONDEMNED(ipif))
15346 continue;
15347 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15348 continue;
15349 if (!(ipif->ipif_flags & IPIF_UP))
15350 continue;
15351 if (ipif->ipif_zoneid != zoneid &&
15352 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
15353 continue;
15354 if (ill->ill_isv6 ?
15355 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15356 ipif->ipif_lcl_addr == INADDR_ANY)
15357 continue;
15358 ipif_refhold_locked(ipif);
15359 mutex_exit(&ill->ill_lock);
15360 return (ipif);
15361 }
15362 mutex_exit(&ill->ill_lock);
15363 return (NULL);
15364 }
15365
15366 /*
15367 * IP source address type, sorted from worst to best. For a given type,
15368 * always prefer IP addresses on the same subnet. All-zones addresses are
15369 * suboptimal because they pose problems with unlabeled destinations.
15370 */
15371 typedef enum {
15372 IPIF_NONE,
15373 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */
15374 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */
15375 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */
15376 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */
15377 IPIF_DIFFNET, /* normal and different subnet */
15378 IPIF_SAMENET, /* normal and same subnet */
15379 IPIF_LOCALADDR /* local loopback */
15380 } ipif_type_t;
15381
15382 /*
15383 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
15384 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t
15385 * enumeration, and return the highest-rated ipif. If there's a tie, we pick
15386 * the first one, unless IPMP is used in which case we round-robin among them;
15387 * see below for more.
15388 *
15389 * Returns NULL if there is no suitable source address for the ill.
15390 * This only occurs when there is no valid source address for the ill.
15391 */
15392 ipif_t *
15393 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
15394 boolean_t allow_usesrc, boolean_t *notreadyp)
15395 {
15396 ill_t *usill = NULL;
15397 ill_t *ipmp_ill = NULL;
15398 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif;
15399 ipif_type_t type, best_type;
15400 tsol_tpc_t *src_rhtp, *dst_rhtp;
15401 ip_stack_t *ipst = ill->ill_ipst;
15402 boolean_t samenet;
15403
15404 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
15405 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
15406 B_FALSE, ipst);
15407 if (usill != NULL)
15408 ill = usill; /* Select source from usesrc ILL */
15409 else
15410 return (NULL);
15411 }
15412
15413 /*
15414 * Test addresses should never be used for source address selection,
15415 * so if we were passed one, switch to the IPMP meta-interface.
15416 */
15417 if (IS_UNDER_IPMP(ill)) {
15418 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
15419 ill = ipmp_ill; /* Select source from IPMP ill */
15420 else
15421 return (NULL);
15422 }
15423
15424 /*
15425 * If we're dealing with an unlabeled destination on a labeled system,
15426 * make sure that we ignore source addresses that are incompatible with
15427 * the destination's default label. That destination's default label
15428 * must dominate the minimum label on the source address.
15429 */
15430 dst_rhtp = NULL;
15431 if (is_system_labeled()) {
15432 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE);
15433 if (dst_rhtp == NULL)
15434 return (NULL);
15435 if (dst_rhtp->tpc_tp.host_type != UNLABELED) {
15436 TPC_RELE(dst_rhtp);
15437 dst_rhtp = NULL;
15438 }
15439 }
15440
15441 /*
15442 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
15443 * can be deleted. But an ipif/ill can get CONDEMNED any time.
15444 * After selecting the right ipif, under ill_lock make sure ipif is
15445 * not condemned, and increment refcnt. If ipif is CONDEMNED,
15446 * we retry. Inside the loop we still need to check for CONDEMNED,
15447 * but not under a lock.
15448 */
15449 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15450 retry:
15451 /*
15452 * For source address selection, we treat the ipif list as circular
15453 * and continue until we get back to where we started. This allows
15454 * IPMP to vary source address selection (which improves inbound load
15455 * spreading) by caching its last ending point and starting from
15456 * there. NOTE: we don't have to worry about ill_src_ipif changing
15457 * ills since that can't happen on the IPMP ill.
15458 */
15459 start_ipif = ill->ill_ipif;
15460 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
15461 start_ipif = ill->ill_src_ipif;
15462
15463 ipif = start_ipif;
15464 best_ipif = NULL;
15465 best_type = IPIF_NONE;
15466 do {
15467 if ((next_ipif = ipif->ipif_next) == NULL)
15468 next_ipif = ill->ill_ipif;
15469
15470 if (IPIF_IS_CONDEMNED(ipif))
15471 continue;
15472 /* Always skip NOLOCAL and ANYCAST interfaces */
15473 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15474 continue;
15475 /* Always skip NOACCEPT interfaces */
15476 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT)
15477 continue;
15478 if (!(ipif->ipif_flags & IPIF_UP))
15479 continue;
15480
15481 if (!ipif->ipif_addr_ready) {
15482 if (notreadyp != NULL)
15483 *notreadyp = B_TRUE;
15484 continue;
15485 }
15486
15487 if (zoneid != ALL_ZONES &&
15488 ipif->ipif_zoneid != zoneid &&
15489 ipif->ipif_zoneid != ALL_ZONES)
15490 continue;
15491
15492 /*
15493 * Interfaces with 0.0.0.0 address are allowed to be UP, but
15494 * are not valid as source addresses.
15495 */
15496 if (ipif->ipif_lcl_addr == INADDR_ANY)
15497 continue;
15498
15499 /*
15500 * Check compatibility of local address for destination's
15501 * default label if we're on a labeled system. Incompatible
15502 * addresses can't be used at all.
15503 */
15504 if (dst_rhtp != NULL) {
15505 boolean_t incompat;
15506
15507 src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
15508 IPV4_VERSION, B_FALSE);
15509 if (src_rhtp == NULL)
15510 continue;
15511 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
15512 src_rhtp->tpc_tp.tp_doi !=
15513 dst_rhtp->tpc_tp.tp_doi ||
15514 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
15515 &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
15516 !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
15517 src_rhtp->tpc_tp.tp_sl_set_cipso));
15518 TPC_RELE(src_rhtp);
15519 if (incompat)
15520 continue;
15521 }
15522
15523 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
15524
15525 if (ipif->ipif_lcl_addr == dst) {
15526 type = IPIF_LOCALADDR;
15527 } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
15528 type = samenet ? IPIF_SAMENET_DEPRECATED :
15529 IPIF_DIFFNET_DEPRECATED;
15530 } else if (ipif->ipif_zoneid == ALL_ZONES) {
15531 type = samenet ? IPIF_SAMENET_ALLZONES :
15532 IPIF_DIFFNET_ALLZONES;
15533 } else {
15534 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
15535 }
15536
15537 if (type > best_type) {
15538 best_type = type;
15539 best_ipif = ipif;
15540 if (best_type == IPIF_LOCALADDR)
15541 break; /* can't get better */
15542 }
15543 } while ((ipif = next_ipif) != start_ipif);
15544
15545 if ((ipif = best_ipif) != NULL) {
15546 mutex_enter(&ipif->ipif_ill->ill_lock);
15547 if (IPIF_IS_CONDEMNED(ipif)) {
15548 mutex_exit(&ipif->ipif_ill->ill_lock);
15549 goto retry;
15550 }
15551 ipif_refhold_locked(ipif);
15552
15553 /*
15554 * For IPMP, update the source ipif rotor to the next ipif,
15555 * provided we can look it up. (We must not use it if it's
15556 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
15557 * ipif_free() checked ill_src_ipif.)
15558 */
15559 if (IS_IPMP(ill) && ipif != NULL) {
15560 next_ipif = ipif->ipif_next;
15561 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
15562 ill->ill_src_ipif = next_ipif;
15563 else
15564 ill->ill_src_ipif = NULL;
15565 }
15566 mutex_exit(&ipif->ipif_ill->ill_lock);
15567 }
15568
15569 rw_exit(&ipst->ips_ill_g_lock);
15570 if (usill != NULL)
15571 ill_refrele(usill);
15572 if (ipmp_ill != NULL)
15573 ill_refrele(ipmp_ill);
15574 if (dst_rhtp != NULL)
15575 TPC_RELE(dst_rhtp);
15576
15577 #ifdef DEBUG
15578 if (ipif == NULL) {
15579 char buf1[INET6_ADDRSTRLEN];
15580
15581 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
15582 ill->ill_name,
15583 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
15584 } else {
15585 char buf1[INET6_ADDRSTRLEN];
15586 char buf2[INET6_ADDRSTRLEN];
15587
15588 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
15589 ipif->ipif_ill->ill_name,
15590 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
15591 inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
15592 buf2, sizeof (buf2))));
15593 }
15594 #endif /* DEBUG */
15595 return (ipif);
15596 }
15597
15598 /*
15599 * Pick a source address based on the destination ill and an optional setsrc
15600 * address.
15601 * The result is stored in srcp. If generation is set, then put the source
15602 * generation number there before we look for the source address (to avoid
15603 * missing changes in the set of source addresses.
15604 * If flagsp is set, then us it to pass back ipif_flags.
15605 *
15606 * If the caller wants to cache the returned source address and detect when
15607 * that might be stale, the caller should pass in a generation argument,
15608 * which the caller can later compare against ips_src_generation
15609 *
15610 * The precedence order for selecting an IPv4 source address is:
15611 * - RTF_SETSRC on the offlink ire always wins.
15612 * - If usrsrc is set, swap the ill to be the usesrc one.
15613 * - If IPMP is used on the ill, select a random address from the most
15614 * preferred ones below:
15615 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
15616 * 2. Not deprecated, not ALL_ZONES
15617 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
15618 * 4. Not deprecated, ALL_ZONES
15619 * 5. If onlink destination, same subnet and deprecated
15620 * 6. Deprecated.
15621 *
15622 * We have lower preference for ALL_ZONES IP addresses,
15623 * as they pose problems with unlabeled destinations.
15624 *
15625 * Note that when multiple IP addresses match e.g., #1 we pick
15626 * the first one if IPMP is not in use. With IPMP we randomize.
15627 */
15628 int
15629 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
15630 ipaddr_t multicast_ifaddr,
15631 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
15632 uint32_t *generation, uint64_t *flagsp)
15633 {
15634 ipif_t *ipif;
15635 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */
15636
15637 if (flagsp != NULL)
15638 *flagsp = 0;
15639
15640 /*
15641 * Need to grab the generation number before we check to
15642 * avoid a race with a change to the set of local addresses.
15643 * No lock needed since the thread which updates the set of local
15644 * addresses use ipif/ill locks and exit those (hence a store memory
15645 * barrier) before doing the atomic increase of ips_src_generation.
15646 */
15647 if (generation != NULL) {
15648 *generation = ipst->ips_src_generation;
15649 }
15650
15651 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
15652 *srcp = multicast_ifaddr;
15653 return (0);
15654 }
15655
15656 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
15657 if (setsrc != INADDR_ANY) {
15658 *srcp = setsrc;
15659 return (0);
15660 }
15661 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready);
15662 if (ipif == NULL) {
15663 if (notready)
15664 return (ENETDOWN);
15665 else
15666 return (EADDRNOTAVAIL);
15667 }
15668 *srcp = ipif->ipif_lcl_addr;
15669 if (flagsp != NULL)
15670 *flagsp = ipif->ipif_flags;
15671 ipif_refrele(ipif);
15672 return (0);
15673 }
15674
15675 /* ARGSUSED */
15676 int
15677 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15678 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15679 {
15680 /*
15681 * ill_phyint_reinit merged the v4 and v6 into a single
15682 * ipsq. We might not have been able to complete the
15683 * operation in ipif_set_values, if we could not become
15684 * exclusive. If so restart it here.
15685 */
15686 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15687 }
15688
15689 /*
15690 * Can operate on either a module or a driver queue.
15691 * Returns an error if not a module queue.
15692 */
15693 /* ARGSUSED */
15694 int
15695 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15696 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15697 {
15698 queue_t *q1 = q;
15699 char *cp;
15700 char interf_name[LIFNAMSIZ];
15701 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
15702
15703 if (q->q_next == NULL) {
15704 ip1dbg((
15705 "if_unitsel: IF_UNITSEL: no q_next\n"));
15706 return (EINVAL);
15707 }
15708
15709 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
15710 return (EALREADY);
15711
15712 do {
15713 q1 = q1->q_next;
15714 } while (q1->q_next);
15715 cp = q1->q_qinfo->qi_minfo->mi_idname;
15716 (void) sprintf(interf_name, "%s%d", cp, ppa);
15717
15718 /*
15719 * Here we are not going to delay the ioack until after
15720 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
15721 * original ioctl message before sending the requests.
15722 */
15723 return (ipif_set_values(q, mp, interf_name, &ppa));
15724 }
15725
15726 /* ARGSUSED */
15727 int
15728 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15729 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15730 {
15731 return (ENXIO);
15732 }
15733
15734 /*
15735 * Create any IRE_BROADCAST entries for `ipif', and store those entries in
15736 * `irep'. Returns a pointer to the next free `irep' entry
15737 * A mirror exists in ipif_delete_bcast_ires().
15738 *
15739 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
15740 * done in ire_add.
15741 */
15742 static ire_t **
15743 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
15744 {
15745 ipaddr_t addr;
15746 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15747 ipaddr_t subnetmask = ipif->ipif_net_mask;
15748 ill_t *ill = ipif->ipif_ill;
15749 zoneid_t zoneid = ipif->ipif_zoneid;
15750
15751 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
15752
15753 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15754 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15755
15756 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15757 (ipif->ipif_flags & IPIF_NOLOCAL))
15758 netmask = htonl(IN_CLASSA_NET); /* fallback */
15759
15760 irep = ire_create_bcast(ill, 0, zoneid, irep);
15761 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
15762
15763 /*
15764 * For backward compatibility, we create net broadcast IREs based on
15765 * the old "IP address class system", since some old machines only
15766 * respond to these class derived net broadcast. However, we must not
15767 * create these net broadcast IREs if the subnetmask is shorter than
15768 * the IP address class based derived netmask. Otherwise, we may
15769 * create a net broadcast address which is the same as an IP address
15770 * on the subnet -- and then TCP will refuse to talk to that address.
15771 */
15772 if (netmask < subnetmask) {
15773 addr = netmask & ipif->ipif_subnet;
15774 irep = ire_create_bcast(ill, addr, zoneid, irep);
15775 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
15776 }
15777
15778 /*
15779 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15780 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15781 * created. Creating these broadcast IREs will only create confusion
15782 * as `addr' will be the same as the IP address.
15783 */
15784 if (subnetmask != 0xFFFFFFFF) {
15785 addr = ipif->ipif_subnet;
15786 irep = ire_create_bcast(ill, addr, zoneid, irep);
15787 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
15788 }
15789
15790 return (irep);
15791 }
15792
15793 /*
15794 * Mirror of ipif_create_bcast_ires()
15795 */
15796 static void
15797 ipif_delete_bcast_ires(ipif_t *ipif)
15798 {
15799 ipaddr_t addr;
15800 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15801 ipaddr_t subnetmask = ipif->ipif_net_mask;
15802 ill_t *ill = ipif->ipif_ill;
15803 zoneid_t zoneid = ipif->ipif_zoneid;
15804 ire_t *ire;
15805
15806 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15807 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15808
15809 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15810 (ipif->ipif_flags & IPIF_NOLOCAL))
15811 netmask = htonl(IN_CLASSA_NET); /* fallback */
15812
15813 ire = ire_lookup_bcast(ill, 0, zoneid);
15814 ASSERT(ire != NULL);
15815 ire_delete(ire); ire_refrele(ire);
15816 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
15817 ASSERT(ire != NULL);
15818 ire_delete(ire); ire_refrele(ire);
15819
15820 /*
15821 * For backward compatibility, we create net broadcast IREs based on
15822 * the old "IP address class system", since some old machines only
15823 * respond to these class derived net broadcast. However, we must not
15824 * create these net broadcast IREs if the subnetmask is shorter than
15825 * the IP address class based derived netmask. Otherwise, we may
15826 * create a net broadcast address which is the same as an IP address
15827 * on the subnet -- and then TCP will refuse to talk to that address.
15828 */
15829 if (netmask < subnetmask) {
15830 addr = netmask & ipif->ipif_subnet;
15831 ire = ire_lookup_bcast(ill, addr, zoneid);
15832 ASSERT(ire != NULL);
15833 ire_delete(ire); ire_refrele(ire);
15834 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
15835 ASSERT(ire != NULL);
15836 ire_delete(ire); ire_refrele(ire);
15837 }
15838
15839 /*
15840 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15841 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15842 * created. Creating these broadcast IREs will only create confusion
15843 * as `addr' will be the same as the IP address.
15844 */
15845 if (subnetmask != 0xFFFFFFFF) {
15846 addr = ipif->ipif_subnet;
15847 ire = ire_lookup_bcast(ill, addr, zoneid);
15848 ASSERT(ire != NULL);
15849 ire_delete(ire); ire_refrele(ire);
15850 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
15851 ASSERT(ire != NULL);
15852 ire_delete(ire); ire_refrele(ire);
15853 }
15854 }
15855
15856 /*
15857 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
15858 * from lifr_flags and the name from lifr_name.
15859 * Set IFF_IPV* and ill_isv6 prior to doing the lookup
15860 * since ipif_lookup_on_name uses the _isv6 flags when matching.
15861 * Returns EINPROGRESS when mp has been consumed by queueing it on
15862 * ipx_pending_mp and the ioctl will complete in ip_rput.
15863 *
15864 * Can operate on either a module or a driver queue.
15865 * Returns an error if not a module queue.
15866 */
15867 /* ARGSUSED */
15868 int
15869 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15870 ip_ioctl_cmd_t *ipip, void *if_req)
15871 {
15872 ill_t *ill = q->q_ptr;
15873 phyint_t *phyi;
15874 ip_stack_t *ipst;
15875 struct lifreq *lifr = if_req;
15876 uint64_t new_flags;
15877
15878 ASSERT(ipif != NULL);
15879 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
15880
15881 if (q->q_next == NULL) {
15882 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
15883 return (EINVAL);
15884 }
15885
15886 /*
15887 * If we are not writer on 'q' then this interface exists already
15888 * and previous lookups (ip_extract_lifreq()) found this ipif --
15889 * so return EALREADY.
15890 */
15891 if (ill != ipif->ipif_ill)
15892 return (EALREADY);
15893
15894 if (ill->ill_name[0] != '\0')
15895 return (EALREADY);
15896
15897 /*
15898 * If there's another ill already with the requested name, ensure
15899 * that it's of the same type. Otherwise, ill_phyint_reinit() will
15900 * fuse together two unrelated ills, which will cause chaos.
15901 */
15902 ipst = ill->ill_ipst;
15903 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
15904 lifr->lifr_name, NULL);
15905 if (phyi != NULL) {
15906 ill_t *ill_mate = phyi->phyint_illv4;
15907
15908 if (ill_mate == NULL)
15909 ill_mate = phyi->phyint_illv6;
15910 ASSERT(ill_mate != NULL);
15911
15912 if (ill_mate->ill_media->ip_m_mac_type !=
15913 ill->ill_media->ip_m_mac_type) {
15914 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to "
15915 "use the same ill name on differing media\n"));
15916 return (EINVAL);
15917 }
15918 }
15919
15920 /*
15921 * We start off as IFF_IPV4 in ipif_allocate and become
15922 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value.
15923 * The only flags that we read from user space are IFF_IPV4,
15924 * IFF_IPV6, and IFF_BROADCAST.
15925 *
15926 * This ill has not been inserted into the global list.
15927 * So we are still single threaded and don't need any lock
15928 *
15929 * Saniy check the flags.
15930 */
15931
15932 if ((lifr->lifr_flags & IFF_BROADCAST) &&
15933 ((lifr->lifr_flags & IFF_IPV6) ||
15934 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
15935 ip1dbg(("ip_sioctl_slifname: link not broadcast capable "
15936 "or IPv6 i.e., no broadcast \n"));
15937 return (EINVAL);
15938 }
15939
15940 new_flags =
15941 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
15942
15943 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
15944 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
15945 "IFF_IPV4 or IFF_IPV6\n"));
15946 return (EINVAL);
15947 }
15948
15949 /*
15950 * We always start off as IPv4, so only need to check for IPv6.
15951 */
15952 if ((new_flags & IFF_IPV6) != 0) {
15953 ill->ill_flags |= ILLF_IPV6;
15954 ill->ill_flags &= ~ILLF_IPV4;
15955
15956 if (lifr->lifr_flags & IFF_NOLINKLOCAL)
15957 ill->ill_flags |= ILLF_NOLINKLOCAL;
15958 }
15959
15960 if ((new_flags & IFF_BROADCAST) != 0)
15961 ipif->ipif_flags |= IPIF_BROADCAST;
15962 else
15963 ipif->ipif_flags &= ~IPIF_BROADCAST;
15964
15965 /* We started off as V4. */
15966 if (ill->ill_flags & ILLF_IPV6) {
15967 ill->ill_phyint->phyint_illv6 = ill;
15968 ill->ill_phyint->phyint_illv4 = NULL;
15969 }
15970
15971 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa));
15972 }
15973
15974 /* ARGSUSED */
15975 int
15976 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15977 ip_ioctl_cmd_t *ipip, void *if_req)
15978 {
15979 /*
15980 * ill_phyint_reinit merged the v4 and v6 into a single
15981 * ipsq. We might not have been able to complete the
15982 * slifname in ipif_set_values, if we could not become
15983 * exclusive. If so restart it here
15984 */
15985 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15986 }
15987
15988 /*
15989 * Return a pointer to the ipif which matches the index, IP version type and
15990 * zoneid.
15991 */
15992 ipif_t *
15993 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
15994 ip_stack_t *ipst)
15995 {
15996 ill_t *ill;
15997 ipif_t *ipif = NULL;
15998
15999 ill = ill_lookup_on_ifindex(index, isv6, ipst);
16000 if (ill != NULL) {
16001 mutex_enter(&ill->ill_lock);
16002 for (ipif = ill->ill_ipif; ipif != NULL;
16003 ipif = ipif->ipif_next) {
16004 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
16005 zoneid == ipif->ipif_zoneid ||
16006 ipif->ipif_zoneid == ALL_ZONES)) {
16007 ipif_refhold_locked(ipif);
16008 break;
16009 }
16010 }
16011 mutex_exit(&ill->ill_lock);
16012 ill_refrele(ill);
16013 }
16014 return (ipif);
16015 }
16016
16017 /*
16018 * Change an existing physical interface's index. If the new index
16019 * is acceptable we update the index and the phyint_list_avl_by_index tree.
16020 * Finally, we update other systems which may have a dependence on the
16021 * index value.
16022 */
16023 /* ARGSUSED */
16024 int
16025 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16026 ip_ioctl_cmd_t *ipip, void *ifreq)
16027 {
16028 ill_t *ill;
16029 phyint_t *phyi;
16030 struct ifreq *ifr = (struct ifreq *)ifreq;
16031 struct lifreq *lifr = (struct lifreq *)ifreq;
16032 uint_t old_index, index;
16033 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
16034 avl_index_t where;
16035
16036 if (ipip->ipi_cmd_type == IF_CMD)
16037 index = ifr->ifr_index;
16038 else
16039 index = lifr->lifr_index;
16040
16041 /*
16042 * Only allow on physical interface. Also, index zero is illegal.
16043 */
16044 ill = ipif->ipif_ill;
16045 phyi = ill->ill_phyint;
16046 if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) {
16047 return (EINVAL);
16048 }
16049
16050 /* If the index is not changing, no work to do */
16051 if (phyi->phyint_ifindex == index)
16052 return (0);
16053
16054 /*
16055 * Use phyint_exists() to determine if the new interface index
16056 * is already in use. If the index is unused then we need to
16057 * change the phyint's position in the phyint_list_avl_by_index
16058 * tree. If we do not do this, subsequent lookups (using the new
16059 * index value) will not find the phyint.
16060 */
16061 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16062 if (phyint_exists(index, ipst)) {
16063 rw_exit(&ipst->ips_ill_g_lock);
16064 return (EEXIST);
16065 }
16066
16067 /*
16068 * The new index is unused. Set it in the phyint. However we must not
16069 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex
16070 * changes. The event must be bound to old ifindex value.
16071 */
16072 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE,
16073 &index, sizeof (index));
16074
16075 old_index = phyi->phyint_ifindex;
16076 phyi->phyint_ifindex = index;
16077
16078 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi);
16079 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16080 &index, &where);
16081 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16082 phyi, where);
16083 rw_exit(&ipst->ips_ill_g_lock);
16084
16085 /* Update SCTP's ILL list */
16086 sctp_ill_reindex(ill, old_index);
16087
16088 /* Send the routing sockets message */
16089 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
16090 if (ILL_OTHER(ill))
16091 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
16092
16093 /* Perhaps ilgs should use this ill */
16094 update_conn_ill(NULL, ill->ill_ipst);
16095 return (0);
16096 }
16097
16098 /* ARGSUSED */
16099 int
16100 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16101 ip_ioctl_cmd_t *ipip, void *ifreq)
16102 {
16103 struct ifreq *ifr = (struct ifreq *)ifreq;
16104 struct lifreq *lifr = (struct lifreq *)ifreq;
16105
16106 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
16107 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16108 /* Get the interface index */
16109 if (ipip->ipi_cmd_type == IF_CMD) {
16110 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16111 } else {
16112 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16113 }
16114 return (0);
16115 }
16116
16117 /* ARGSUSED */
16118 int
16119 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16120 ip_ioctl_cmd_t *ipip, void *ifreq)
16121 {
16122 struct lifreq *lifr = (struct lifreq *)ifreq;
16123
16124 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
16125 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16126 /* Get the interface zone */
16127 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16128 lifr->lifr_zoneid = ipif->ipif_zoneid;
16129 return (0);
16130 }
16131
16132 /*
16133 * Set the zoneid of an interface.
16134 */
16135 /* ARGSUSED */
16136 int
16137 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16138 ip_ioctl_cmd_t *ipip, void *ifreq)
16139 {
16140 struct lifreq *lifr = (struct lifreq *)ifreq;
16141 int err = 0;
16142 boolean_t need_up = B_FALSE;
16143 zone_t *zptr;
16144 zone_status_t status;
16145 zoneid_t zoneid;
16146
16147 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16148 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) {
16149 if (!is_system_labeled())
16150 return (ENOTSUP);
16151 zoneid = GLOBAL_ZONEID;
16152 }
16153
16154 /* cannot assign instance zero to a non-global zone */
16155 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
16156 return (ENOTSUP);
16157
16158 /*
16159 * Cannot assign to a zone that doesn't exist or is shutting down. In
16160 * the event of a race with the zone shutdown processing, since IP
16161 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
16162 * interface will be cleaned up even if the zone is shut down
16163 * immediately after the status check. If the interface can't be brought
16164 * down right away, and the zone is shut down before the restart
16165 * function is called, we resolve the possible races by rechecking the
16166 * zone status in the restart function.
16167 */
16168 if ((zptr = zone_find_by_id(zoneid)) == NULL)
16169 return (EINVAL);
16170 status = zone_status_get(zptr);
16171 zone_rele(zptr);
16172
16173 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
16174 return (EINVAL);
16175
16176 if (ipif->ipif_flags & IPIF_UP) {
16177 /*
16178 * If the interface is already marked up,
16179 * we call ipif_down which will take care
16180 * of ditching any IREs that have been set
16181 * up based on the old interface address.
16182 */
16183 err = ipif_logical_down(ipif, q, mp);
16184 if (err == EINPROGRESS)
16185 return (err);
16186 (void) ipif_down_tail(ipif);
16187 need_up = B_TRUE;
16188 }
16189
16190 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
16191 return (err);
16192 }
16193
16194 static int
16195 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
16196 queue_t *q, mblk_t *mp, boolean_t need_up)
16197 {
16198 int err = 0;
16199 ip_stack_t *ipst;
16200
16201 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
16202 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16203
16204 if (CONN_Q(q))
16205 ipst = CONNQ_TO_IPST(q);
16206 else
16207 ipst = ILLQ_TO_IPST(q);
16208
16209 /*
16210 * For exclusive stacks we don't allow a different zoneid than
16211 * global.
16212 */
16213 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
16214 zoneid != GLOBAL_ZONEID)
16215 return (EINVAL);
16216
16217 /* Set the new zone id. */
16218 ipif->ipif_zoneid = zoneid;
16219
16220 /* Update sctp list */
16221 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
16222
16223 /* The default multicast interface might have changed */
16224 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
16225
16226 if (need_up) {
16227 /*
16228 * Now bring the interface back up. If this
16229 * is the only IPIF for the ILL, ipif_up
16230 * will have to re-bind to the device, so
16231 * we may get back EINPROGRESS, in which
16232 * case, this IOCTL will get completed in
16233 * ip_rput_dlpi when we see the DL_BIND_ACK.
16234 */
16235 err = ipif_up(ipif, q, mp);
16236 }
16237 return (err);
16238 }
16239
16240 /* ARGSUSED */
16241 int
16242 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16243 ip_ioctl_cmd_t *ipip, void *if_req)
16244 {
16245 struct lifreq *lifr = (struct lifreq *)if_req;
16246 zoneid_t zoneid;
16247 zone_t *zptr;
16248 zone_status_t status;
16249
16250 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16251 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
16252 zoneid = GLOBAL_ZONEID;
16253
16254 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
16255 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16256
16257 /*
16258 * We recheck the zone status to resolve the following race condition:
16259 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
16260 * 2) hme0:1 is up and can't be brought down right away;
16261 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
16262 * 3) zone "myzone" is halted; the zone status switches to
16263 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
16264 * the interfaces to remove - hme0:1 is not returned because it's not
16265 * yet in "myzone", so it won't be removed;
16266 * 4) the restart function for SIOCSLIFZONE is called; without the
16267 * status check here, we would have hme0:1 in "myzone" after it's been
16268 * destroyed.
16269 * Note that if the status check fails, we need to bring the interface
16270 * back to its state prior to ip_sioctl_slifzone(), hence the call to
16271 * ipif_up_done[_v6]().
16272 */
16273 status = ZONE_IS_UNINITIALIZED;
16274 if ((zptr = zone_find_by_id(zoneid)) != NULL) {
16275 status = zone_status_get(zptr);
16276 zone_rele(zptr);
16277 }
16278 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
16279 if (ipif->ipif_isv6) {
16280 (void) ipif_up_done_v6(ipif);
16281 } else {
16282 (void) ipif_up_done(ipif);
16283 }
16284 return (EINVAL);
16285 }
16286
16287 (void) ipif_down_tail(ipif);
16288
16289 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
16290 B_TRUE));
16291 }
16292
16293 /*
16294 * Return the number of addresses on `ill' with one or more of the values
16295 * in `set' set and all of the values in `clear' clear.
16296 */
16297 static uint_t
16298 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
16299 {
16300 ipif_t *ipif;
16301 uint_t cnt = 0;
16302
16303 ASSERT(IAM_WRITER_ILL(ill));
16304
16305 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
16306 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
16307 cnt++;
16308
16309 return (cnt);
16310 }
16311
16312 /*
16313 * Return the number of migratable addresses on `ill' that are under
16314 * application control.
16315 */
16316 uint_t
16317 ill_appaddr_cnt(const ill_t *ill)
16318 {
16319 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
16320 IPIF_NOFAILOVER));
16321 }
16322
16323 /*
16324 * Return the number of point-to-point addresses on `ill'.
16325 */
16326 uint_t
16327 ill_ptpaddr_cnt(const ill_t *ill)
16328 {
16329 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
16330 }
16331
16332 /* ARGSUSED */
16333 int
16334 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16335 ip_ioctl_cmd_t *ipip, void *ifreq)
16336 {
16337 struct lifreq *lifr = ifreq;
16338
16339 ASSERT(q->q_next == NULL);
16340 ASSERT(CONN_Q(q));
16341
16342 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
16343 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16344 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
16345 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
16346
16347 return (0);
16348 }
16349
16350 /* Find the previous ILL in this usesrc group */
16351 static ill_t *
16352 ill_prev_usesrc(ill_t *uill)
16353 {
16354 ill_t *ill;
16355
16356 for (ill = uill->ill_usesrc_grp_next;
16357 ASSERT(ill), ill->ill_usesrc_grp_next != uill;
16358 ill = ill->ill_usesrc_grp_next)
16359 /* do nothing */;
16360 return (ill);
16361 }
16362
16363 /*
16364 * Release all members of the usesrc group. This routine is called
16365 * from ill_delete when the interface being unplumbed is the
16366 * group head.
16367 *
16368 * This silently clears the usesrc that ifconfig setup.
16369 * An alternative would be to keep that ifindex, and drop packets on the floor
16370 * since no source address can be selected.
16371 * Even if we keep the current semantics, don't need a lock and a linked list.
16372 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
16373 * the one that is being removed. Issue is how we return the usesrc users
16374 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
16375 * ill_usesrc_ifindex matching a target ill. We could also do that with an
16376 * ill walk, but the walker would need to insert in the ioctl response.
16377 */
16378 static void
16379 ill_disband_usesrc_group(ill_t *uill)
16380 {
16381 ill_t *next_ill, *tmp_ill;
16382 ip_stack_t *ipst = uill->ill_ipst;
16383
16384 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16385 next_ill = uill->ill_usesrc_grp_next;
16386
16387 do {
16388 ASSERT(next_ill != NULL);
16389 tmp_ill = next_ill->ill_usesrc_grp_next;
16390 ASSERT(tmp_ill != NULL);
16391 next_ill->ill_usesrc_grp_next = NULL;
16392 next_ill->ill_usesrc_ifindex = 0;
16393 next_ill = tmp_ill;
16394 } while (next_ill->ill_usesrc_ifindex != 0);
16395 uill->ill_usesrc_grp_next = NULL;
16396 }
16397
16398 /*
16399 * Remove the client usesrc ILL from the list and relink to a new list
16400 */
16401 int
16402 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
16403 {
16404 ill_t *ill, *tmp_ill;
16405 ip_stack_t *ipst = ucill->ill_ipst;
16406
16407 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
16408 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16409
16410 /*
16411 * Check if the usesrc client ILL passed in is not already
16412 * in use as a usesrc ILL i.e one whose source address is
16413 * in use OR a usesrc ILL is not already in use as a usesrc
16414 * client ILL
16415 */
16416 if ((ucill->ill_usesrc_ifindex == 0) ||
16417 (uill->ill_usesrc_ifindex != 0)) {
16418 return (-1);
16419 }
16420
16421 ill = ill_prev_usesrc(ucill);
16422 ASSERT(ill->ill_usesrc_grp_next != NULL);
16423
16424 /* Remove from the current list */
16425 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
16426 /* Only two elements in the list */
16427 ASSERT(ill->ill_usesrc_ifindex == 0);
16428 ill->ill_usesrc_grp_next = NULL;
16429 } else {
16430 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
16431 }
16432
16433 if (ifindex == 0) {
16434 ucill->ill_usesrc_ifindex = 0;
16435 ucill->ill_usesrc_grp_next = NULL;
16436 return (0);
16437 }
16438
16439 ucill->ill_usesrc_ifindex = ifindex;
16440 tmp_ill = uill->ill_usesrc_grp_next;
16441 uill->ill_usesrc_grp_next = ucill;
16442 ucill->ill_usesrc_grp_next =
16443 (tmp_ill != NULL) ? tmp_ill : uill;
16444 return (0);
16445 }
16446
16447 /*
16448 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
16449 * ip.c for locking details.
16450 */
16451 /* ARGSUSED */
16452 int
16453 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16454 ip_ioctl_cmd_t *ipip, void *ifreq)
16455 {
16456 struct lifreq *lifr = (struct lifreq *)ifreq;
16457 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
16458 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
16459 int err = 0, ret;
16460 uint_t ifindex;
16461 ipsq_t *ipsq = NULL;
16462 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
16463
16464 ASSERT(IAM_WRITER_IPIF(ipif));
16465 ASSERT(q->q_next == NULL);
16466 ASSERT(CONN_Q(q));
16467
16468 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
16469
16470 ifindex = lifr->lifr_index;
16471 if (ifindex == 0) {
16472 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
16473 /* non usesrc group interface, nothing to reset */
16474 return (0);
16475 }
16476 ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
16477 /* valid reset request */
16478 reset_flg = B_TRUE;
16479 }
16480
16481 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
16482 if (usesrc_ill == NULL)
16483 return (ENXIO);
16484 if (usesrc_ill == ipif->ipif_ill) {
16485 ill_refrele(usesrc_ill);
16486 return (EINVAL);
16487 }
16488
16489 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
16490 NEW_OP, B_TRUE);
16491 if (ipsq == NULL) {
16492 err = EINPROGRESS;
16493 /* Operation enqueued on the ipsq of the usesrc ILL */
16494 goto done;
16495 }
16496
16497 /* USESRC isn't currently supported with IPMP */
16498 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
16499 err = ENOTSUP;
16500 goto done;
16501 }
16502
16503 /*
16504 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only
16505 * used by IPMP underlying interfaces, but someone might think it's
16506 * more general and try to use it independently with VNI.)
16507 */
16508 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
16509 err = ENOTSUP;
16510 goto done;
16511 }
16512
16513 /*
16514 * If the client is already in use as a usesrc_ill or a usesrc_ill is
16515 * already a client then return EINVAL
16516 */
16517 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
16518 err = EINVAL;
16519 goto done;
16520 }
16521
16522 /*
16523 * If the ill_usesrc_ifindex field is already set to what it needs to
16524 * be then this is a duplicate operation.
16525 */
16526 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
16527 err = 0;
16528 goto done;
16529 }
16530
16531 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
16532 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
16533 usesrc_ill->ill_isv6));
16534
16535 /*
16536 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
16537 * and the ill_usesrc_ifindex fields
16538 */
16539 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
16540
16541 if (reset_flg) {
16542 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
16543 if (ret != 0) {
16544 err = EINVAL;
16545 }
16546 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16547 goto done;
16548 }
16549
16550 /*
16551 * Four possibilities to consider:
16552 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
16553 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
16554 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
16555 * 4. Both are part of their respective usesrc groups
16556 */
16557 if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
16558 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16559 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
16560 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16561 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16562 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
16563 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
16564 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16565 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16566 /* Insert at head of list */
16567 usesrc_cli_ill->ill_usesrc_grp_next =
16568 usesrc_ill->ill_usesrc_grp_next;
16569 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16570 } else {
16571 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
16572 ifindex);
16573 if (ret != 0)
16574 err = EINVAL;
16575 }
16576 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16577
16578 done:
16579 if (ipsq != NULL)
16580 ipsq_exit(ipsq);
16581 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
16582 ill_refrele(usesrc_ill);
16583
16584 /* Let conn_ixa caching know that source address selection changed */
16585 ip_update_source_selection(ipst);
16586
16587 return (err);
16588 }
16589
16590 /* ARGSUSED */
16591 int
16592 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16593 ip_ioctl_cmd_t *ipip, void *if_req)
16594 {
16595 struct lifreq *lifr = (struct lifreq *)if_req;
16596 ill_t *ill = ipif->ipif_ill;
16597
16598 /*
16599 * Need a lock since IFF_UP can be set even when there are
16600 * references to the ipif.
16601 */
16602 mutex_enter(&ill->ill_lock);
16603 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0)
16604 lifr->lifr_dadstate = DAD_IN_PROGRESS;
16605 else
16606 lifr->lifr_dadstate = DAD_DONE;
16607 mutex_exit(&ill->ill_lock);
16608 return (0);
16609 }
16610
16611 /*
16612 * comparison function used by avl.
16613 */
16614 static int
16615 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
16616 {
16617
16618 uint_t index;
16619
16620 ASSERT(phyip != NULL && index_ptr != NULL);
16621
16622 index = *((uint_t *)index_ptr);
16623 /*
16624 * let the phyint with the lowest index be on top.
16625 */
16626 if (((phyint_t *)phyip)->phyint_ifindex < index)
16627 return (1);
16628 if (((phyint_t *)phyip)->phyint_ifindex > index)
16629 return (-1);
16630 return (0);
16631 }
16632
16633 /*
16634 * comparison function used by avl.
16635 */
16636 static int
16637 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
16638 {
16639 ill_t *ill;
16640 int res = 0;
16641
16642 ASSERT(phyip != NULL && name_ptr != NULL);
16643
16644 if (((phyint_t *)phyip)->phyint_illv4)
16645 ill = ((phyint_t *)phyip)->phyint_illv4;
16646 else
16647 ill = ((phyint_t *)phyip)->phyint_illv6;
16648 ASSERT(ill != NULL);
16649
16650 res = strcmp(ill->ill_name, (char *)name_ptr);
16651 if (res > 0)
16652 return (1);
16653 else if (res < 0)
16654 return (-1);
16655 return (0);
16656 }
16657
16658 /*
16659 * This function is called on the unplumb path via ill_glist_delete() when
16660 * there are no ills left on the phyint and thus the phyint can be freed.
16661 */
16662 static void
16663 phyint_free(phyint_t *phyi)
16664 {
16665 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
16666
16667 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
16668
16669 /*
16670 * If this phyint was an IPMP meta-interface, blow away the group.
16671 * This is safe to do because all of the illgrps have already been
16672 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
16673 * If we're cleaning up as a result of failed initialization,
16674 * phyint_grp may be NULL.
16675 */
16676 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
16677 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16678 ipmp_grp_destroy(phyi->phyint_grp);
16679 phyi->phyint_grp = NULL;
16680 rw_exit(&ipst->ips_ipmp_lock);
16681 }
16682
16683 /*
16684 * If this interface was under IPMP, take it out of the group.
16685 */
16686 if (phyi->phyint_grp != NULL)
16687 ipmp_phyint_leave_grp(phyi);
16688
16689 /*
16690 * Delete the phyint and disassociate its ipsq. The ipsq itself
16691 * will be freed in ipsq_exit().
16692 */
16693 phyi->phyint_ipsq->ipsq_phyint = NULL;
16694 phyi->phyint_name[0] = '\0';
16695
16696 mi_free(phyi);
16697 }
16698
16699 /*
16700 * Attach the ill to the phyint structure which can be shared by both
16701 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
16702 * function is called from ipif_set_values and ill_lookup_on_name (for
16703 * loopback) where we know the name of the ill. We lookup the ill and if
16704 * there is one present already with the name use that phyint. Otherwise
16705 * reuse the one allocated by ill_init.
16706 */
16707 static void
16708 ill_phyint_reinit(ill_t *ill)
16709 {
16710 boolean_t isv6 = ill->ill_isv6;
16711 phyint_t *phyi_old;
16712 phyint_t *phyi;
16713 avl_index_t where = 0;
16714 ill_t *ill_other = NULL;
16715 ip_stack_t *ipst = ill->ill_ipst;
16716
16717 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16718
16719 phyi_old = ill->ill_phyint;
16720 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
16721 phyi_old->phyint_illv6 == NULL));
16722 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
16723 phyi_old->phyint_illv4 == NULL));
16724 ASSERT(phyi_old->phyint_ifindex == 0);
16725
16726 /*
16727 * Now that our ill has a name, set it in the phyint.
16728 */
16729 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
16730
16731 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16732 ill->ill_name, &where);
16733
16734 /*
16735 * 1. We grabbed the ill_g_lock before inserting this ill into
16736 * the global list of ills. So no other thread could have located
16737 * this ill and hence the ipsq of this ill is guaranteed to be empty.
16738 * 2. Now locate the other protocol instance of this ill.
16739 * 3. Now grab both ill locks in the right order, and the phyint lock of
16740 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
16741 * of neither ill can change.
16742 * 4. Merge the phyint and thus the ipsq as well of this ill onto the
16743 * other ill.
16744 * 5. Release all locks.
16745 */
16746
16747 /*
16748 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
16749 * we are initializing IPv4.
16750 */
16751 if (phyi != NULL) {
16752 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
16753 ASSERT(ill_other->ill_phyint != NULL);
16754 ASSERT((isv6 && !ill_other->ill_isv6) ||
16755 (!isv6 && ill_other->ill_isv6));
16756 GRAB_ILL_LOCKS(ill, ill_other);
16757 /*
16758 * We are potentially throwing away phyint_flags which
16759 * could be different from the one that we obtain from
16760 * ill_other->ill_phyint. But it is okay as we are assuming
16761 * that the state maintained within IP is correct.
16762 */
16763 mutex_enter(&phyi->phyint_lock);
16764 if (isv6) {
16765 ASSERT(phyi->phyint_illv6 == NULL);
16766 phyi->phyint_illv6 = ill;
16767 } else {
16768 ASSERT(phyi->phyint_illv4 == NULL);
16769 phyi->phyint_illv4 = ill;
16770 }
16771
16772 /*
16773 * Delete the old phyint and make its ipsq eligible
16774 * to be freed in ipsq_exit().
16775 */
16776 phyi_old->phyint_illv4 = NULL;
16777 phyi_old->phyint_illv6 = NULL;
16778 phyi_old->phyint_ipsq->ipsq_phyint = NULL;
16779 phyi_old->phyint_name[0] = '\0';
16780 mi_free(phyi_old);
16781 } else {
16782 mutex_enter(&ill->ill_lock);
16783 /*
16784 * We don't need to acquire any lock, since
16785 * the ill is not yet visible globally and we
16786 * have not yet released the ill_g_lock.
16787 */
16788 phyi = phyi_old;
16789 mutex_enter(&phyi->phyint_lock);
16790 /* XXX We need a recovery strategy here. */
16791 if (!phyint_assign_ifindex(phyi, ipst))
16792 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
16793
16794 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16795 (void *)phyi, where);
16796
16797 (void) avl_find(&ipst->ips_phyint_g_list->
16798 phyint_list_avl_by_index,
16799 &phyi->phyint_ifindex, &where);
16800 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16801 (void *)phyi, where);
16802 }
16803
16804 /*
16805 * Reassigning ill_phyint automatically reassigns the ipsq also.
16806 * pending mp is not affected because that is per ill basis.
16807 */
16808 ill->ill_phyint = phyi;
16809
16810 /*
16811 * Now that the phyint's ifindex has been assigned, complete the
16812 * remaining
16813 */
16814 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
16815 if (ill->ill_isv6) {
16816 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
16817 ill->ill_phyint->phyint_ifindex;
16818 ill->ill_mcast_type = ipst->ips_mld_max_version;
16819 } else {
16820 ill->ill_mcast_type = ipst->ips_igmp_max_version;
16821 }
16822
16823 /*
16824 * Generate an event within the hooks framework to indicate that
16825 * a new interface has just been added to IP. For this event to
16826 * be generated, the network interface must, at least, have an
16827 * ifindex assigned to it. (We don't generate the event for
16828 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.)
16829 *
16830 * This needs to be run inside the ill_g_lock perimeter to ensure
16831 * that the ordering of delivered events to listeners matches the
16832 * order of them in the kernel.
16833 */
16834 if (!IS_LOOPBACK(ill)) {
16835 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
16836 ill->ill_name_length);
16837 }
16838 RELEASE_ILL_LOCKS(ill, ill_other);
16839 mutex_exit(&phyi->phyint_lock);
16840 }
16841
16842 /*
16843 * Notify any downstream modules of the name of this interface.
16844 * An M_IOCTL is used even though we don't expect a successful reply.
16845 * Any reply message from the driver (presumably an M_IOCNAK) will
16846 * eventually get discarded somewhere upstream. The message format is
16847 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
16848 * to IP.
16849 */
16850 static void
16851 ip_ifname_notify(ill_t *ill, queue_t *q)
16852 {
16853 mblk_t *mp1, *mp2;
16854 struct iocblk *iocp;
16855 struct lifreq *lifr;
16856
16857 mp1 = mkiocb(SIOCSLIFNAME);
16858 if (mp1 == NULL)
16859 return;
16860 mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
16861 if (mp2 == NULL) {
16862 freeb(mp1);
16863 return;
16864 }
16865
16866 mp1->b_cont = mp2;
16867 iocp = (struct iocblk *)mp1->b_rptr;
16868 iocp->ioc_count = sizeof (struct lifreq);
16869
16870 lifr = (struct lifreq *)mp2->b_rptr;
16871 mp2->b_wptr += sizeof (struct lifreq);
16872 bzero(lifr, sizeof (struct lifreq));
16873
16874 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
16875 lifr->lifr_ppa = ill->ill_ppa;
16876 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
16877
16878 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
16879 char *, "SIOCSLIFNAME", ill_t *, ill);
16880 putnext(q, mp1);
16881 }
16882
16883 static int
16884 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
16885 {
16886 int err;
16887 ip_stack_t *ipst = ill->ill_ipst;
16888 phyint_t *phyi = ill->ill_phyint;
16889
16890 /*
16891 * Now that ill_name is set, the configuration for the IPMP
16892 * meta-interface can be performed.
16893 */
16894 if (IS_IPMP(ill)) {
16895 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16896 /*
16897 * If phyi->phyint_grp is NULL, then this is the first IPMP
16898 * meta-interface and we need to create the IPMP group.
16899 */
16900 if (phyi->phyint_grp == NULL) {
16901 /*
16902 * If someone has renamed another IPMP group to have
16903 * the same name as our interface, bail.
16904 */
16905 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
16906 rw_exit(&ipst->ips_ipmp_lock);
16907 return (EEXIST);
16908 }
16909 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
16910 if (phyi->phyint_grp == NULL) {
16911 rw_exit(&ipst->ips_ipmp_lock);
16912 return (ENOMEM);
16913 }
16914 }
16915 rw_exit(&ipst->ips_ipmp_lock);
16916 }
16917
16918 /* Tell downstream modules where they are. */
16919 ip_ifname_notify(ill, q);
16920
16921 /*
16922 * ill_dl_phys returns EINPROGRESS in the usual case.
16923 * Error cases are ENOMEM ...
16924 */
16925 err = ill_dl_phys(ill, ipif, mp, q);
16926
16927 if (ill->ill_isv6) {
16928 mutex_enter(&ipst->ips_mld_slowtimeout_lock);
16929 if (ipst->ips_mld_slowtimeout_id == 0) {
16930 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
16931 (void *)ipst,
16932 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16933 }
16934 mutex_exit(&ipst->ips_mld_slowtimeout_lock);
16935 } else {
16936 mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
16937 if (ipst->ips_igmp_slowtimeout_id == 0) {
16938 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
16939 (void *)ipst,
16940 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16941 }
16942 mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
16943 }
16944
16945 return (err);
16946 }
16947
16948 /*
16949 * Common routine for ppa and ifname setting. Should be called exclusive.
16950 *
16951 * Returns EINPROGRESS when mp has been consumed by queueing it on
16952 * ipx_pending_mp and the ioctl will complete in ip_rput.
16953 *
16954 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
16955 * the new name and new ppa in lifr_name and lifr_ppa respectively.
16956 * For SLIFNAME, we pass these values back to the userland.
16957 */
16958 static int
16959 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
16960 {
16961 ill_t *ill;
16962 ipif_t *ipif;
16963 ipsq_t *ipsq;
16964 char *ppa_ptr;
16965 char *old_ptr;
16966 char old_char;
16967 int error;
16968 ip_stack_t *ipst;
16969
16970 ip1dbg(("ipif_set_values: interface %s\n", interf_name));
16971 ASSERT(q->q_next != NULL);
16972 ASSERT(interf_name != NULL);
16973
16974 ill = (ill_t *)q->q_ptr;
16975 ipst = ill->ill_ipst;
16976
16977 ASSERT(ill->ill_ipst != NULL);
16978 ASSERT(ill->ill_name[0] == '\0');
16979 ASSERT(IAM_WRITER_ILL(ill));
16980 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
16981 ASSERT(ill->ill_ppa == UINT_MAX);
16982
16983 ill->ill_defend_start = ill->ill_defend_count = 0;
16984 /* The ppa is sent down by ifconfig or is chosen */
16985 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
16986 return (EINVAL);
16987 }
16988
16989 /*
16990 * make sure ppa passed in is same as ppa in the name.
16991 * This check is not made when ppa == UINT_MAX in that case ppa
16992 * in the name could be anything. System will choose a ppa and
16993 * update new_ppa_ptr and inter_name to contain the choosen ppa.
16994 */
16995 if (*new_ppa_ptr != UINT_MAX) {
16996 /* stoi changes the pointer */
16997 old_ptr = ppa_ptr;
16998 /*
16999 * ifconfig passed in 0 for the ppa for DLPI 1 style devices
17000 * (they don't have an externally visible ppa). We assign one
17001 * here so that we can manage the interface. Note that in
17002 * the past this value was always 0 for DLPI 1 drivers.
17003 */
17004 if (*new_ppa_ptr == 0)
17005 *new_ppa_ptr = stoi(&old_ptr);
17006 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
17007 return (EINVAL);
17008 }
17009 /*
17010 * terminate string before ppa
17011 * save char at that location.
17012 */
17013 old_char = ppa_ptr[0];
17014 ppa_ptr[0] = '\0';
17015
17016 ill->ill_ppa = *new_ppa_ptr;
17017 /*
17018 * Finish as much work now as possible before calling ill_glist_insert
17019 * which makes the ill globally visible and also merges it with the
17020 * other protocol instance of this phyint. The remaining work is
17021 * done after entering the ipsq which may happen sometime later.
17022 */
17023 ipif = ill->ill_ipif;
17024
17025 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
17026 ipif_assign_seqid(ipif);
17027
17028 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
17029 ill->ill_flags |= ILLF_IPV4;
17030
17031 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */
17032 ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
17033
17034 if (ill->ill_flags & ILLF_IPV6) {
17035
17036 ill->ill_isv6 = B_TRUE;
17037 ill_set_inputfn(ill);
17038 if (ill->ill_rq != NULL) {
17039 ill->ill_rq->q_qinfo = &iprinitv6;
17040 }
17041
17042 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
17043 ipif->ipif_v6lcl_addr = ipv6_all_zeros;
17044 ipif->ipif_v6subnet = ipv6_all_zeros;
17045 ipif->ipif_v6net_mask = ipv6_all_zeros;
17046 ipif->ipif_v6brd_addr = ipv6_all_zeros;
17047 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
17048 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
17049 /*
17050 * point-to-point or Non-mulicast capable
17051 * interfaces won't do NUD unless explicitly
17052 * configured to do so.
17053 */
17054 if (ipif->ipif_flags & IPIF_POINTOPOINT ||
17055 !(ill->ill_flags & ILLF_MULTICAST)) {
17056 ill->ill_flags |= ILLF_NONUD;
17057 }
17058 /* Make sure IPv4 specific flag is not set on IPv6 if */
17059 if (ill->ill_flags & ILLF_NOARP) {
17060 /*
17061 * Note: xresolv interfaces will eventually need
17062 * NOARP set here as well, but that will require
17063 * those external resolvers to have some
17064 * knowledge of that flag and act appropriately.
17065 * Not to be changed at present.
17066 */
17067 ill->ill_flags &= ~ILLF_NOARP;
17068 }
17069 /*
17070 * Set the ILLF_ROUTER flag according to the global
17071 * IPv6 forwarding policy.
17072 */
17073 if (ipst->ips_ipv6_forwarding != 0)
17074 ill->ill_flags |= ILLF_ROUTER;
17075 } else if (ill->ill_flags & ILLF_IPV4) {
17076 ill->ill_isv6 = B_FALSE;
17077 ill_set_inputfn(ill);
17078 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
17079 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
17080 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
17081 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
17082 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
17083 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
17084 /*
17085 * Set the ILLF_ROUTER flag according to the global
17086 * IPv4 forwarding policy.
17087 */
17088 if (ipst->ips_ip_forwarding != 0)
17089 ill->ill_flags |= ILLF_ROUTER;
17090 }
17091
17092 ASSERT(ill->ill_phyint != NULL);
17093
17094 /*
17095 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
17096 * be completed in ill_glist_insert -> ill_phyint_reinit
17097 */
17098 if (!ill_allocate_mibs(ill))
17099 return (ENOMEM);
17100
17101 /*
17102 * Pick a default sap until we get the DL_INFO_ACK back from
17103 * the driver.
17104 */
17105 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap :
17106 ill->ill_media->ip_m_ipv4sap;
17107
17108 ill->ill_ifname_pending = 1;
17109 ill->ill_ifname_pending_err = 0;
17110
17111 /*
17112 * When the first ipif comes up in ipif_up_done(), multicast groups
17113 * that were joined while this ill was not bound to the DLPI link need
17114 * to be recovered by ill_recover_multicast().
17115 */
17116 ill->ill_need_recover_multicast = 1;
17117
17118 ill_refhold(ill);
17119 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
17120 if ((error = ill_glist_insert(ill, interf_name,
17121 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
17122 ill->ill_ppa = UINT_MAX;
17123 ill->ill_name[0] = '\0';
17124 /*
17125 * undo null termination done above.
17126 */
17127 ppa_ptr[0] = old_char;
17128 rw_exit(&ipst->ips_ill_g_lock);
17129 ill_refrele(ill);
17130 return (error);
17131 }
17132
17133 ASSERT(ill->ill_name_length <= LIFNAMSIZ);
17134
17135 /*
17136 * When we return the buffer pointed to by interf_name should contain
17137 * the same name as in ill_name.
17138 * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
17139 * the buffer pointed to by new_ppa_ptr would not contain the right ppa
17140 * so copy full name and update the ppa ptr.
17141 * When ppa passed in != UINT_MAX all values are correct just undo
17142 * null termination, this saves a bcopy.
17143 */
17144 if (*new_ppa_ptr == UINT_MAX) {
17145 bcopy(ill->ill_name, interf_name, ill->ill_name_length);
17146 *new_ppa_ptr = ill->ill_ppa;
17147 } else {
17148 /*
17149 * undo null termination done above.
17150 */
17151 ppa_ptr[0] = old_char;
17152 }
17153
17154 /* Let SCTP know about this ILL */
17155 sctp_update_ill(ill, SCTP_ILL_INSERT);
17156
17157 /*
17158 * ill_glist_insert has made the ill visible globally, and
17159 * ill_phyint_reinit could have changed the ipsq. At this point,
17160 * we need to hold the ips_ill_g_lock across the call to enter the
17161 * ipsq to enforce atomicity and prevent reordering. In the event
17162 * the ipsq has changed, and if the new ipsq is currently busy,
17163 * we need to make sure that this half-completed ioctl is ahead of
17164 * any subsequent ioctl. We achieve this by not dropping the
17165 * ips_ill_g_lock which prevents any ill lookup itself thereby
17166 * ensuring that new ioctls can't start.
17167 */
17168 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
17169 B_TRUE);
17170
17171 rw_exit(&ipst->ips_ill_g_lock);
17172 ill_refrele(ill);
17173 if (ipsq == NULL)
17174 return (EINPROGRESS);
17175
17176 /*
17177 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
17178 */
17179 if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
17180 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
17181 else
17182 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
17183
17184 error = ipif_set_values_tail(ill, ipif, mp, q);
17185 ipsq_exit(ipsq);
17186 if (error != 0 && error != EINPROGRESS) {
17187 /*
17188 * restore previous values
17189 */
17190 ill->ill_isv6 = B_FALSE;
17191 ill_set_inputfn(ill);
17192 }
17193 return (error);
17194 }
17195
17196 void
17197 ipif_init(ip_stack_t *ipst)
17198 {
17199 int i;
17200
17201 for (i = 0; i < MAX_G_HEADS; i++) {
17202 ipst->ips_ill_g_heads[i].ill_g_list_head =
17203 (ill_if_t *)&ipst->ips_ill_g_heads[i];
17204 ipst->ips_ill_g_heads[i].ill_g_list_tail =
17205 (ill_if_t *)&ipst->ips_ill_g_heads[i];
17206 }
17207
17208 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
17209 ill_phyint_compare_index,
17210 sizeof (phyint_t),
17211 offsetof(struct phyint, phyint_avl_by_index));
17212 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
17213 ill_phyint_compare_name,
17214 sizeof (phyint_t),
17215 offsetof(struct phyint, phyint_avl_by_name));
17216 }
17217
17218 /*
17219 * Save enough information so that we can recreate the IRE if
17220 * the interface goes down and then up.
17221 */
17222 void
17223 ill_save_ire(ill_t *ill, ire_t *ire)
17224 {
17225 mblk_t *save_mp;
17226
17227 save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
17228 if (save_mp != NULL) {
17229 ifrt_t *ifrt;
17230
17231 save_mp->b_wptr += sizeof (ifrt_t);
17232 ifrt = (ifrt_t *)save_mp->b_rptr;
17233 bzero(ifrt, sizeof (ifrt_t));
17234 ifrt->ifrt_type = ire->ire_type;
17235 if (ire->ire_ipversion == IPV4_VERSION) {
17236 ASSERT(!ill->ill_isv6);
17237 ifrt->ifrt_addr = ire->ire_addr;
17238 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
17239 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
17240 ifrt->ifrt_mask = ire->ire_mask;
17241 } else {
17242 ASSERT(ill->ill_isv6);
17243 ifrt->ifrt_v6addr = ire->ire_addr_v6;
17244 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
17245 mutex_enter(&ire->ire_lock);
17246 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
17247 mutex_exit(&ire->ire_lock);
17248 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
17249 ifrt->ifrt_v6mask = ire->ire_mask_v6;
17250 }
17251 ifrt->ifrt_flags = ire->ire_flags;
17252 ifrt->ifrt_zoneid = ire->ire_zoneid;
17253 mutex_enter(&ill->ill_saved_ire_lock);
17254 save_mp->b_cont = ill->ill_saved_ire_mp;
17255 ill->ill_saved_ire_mp = save_mp;
17256 ill->ill_saved_ire_cnt++;
17257 mutex_exit(&ill->ill_saved_ire_lock);
17258 }
17259 }
17260
17261 /*
17262 * Remove one entry from ill_saved_ire_mp.
17263 */
17264 void
17265 ill_remove_saved_ire(ill_t *ill, ire_t *ire)
17266 {
17267 mblk_t **mpp;
17268 mblk_t *mp;
17269 ifrt_t *ifrt;
17270
17271 /* Remove from ill_saved_ire_mp list if it is there */
17272 mutex_enter(&ill->ill_saved_ire_lock);
17273 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
17274 mpp = &(*mpp)->b_cont) {
17275 in6_addr_t gw_addr_v6;
17276
17277 /*
17278 * On a given ill, the tuple of address, gateway, mask,
17279 * ire_type, and zoneid is unique for each saved IRE.
17280 */
17281 mp = *mpp;
17282 ifrt = (ifrt_t *)mp->b_rptr;
17283 /* ire_gateway_addr_v6 can change - need lock */
17284 mutex_enter(&ire->ire_lock);
17285 gw_addr_v6 = ire->ire_gateway_addr_v6;
17286 mutex_exit(&ire->ire_lock);
17287
17288 if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
17289 ifrt->ifrt_type != ire->ire_type)
17290 continue;
17291
17292 if (ill->ill_isv6 ?
17293 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
17294 &ire->ire_addr_v6) &&
17295 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
17296 &gw_addr_v6) &&
17297 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
17298 &ire->ire_mask_v6)) :
17299 (ifrt->ifrt_addr == ire->ire_addr &&
17300 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
17301 ifrt->ifrt_mask == ire->ire_mask)) {
17302 *mpp = mp->b_cont;
17303 ill->ill_saved_ire_cnt--;
17304 freeb(mp);
17305 break;
17306 }
17307 }
17308 mutex_exit(&ill->ill_saved_ire_lock);
17309 }
17310
17311 /*
17312 * IP multirouting broadcast routes handling
17313 * Append CGTP broadcast IREs to regular ones created
17314 * at ifconfig time.
17315 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both
17316 * the destination and the gateway are broadcast addresses.
17317 * The caller has verified that the destination is an IRE_BROADCAST and that
17318 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then
17319 * we create a MULTIRT IRE_BROADCAST.
17320 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything
17321 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion.
17322 */
17323 static void
17324 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst)
17325 {
17326 ire_t *ire_prim;
17327
17328 ASSERT(ire != NULL);
17329
17330 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17331 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
17332 NULL);
17333 if (ire_prim != NULL) {
17334 /*
17335 * We are in the special case of broadcasts for
17336 * CGTP. We add an IRE_BROADCAST that holds
17337 * the RTF_MULTIRT flag, the destination
17338 * address and the low level
17339 * info of ire_prim. In other words, CGTP
17340 * broadcast is added to the redundant ipif.
17341 */
17342 ill_t *ill_prim;
17343 ire_t *bcast_ire;
17344
17345 ill_prim = ire_prim->ire_ill;
17346
17347 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n",
17348 (void *)ire_prim, (void *)ill_prim));
17349
17350 bcast_ire = ire_create(
17351 (uchar_t *)&ire->ire_addr,
17352 (uchar_t *)&ip_g_all_ones,
17353 (uchar_t *)&ire->ire_gateway_addr,
17354 IRE_BROADCAST,
17355 ill_prim,
17356 GLOBAL_ZONEID, /* CGTP is only for the global zone */
17357 ire->ire_flags | RTF_KERNEL,
17358 NULL,
17359 ipst);
17360
17361 /*
17362 * Here we assume that ire_add does head insertion so that
17363 * the added IRE_BROADCAST comes before the existing IRE_HOST.
17364 */
17365 if (bcast_ire != NULL) {
17366 if (ire->ire_flags & RTF_SETSRC) {
17367 bcast_ire->ire_setsrc_addr =
17368 ire->ire_setsrc_addr;
17369 }
17370 bcast_ire = ire_add(bcast_ire);
17371 if (bcast_ire != NULL) {
17372 ip2dbg(("ip_cgtp_filter_bcast_add: "
17373 "added bcast_ire %p\n",
17374 (void *)bcast_ire));
17375
17376 ill_save_ire(ill_prim, bcast_ire);
17377 ire_refrele(bcast_ire);
17378 }
17379 }
17380 ire_refrele(ire_prim);
17381 }
17382 }
17383
17384 /*
17385 * IP multirouting broadcast routes handling
17386 * Remove the broadcast ire.
17387 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both
17388 * the destination and the gateway are broadcast addresses.
17389 * The caller has only verified that RTF_MULTIRT was set. We check
17390 * that the destination is broadcast and that the gateway is a broadcast
17391 * address, and if so delete the IRE added by ip_cgtp_bcast_add().
17392 */
17393 static void
17394 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
17395 {
17396 ASSERT(ire != NULL);
17397
17398 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) {
17399 ire_t *ire_prim;
17400
17401 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17402 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0,
17403 ipst, NULL);
17404 if (ire_prim != NULL) {
17405 ill_t *ill_prim;
17406 ire_t *bcast_ire;
17407
17408 ill_prim = ire_prim->ire_ill;
17409
17410 ip2dbg(("ip_cgtp_filter_bcast_delete: "
17411 "ire_prim %p, ill_prim %p\n",
17412 (void *)ire_prim, (void *)ill_prim));
17413
17414 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0,
17415 ire->ire_gateway_addr, IRE_BROADCAST,
17416 ill_prim, ALL_ZONES, NULL,
17417 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL |
17418 MATCH_IRE_MASK, 0, ipst, NULL);
17419
17420 if (bcast_ire != NULL) {
17421 ip2dbg(("ip_cgtp_filter_bcast_delete: "
17422 "looked up bcast_ire %p\n",
17423 (void *)bcast_ire));
17424 ill_remove_saved_ire(bcast_ire->ire_ill,
17425 bcast_ire);
17426 ire_delete(bcast_ire);
17427 ire_refrele(bcast_ire);
17428 }
17429 ire_refrele(ire_prim);
17430 }
17431 }
17432 }
17433
17434 /*
17435 * Derive an interface id from the link layer address.
17436 * Knows about IEEE 802 and IEEE EUI-64 mappings.
17437 */
17438 static void
17439 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17440 {
17441 char *addr;
17442
17443 /*
17444 * Note that some IPv6 interfaces get plumbed over links that claim to
17445 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g.
17446 * PPP links). The ETHERADDRL check here ensures that we only set the
17447 * interface ID on IPv6 interfaces above links that actually have real
17448 * Ethernet addresses.
17449 */
17450 if (ill->ill_phys_addr_length == ETHERADDRL) {
17451 /* Form EUI-64 like address */
17452 addr = (char *)&v6addr->s6_addr32[2];
17453 bcopy(ill->ill_phys_addr, addr, 3);
17454 addr[0] ^= 0x2; /* Toggle Universal/Local bit */
17455 addr[3] = (char)0xff;
17456 addr[4] = (char)0xfe;
17457 bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
17458 }
17459 }
17460
17461 /* ARGSUSED */
17462 static void
17463 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17464 {
17465 }
17466
17467 typedef struct ipmp_ifcookie {
17468 uint32_t ic_hostid;
17469 char ic_ifname[LIFNAMSIZ];
17470 char ic_zonename[ZONENAME_MAX];
17471 } ipmp_ifcookie_t;
17472
17473 /*
17474 * Construct a pseudo-random interface ID for the IPMP interface that's both
17475 * predictable and (almost) guaranteed to be unique.
17476 */
17477 static void
17478 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17479 {
17480 zone_t *zp;
17481 uint8_t *addr;
17482 uchar_t hash[16];
17483 ulong_t hostid;
17484 MD5_CTX ctx;
17485 ipmp_ifcookie_t ic = { 0 };
17486
17487 ASSERT(IS_IPMP(ill));
17488
17489 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
17490 ic.ic_hostid = htonl((uint32_t)hostid);
17491
17492 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
17493
17494 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
17495 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
17496 zone_rele(zp);
17497 }
17498
17499 MD5Init(&ctx);
17500 MD5Update(&ctx, &ic, sizeof (ic));
17501 MD5Final(hash, &ctx);
17502
17503 /*
17504 * Map the hash to an interface ID per the basic approach in RFC3041.
17505 */
17506 addr = &v6addr->s6_addr8[8];
17507 bcopy(hash + 8, addr, sizeof (uint64_t));
17508 addr[0] &= ~0x2; /* set local bit */
17509 }
17510
17511 /*
17512 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
17513 */
17514 static void
17515 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
17516 {
17517 phyint_t *phyi = ill->ill_phyint;
17518
17519 /*
17520 * Check PHYI_MULTI_BCAST and length of physical
17521 * address to determine if we use the mapping or the
17522 * broadcast address.
17523 */
17524 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17525 ill->ill_phys_addr_length != ETHERADDRL) {
17526 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
17527 return;
17528 }
17529 m_physaddr[0] = 0x33;
17530 m_physaddr[1] = 0x33;
17531 m_physaddr[2] = m_ip6addr[12];
17532 m_physaddr[3] = m_ip6addr[13];
17533 m_physaddr[4] = m_ip6addr[14];
17534 m_physaddr[5] = m_ip6addr[15];
17535 }
17536
17537 /*
17538 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
17539 */
17540 static void
17541 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17542 {
17543 phyint_t *phyi = ill->ill_phyint;
17544
17545 /*
17546 * Check PHYI_MULTI_BCAST and length of physical
17547 * address to determine if we use the mapping or the
17548 * broadcast address.
17549 */
17550 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17551 ill->ill_phys_addr_length != ETHERADDRL) {
17552 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
17553 return;
17554 }
17555 m_physaddr[0] = 0x01;
17556 m_physaddr[1] = 0x00;
17557 m_physaddr[2] = 0x5e;
17558 m_physaddr[3] = m_ipaddr[1] & 0x7f;
17559 m_physaddr[4] = m_ipaddr[2];
17560 m_physaddr[5] = m_ipaddr[3];
17561 }
17562
17563 /* ARGSUSED */
17564 static void
17565 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17566 {
17567 /*
17568 * for the MULTI_BCAST case and other cases when we want to
17569 * use the link-layer broadcast address for multicast.
17570 */
17571 uint8_t *bphys_addr;
17572 dl_unitdata_req_t *dlur;
17573
17574 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17575 if (ill->ill_sap_length < 0) {
17576 bphys_addr = (uchar_t *)dlur +
17577 dlur->dl_dest_addr_offset;
17578 } else {
17579 bphys_addr = (uchar_t *)dlur +
17580 dlur->dl_dest_addr_offset + ill->ill_sap_length;
17581 }
17582
17583 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
17584 }
17585
17586 /*
17587 * Derive IPoIB interface id from the link layer address.
17588 */
17589 static void
17590 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17591 {
17592 char *addr;
17593
17594 ASSERT(ill->ill_phys_addr_length == 20);
17595 addr = (char *)&v6addr->s6_addr32[2];
17596 bcopy(ill->ill_phys_addr + 12, addr, 8);
17597 /*
17598 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
17599 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
17600 * rules. In these cases, the IBA considers these GUIDs to be in
17601 * "Modified EUI-64" format, and thus toggling the u/l bit is not
17602 * required; vendors are required not to assign global EUI-64's
17603 * that differ only in u/l bit values, thus guaranteeing uniqueness
17604 * of the interface identifier. Whether the GUID is in modified
17605 * or proper EUI-64 format, the ipv6 identifier must have the u/l
17606 * bit set to 1.
17607 */
17608 addr[0] |= 2; /* Set Universal/Local bit to 1 */
17609 }
17610
17611 /*
17612 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
17613 * Note on mapping from multicast IP addresses to IPoIB multicast link
17614 * addresses. IPoIB multicast link addresses are based on IBA link addresses.
17615 * The format of an IPoIB multicast address is:
17616 *
17617 * 4 byte QPN Scope Sign. Pkey
17618 * +--------------------------------------------+
17619 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
17620 * +--------------------------------------------+
17621 *
17622 * The Scope and Pkey components are properties of the IBA port and
17623 * network interface. They can be ascertained from the broadcast address.
17624 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
17625 */
17626 static void
17627 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17628 {
17629 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17630 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
17631 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17632 uint8_t *bphys_addr;
17633 dl_unitdata_req_t *dlur;
17634
17635 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17636
17637 /*
17638 * RFC 4391: IPv4 MGID is 28-bit long.
17639 */
17640 m_physaddr[16] = m_ipaddr[0] & 0x0f;
17641 m_physaddr[17] = m_ipaddr[1];
17642 m_physaddr[18] = m_ipaddr[2];
17643 m_physaddr[19] = m_ipaddr[3];
17644
17645
17646 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17647 if (ill->ill_sap_length < 0) {
17648 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17649 } else {
17650 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17651 ill->ill_sap_length;
17652 }
17653 /*
17654 * Now fill in the IBA scope/Pkey values from the broadcast address.
17655 */
17656 m_physaddr[5] = bphys_addr[5];
17657 m_physaddr[8] = bphys_addr[8];
17658 m_physaddr[9] = bphys_addr[9];
17659 }
17660
17661 static void
17662 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17663 {
17664 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17665 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
17666 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17667 uint8_t *bphys_addr;
17668 dl_unitdata_req_t *dlur;
17669
17670 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17671
17672 /*
17673 * RFC 4391: IPv4 MGID is 80-bit long.
17674 */
17675 bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
17676
17677 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17678 if (ill->ill_sap_length < 0) {
17679 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17680 } else {
17681 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17682 ill->ill_sap_length;
17683 }
17684 /*
17685 * Now fill in the IBA scope/Pkey values from the broadcast address.
17686 */
17687 m_physaddr[5] = bphys_addr[5];
17688 m_physaddr[8] = bphys_addr[8];
17689 m_physaddr[9] = bphys_addr[9];
17690 }
17691
17692 /*
17693 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4
17694 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the
17695 * IPv6 interface id. This is a suggested mechanism described in section 3.7
17696 * of RFC4213.
17697 */
17698 static void
17699 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17700 {
17701 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t));
17702 v6addr->s6_addr32[2] = 0;
17703 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t));
17704 }
17705
17706 /*
17707 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6
17708 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface
17709 * id.
17710 */
17711 static void
17712 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17713 {
17714 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr;
17715
17716 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t));
17717 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8);
17718 }
17719
17720 static void
17721 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17722 {
17723 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17724 }
17725
17726 static void
17727 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17728 {
17729 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17730 }
17731
17732 static void
17733 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17734 {
17735 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17736 }
17737
17738 static void
17739 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17740 {
17741 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17742 }
17743
17744 /*
17745 * Lookup an ill and verify that the zoneid has an ipif on that ill.
17746 * Returns an held ill, or NULL.
17747 */
17748 ill_t *
17749 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
17750 ip_stack_t *ipst)
17751 {
17752 ill_t *ill;
17753 ipif_t *ipif;
17754
17755 ill = ill_lookup_on_ifindex(index, isv6, ipst);
17756 if (ill == NULL)
17757 return (NULL);
17758
17759 mutex_enter(&ill->ill_lock);
17760 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17761 if (IPIF_IS_CONDEMNED(ipif))
17762 continue;
17763 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
17764 ipif->ipif_zoneid != ALL_ZONES)
17765 continue;
17766
17767 mutex_exit(&ill->ill_lock);
17768 return (ill);
17769 }
17770 mutex_exit(&ill->ill_lock);
17771 ill_refrele(ill);
17772 return (NULL);
17773 }
17774
17775 /*
17776 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
17777 * If a pointer to an ipif_t is returned then the caller will need to do
17778 * an ill_refrele().
17779 */
17780 ipif_t *
17781 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
17782 ip_stack_t *ipst)
17783 {
17784 ipif_t *ipif;
17785 ill_t *ill;
17786
17787 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
17788 if (ill == NULL)
17789 return (NULL);
17790
17791 mutex_enter(&ill->ill_lock);
17792 if (ill->ill_state_flags & ILL_CONDEMNED) {
17793 mutex_exit(&ill->ill_lock);
17794 ill_refrele(ill);
17795 return (NULL);
17796 }
17797
17798 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17799 if (!IPIF_CAN_LOOKUP(ipif))
17800 continue;
17801 if (lifidx == ipif->ipif_id) {
17802 ipif_refhold_locked(ipif);
17803 break;
17804 }
17805 }
17806
17807 mutex_exit(&ill->ill_lock);
17808 ill_refrele(ill);
17809 return (ipif);
17810 }
17811
17812 /*
17813 * Set ill_inputfn based on the current know state.
17814 * This needs to be called when any of the factors taken into
17815 * account changes.
17816 */
17817 void
17818 ill_set_inputfn(ill_t *ill)
17819 {
17820 ip_stack_t *ipst = ill->ill_ipst;
17821
17822 if (ill->ill_isv6) {
17823 if (is_system_labeled())
17824 ill->ill_inputfn = ill_input_full_v6;
17825 else
17826 ill->ill_inputfn = ill_input_short_v6;
17827 } else {
17828 if (is_system_labeled())
17829 ill->ill_inputfn = ill_input_full_v4;
17830 else if (ill->ill_dhcpinit != 0)
17831 ill->ill_inputfn = ill_input_full_v4;
17832 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
17833 != NULL)
17834 ill->ill_inputfn = ill_input_full_v4;
17835 else if (ipst->ips_ip_cgtp_filter &&
17836 ipst->ips_ip_cgtp_filter_ops != NULL)
17837 ill->ill_inputfn = ill_input_full_v4;
17838 else
17839 ill->ill_inputfn = ill_input_short_v4;
17840 }
17841 }
17842
17843 /*
17844 * Re-evaluate ill_inputfn for all the IPv4 ills.
17845 * Used when RSVP and CGTP comes and goes.
17846 */
17847 void
17848 ill_set_inputfn_all(ip_stack_t *ipst)
17849 {
17850 ill_walk_context_t ctx;
17851 ill_t *ill;
17852
17853 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
17854 ill = ILL_START_WALK_V4(&ctx, ipst);
17855 for (; ill != NULL; ill = ill_next(&ctx, ill))
17856 ill_set_inputfn(ill);
17857
17858 rw_exit(&ipst->ips_ill_g_lock);
17859 }
17860
17861 /*
17862 * Set the physical address information for `ill' to the contents of the
17863 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be
17864 * asynchronous if `ill' cannot immediately be quiesced -- in which case
17865 * EINPROGRESS will be returned.
17866 */
17867 int
17868 ill_set_phys_addr(ill_t *ill, mblk_t *mp)
17869 {
17870 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17871 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
17872
17873 ASSERT(IAM_WRITER_IPSQ(ipsq));
17874
17875 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
17876 dlindp->dl_data != DL_CURR_DEST_ADDR &&
17877 dlindp->dl_data != DL_CURR_PHYS_ADDR) {
17878 /* Changing DL_IPV6_TOKEN is not yet supported */
17879 return (0);
17880 }
17881
17882 /*
17883 * We need to store up to two copies of `mp' in `ill'. Due to the
17884 * design of ipsq_pending_mp_add(), we can't pass them as separate
17885 * arguments to ill_set_phys_addr_tail(). Instead, chain them
17886 * together here, then pull 'em apart in ill_set_phys_addr_tail().
17887 */
17888 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
17889 freemsg(mp);
17890 return (ENOMEM);
17891 }
17892
17893 ipsq_current_start(ipsq, ill->ill_ipif, 0);
17894
17895 /*
17896 * Since we'll only do a logical down, we can't rely on ipif_down
17897 * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset
17898 * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this
17899 * case, to quiesce ire's and nce's for ill_is_quiescent.
17900 */
17901 mutex_enter(&ill->ill_lock);
17902 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
17903 /* no more ire/nce addition allowed */
17904 mutex_exit(&ill->ill_lock);
17905
17906 /*
17907 * If we can quiesce the ill, then set the address. If not, then
17908 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
17909 */
17910 ill_down_ipifs(ill, B_TRUE);
17911 mutex_enter(&ill->ill_lock);
17912 if (!ill_is_quiescent(ill)) {
17913 /* call cannot fail since `conn_t *' argument is NULL */
17914 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17915 mp, ILL_DOWN);
17916 mutex_exit(&ill->ill_lock);
17917 return (EINPROGRESS);
17918 }
17919 mutex_exit(&ill->ill_lock);
17920
17921 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
17922 return (0);
17923 }
17924
17925 /*
17926 * When the allowed-ips link property is set on the datalink, IP receives a
17927 * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips()
17928 * to initialize the ill_allowed_ips[] array in the ill_t. This array is then
17929 * used to vet addresses passed to ip_sioctl_addr() and to ensure that the
17930 * only IP addresses configured on the ill_t are those in the ill_allowed_ips[]
17931 * array.
17932 */
17933 void
17934 ill_set_allowed_ips(ill_t *ill, mblk_t *mp)
17935 {
17936 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17937 dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr;
17938 mac_protect_t *mrp;
17939 int i;
17940
17941 ASSERT(IAM_WRITER_IPSQ(ipsq));
17942 mrp = (mac_protect_t *)&dlip[1];
17943
17944 if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */
17945 kmem_free(ill->ill_allowed_ips,
17946 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17947 ill->ill_allowed_ips_cnt = 0;
17948 ill->ill_allowed_ips = NULL;
17949 mutex_enter(&ill->ill_phyint->phyint_lock);
17950 ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT;
17951 mutex_exit(&ill->ill_phyint->phyint_lock);
17952 return;
17953 }
17954
17955 if (ill->ill_allowed_ips != NULL) {
17956 kmem_free(ill->ill_allowed_ips,
17957 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17958 }
17959 ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt;
17960 ill->ill_allowed_ips = kmem_alloc(
17961 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP);
17962 for (i = 0; i < mrp->mp_ipaddrcnt; i++)
17963 ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr;
17964
17965 mutex_enter(&ill->ill_phyint->phyint_lock);
17966 ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT;
17967 mutex_exit(&ill->ill_phyint->phyint_lock);
17968 }
17969
17970 /*
17971 * Once the ill associated with `q' has quiesced, set its physical address
17972 * information to the values in `addrmp'. Note that two copies of `addrmp'
17973 * are passed (linked by b_cont), since we sometimes need to save two distinct
17974 * copies in the ill_t, and our context doesn't permit sleeping or allocation
17975 * failure (we'll free the other copy if it's not needed). Since the ill_t
17976 * is quiesced, we know any stale nce's with the old address information have
17977 * already been removed, so we don't need to call nce_flush().
17978 */
17979 /* ARGSUSED */
17980 static void
17981 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
17982 {
17983 ill_t *ill = q->q_ptr;
17984 mblk_t *addrmp2 = unlinkb(addrmp);
17985 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
17986 uint_t addrlen, addroff;
17987 int status;
17988
17989 ASSERT(IAM_WRITER_IPSQ(ipsq));
17990
17991 addroff = dlindp->dl_addr_offset;
17992 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
17993
17994 switch (dlindp->dl_data) {
17995 case DL_IPV6_LINK_LAYER_ADDR:
17996 ill_set_ndmp(ill, addrmp, addroff, addrlen);
17997 freemsg(addrmp2);
17998 break;
17999
18000 case DL_CURR_DEST_ADDR:
18001 freemsg(ill->ill_dest_addr_mp);
18002 ill->ill_dest_addr = addrmp->b_rptr + addroff;
18003 ill->ill_dest_addr_mp = addrmp;
18004 if (ill->ill_isv6) {
18005 ill_setdesttoken(ill);
18006 ipif_setdestlinklocal(ill->ill_ipif);
18007 }
18008 freemsg(addrmp2);
18009 break;
18010
18011 case DL_CURR_PHYS_ADDR:
18012 freemsg(ill->ill_phys_addr_mp);
18013 ill->ill_phys_addr = addrmp->b_rptr + addroff;
18014 ill->ill_phys_addr_mp = addrmp;
18015 ill->ill_phys_addr_length = addrlen;
18016 if (ill->ill_isv6)
18017 ill_set_ndmp(ill, addrmp2, addroff, addrlen);
18018 else
18019 freemsg(addrmp2);
18020 if (ill->ill_isv6) {
18021 ill_setdefaulttoken(ill);
18022 ipif_setlinklocal(ill->ill_ipif);
18023 }
18024 break;
18025 default:
18026 ASSERT(0);
18027 }
18028
18029 /*
18030 * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires
18031 * as we bring the ipifs up again.
18032 */
18033 mutex_enter(&ill->ill_lock);
18034 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
18035 mutex_exit(&ill->ill_lock);
18036 /*
18037 * If there are ipifs to bring up, ill_up_ipifs() will return
18038 * EINPROGRESS, and ipsq_current_finish() will be called by
18039 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
18040 * brought up.
18041 */
18042 status = ill_up_ipifs(ill, q, addrmp);
18043 if (status != EINPROGRESS)
18044 ipsq_current_finish(ipsq);
18045 }
18046
18047 /*
18048 * Helper routine for setting the ill_nd_lla fields.
18049 */
18050 void
18051 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
18052 {
18053 freemsg(ill->ill_nd_lla_mp);
18054 ill->ill_nd_lla = ndmp->b_rptr + addroff;
18055 ill->ill_nd_lla_mp = ndmp;
18056 ill->ill_nd_lla_len = addrlen;
18057 }
18058
18059 /*
18060 * Replumb the ill.
18061 */
18062 int
18063 ill_replumb(ill_t *ill, mblk_t *mp)
18064 {
18065 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
18066
18067 ASSERT(IAM_WRITER_IPSQ(ipsq));
18068
18069 ipsq_current_start(ipsq, ill->ill_ipif, 0);
18070
18071 /*
18072 * If we can quiesce the ill, then continue. If not, then
18073 * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
18074 */
18075 ill_down_ipifs(ill, B_FALSE);
18076
18077 mutex_enter(&ill->ill_lock);
18078 if (!ill_is_quiescent(ill)) {
18079 /* call cannot fail since `conn_t *' argument is NULL */
18080 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
18081 mp, ILL_DOWN);
18082 mutex_exit(&ill->ill_lock);
18083 return (EINPROGRESS);
18084 }
18085 mutex_exit(&ill->ill_lock);
18086
18087 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL);
18088 return (0);
18089 }
18090
18091 /* ARGSUSED */
18092 static void
18093 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
18094 {
18095 ill_t *ill = q->q_ptr;
18096 int err;
18097 conn_t *connp = NULL;
18098
18099 ASSERT(IAM_WRITER_IPSQ(ipsq));
18100 freemsg(ill->ill_replumb_mp);
18101 ill->ill_replumb_mp = copyb(mp);
18102
18103 if (ill->ill_replumb_mp == NULL) {
18104 /* out of memory */
18105 ipsq_current_finish(ipsq);
18106 return;
18107 }
18108
18109 mutex_enter(&ill->ill_lock);
18110 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
18111 ill->ill_rq, ill->ill_replumb_mp, 0);
18112 mutex_exit(&ill->ill_lock);
18113
18114 if (!ill->ill_up_ipifs) {
18115 /* already closing */
18116 ipsq_current_finish(ipsq);
18117 return;
18118 }
18119 ill->ill_replumbing = 1;
18120 err = ill_down_ipifs_tail(ill);
18121
18122 /*
18123 * Successfully quiesced and brought down the interface, now we send
18124 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
18125 * DL_NOTE_REPLUMB message.
18126 */
18127 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
18128 DL_NOTIFY_CONF);
18129 ASSERT(mp != NULL);
18130 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification =
18131 DL_NOTE_REPLUMB_DONE;
18132 ill_dlpi_send(ill, mp);
18133
18134 /*
18135 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
18136 * streams have to be unbound. When all the DLPI exchanges are done,
18137 * ipsq_current_finish() will be called by arp_bringup_done(). The
18138 * remainder of ipif bringup via ill_up_ipifs() will also be done in
18139 * arp_bringup_done().
18140 */
18141 ASSERT(ill->ill_replumb_mp != NULL);
18142 if (err == EINPROGRESS)
18143 return;
18144 else
18145 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
18146 ASSERT(connp == NULL);
18147 if (err == 0 && ill->ill_replumb_mp != NULL &&
18148 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
18149 return;
18150 }
18151 ipsq_current_finish(ipsq);
18152 }
18153
18154 /*
18155 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf'
18156 * which is `bufsize' bytes. On success, zero is returned and `buf' updated
18157 * as per the ioctl. On failure, an errno is returned.
18158 */
18159 static int
18160 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr)
18161 {
18162 int rval;
18163 struct strioctl iocb;
18164
18165 iocb.ic_cmd = cmd;
18166 iocb.ic_timout = 15;
18167 iocb.ic_len = bufsize;
18168 iocb.ic_dp = buf;
18169
18170 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval));
18171 }
18172
18173 /*
18174 * Issue an SIOCGLIFCONF for address family `af' and store the result into a
18175 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success.
18176 */
18177 static int
18178 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp,
18179 uint_t *bufsizep, cred_t *cr)
18180 {
18181 int err;
18182 struct lifnum lifn;
18183
18184 bzero(&lifn, sizeof (lifn));
18185 lifn.lifn_family = af;
18186 lifn.lifn_flags = LIFC_UNDER_IPMP;
18187
18188 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0)
18189 return (err);
18190
18191 /*
18192 * Pad the interface count to account for additional interfaces that
18193 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
18194 */
18195 lifn.lifn_count += 4;
18196 bzero(lifcp, sizeof (*lifcp));
18197 lifcp->lifc_flags = LIFC_UNDER_IPMP;
18198 lifcp->lifc_family = af;
18199 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
18200 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
18201
18202 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr);
18203 if (err != 0) {
18204 kmem_free(lifcp->lifc_buf, *bufsizep);
18205 return (err);
18206 }
18207
18208 return (0);
18209 }
18210
18211 /*
18212 * Helper for ip_interface_cleanup() that removes the loopback interface.
18213 */
18214 static void
18215 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18216 {
18217 int err;
18218 struct lifreq lifr;
18219
18220 bzero(&lifr, sizeof (lifr));
18221 (void) strcpy(lifr.lifr_name, ipif_loopback_name);
18222
18223 /*
18224 * Attempt to remove the interface. It may legitimately not exist
18225 * (e.g. the zone administrator unplumbed it), so ignore ENXIO.
18226 */
18227 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr);
18228 if (err != 0 && err != ENXIO) {
18229 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: "
18230 "error %d\n", isv6 ? "v6" : "v4", err));
18231 }
18232 }
18233
18234 /*
18235 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP
18236 * groups and that IPMP data addresses are down. These conditions must be met
18237 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp().
18238 */
18239 static void
18240 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18241 {
18242 int af = isv6 ? AF_INET6 : AF_INET;
18243 int i, nifs;
18244 int err;
18245 uint_t bufsize;
18246 uint_t lifrsize = sizeof (struct lifreq);
18247 struct lifconf lifc;
18248 struct lifreq *lifrp;
18249
18250 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) {
18251 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list "
18252 "(error %d); any IPMP interfaces cannot be shutdown", err);
18253 return;
18254 }
18255
18256 nifs = lifc.lifc_len / lifrsize;
18257 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
18258 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18259 if (err != 0) {
18260 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get "
18261 "flags: error %d", lifrp->lifr_name, err);
18262 continue;
18263 }
18264
18265 if (lifrp->lifr_flags & IFF_IPMP) {
18266 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0)
18267 continue;
18268
18269 lifrp->lifr_flags &= ~IFF_UP;
18270 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr);
18271 if (err != 0) {
18272 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18273 "bring down (error %d); IPMP interface may "
18274 "not be shutdown", lifrp->lifr_name, err);
18275 }
18276
18277 /*
18278 * Check if IFF_DUPLICATE is still set -- and if so,
18279 * reset the address to clear it.
18280 */
18281 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18282 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE))
18283 continue;
18284
18285 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr);
18286 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR,
18287 lifrp, lifrsize, cr)) != 0) {
18288 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18289 "reset DAD (error %d); IPMP interface may "
18290 "not be shutdown", lifrp->lifr_name, err);
18291 }
18292 continue;
18293 }
18294
18295 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) {
18296 lifrp->lifr_groupname[0] = '\0';
18297 if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp,
18298 lifrsize, cr)) != 0) {
18299 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18300 "leave IPMP group (error %d); associated "
18301 "IPMP interface may not be shutdown",
18302 lifrp->lifr_name, err);
18303 continue;
18304 }
18305 }
18306 }
18307
18308 kmem_free(lifc.lifc_buf, bufsize);
18309 }
18310
18311 #define UDPDEV "/devices/pseudo/udp@0:udp"
18312 #define UDP6DEV "/devices/pseudo/udp6@0:udp6"
18313
18314 /*
18315 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down.
18316 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away
18317 * when the user-level processes in the zone are killed and the latter are
18318 * cleaned up by str_stack_shutdown().
18319 */
18320 void
18321 ip_interface_cleanup(ip_stack_t *ipst)
18322 {
18323 ldi_handle_t lh;
18324 ldi_ident_t li;
18325 cred_t *cr;
18326 int err;
18327 int i;
18328 char *devs[] = { UDP6DEV, UDPDEV };
18329 netstackid_t stackid = ipst->ips_netstack->netstack_stackid;
18330
18331 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) {
18332 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:"
18333 " error %d", err);
18334 return;
18335 }
18336
18337 cr = zone_get_kcred(netstackid_to_zoneid(stackid));
18338 ASSERT(cr != NULL);
18339
18340 /*
18341 * NOTE: loop executes exactly twice and is hardcoded to know that the
18342 * first iteration is IPv6. (Unrolling yields repetitious code, hence
18343 * the loop.)
18344 */
18345 for (i = 0; i < 2; i++) {
18346 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li);
18347 if (err != 0) {
18348 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:"
18349 " error %d", devs[i], err);
18350 continue;
18351 }
18352
18353 ip_loopback_removeif(lh, i == 0, cr);
18354 ip_ipmp_cleanup(lh, i == 0, cr);
18355
18356 (void) ldi_close(lh, FREAD|FWRITE, cr);
18357 }
18358
18359 ldi_ident_release(li);
18360 crfree(cr);
18361 }
18362
18363 /*
18364 * This needs to be in-sync with nic_event_t definition
18365 */
18366 static const char *
18367 ill_hook_event2str(nic_event_t event)
18368 {
18369 switch (event) {
18370 case NE_PLUMB:
18371 return ("PLUMB");
18372 case NE_UNPLUMB:
18373 return ("UNPLUMB");
18374 case NE_UP:
18375 return ("UP");
18376 case NE_DOWN:
18377 return ("DOWN");
18378 case NE_ADDRESS_CHANGE:
18379 return ("ADDRESS_CHANGE");
18380 case NE_LIF_UP:
18381 return ("LIF_UP");
18382 case NE_LIF_DOWN:
18383 return ("LIF_DOWN");
18384 case NE_IFINDEX_CHANGE:
18385 return ("IFINDEX_CHANGE");
18386 default:
18387 return ("UNKNOWN");
18388 }
18389 }
18390
18391 void
18392 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
18393 nic_event_data_t data, size_t datalen)
18394 {
18395 ip_stack_t *ipst = ill->ill_ipst;
18396 hook_nic_event_int_t *info;
18397 const char *str = NULL;
18398
18399 /* create a new nic event info */
18400 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
18401 goto fail;
18402
18403 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
18404 info->hnei_event.hne_lif = lif;
18405 info->hnei_event.hne_event = event;
18406 info->hnei_event.hne_protocol = ill->ill_isv6 ?
18407 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
18408 info->hnei_event.hne_data = NULL;
18409 info->hnei_event.hne_datalen = 0;
18410 info->hnei_stackid = ipst->ips_netstack->netstack_stackid;
18411
18412 if (data != NULL && datalen != 0) {
18413 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP);
18414 if (info->hnei_event.hne_data == NULL)
18415 goto fail;
18416 bcopy(data, info->hnei_event.hne_data, datalen);
18417 info->hnei_event.hne_datalen = datalen;
18418 }
18419
18420 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info,
18421 DDI_NOSLEEP) == DDI_SUCCESS)
18422 return;
18423
18424 fail:
18425 if (info != NULL) {
18426 if (info->hnei_event.hne_data != NULL) {
18427 kmem_free(info->hnei_event.hne_data,
18428 info->hnei_event.hne_datalen);
18429 }
18430 kmem_free(info, sizeof (hook_nic_event_t));
18431 }
18432 str = ill_hook_event2str(event);
18433 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event "
18434 "information for %s (ENOMEM)\n", str, ill->ill_name));
18435 }
18436
18437 static int
18438 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
18439 {
18440 int err = 0;
18441 const in_addr_t *addr = NULL;
18442 nce_t *nce = NULL;
18443 ill_t *ill = ipif->ipif_ill;
18444 ill_t *bound_ill;
18445 boolean_t added_ipif = B_FALSE;
18446 uint16_t state;
18447 uint16_t flags;
18448
18449 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
18450 ill_t *, ill, ipif_t *, ipif);
18451 if (ipif->ipif_lcl_addr != INADDR_ANY) {
18452 addr = &ipif->ipif_lcl_addr;
18453 }
18454
18455 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
18456 if (res_act != Res_act_initial)
18457 return (EINVAL);
18458 }
18459
18460 if (addr != NULL) {
18461 ipmp_illgrp_t *illg = ill->ill_grp;
18462
18463 /* add unicast nce for the local addr */
18464
18465 if (IS_IPMP(ill)) {
18466 /*
18467 * If we're here via ipif_up(), then the ipif
18468 * won't be bound yet -- add it to the group,
18469 * which will bind it if possible. (We would
18470 * add it in ipif_up(), but deleting on failure
18471 * there is gruesome.) If we're here via
18472 * ipmp_ill_bind_ipif(), then the ipif has
18473 * already been added to the group and we
18474 * just need to use the binding.
18475 */
18476 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
18477 bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
18478 if (bound_ill == NULL) {
18479 /*
18480 * We couldn't bind the ipif to an ill
18481 * yet, so we have nothing to publish.
18482 * Mark the address as ready and return.
18483 */
18484 ipif->ipif_addr_ready = 1;
18485 return (0);
18486 }
18487 added_ipif = B_TRUE;
18488 }
18489 } else {
18490 bound_ill = ill;
18491 }
18492
18493 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
18494 NCE_F_NONUD);
18495 /*
18496 * If this is an initial bring-up (or the ipif was never
18497 * completely brought up), do DAD. Otherwise, we're here
18498 * because IPMP has rebound an address to this ill: send
18499 * unsolicited advertisements (ARP announcements) to
18500 * inform others.
18501 */
18502 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
18503 state = ND_UNCHANGED; /* compute in nce_add_common() */
18504 } else {
18505 state = ND_REACHABLE;
18506 flags |= NCE_F_UNSOL_ADV;
18507 }
18508
18509 retry:
18510 err = nce_lookup_then_add_v4(ill,
18511 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
18512 addr, flags, state, &nce);
18513
18514 /*
18515 * note that we may encounter EEXIST if we are moving
18516 * the nce as a result of a rebind operation.
18517 */
18518 switch (err) {
18519 case 0:
18520 ipif->ipif_added_nce = 1;
18521 nce->nce_ipif_cnt++;
18522 break;
18523 case EEXIST:
18524 ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
18525 ill->ill_name));
18526 if (!NCE_MYADDR(nce->nce_common)) {
18527 /*
18528 * A leftover nce from before this address
18529 * existed
18530 */
18531 ncec_delete(nce->nce_common);
18532 nce_refrele(nce);
18533 nce = NULL;
18534 goto retry;
18535 }
18536 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
18537 nce_refrele(nce);
18538 nce = NULL;
18539 ip1dbg(("ipif_arp_up: NCE already exists "
18540 "for %s:%u\n", ill->ill_name,
18541 ipif->ipif_id));
18542 goto arp_up_done;
18543 }
18544 /*
18545 * Duplicate local addresses are permissible for
18546 * IPIF_POINTOPOINT interfaces which will get marked
18547 * IPIF_UNNUMBERED later in
18548 * ip_addr_availability_check().
18549 *
18550 * The nce_ipif_cnt field tracks the number of
18551 * ipifs that have nce_addr as their local address.
18552 */
18553 ipif->ipif_addr_ready = 1;
18554 ipif->ipif_added_nce = 1;
18555 nce->nce_ipif_cnt++;
18556 err = 0;
18557 break;
18558 default:
18559 ASSERT(nce == NULL);
18560 goto arp_up_done;
18561 }
18562 if (arp_no_defense) {
18563 if ((ipif->ipif_flags & IPIF_UP) &&
18564 !ipif->ipif_addr_ready)
18565 ipif_up_notify(ipif);
18566 ipif->ipif_addr_ready = 1;
18567 }
18568 } else {
18569 /* zero address. nothing to publish */
18570 ipif->ipif_addr_ready = 1;
18571 }
18572 if (nce != NULL)
18573 nce_refrele(nce);
18574 arp_up_done:
18575 if (added_ipif && err != 0)
18576 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18577 return (err);
18578 }
18579
18580 int
18581 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
18582 {
18583 int err = 0;
18584 ill_t *ill = ipif->ipif_ill;
18585 boolean_t first_interface, wait_for_dlpi = B_FALSE;
18586
18587 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
18588 ill_t *, ill, ipif_t *, ipif);
18589
18590 /*
18591 * need to bring up ARP or setup mcast mapping only
18592 * when the first interface is coming UP.
18593 */
18594 first_interface = (ill->ill_ipif_up_count == 0 &&
18595 ill->ill_ipif_dup_count == 0 && !was_dup);
18596
18597 if (res_act == Res_act_initial && first_interface) {
18598 /*
18599 * Send ATTACH + BIND
18600 */
18601 err = arp_ll_up(ill);
18602 if (err != EINPROGRESS && err != 0)
18603 return (err);
18604
18605 /*
18606 * Add NCE for local address. Start DAD.
18607 * we'll wait to hear that DAD has finished
18608 * before using the interface.
18609 */
18610 if (err == EINPROGRESS)
18611 wait_for_dlpi = B_TRUE;
18612 }
18613
18614 if (!wait_for_dlpi)
18615 (void) ipif_arp_up_done_tail(ipif, res_act);
18616
18617 return (!wait_for_dlpi ? 0 : EINPROGRESS);
18618 }
18619
18620 /*
18621 * Finish processing of "arp_up" after all the DLPI message
18622 * exchanges have completed between arp and the driver.
18623 */
18624 void
18625 arp_bringup_done(ill_t *ill, int err)
18626 {
18627 mblk_t *mp1;
18628 ipif_t *ipif;
18629 conn_t *connp = NULL;
18630 ipsq_t *ipsq;
18631 queue_t *q;
18632
18633 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
18634
18635 ASSERT(IAM_WRITER_ILL(ill));
18636
18637 ipsq = ill->ill_phyint->phyint_ipsq;
18638 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18639 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18640 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18641 if (mp1 == NULL) /* bringup was aborted by the user */
18642 return;
18643
18644 /*
18645 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18646 * must have an associated conn_t. Otherwise, we're bringing this
18647 * interface back up as part of handling an asynchronous event (e.g.,
18648 * physical address change).
18649 */
18650 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18651 ASSERT(connp != NULL);
18652 q = CONNP_TO_WQ(connp);
18653 } else {
18654 ASSERT(connp == NULL);
18655 q = ill->ill_rq;
18656 }
18657 if (err == 0) {
18658 if (ipif->ipif_isv6) {
18659 if ((err = ipif_up_done_v6(ipif)) != 0)
18660 ip0dbg(("arp_bringup_done: init failed\n"));
18661 } else {
18662 err = ipif_arp_up_done_tail(ipif, Res_act_initial);
18663 if (err != 0 ||
18664 (err = ipif_up_done(ipif)) != 0) {
18665 ip0dbg(("arp_bringup_done: "
18666 "init failed err %x\n", err));
18667 (void) ipif_arp_down(ipif);
18668 }
18669
18670 }
18671 } else {
18672 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
18673 }
18674
18675 if ((err == 0) && (ill->ill_up_ipifs)) {
18676 err = ill_up_ipifs(ill, q, mp1);
18677 if (err == EINPROGRESS)
18678 return;
18679 }
18680
18681 /*
18682 * If we have a moved ipif to bring up, and everything has succeeded
18683 * to this point, bring it up on the IPMP ill. Otherwise, leave it
18684 * down -- the admin can try to bring it up by hand if need be.
18685 */
18686 if (ill->ill_move_ipif != NULL) {
18687 ipif = ill->ill_move_ipif;
18688 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
18689 ipif->ipif_ill->ill_name));
18690 ill->ill_move_ipif = NULL;
18691 if (err == 0) {
18692 err = ipif_up(ipif, q, mp1);
18693 if (err == EINPROGRESS)
18694 return;
18695 }
18696 }
18697
18698 /*
18699 * The operation must complete without EINPROGRESS since
18700 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18701 * Otherwise, the operation will be stuck forever in the ipsq.
18702 */
18703 ASSERT(err != EINPROGRESS);
18704 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18705 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
18706 int, ipsq->ipsq_xop->ipx_current_ioctl,
18707 ill_t *, ill, ipif_t *, ipif);
18708 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18709 } else {
18710 ipsq_current_finish(ipsq);
18711 }
18712 }
18713
18714 /*
18715 * Finish processing of arp replumb after all the DLPI message
18716 * exchanges have completed between arp and the driver.
18717 */
18718 void
18719 arp_replumb_done(ill_t *ill, int err)
18720 {
18721 mblk_t *mp1;
18722 ipif_t *ipif;
18723 conn_t *connp = NULL;
18724 ipsq_t *ipsq;
18725 queue_t *q;
18726
18727 ASSERT(IAM_WRITER_ILL(ill));
18728
18729 ipsq = ill->ill_phyint->phyint_ipsq;
18730 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18731 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18732 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18733 if (mp1 == NULL) {
18734 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
18735 ipsq->ipsq_xop->ipx_current_ioctl));
18736 /* bringup was aborted by the user */
18737 return;
18738 }
18739 /*
18740 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18741 * must have an associated conn_t. Otherwise, we're bringing this
18742 * interface back up as part of handling an asynchronous event (e.g.,
18743 * physical address change).
18744 */
18745 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18746 ASSERT(connp != NULL);
18747 q = CONNP_TO_WQ(connp);
18748 } else {
18749 ASSERT(connp == NULL);
18750 q = ill->ill_rq;
18751 }
18752 if ((err == 0) && (ill->ill_up_ipifs)) {
18753 err = ill_up_ipifs(ill, q, mp1);
18754 if (err == EINPROGRESS)
18755 return;
18756 }
18757 /*
18758 * The operation must complete without EINPROGRESS since
18759 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18760 * Otherwise, the operation will be stuck forever in the ipsq.
18761 */
18762 ASSERT(err != EINPROGRESS);
18763 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18764 DTRACE_PROBE4(ipif__ioctl, char *,
18765 "arp_replumb_done finish",
18766 int, ipsq->ipsq_xop->ipx_current_ioctl,
18767 ill_t *, ill, ipif_t *, ipif);
18768 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18769 } else {
18770 ipsq_current_finish(ipsq);
18771 }
18772 }
18773
18774 void
18775 ipif_up_notify(ipif_t *ipif)
18776 {
18777 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
18778 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
18779 sctp_update_ipif(ipif, SCTP_IPIF_UP);
18780 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
18781 NE_LIF_UP, NULL, 0);
18782 }
18783
18784 /*
18785 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and
18786 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on
18787 * TPI end points with STREAMS modules pushed above. This is assured by not
18788 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl
18789 * never ends up on an ipsq, otherwise we may end up processing the ioctl
18790 * while unwinding from the ispq and that could be a thread from the bottom.
18791 */
18792 /* ARGSUSED */
18793 int
18794 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
18795 ip_ioctl_cmd_t *ipip, void *arg)
18796 {
18797 mblk_t *cmd_mp = mp->b_cont->b_cont;
18798 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr);
18799 int ret = 0;
18800 int i;
18801 size_t size;
18802 ip_stack_t *ipst;
18803 zoneid_t zoneid;
18804 ilb_stack_t *ilbs;
18805
18806 ipst = CONNQ_TO_IPST(q);
18807 ilbs = ipst->ips_netstack->netstack_ilb;
18808 zoneid = Q_TO_CONN(q)->conn_zoneid;
18809
18810 switch (command) {
18811 case ILB_CREATE_RULE: {
18812 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18813
18814 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18815 ret = EINVAL;
18816 break;
18817 }
18818
18819 ret = ilb_rule_add(ilbs, zoneid, cmd);
18820 break;
18821 }
18822 case ILB_DESTROY_RULE:
18823 case ILB_ENABLE_RULE:
18824 case ILB_DISABLE_RULE: {
18825 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr;
18826
18827 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) {
18828 ret = EINVAL;
18829 break;
18830 }
18831
18832 if (cmd->flags & ILB_RULE_ALLRULES) {
18833 if (command == ILB_DESTROY_RULE) {
18834 ilb_rule_del_all(ilbs, zoneid);
18835 break;
18836 } else if (command == ILB_ENABLE_RULE) {
18837 ilb_rule_enable_all(ilbs, zoneid);
18838 break;
18839 } else if (command == ILB_DISABLE_RULE) {
18840 ilb_rule_disable_all(ilbs, zoneid);
18841 break;
18842 }
18843 } else {
18844 if (command == ILB_DESTROY_RULE) {
18845 ret = ilb_rule_del(ilbs, zoneid, cmd->name);
18846 } else if (command == ILB_ENABLE_RULE) {
18847 ret = ilb_rule_enable(ilbs, zoneid, cmd->name,
18848 NULL);
18849 } else if (command == ILB_DISABLE_RULE) {
18850 ret = ilb_rule_disable(ilbs, zoneid, cmd->name,
18851 NULL);
18852 }
18853 }
18854 break;
18855 }
18856 case ILB_NUM_RULES: {
18857 ilb_num_rules_cmd_t *cmd;
18858
18859 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) {
18860 ret = EINVAL;
18861 break;
18862 }
18863 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr;
18864 ilb_get_num_rules(ilbs, zoneid, &(cmd->num));
18865 break;
18866 }
18867 case ILB_RULE_NAMES: {
18868 ilb_rule_names_cmd_t *cmd;
18869
18870 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr;
18871 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) ||
18872 cmd->num_names == 0) {
18873 ret = EINVAL;
18874 break;
18875 }
18876 size = cmd->num_names * ILB_RULE_NAMESZ;
18877 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) +
18878 size != cmd_mp->b_wptr) {
18879 ret = EINVAL;
18880 break;
18881 }
18882 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf);
18883 break;
18884 }
18885 case ILB_NUM_SERVERS: {
18886 ilb_num_servers_cmd_t *cmd;
18887
18888 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) {
18889 ret = EINVAL;
18890 break;
18891 }
18892 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr;
18893 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name,
18894 &(cmd->num));
18895 break;
18896 }
18897 case ILB_LIST_RULE: {
18898 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18899
18900 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18901 ret = EINVAL;
18902 break;
18903 }
18904 ret = ilb_rule_list(ilbs, zoneid, cmd);
18905 break;
18906 }
18907 case ILB_LIST_SERVERS: {
18908 ilb_servers_info_cmd_t *cmd;
18909
18910 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18911 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) ||
18912 cmd->num_servers == 0) {
18913 ret = EINVAL;
18914 break;
18915 }
18916 size = cmd->num_servers * sizeof (ilb_server_info_t);
18917 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18918 size != cmd_mp->b_wptr) {
18919 ret = EINVAL;
18920 break;
18921 }
18922
18923 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers,
18924 &cmd->num_servers);
18925 break;
18926 }
18927 case ILB_ADD_SERVERS: {
18928 ilb_servers_info_cmd_t *cmd;
18929 ilb_rule_t *rule;
18930
18931 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18932 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) {
18933 ret = EINVAL;
18934 break;
18935 }
18936 size = cmd->num_servers * sizeof (ilb_server_info_t);
18937 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18938 size != cmd_mp->b_wptr) {
18939 ret = EINVAL;
18940 break;
18941 }
18942 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18943 if (rule == NULL) {
18944 ASSERT(ret != 0);
18945 break;
18946 }
18947 for (i = 0; i < cmd->num_servers; i++) {
18948 ilb_server_info_t *s;
18949
18950 s = &cmd->servers[i];
18951 s->err = ilb_server_add(ilbs, rule, s);
18952 }
18953 ILB_RULE_REFRELE(rule);
18954 break;
18955 }
18956 case ILB_DEL_SERVERS:
18957 case ILB_ENABLE_SERVERS:
18958 case ILB_DISABLE_SERVERS: {
18959 ilb_servers_cmd_t *cmd;
18960 ilb_rule_t *rule;
18961 int (*f)();
18962
18963 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr;
18964 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) {
18965 ret = EINVAL;
18966 break;
18967 }
18968 size = cmd->num_servers * sizeof (ilb_server_arg_t);
18969 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) +
18970 size != cmd_mp->b_wptr) {
18971 ret = EINVAL;
18972 break;
18973 }
18974
18975 if (command == ILB_DEL_SERVERS)
18976 f = ilb_server_del;
18977 else if (command == ILB_ENABLE_SERVERS)
18978 f = ilb_server_enable;
18979 else if (command == ILB_DISABLE_SERVERS)
18980 f = ilb_server_disable;
18981
18982 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18983 if (rule == NULL) {
18984 ASSERT(ret != 0);
18985 break;
18986 }
18987
18988 for (i = 0; i < cmd->num_servers; i++) {
18989 ilb_server_arg_t *s;
18990
18991 s = &cmd->servers[i];
18992 s->err = f(ilbs, zoneid, NULL, rule, &s->addr);
18993 }
18994 ILB_RULE_REFRELE(rule);
18995 break;
18996 }
18997 case ILB_LIST_NAT_TABLE: {
18998 ilb_list_nat_cmd_t *cmd;
18999
19000 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr;
19001 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) {
19002 ret = EINVAL;
19003 break;
19004 }
19005 size = cmd->num_nat * sizeof (ilb_nat_entry_t);
19006 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) +
19007 size != cmd_mp->b_wptr) {
19008 ret = EINVAL;
19009 break;
19010 }
19011
19012 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat,
19013 &cmd->flags);
19014 break;
19015 }
19016 case ILB_LIST_STICKY_TABLE: {
19017 ilb_list_sticky_cmd_t *cmd;
19018
19019 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr;
19020 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) {
19021 ret = EINVAL;
19022 break;
19023 }
19024 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t);
19025 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) +
19026 size != cmd_mp->b_wptr) {
19027 ret = EINVAL;
19028 break;
19029 }
19030
19031 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries,
19032 &cmd->num_sticky, &cmd->flags);
19033 break;
19034 }
19035 default:
19036 ret = EINVAL;
19037 break;
19038 }
19039 done:
19040 return (ret);
19041 }
19042
19043 /* Remove all cache entries for this logical interface */
19044 void
19045 ipif_nce_down(ipif_t *ipif)
19046 {
19047 ill_t *ill = ipif->ipif_ill;
19048 nce_t *nce;
19049
19050 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
19051 ill_t *, ill, ipif_t *, ipif);
19052 if (ipif->ipif_added_nce) {
19053 if (ipif->ipif_isv6)
19054 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
19055 else
19056 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
19057 if (nce != NULL) {
19058 if (--nce->nce_ipif_cnt == 0)
19059 ncec_delete(nce->nce_common);
19060 ipif->ipif_added_nce = 0;
19061 nce_refrele(nce);
19062 } else {
19063 /*
19064 * nce may already be NULL because it was already
19065 * flushed, e.g., due to a call to nce_flush
19066 */
19067 ipif->ipif_added_nce = 0;
19068 }
19069 }
19070 /*
19071 * Make IPMP aware of the deleted data address.
19072 */
19073 if (IS_IPMP(ill))
19074 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
19075
19076 /*
19077 * Remove all other nces dependent on this ill when the last ipif
19078 * is going away.
19079 */
19080 if (ill->ill_ipif_up_count == 0) {
19081 ncec_walk(ill, ncec_delete_per_ill, ill, ill->ill_ipst);
19082 if (IS_UNDER_IPMP(ill))
19083 nce_flush(ill, B_TRUE);
19084 }
19085 }
19086
19087 /*
19088 * find the first interface that uses usill for its source address.
19089 */
19090 ill_t *
19091 ill_lookup_usesrc(ill_t *usill)
19092 {
19093 ip_stack_t *ipst = usill->ill_ipst;
19094 ill_t *ill;
19095
19096 ASSERT(usill != NULL);
19097
19098 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
19099 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
19100 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
19101 for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
19102 ill = ill->ill_usesrc_grp_next) {
19103 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
19104 !ILL_IS_CONDEMNED(ill)) {
19105 ill_refhold(ill);
19106 break;
19107 }
19108 }
19109 rw_exit(&ipst->ips_ill_g_lock);
19110 rw_exit(&ipst->ips_ill_g_usesrc_lock);
19111 return (ill);
19112 }
19113
19114 /*
19115 * This comment applies to both ip_sioctl_get_ifhwaddr and
19116 * ip_sioctl_get_lifhwaddr as the basic function of these two functions
19117 * is the same.
19118 *
19119 * The goal here is to find an IP interface that corresponds to the name
19120 * provided by the caller in the ifreq/lifreq structure held in the mblk_t
19121 * chain and to fill out a sockaddr/sockaddr_storage structure with the
19122 * mac address.
19123 *
19124 * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number
19125 * of different reasons:
19126 * ENXIO - the device name is not known to IP.
19127 * EADDRNOTAVAIL - the device has no hardware address. This is indicated
19128 * by ill_phys_addr not pointing to an actual address.
19129 * EPFNOSUPPORT - this will indicate that a request is being made for a
19130 * mac address that will not fit in the data structure supplier (struct
19131 * sockaddr).
19132 *
19133 */
19134 /* ARGSUSED */
19135 int
19136 ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19137 ip_ioctl_cmd_t *ipip, void *if_req)
19138 {
19139 struct sockaddr *sock;
19140 struct ifreq *ifr;
19141 mblk_t *mp1;
19142 ill_t *ill;
19143
19144 ASSERT(ipif != NULL);
19145 ill = ipif->ipif_ill;
19146
19147 if (ill->ill_phys_addr == NULL) {
19148 return (EADDRNOTAVAIL);
19149 }
19150 if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) {
19151 return (EPFNOSUPPORT);
19152 }
19153
19154 ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name));
19155
19156 /* Existence of mp1 has been checked in ip_wput_nondata */
19157 mp1 = mp->b_cont->b_cont;
19158 ifr = (struct ifreq *)mp1->b_rptr;
19159
19160 sock = &ifr->ifr_addr;
19161 /*
19162 * The "family" field in the returned structure is set to a value
19163 * that represents the type of device to which the address belongs.
19164 * The value returned may differ to that on Linux but it will still
19165 * represent the correct symbol on Solaris.
19166 */
19167 sock->sa_family = arp_hw_type(ill->ill_mactype);
19168 bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length);
19169
19170 return (0);
19171 }
19172
19173 /*
19174 * The expection of applications using SIOCGIFHWADDR is that data will
19175 * be returned in the sa_data field of the sockaddr structure. With
19176 * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux
19177 * equivalent. In light of this, struct sockaddr_dl is used as it
19178 * offers more space for address storage in sll_data.
19179 */
19180 /* ARGSUSED */
19181 int
19182 ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19183 ip_ioctl_cmd_t *ipip, void *if_req)
19184 {
19185 struct sockaddr_dl *sock;
19186 struct lifreq *lifr;
19187 mblk_t *mp1;
19188 ill_t *ill;
19189
19190 ASSERT(ipif != NULL);
19191 ill = ipif->ipif_ill;
19192
19193 if (ill->ill_phys_addr == NULL) {
19194 return (EADDRNOTAVAIL);
19195 }
19196 if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) {
19197 return (EPFNOSUPPORT);
19198 }
19199
19200 ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name));
19201
19202 /* Existence of mp1 has been checked in ip_wput_nondata */
19203 mp1 = mp->b_cont->b_cont;
19204 lifr = (struct lifreq *)mp1->b_rptr;
19205
19206 /*
19207 * sockaddr_ll is used here because it is also the structure used in
19208 * responding to the same ioctl in sockpfp. The only other choice is
19209 * sockaddr_dl which contains fields that are not required here
19210 * because its purpose is different.
19211 */
19212 lifr->lifr_type = ill->ill_type;
19213 sock = (struct sockaddr_dl *)&lifr->lifr_addr;
19214 sock->sdl_family = AF_LINK;
19215 sock->sdl_index = ill->ill_phyint->phyint_ifindex;
19216 sock->sdl_type = ill->ill_mactype;
19217 sock->sdl_nlen = 0;
19218 sock->sdl_slen = 0;
19219 sock->sdl_alen = ill->ill_phys_addr_length;
19220 bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length);
19221
19222 return (0);
19223 }