Print this page
OS-7184 prototype
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/ip/ip_if.c
+++ new/usr/src/uts/common/inet/ip/ip_if.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 23 * Copyright (c) 1990 Mentat Inc.
24 24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 25 * Copyright 2019 Joyent, Inc.
26 26 * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
27 27 */
28 28
29 29 /*
30 30 * This file contains the interface control functions for IP.
31 31 */
32 32
33 33 #include <sys/types.h>
34 34 #include <sys/stream.h>
35 35 #include <sys/dlpi.h>
36 36 #include <sys/stropts.h>
37 37 #include <sys/strsun.h>
38 38 #include <sys/sysmacros.h>
39 39 #include <sys/strsubr.h>
40 40 #include <sys/strlog.h>
41 41 #include <sys/ddi.h>
42 42 #include <sys/sunddi.h>
43 43 #include <sys/cmn_err.h>
44 44 #include <sys/kstat.h>
45 45 #include <sys/debug.h>
46 46 #include <sys/zone.h>
47 47 #include <sys/sunldi.h>
48 48 #include <sys/file.h>
49 49 #include <sys/bitmap.h>
50 50 #include <sys/cpuvar.h>
51 51 #include <sys/time.h>
52 52 #include <sys/ctype.h>
53 53 #include <sys/kmem.h>
54 54 #include <sys/systm.h>
55 55 #include <sys/param.h>
56 56 #include <sys/socket.h>
57 57 #include <sys/isa_defs.h>
58 58 #include <net/if.h>
59 59 #include <net/if_arp.h>
60 60 #include <net/if_types.h>
61 61 #include <net/if_dl.h>
62 62 #include <net/route.h>
63 63 #include <sys/sockio.h>
64 64 #include <netinet/in.h>
65 65 #include <netinet/ip6.h>
66 66 #include <netinet/icmp6.h>
67 67 #include <netinet/igmp_var.h>
68 68 #include <sys/policy.h>
69 69 #include <sys/ethernet.h>
70 70 #include <sys/callb.h>
71 71 #include <sys/md5.h>
72 72
73 73 #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
74 74 #include <inet/mi.h>
75 75 #include <inet/nd.h>
76 76 #include <inet/tunables.h>
77 77 #include <inet/arp.h>
78 78 #include <inet/ip_arp.h>
79 79 #include <inet/mib2.h>
80 80 #include <inet/ip.h>
81 81 #include <inet/ip6.h>
82 82 #include <inet/ip6_asp.h>
83 83 #include <inet/tcp.h>
84 84 #include <inet/ip_multi.h>
85 85 #include <inet/ip_ire.h>
86 86 #include <inet/ip_ftable.h>
87 87 #include <inet/ip_rts.h>
88 88 #include <inet/ip_ndp.h>
89 89 #include <inet/ip_if.h>
90 90 #include <inet/ip_impl.h>
91 91 #include <inet/sctp_ip.h>
92 92 #include <inet/ip_netinfo.h>
93 93 #include <inet/ilb_ip.h>
94 94
95 95 #include <netinet/igmp.h>
96 96 #include <inet/ip_listutils.h>
97 97 #include <inet/ipclassifier.h>
98 98 #include <sys/mac_client.h>
99 99 #include <sys/dld.h>
100 100 #include <sys/mac_flow.h>
101 101
102 102 #include <sys/systeminfo.h>
103 103 #include <sys/bootconf.h>
104 104
105 105 #include <sys/tsol/tndb.h>
106 106 #include <sys/tsol/tnet.h>
107 107
108 108 #include <inet/rawip_impl.h> /* needed for icmp_stack_t */
109 109 #include <inet/udp_impl.h> /* needed for udp_stack_t */
110 110
111 111 /* The character which tells where the ill_name ends */
112 112 #define IPIF_SEPARATOR_CHAR ':'
113 113
114 114 /* IP ioctl function table entry */
115 115 typedef struct ipft_s {
116 116 int ipft_cmd;
117 117 pfi_t ipft_pfi;
118 118 int ipft_min_size;
119 119 int ipft_flags;
120 120 } ipft_t;
121 121 #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */
122 122 #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */
123 123
124 124 static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *);
125 125 static int nd_ill_forward_set(queue_t *q, mblk_t *mp,
126 126 char *value, caddr_t cp, cred_t *ioc_cr);
127 127
128 128 static boolean_t ill_is_quiescent(ill_t *);
129 129 static boolean_t ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask);
130 130 static ip_m_t *ip_m_lookup(t_uscalar_t mac_type);
131 131 static int ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
132 132 mblk_t *mp, boolean_t need_up);
133 133 static int ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
134 134 mblk_t *mp, boolean_t need_up);
135 135 static int ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
136 136 queue_t *q, mblk_t *mp, boolean_t need_up);
137 137 static int ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q,
138 138 mblk_t *mp);
139 139 static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q,
140 140 mblk_t *mp);
141 141 static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t,
142 142 queue_t *q, mblk_t *mp, boolean_t need_up);
143 143 static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
144 144 int ioccmd, struct linkblk *li);
145 145 static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *);
146 146 static void ip_wput_ioctl(queue_t *q, mblk_t *mp);
147 147 static void ipsq_flush(ill_t *ill);
148 148
149 149 static int ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen,
150 150 queue_t *q, mblk_t *mp, boolean_t need_up);
151 151 static void ipsq_delete(ipsq_t *);
152 152
153 153 static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type,
154 154 boolean_t initialize, boolean_t insert, int *errorp);
155 155 static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep);
156 156 static void ipif_delete_bcast_ires(ipif_t *ipif);
157 157 static int ipif_add_ires_v4(ipif_t *, boolean_t);
158 158 static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif,
159 159 boolean_t isv6);
160 160 static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp);
161 161 static void ipif_free(ipif_t *ipif);
162 162 static void ipif_free_tail(ipif_t *ipif);
163 163 static void ipif_set_default(ipif_t *ipif);
164 164 static int ipif_set_values(queue_t *q, mblk_t *mp,
165 165 char *interf_name, uint_t *ppa);
166 166 static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp,
167 167 queue_t *q);
168 168 static ipif_t *ipif_lookup_on_name(char *name, size_t namelen,
169 169 boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid,
170 170 ip_stack_t *);
171 171 static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
172 172 boolean_t isv6, zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func,
173 173 int *error, ip_stack_t *);
174 174
175 175 static int ill_alloc_ppa(ill_if_t *, ill_t *);
176 176 static void ill_delete_interface_type(ill_if_t *);
177 177 static int ill_dl_up(ill_t *ill, ipif_t *ipif);
178 178 static void ill_dl_down(ill_t *ill);
179 179 static void ill_down(ill_t *ill);
180 180 static void ill_down_ipifs(ill_t *, boolean_t);
181 181 static void ill_free_mib(ill_t *ill);
182 182 static void ill_glist_delete(ill_t *);
183 183 static void ill_phyint_reinit(ill_t *ill);
184 184 static void ill_set_nce_router_flags(ill_t *, boolean_t);
185 185 static void ill_set_phys_addr_tail(ipsq_t *, queue_t *, mblk_t *, void *);
186 186 static void ill_replumb_tail(ipsq_t *, queue_t *, mblk_t *, void *);
187 187
188 188 static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid;
189 189 static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid;
190 190 static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid;
191 191 static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid;
192 192 static ip_v4mapinfo_func_t ip_ether_v4_mapping;
193 193 static ip_v6mapinfo_func_t ip_ether_v6_mapping;
194 194 static ip_v4mapinfo_func_t ip_ib_v4_mapping;
195 195 static ip_v6mapinfo_func_t ip_ib_v6_mapping;
196 196 static ip_v4mapinfo_func_t ip_mbcast_mapping;
197 197 static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *);
198 198 static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *);
199 199 static void phyint_free(phyint_t *);
200 200
201 201 static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *);
202 202 static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
203 203 static void ill_capability_vrrp_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
204 204 static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
205 205 static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *);
206 206 static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
207 207 dl_capability_sub_t *);
208 208 static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *);
209 209 static void ill_capability_dld_reset_fill(ill_t *, mblk_t *);
210 210 static void ill_capability_dld_ack(ill_t *, mblk_t *,
211 211 dl_capability_sub_t *);
212 212 static void ill_capability_dld_enable(ill_t *);
213 213 static void ill_capability_ack_thr(void *);
214 214 static void ill_capability_lso_enable(ill_t *);
215 215
216 216 static ill_t *ill_prev_usesrc(ill_t *);
217 217 static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t);
218 218 static void ill_disband_usesrc_group(ill_t *);
219 219 static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int);
220 220
221 221 #ifdef DEBUG
222 222 static void ill_trace_cleanup(const ill_t *);
223 223 static void ipif_trace_cleanup(const ipif_t *);
224 224 #endif
225 225
226 226 static void ill_dlpi_clear_deferred(ill_t *ill);
227 227
228 228 static void phyint_flags_init(phyint_t *, t_uscalar_t);
229 229
230 230 /*
231 231 * if we go over the memory footprint limit more than once in this msec
232 232 * interval, we'll start pruning aggressively.
233 233 */
234 234 int ip_min_frag_prune_time = 0;
235 235
236 236 static ipft_t ip_ioctl_ftbl[] = {
237 237 { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 },
238 238 { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t),
239 239 IPFT_F_NO_REPLY },
240 240 { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY },
241 241 { 0 }
242 242 };
243 243
244 244 /* Simple ICMP IP Header Template */
245 245 static ipha_t icmp_ipha = {
246 246 IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
247 247 };
248 248
249 249 static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
250 250
251 251 static ip_m_t ip_m_tbl[] = {
252 252 { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
253 253 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
254 254 ip_nodef_v6intfid },
255 255 { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6,
256 256 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
257 257 ip_nodef_v6intfid },
258 258 { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6,
259 259 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
260 260 ip_nodef_v6intfid },
261 261 { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6,
262 262 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
263 263 ip_nodef_v6intfid },
264 264 { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6,
265 265 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid,
266 266 ip_nodef_v6intfid },
267 267 { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6,
268 268 ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid,
269 269 ip_nodef_v6intfid },
270 270 { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6,
271 271 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
272 272 ip_ipv4_v6destintfid },
273 273 { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6,
274 274 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid,
275 275 ip_ipv6_v6destintfid },
276 276 { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6,
277 277 ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid,
278 278 ip_nodef_v6intfid },
279 279 { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
280 280 NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid },
281 281 { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
282 282 NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid },
283 283 { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6,
284 284 ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid,
285 285 ip_nodef_v6intfid }
286 286 };
287 287
288 288 char ipif_loopback_name[] = "lo0";
289 289
290 290 /* These are used by all IP network modules. */
291 291 sin6_t sin6_null; /* Zero address for quick clears */
292 292 sin_t sin_null; /* Zero address for quick clears */
293 293
294 294 /* When set search for unused ipif_seqid */
295 295 static ipif_t ipif_zero;
296 296
297 297 /*
298 298 * ppa arena is created after these many
299 299 * interfaces have been plumbed.
300 300 */
301 301 uint_t ill_no_arena = 12; /* Setable in /etc/system */
302 302
303 303 /*
304 304 * Allocate per-interface mibs.
305 305 * Returns true if ok. False otherwise.
306 306 * ipsq may not yet be allocated (loopback case ).
307 307 */
308 308 static boolean_t
309 309 ill_allocate_mibs(ill_t *ill)
310 310 {
311 311 /* Already allocated? */
312 312 if (ill->ill_ip_mib != NULL) {
313 313 if (ill->ill_isv6)
314 314 ASSERT(ill->ill_icmp6_mib != NULL);
315 315 return (B_TRUE);
316 316 }
317 317
318 318 ill->ill_ip_mib = kmem_zalloc(sizeof (*ill->ill_ip_mib),
319 319 KM_NOSLEEP);
320 320 if (ill->ill_ip_mib == NULL) {
321 321 return (B_FALSE);
322 322 }
323 323
324 324 /* Setup static information */
325 325 SET_MIB(ill->ill_ip_mib->ipIfStatsEntrySize,
326 326 sizeof (mib2_ipIfStatsEntry_t));
327 327 if (ill->ill_isv6) {
328 328 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv6;
329 329 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
330 330 sizeof (mib2_ipv6AddrEntry_t));
331 331 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
332 332 sizeof (mib2_ipv6RouteEntry_t));
333 333 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
334 334 sizeof (mib2_ipv6NetToMediaEntry_t));
335 335 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
336 336 sizeof (ipv6_member_t));
337 337 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
338 338 sizeof (ipv6_grpsrc_t));
339 339 } else {
340 340 ill->ill_ip_mib->ipIfStatsIPVersion = MIB2_INETADDRESSTYPE_ipv4;
341 341 SET_MIB(ill->ill_ip_mib->ipIfStatsAddrEntrySize,
342 342 sizeof (mib2_ipAddrEntry_t));
343 343 SET_MIB(ill->ill_ip_mib->ipIfStatsRouteEntrySize,
344 344 sizeof (mib2_ipRouteEntry_t));
345 345 SET_MIB(ill->ill_ip_mib->ipIfStatsNetToMediaEntrySize,
346 346 sizeof (mib2_ipNetToMediaEntry_t));
347 347 SET_MIB(ill->ill_ip_mib->ipIfStatsMemberEntrySize,
348 348 sizeof (ip_member_t));
349 349 SET_MIB(ill->ill_ip_mib->ipIfStatsGroupSourceEntrySize,
350 350 sizeof (ip_grpsrc_t));
351 351
352 352 /*
353 353 * For a v4 ill, we are done at this point, because per ill
354 354 * icmp mibs are only used for v6.
355 355 */
356 356 return (B_TRUE);
357 357 }
358 358
359 359 ill->ill_icmp6_mib = kmem_zalloc(sizeof (*ill->ill_icmp6_mib),
360 360 KM_NOSLEEP);
361 361 if (ill->ill_icmp6_mib == NULL) {
362 362 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
363 363 ill->ill_ip_mib = NULL;
364 364 return (B_FALSE);
365 365 }
366 366 /* static icmp info */
367 367 ill->ill_icmp6_mib->ipv6IfIcmpEntrySize =
368 368 sizeof (mib2_ipv6IfIcmpEntry_t);
369 369 /*
370 370 * The ipIfStatsIfindex and ipv6IfIcmpIndex will be assigned later
371 371 * after the phyint merge occurs in ipif_set_values -> ill_glist_insert
372 372 * -> ill_phyint_reinit
373 373 */
374 374 return (B_TRUE);
375 375 }
376 376
377 377 /*
378 378 * Completely vaporize a lower level tap and all associated interfaces.
379 379 * ill_delete is called only out of ip_close when the device control
380 380 * stream is being closed.
381 381 */
382 382 void
383 383 ill_delete(ill_t *ill)
384 384 {
385 385 ipif_t *ipif;
386 386 ill_t *prev_ill;
387 387 ip_stack_t *ipst = ill->ill_ipst;
388 388
389 389 /*
390 390 * ill_delete may be forcibly entering the ipsq. The previous
391 391 * ioctl may not have completed and may need to be aborted.
392 392 * ipsq_flush takes care of it. If we don't need to enter the
393 393 * the ipsq forcibly, the 2nd invocation of ipsq_flush in
394 394 * ill_delete_tail is sufficient.
395 395 */
396 396 ipsq_flush(ill);
397 397
398 398 /*
399 399 * Nuke all interfaces. ipif_free will take down the interface,
400 400 * remove it from the list, and free the data structure.
401 401 * Walk down the ipif list and remove the logical interfaces
402 402 * first before removing the main ipif. We can't unplumb
403 403 * zeroth interface first in the case of IPv6 as update_conn_ill
404 404 * -> ip_ll_multireq de-references ill_ipif for checking
405 405 * POINTOPOINT.
406 406 *
407 407 * If ill_ipif was not properly initialized (i.e low on memory),
408 408 * then no interfaces to clean up. In this case just clean up the
409 409 * ill.
410 410 */
411 411 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
412 412 ipif_free(ipif);
413 413
414 414 /*
415 415 * clean out all the nce_t entries that depend on this
416 416 * ill for the ill_phys_addr.
417 417 */
418 418 nce_flush(ill, B_TRUE);
419 419
420 420 /* Clean up msgs on pending upcalls for mrouted */
421 421 reset_mrt_ill(ill);
422 422
423 423 update_conn_ill(ill, ipst);
424 424
425 425 /*
426 426 * Remove multicast references added as a result of calls to
427 427 * ip_join_allmulti().
428 428 */
429 429 ip_purge_allmulti(ill);
430 430
431 431 /*
432 432 * If the ill being deleted is under IPMP, boot it out of the illgrp.
433 433 */
434 434 if (IS_UNDER_IPMP(ill))
435 435 ipmp_ill_leave_illgrp(ill);
436 436
437 437 /*
438 438 * ill_down will arrange to blow off any IRE's dependent on this
439 439 * ILL, and shut down fragmentation reassembly.
440 440 */
441 441 ill_down(ill);
442 442
443 443 /* Let SCTP know, so that it can remove this from its list. */
444 444 sctp_update_ill(ill, SCTP_ILL_REMOVE);
445 445
446 446 /*
447 447 * Walk all CONNs that can have a reference on an ire or nce for this
448 448 * ill (we actually walk all that now have stale references).
449 449 */
450 450 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
451 451
452 452 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
453 453 if (ill->ill_isv6)
454 454 dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst);
455 455
456 456 /*
457 457 * If an address on this ILL is being used as a source address then
458 458 * clear out the pointers in other ILLs that point to this ILL.
459 459 */
460 460 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
461 461 if (ill->ill_usesrc_grp_next != NULL) {
462 462 if (ill->ill_usesrc_ifindex == 0) { /* usesrc ILL ? */
463 463 ill_disband_usesrc_group(ill);
464 464 } else { /* consumer of the usesrc ILL */
465 465 prev_ill = ill_prev_usesrc(ill);
466 466 prev_ill->ill_usesrc_grp_next =
467 467 ill->ill_usesrc_grp_next;
468 468 }
469 469 }
470 470 rw_exit(&ipst->ips_ill_g_usesrc_lock);
471 471 }
472 472
473 473 static void
474 474 ipif_non_duplicate(ipif_t *ipif)
475 475 {
476 476 ill_t *ill = ipif->ipif_ill;
477 477 mutex_enter(&ill->ill_lock);
478 478 if (ipif->ipif_flags & IPIF_DUPLICATE) {
479 479 ipif->ipif_flags &= ~IPIF_DUPLICATE;
480 480 ASSERT(ill->ill_ipif_dup_count > 0);
481 481 ill->ill_ipif_dup_count--;
482 482 }
483 483 mutex_exit(&ill->ill_lock);
484 484 }
485 485
486 486 /*
487 487 * ill_delete_tail is called from ip_modclose after all references
488 488 * to the closing ill are gone. The wait is done in ip_modclose
489 489 */
490 490 void
491 491 ill_delete_tail(ill_t *ill)
492 492 {
493 493 mblk_t **mpp;
494 494 ipif_t *ipif;
495 495 ip_stack_t *ipst = ill->ill_ipst;
496 496
497 497 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
498 498 ipif_non_duplicate(ipif);
499 499 (void) ipif_down_tail(ipif);
500 500 }
501 501
502 502 ASSERT(ill->ill_ipif_dup_count == 0);
503 503
504 504 /*
505 505 * If polling capability is enabled (which signifies direct
506 506 * upcall into IP and driver has ill saved as a handle),
507 507 * we need to make sure that unbind has completed before we
508 508 * let the ill disappear and driver no longer has any reference
509 509 * to this ill.
510 510 */
511 511 mutex_enter(&ill->ill_lock);
512 512 while (ill->ill_state_flags & ILL_DL_UNBIND_IN_PROGRESS)
513 513 cv_wait(&ill->ill_cv, &ill->ill_lock);
514 514 mutex_exit(&ill->ill_lock);
515 515 ASSERT(!(ill->ill_capabilities &
516 516 (ILL_CAPAB_DLD | ILL_CAPAB_DLD_POLL | ILL_CAPAB_DLD_DIRECT)));
517 517
518 518 if (ill->ill_net_type != IRE_LOOPBACK)
519 519 qprocsoff(ill->ill_rq);
520 520
521 521 /*
522 522 * We do an ipsq_flush once again now. New messages could have
523 523 * landed up from below (M_ERROR or M_HANGUP). Similarly ioctls
524 524 * could also have landed up if an ioctl thread had looked up
525 525 * the ill before we set the ILL_CONDEMNED flag, but not yet
526 526 * enqueued the ioctl when we did the ipsq_flush last time.
527 527 */
528 528 ipsq_flush(ill);
529 529
530 530 /*
531 531 * Free capabilities.
532 532 */
533 533 if (ill->ill_hcksum_capab != NULL) {
534 534 kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t));
535 535 ill->ill_hcksum_capab = NULL;
536 536 }
537 537
538 538 if (ill->ill_zerocopy_capab != NULL) {
539 539 kmem_free(ill->ill_zerocopy_capab,
540 540 sizeof (ill_zerocopy_capab_t));
541 541 ill->ill_zerocopy_capab = NULL;
542 542 }
543 543
544 544 if (ill->ill_lso_capab != NULL) {
545 545 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
546 546 ill->ill_lso_capab = NULL;
547 547 }
548 548
549 549 if (ill->ill_dld_capab != NULL) {
550 550 kmem_free(ill->ill_dld_capab, sizeof (ill_dld_capab_t));
551 551 ill->ill_dld_capab = NULL;
552 552 }
553 553
554 554 /* Clean up ill_allowed_ips* related state */
555 555 if (ill->ill_allowed_ips != NULL) {
556 556 ASSERT(ill->ill_allowed_ips_cnt > 0);
557 557 kmem_free(ill->ill_allowed_ips,
558 558 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
559 559 ill->ill_allowed_ips = NULL;
560 560 ill->ill_allowed_ips_cnt = 0;
561 561 }
562 562
563 563 while (ill->ill_ipif != NULL)
564 564 ipif_free_tail(ill->ill_ipif);
565 565
566 566 /*
567 567 * We have removed all references to ilm from conn and the ones joined
568 568 * within the kernel.
569 569 *
570 570 * We don't walk conns, mrts and ires because
571 571 *
572 572 * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts.
573 573 * 2) ill_down ->ill_downi walks all the ires and cleans up
574 574 * ill references.
575 575 */
576 576
577 577 /*
578 578 * If this ill is an IPMP meta-interface, blow away the illgrp. This
579 579 * is safe to do because the illgrp has already been unlinked from the
580 580 * group by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find it.
581 581 */
582 582 if (IS_IPMP(ill)) {
583 583 ipmp_illgrp_destroy(ill->ill_grp);
584 584 ill->ill_grp = NULL;
585 585 }
586 586
587 587 if (ill->ill_mphysaddr_list != NULL) {
588 588 multiphysaddr_t *mpa, *tmpa;
589 589
590 590 mpa = ill->ill_mphysaddr_list;
591 591 ill->ill_mphysaddr_list = NULL;
592 592 while (mpa) {
593 593 tmpa = mpa->mpa_next;
594 594 kmem_free(mpa, sizeof (*mpa));
595 595 mpa = tmpa;
596 596 }
597 597 }
598 598 /*
599 599 * Take us out of the list of ILLs. ill_glist_delete -> phyint_free
600 600 * could free the phyint. No more reference to the phyint after this
601 601 * point.
602 602 */
603 603 (void) ill_glist_delete(ill);
604 604
605 605 if (ill->ill_frag_ptr != NULL) {
606 606 uint_t count;
607 607
608 608 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
609 609 mutex_destroy(&ill->ill_frag_hash_tbl[count].ipfb_lock);
610 610 }
611 611 mi_free(ill->ill_frag_ptr);
612 612 ill->ill_frag_ptr = NULL;
613 613 ill->ill_frag_hash_tbl = NULL;
614 614 }
615 615
616 616 freemsg(ill->ill_nd_lla_mp);
617 617 /* Free all retained control messages. */
618 618 mpp = &ill->ill_first_mp_to_free;
619 619 do {
620 620 while (mpp[0]) {
621 621 mblk_t *mp;
622 622 mblk_t *mp1;
623 623
624 624 mp = mpp[0];
625 625 mpp[0] = mp->b_next;
626 626 for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
627 627 mp1->b_next = NULL;
628 628 mp1->b_prev = NULL;
629 629 }
630 630 freemsg(mp);
631 631 }
632 632 } while (mpp++ != &ill->ill_last_mp_to_free);
633 633
634 634 ill_free_mib(ill);
635 635
636 636 #ifdef DEBUG
637 637 ill_trace_cleanup(ill);
638 638 #endif
639 639
640 640 /* The default multicast interface might have changed */
641 641 ire_increment_multicast_generation(ipst, ill->ill_isv6);
642 642
643 643 /* Drop refcnt here */
644 644 netstack_rele(ill->ill_ipst->ips_netstack);
645 645 ill->ill_ipst = NULL;
646 646 }
647 647
648 648 static void
649 649 ill_free_mib(ill_t *ill)
650 650 {
651 651 ip_stack_t *ipst = ill->ill_ipst;
652 652
653 653 /*
654 654 * MIB statistics must not be lost, so when an interface
655 655 * goes away the counter values will be added to the global
656 656 * MIBs.
657 657 */
658 658 if (ill->ill_ip_mib != NULL) {
659 659 if (ill->ill_isv6) {
660 660 ip_mib2_add_ip_stats(&ipst->ips_ip6_mib,
661 661 ill->ill_ip_mib);
662 662 } else {
663 663 ip_mib2_add_ip_stats(&ipst->ips_ip_mib,
664 664 ill->ill_ip_mib);
665 665 }
666 666
667 667 kmem_free(ill->ill_ip_mib, sizeof (*ill->ill_ip_mib));
668 668 ill->ill_ip_mib = NULL;
669 669 }
670 670 if (ill->ill_icmp6_mib != NULL) {
671 671 ip_mib2_add_icmp6_stats(&ipst->ips_icmp6_mib,
672 672 ill->ill_icmp6_mib);
673 673 kmem_free(ill->ill_icmp6_mib, sizeof (*ill->ill_icmp6_mib));
674 674 ill->ill_icmp6_mib = NULL;
675 675 }
676 676 }
677 677
678 678 /*
679 679 * Concatenate together a physical address and a sap.
680 680 *
681 681 * Sap_lengths are interpreted as follows:
682 682 * sap_length == 0 ==> no sap
683 683 * sap_length > 0 ==> sap is at the head of the dlpi address
684 684 * sap_length < 0 ==> sap is at the tail of the dlpi address
685 685 */
686 686 static void
687 687 ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
688 688 t_scalar_t sap_src, t_scalar_t sap_length, uchar_t *dst)
689 689 {
690 690 uint16_t sap_addr = (uint16_t)sap_src;
691 691
692 692 if (sap_length == 0) {
693 693 if (phys_src == NULL)
694 694 bzero(dst, phys_length);
695 695 else
696 696 bcopy(phys_src, dst, phys_length);
697 697 } else if (sap_length < 0) {
698 698 if (phys_src == NULL)
699 699 bzero(dst, phys_length);
700 700 else
701 701 bcopy(phys_src, dst, phys_length);
702 702 bcopy(&sap_addr, (char *)dst + phys_length, sizeof (sap_addr));
703 703 } else {
704 704 bcopy(&sap_addr, dst, sizeof (sap_addr));
705 705 if (phys_src == NULL)
706 706 bzero((char *)dst + sap_length, phys_length);
707 707 else
708 708 bcopy(phys_src, (char *)dst + sap_length, phys_length);
709 709 }
710 710 }
711 711
712 712 /*
713 713 * Generate a dl_unitdata_req mblk for the device and address given.
714 714 * addr_length is the length of the physical portion of the address.
715 715 * If addr is NULL include an all zero address of the specified length.
716 716 * TRUE? In any case, addr_length is taken to be the entire length of the
717 717 * dlpi address, including the absolute value of sap_length.
718 718 */
719 719 mblk_t *
720 720 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
721 721 t_scalar_t sap_length)
722 722 {
723 723 dl_unitdata_req_t *dlur;
724 724 mblk_t *mp;
725 725 t_scalar_t abs_sap_length; /* absolute value */
726 726
727 727 abs_sap_length = ABS(sap_length);
728 728 mp = ip_dlpi_alloc(sizeof (*dlur) + addr_length + abs_sap_length,
729 729 DL_UNITDATA_REQ);
730 730 if (mp == NULL)
731 731 return (NULL);
732 732 dlur = (dl_unitdata_req_t *)mp->b_rptr;
733 733 /* HACK: accomodate incompatible DLPI drivers */
734 734 if (addr_length == 8)
735 735 addr_length = 6;
736 736 dlur->dl_dest_addr_length = addr_length + abs_sap_length;
737 737 dlur->dl_dest_addr_offset = sizeof (*dlur);
738 738 dlur->dl_priority.dl_min = 0;
739 739 dlur->dl_priority.dl_max = 0;
740 740 ill_dlur_copy_address(addr, addr_length, sap, sap_length,
741 741 (uchar_t *)&dlur[1]);
742 742 return (mp);
743 743 }
744 744
745 745 /*
746 746 * Add the pending mp to the list. There can be only 1 pending mp
747 747 * in the list. Any exclusive ioctl that needs to wait for a response
748 748 * from another module or driver needs to use this function to set
749 749 * the ipx_pending_mp to the ioctl mblk and wait for the response from
750 750 * the other module/driver. This is also used while waiting for the
751 751 * ipif/ill/ire refcnts to drop to zero in bringing down an ipif.
752 752 */
753 753 boolean_t
754 754 ipsq_pending_mp_add(conn_t *connp, ipif_t *ipif, queue_t *q, mblk_t *add_mp,
755 755 int waitfor)
756 756 {
757 757 ipxop_t *ipx = ipif->ipif_ill->ill_phyint->phyint_ipsq->ipsq_xop;
758 758
759 759 ASSERT(IAM_WRITER_IPIF(ipif));
760 760 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
761 761 ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL));
762 762 ASSERT(ipx->ipx_pending_mp == NULL);
763 763 /*
764 764 * The caller may be using a different ipif than the one passed into
765 765 * ipsq_current_start() (e.g., suppose an ioctl that came in on the V4
766 766 * ill needs to wait for the V6 ill to quiesce). So we can't ASSERT
767 767 * that `ipx_current_ipif == ipif'.
768 768 */
769 769 ASSERT(ipx->ipx_current_ipif != NULL);
770 770
771 771 /*
772 772 * M_IOCDATA from ioctls, M_ERROR/M_HANGUP/M_PROTO/M_PCPROTO from the
773 773 * driver.
774 774 */
775 775 ASSERT((DB_TYPE(add_mp) == M_IOCDATA) || (DB_TYPE(add_mp) == M_ERROR) ||
776 776 (DB_TYPE(add_mp) == M_HANGUP) || (DB_TYPE(add_mp) == M_PROTO) ||
777 777 (DB_TYPE(add_mp) == M_PCPROTO));
778 778
779 779 if (connp != NULL) {
780 780 ASSERT(MUTEX_HELD(&connp->conn_lock));
781 781 /*
782 782 * Return error if the conn has started closing. The conn
783 783 * could have finished cleaning up the pending mp list,
784 784 * If so we should not add another mp to the list negating
785 785 * the cleanup.
786 786 */
787 787 if (connp->conn_state_flags & CONN_CLOSING)
788 788 return (B_FALSE);
789 789 }
790 790 mutex_enter(&ipx->ipx_lock);
791 791 ipx->ipx_pending_ipif = ipif;
792 792 /*
793 793 * Note down the queue in b_queue. This will be returned by
794 794 * ipsq_pending_mp_get. Caller will then use these values to restart
795 795 * the processing
796 796 */
797 797 add_mp->b_next = NULL;
798 798 add_mp->b_queue = q;
799 799 ipx->ipx_pending_mp = add_mp;
800 800 ipx->ipx_waitfor = waitfor;
801 801 mutex_exit(&ipx->ipx_lock);
802 802
803 803 if (connp != NULL)
804 804 connp->conn_oper_pending_ill = ipif->ipif_ill;
805 805
806 806 return (B_TRUE);
807 807 }
808 808
809 809 /*
810 810 * Retrieve the ipx_pending_mp and return it. There can be only 1 mp
811 811 * queued in the list.
812 812 */
813 813 mblk_t *
814 814 ipsq_pending_mp_get(ipsq_t *ipsq, conn_t **connpp)
815 815 {
816 816 mblk_t *curr = NULL;
817 817 ipxop_t *ipx = ipsq->ipsq_xop;
818 818
819 819 *connpp = NULL;
820 820 mutex_enter(&ipx->ipx_lock);
821 821 if (ipx->ipx_pending_mp == NULL) {
822 822 mutex_exit(&ipx->ipx_lock);
823 823 return (NULL);
824 824 }
825 825
826 826 /* There can be only 1 such excl message */
827 827 curr = ipx->ipx_pending_mp;
828 828 ASSERT(curr->b_next == NULL);
829 829 ipx->ipx_pending_ipif = NULL;
830 830 ipx->ipx_pending_mp = NULL;
831 831 ipx->ipx_waitfor = 0;
832 832 mutex_exit(&ipx->ipx_lock);
833 833
834 834 if (CONN_Q(curr->b_queue)) {
835 835 /*
836 836 * This mp did a refhold on the conn, at the start of the ioctl.
837 837 * So we can safely return a pointer to the conn to the caller.
838 838 */
839 839 *connpp = Q_TO_CONN(curr->b_queue);
840 840 } else {
841 841 *connpp = NULL;
842 842 }
843 843 curr->b_next = NULL;
844 844 curr->b_prev = NULL;
845 845 return (curr);
846 846 }
847 847
848 848 /*
849 849 * Cleanup the ioctl mp queued in ipx_pending_mp
850 850 * - Called in the ill_delete path
851 851 * - Called in the M_ERROR or M_HANGUP path on the ill.
852 852 * - Called in the conn close path.
853 853 *
854 854 * Returns success on finding the pending mblk associated with the ioctl or
855 855 * exclusive operation in progress, failure otherwise.
856 856 */
857 857 boolean_t
858 858 ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
859 859 {
860 860 mblk_t *mp;
861 861 ipxop_t *ipx;
862 862 queue_t *q;
863 863 ipif_t *ipif;
864 864 int cmd;
865 865
866 866 ASSERT(IAM_WRITER_ILL(ill));
867 867 ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
868 868
869 869 mutex_enter(&ipx->ipx_lock);
870 870 mp = ipx->ipx_pending_mp;
871 871 if (connp != NULL) {
872 872 if (mp == NULL || mp->b_queue != CONNP_TO_WQ(connp)) {
873 873 /*
874 874 * Nothing to clean since the conn that is closing
875 875 * does not have a matching pending mblk in
876 876 * ipx_pending_mp.
877 877 */
878 878 mutex_exit(&ipx->ipx_lock);
879 879 return (B_FALSE);
880 880 }
881 881 } else {
882 882 /*
883 883 * A non-zero ill_error signifies we are called in the
884 884 * M_ERROR or M_HANGUP path and we need to unconditionally
885 885 * abort any current ioctl and do the corresponding cleanup.
886 886 * A zero ill_error means we are in the ill_delete path and
887 887 * we do the cleanup only if there is a pending mp.
888 888 */
889 889 if (mp == NULL && ill->ill_error == 0) {
890 890 mutex_exit(&ipx->ipx_lock);
891 891 return (B_FALSE);
892 892 }
893 893 }
894 894
895 895 /* Now remove from the ipx_pending_mp */
896 896 ipx->ipx_pending_mp = NULL;
897 897 ipif = ipx->ipx_pending_ipif;
898 898 ipx->ipx_pending_ipif = NULL;
899 899 ipx->ipx_waitfor = 0;
900 900 ipx->ipx_current_ipif = NULL;
901 901 cmd = ipx->ipx_current_ioctl;
902 902 ipx->ipx_current_ioctl = 0;
903 903 ipx->ipx_current_done = B_TRUE;
904 904 mutex_exit(&ipx->ipx_lock);
905 905
906 906 if (mp == NULL)
907 907 return (B_FALSE);
908 908
909 909 q = mp->b_queue;
910 910 mp->b_next = NULL;
911 911 mp->b_prev = NULL;
912 912 mp->b_queue = NULL;
913 913
914 914 if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) {
915 915 DTRACE_PROBE4(ipif__ioctl,
916 916 char *, "ipsq_pending_mp_cleanup",
917 917 int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill,
918 918 ipif_t *, ipif);
919 919 if (connp == NULL) {
920 920 ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL);
921 921 } else {
922 922 ip_ioctl_finish(q, mp, ENXIO, CONN_CLOSE, NULL);
923 923 mutex_enter(&ipif->ipif_ill->ill_lock);
924 924 ipif->ipif_state_flags &= ~IPIF_CHANGING;
925 925 mutex_exit(&ipif->ipif_ill->ill_lock);
926 926 }
927 927 } else {
928 928 inet_freemsg(mp);
929 929 }
930 930 return (B_TRUE);
931 931 }
932 932
933 933 /*
934 934 * Called in the conn close path and ill delete path
935 935 */
936 936 static void
937 937 ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
938 938 {
939 939 ipsq_t *ipsq;
940 940 mblk_t *prev;
941 941 mblk_t *curr;
942 942 mblk_t *next;
943 943 queue_t *wq, *rq = NULL;
944 944 mblk_t *tmp_list = NULL;
945 945
946 946 ASSERT(IAM_WRITER_ILL(ill));
947 947 if (connp != NULL)
948 948 wq = CONNP_TO_WQ(connp);
949 949 else
950 950 wq = ill->ill_wq;
951 951
952 952 /*
953 953 * In the case of lo0 being unplumbed, ill_wq will be NULL. Guard
954 954 * against this here.
955 955 */
956 956 if (wq != NULL)
957 957 rq = RD(wq);
958 958
959 959 ipsq = ill->ill_phyint->phyint_ipsq;
960 960 /*
961 961 * Cleanup the ioctl mp's queued in ipsq_xopq_pending_mp if any.
962 962 * In the case of ioctl from a conn, there can be only 1 mp
963 963 * queued on the ipsq. If an ill is being unplumbed flush all
964 964 * the messages.
965 965 */
966 966 mutex_enter(&ipsq->ipsq_lock);
967 967 for (prev = NULL, curr = ipsq->ipsq_xopq_mphead; curr != NULL;
968 968 curr = next) {
969 969 next = curr->b_next;
970 970 if (connp == NULL ||
971 971 (curr->b_queue == wq || curr->b_queue == rq)) {
972 972 /* Unlink the mblk from the pending mp list */
973 973 if (prev != NULL) {
974 974 prev->b_next = curr->b_next;
975 975 } else {
976 976 ASSERT(ipsq->ipsq_xopq_mphead == curr);
977 977 ipsq->ipsq_xopq_mphead = curr->b_next;
978 978 }
979 979 if (ipsq->ipsq_xopq_mptail == curr)
980 980 ipsq->ipsq_xopq_mptail = prev;
981 981 /*
982 982 * Create a temporary list and release the ipsq lock
983 983 * New elements are added to the head of the tmp_list
984 984 */
985 985 curr->b_next = tmp_list;
986 986 tmp_list = curr;
987 987 } else {
988 988 prev = curr;
989 989 }
990 990 }
991 991 mutex_exit(&ipsq->ipsq_lock);
992 992
993 993 while (tmp_list != NULL) {
994 994 curr = tmp_list;
995 995 tmp_list = curr->b_next;
996 996 curr->b_next = NULL;
997 997 curr->b_prev = NULL;
998 998 wq = curr->b_queue;
999 999 curr->b_queue = NULL;
1000 1000 if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) {
1001 1001 DTRACE_PROBE4(ipif__ioctl,
1002 1002 char *, "ipsq_xopq_mp_cleanup",
1003 1003 int, 0, ill_t *, NULL, ipif_t *, NULL);
1004 1004 ip_ioctl_finish(wq, curr, ENXIO, connp != NULL ?
1005 1005 CONN_CLOSE : NO_COPYOUT, NULL);
1006 1006 } else {
1007 1007 /*
1008 1008 * IP-MT XXX In the case of TLI/XTI bind / optmgmt
1009 1009 * this can't be just inet_freemsg. we have to
1010 1010 * restart it otherwise the thread will be stuck.
1011 1011 */
1012 1012 inet_freemsg(curr);
1013 1013 }
1014 1014 }
1015 1015 }
1016 1016
1017 1017 /*
1018 1018 * This conn has started closing. Cleanup any pending ioctl from this conn.
1019 1019 * STREAMS ensures that there can be at most 1 active ioctl on a stream.
1020 1020 */
1021 1021 void
1022 1022 conn_ioctl_cleanup(conn_t *connp)
1023 1023 {
1024 1024 ipsq_t *ipsq;
1025 1025 ill_t *ill;
1026 1026 boolean_t refheld;
1027 1027
1028 1028 /*
1029 1029 * Check for a queued ioctl. If the ioctl has not yet started, the mp
1030 1030 * is pending in the list headed by ipsq_xopq_head. If the ioctl has
1031 1031 * started the mp could be present in ipx_pending_mp. Note that if
1032 1032 * conn_oper_pending_ill is NULL, the ioctl may still be in flight and
1033 1033 * not yet queued anywhere. In this case, the conn close code will wait
1034 1034 * until the conn_ref is dropped. If the stream was a tcp stream, then
1035 1035 * tcp_close will wait first until all ioctls have completed for this
1036 1036 * conn.
1037 1037 */
1038 1038 mutex_enter(&connp->conn_lock);
1039 1039 ill = connp->conn_oper_pending_ill;
1040 1040 if (ill == NULL) {
1041 1041 mutex_exit(&connp->conn_lock);
1042 1042 return;
1043 1043 }
1044 1044
1045 1045 /*
1046 1046 * We may not be able to refhold the ill if the ill/ipif
1047 1047 * is changing. But we need to make sure that the ill will
1048 1048 * not vanish. So we just bump up the ill_waiter count.
1049 1049 */
1050 1050 refheld = ill_waiter_inc(ill);
1051 1051 mutex_exit(&connp->conn_lock);
1052 1052 if (refheld) {
1053 1053 if (ipsq_enter(ill, B_TRUE, NEW_OP)) {
1054 1054 ill_waiter_dcr(ill);
1055 1055 /*
1056 1056 * Check whether this ioctl has started and is
1057 1057 * pending. If it is not found there then check
1058 1058 * whether this ioctl has not even started and is in
1059 1059 * the ipsq_xopq list.
1060 1060 */
1061 1061 if (!ipsq_pending_mp_cleanup(ill, connp))
1062 1062 ipsq_xopq_mp_cleanup(ill, connp);
1063 1063 ipsq = ill->ill_phyint->phyint_ipsq;
1064 1064 ipsq_exit(ipsq);
1065 1065 return;
1066 1066 }
1067 1067 }
1068 1068
1069 1069 /*
1070 1070 * The ill is also closing and we could not bump up the
1071 1071 * ill_waiter_count or we could not enter the ipsq. Leave
1072 1072 * the cleanup to ill_delete
1073 1073 */
1074 1074 mutex_enter(&connp->conn_lock);
1075 1075 while (connp->conn_oper_pending_ill != NULL)
1076 1076 cv_wait(&connp->conn_refcv, &connp->conn_lock);
1077 1077 mutex_exit(&connp->conn_lock);
1078 1078 if (refheld)
1079 1079 ill_waiter_dcr(ill);
1080 1080 }
1081 1081
1082 1082 /*
1083 1083 * ipcl_walk function for cleaning up conn_*_ill fields.
1084 1084 * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and
1085 1085 * conn_bound_if in place. We prefer dropping
1086 1086 * packets instead of sending them out the wrong interface, or accepting
1087 1087 * packets from the wrong ifindex.
1088 1088 */
1089 1089 static void
1090 1090 conn_cleanup_ill(conn_t *connp, caddr_t arg)
1091 1091 {
1092 1092 ill_t *ill = (ill_t *)arg;
1093 1093
1094 1094 mutex_enter(&connp->conn_lock);
1095 1095 if (connp->conn_dhcpinit_ill == ill) {
1096 1096 connp->conn_dhcpinit_ill = NULL;
1097 1097 ASSERT(ill->ill_dhcpinit != 0);
1098 1098 atomic_dec_32(&ill->ill_dhcpinit);
1099 1099 ill_set_inputfn(ill);
1100 1100 }
1101 1101 mutex_exit(&connp->conn_lock);
1102 1102 }
1103 1103
1104 1104 static int
1105 1105 ill_down_ipifs_tail(ill_t *ill)
1106 1106 {
1107 1107 ipif_t *ipif;
1108 1108 int err;
1109 1109
1110 1110 ASSERT(IAM_WRITER_ILL(ill));
1111 1111 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
1112 1112 ipif_non_duplicate(ipif);
1113 1113 /*
1114 1114 * ipif_down_tail will call arp_ll_down on the last ipif
1115 1115 * and typically return EINPROGRESS when the DL_UNBIND is sent.
1116 1116 */
1117 1117 if ((err = ipif_down_tail(ipif)) != 0)
1118 1118 return (err);
1119 1119 }
1120 1120 return (0);
1121 1121 }
1122 1122
1123 1123 /* ARGSUSED */
1124 1124 void
1125 1125 ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
1126 1126 {
1127 1127 ASSERT(IAM_WRITER_IPSQ(ipsq));
1128 1128 (void) ill_down_ipifs_tail(q->q_ptr);
1129 1129 freemsg(mp);
1130 1130 ipsq_current_finish(ipsq);
1131 1131 }
1132 1132
1133 1133 /*
1134 1134 * ill_down_start is called when we want to down this ill and bring it up again
1135 1135 * It is called when we receive an M_ERROR / M_HANGUP. In this case we shut down
1136 1136 * all interfaces, but don't tear down any plumbing.
1137 1137 */
1138 1138 boolean_t
1139 1139 ill_down_start(queue_t *q, mblk_t *mp)
1140 1140 {
1141 1141 ill_t *ill = q->q_ptr;
1142 1142 ipif_t *ipif;
1143 1143
1144 1144 ASSERT(IAM_WRITER_ILL(ill));
1145 1145 /*
1146 1146 * It is possible that some ioctl is already in progress while we
1147 1147 * received the M_ERROR / M_HANGUP in which case, we need to abort
1148 1148 * the ioctl. ill_down_start() is being processed as CUR_OP rather
1149 1149 * than as NEW_OP since the cause of the M_ERROR / M_HANGUP may prevent
1150 1150 * the in progress ioctl from ever completing.
1151 1151 *
1152 1152 * The thread that started the ioctl (if any) must have returned,
1153 1153 * since we are now executing as writer. After the 2 calls below,
1154 1154 * the state of the ipsq and the ill would reflect no trace of any
1155 1155 * pending operation. Subsequently if there is any response to the
1156 1156 * original ioctl from the driver, it would be discarded as an
1157 1157 * unsolicited message from the driver.
1158 1158 */
1159 1159 (void) ipsq_pending_mp_cleanup(ill, NULL);
1160 1160 ill_dlpi_clear_deferred(ill);
1161 1161
1162 1162 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1163 1163 (void) ipif_down(ipif, NULL, NULL);
1164 1164
1165 1165 ill_down(ill);
1166 1166
1167 1167 /*
1168 1168 * Walk all CONNs that can have a reference on an ire or nce for this
1169 1169 * ill (we actually walk all that now have stale references).
1170 1170 */
1171 1171 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst);
1172 1172
1173 1173 /* With IPv6 we have dce_ifindex. Cleanup for neatness */
1174 1174 if (ill->ill_isv6)
1175 1175 dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst);
1176 1176
1177 1177 ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0);
1178 1178
1179 1179 /*
1180 1180 * Atomically test and add the pending mp if references are active.
1181 1181 */
1182 1182 mutex_enter(&ill->ill_lock);
1183 1183 if (!ill_is_quiescent(ill)) {
1184 1184 /* call cannot fail since `conn_t *' argument is NULL */
1185 1185 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
1186 1186 mp, ILL_DOWN);
1187 1187 mutex_exit(&ill->ill_lock);
1188 1188 return (B_FALSE);
1189 1189 }
1190 1190 mutex_exit(&ill->ill_lock);
1191 1191 return (B_TRUE);
1192 1192 }
1193 1193
1194 1194 static void
1195 1195 ill_down(ill_t *ill)
1196 1196 {
1197 1197 mblk_t *mp;
1198 1198 ip_stack_t *ipst = ill->ill_ipst;
1199 1199
1200 1200 /*
1201 1201 * Blow off any IREs dependent on this ILL.
1202 1202 * The caller needs to handle conn_ixa_cleanup
1203 1203 */
1204 1204 ill_delete_ires(ill);
1205 1205
1206 1206 ire_walk_ill(0, 0, ill_downi, ill, ill);
1207 1207
1208 1208 /* Remove any conn_*_ill depending on this ill */
1209 1209 ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst);
1210 1210
1211 1211 /*
1212 1212 * Free state for additional IREs.
1213 1213 */
1214 1214 mutex_enter(&ill->ill_saved_ire_lock);
1215 1215 mp = ill->ill_saved_ire_mp;
1216 1216 ill->ill_saved_ire_mp = NULL;
1217 1217 ill->ill_saved_ire_cnt = 0;
1218 1218 mutex_exit(&ill->ill_saved_ire_lock);
1219 1219 freemsg(mp);
1220 1220 }
1221 1221
1222 1222 /*
1223 1223 * ire_walk routine used to delete every IRE that depends on
1224 1224 * 'ill'. (Always called as writer, and may only be called from ire_walk.)
1225 1225 *
1226 1226 * Note: since the routes added by the kernel are deleted separately,
1227 1227 * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE.
1228 1228 *
1229 1229 * We also remove references on ire_nce_cache entries that refer to the ill.
1230 1230 */
1231 1231 void
1232 1232 ill_downi(ire_t *ire, char *ill_arg)
1233 1233 {
1234 1234 ill_t *ill = (ill_t *)ill_arg;
1235 1235 nce_t *nce;
1236 1236
1237 1237 mutex_enter(&ire->ire_lock);
1238 1238 nce = ire->ire_nce_cache;
1239 1239 if (nce != NULL && nce->nce_ill == ill)
1240 1240 ire->ire_nce_cache = NULL;
1241 1241 else
1242 1242 nce = NULL;
1243 1243 mutex_exit(&ire->ire_lock);
1244 1244 if (nce != NULL)
1245 1245 nce_refrele(nce);
1246 1246 if (ire->ire_ill == ill) {
1247 1247 /*
1248 1248 * The existing interface binding for ire must be
1249 1249 * deleted before trying to bind the route to another
1250 1250 * interface. However, since we are using the contents of the
1251 1251 * ire after ire_delete, the caller has to ensure that
1252 1252 * CONDEMNED (deleted) ire's are not removed from the list
1253 1253 * when ire_delete() returns. Currently ill_downi() is
1254 1254 * only called as part of ire_walk*() routines, so that
1255 1255 * the irb_refhold() done by ire_walk*() will ensure that
1256 1256 * ire_delete() does not lead to ire_inactive().
1257 1257 */
1258 1258 ASSERT(ire->ire_bucket->irb_refcnt > 0);
1259 1259 ire_delete(ire);
1260 1260 if (ire->ire_unbound)
1261 1261 ire_rebind(ire);
1262 1262 }
1263 1263 }
1264 1264
1265 1265 /* Remove IRE_IF_CLONE on this ill */
1266 1266 void
1267 1267 ill_downi_if_clone(ire_t *ire, char *ill_arg)
1268 1268 {
1269 1269 ill_t *ill = (ill_t *)ill_arg;
1270 1270
1271 1271 ASSERT(ire->ire_type & IRE_IF_CLONE);
1272 1272 if (ire->ire_ill == ill)
1273 1273 ire_delete(ire);
1274 1274 }
1275 1275
1276 1276 /* Consume an M_IOCACK of the fastpath probe. */
1277 1277 void
1278 1278 ill_fastpath_ack(ill_t *ill, mblk_t *mp)
1279 1279 {
1280 1280 mblk_t *mp1 = mp;
1281 1281
1282 1282 /*
1283 1283 * If this was the first attempt turn on the fastpath probing.
1284 1284 */
1285 1285 mutex_enter(&ill->ill_lock);
1286 1286 if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS)
1287 1287 ill->ill_dlpi_fastpath_state = IDS_OK;
1288 1288 mutex_exit(&ill->ill_lock);
1289 1289
1290 1290 /* Free the M_IOCACK mblk, hold on to the data */
1291 1291 mp = mp->b_cont;
1292 1292 freeb(mp1);
1293 1293 if (mp == NULL)
1294 1294 return;
1295 1295 if (mp->b_cont != NULL)
1296 1296 nce_fastpath_update(ill, mp);
1297 1297 else
1298 1298 ip0dbg(("ill_fastpath_ack: no b_cont\n"));
1299 1299 freemsg(mp);
1300 1300 }
1301 1301
1302 1302 /*
1303 1303 * Throw an M_IOCTL message downstream asking "do you know fastpath?"
1304 1304 * The data portion of the request is a dl_unitdata_req_t template for
1305 1305 * what we would send downstream in the absence of a fastpath confirmation.
1306 1306 */
1307 1307 int
1308 1308 ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp)
1309 1309 {
1310 1310 struct iocblk *ioc;
1311 1311 mblk_t *mp;
1312 1312
1313 1313 if (dlur_mp == NULL)
1314 1314 return (EINVAL);
1315 1315
1316 1316 mutex_enter(&ill->ill_lock);
1317 1317 switch (ill->ill_dlpi_fastpath_state) {
1318 1318 case IDS_FAILED:
1319 1319 /*
1320 1320 * Driver NAKed the first fastpath ioctl - assume it doesn't
1321 1321 * support it.
1322 1322 */
1323 1323 mutex_exit(&ill->ill_lock);
1324 1324 return (ENOTSUP);
1325 1325 case IDS_UNKNOWN:
1326 1326 /* This is the first probe */
1327 1327 ill->ill_dlpi_fastpath_state = IDS_INPROGRESS;
1328 1328 break;
1329 1329 default:
1330 1330 break;
1331 1331 }
1332 1332 mutex_exit(&ill->ill_lock);
1333 1333
1334 1334 if ((mp = mkiocb(DL_IOC_HDR_INFO)) == NULL)
1335 1335 return (EAGAIN);
1336 1336
1337 1337 mp->b_cont = copyb(dlur_mp);
1338 1338 if (mp->b_cont == NULL) {
1339 1339 freeb(mp);
1340 1340 return (EAGAIN);
1341 1341 }
1342 1342
1343 1343 ioc = (struct iocblk *)mp->b_rptr;
1344 1344 ioc->ioc_count = msgdsize(mp->b_cont);
1345 1345
1346 1346 DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe",
1347 1347 char *, "DL_IOC_HDR_INFO", ill_t *, ill);
1348 1348 putnext(ill->ill_wq, mp);
1349 1349 return (0);
1350 1350 }
1351 1351
1352 1352 void
1353 1353 ill_capability_probe(ill_t *ill)
1354 1354 {
1355 1355 mblk_t *mp;
1356 1356
1357 1357 ASSERT(IAM_WRITER_ILL(ill));
1358 1358
1359 1359 if (ill->ill_dlpi_capab_state != IDCS_UNKNOWN &&
1360 1360 ill->ill_dlpi_capab_state != IDCS_FAILED)
1361 1361 return;
1362 1362
1363 1363 /*
1364 1364 * We are starting a new cycle of capability negotiation.
1365 1365 * Free up the capab reset messages of any previous incarnation.
1366 1366 * We will do a fresh allocation when we get the response to our probe
1367 1367 */
1368 1368 if (ill->ill_capab_reset_mp != NULL) {
1369 1369 freemsg(ill->ill_capab_reset_mp);
1370 1370 ill->ill_capab_reset_mp = NULL;
1371 1371 }
1372 1372
1373 1373 ip1dbg(("ill_capability_probe: starting capability negotiation\n"));
1374 1374
1375 1375 mp = ip_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ);
1376 1376 if (mp == NULL)
1377 1377 return;
1378 1378
1379 1379 ill_capability_send(ill, mp);
1380 1380 ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
1381 1381 }
1382 1382
1383 1383 static boolean_t
1384 1384 ill_capability_wait(ill_t *ill)
1385 1385 {
1386 1386 /*
|
↓ open down ↓ |
1386 lines elided |
↑ open up ↑ |
1387 1387 * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can
1388 1388 * only be set by someone who is the writer. Since we
1389 1389 * drop-and-reacquire the squeue in this loop, we need to check for
1390 1390 * ILL_CONDEMNED, which if set means nothing can signal our capability
1391 1391 * condition variable.
1392 1392 */
1393 1393 ASSERT(IAM_WRITER_ILL(ill));
1394 1394
1395 1395 while (ill->ill_capab_pending_cnt != 0 &&
1396 1396 (ill->ill_state_flags & ILL_CONDEMNED) == 0) {
1397 - mutex_enter(&ill->ill_dlpi_capab_lock);
1397 + /* This may enable blocked callers of ill_capability_done(). */
1398 1398 ipsq_exit(ill->ill_phyint->phyint_ipsq);
1399 - cv_wait(&ill->ill_dlpi_capab_cv, &ill->ill_dlpi_capab_lock);
1400 - mutex_exit(&ill->ill_dlpi_capab_lock);
1399 + /* Pause a bit (1msec) before we re-enter the squeue. */
1400 + delay(drv_usectohz(1000000));
1401 +
1401 1402 /*
1402 1403 * If ipsq_enter() fails, someone set ILL_CONDEMNED
1403 1404 * while we dropped the squeue. Indicate such to the caller.
1404 1405 */
1405 1406 if (!ipsq_enter(ill, B_FALSE, CUR_OP))
1406 1407 return (B_FALSE);
1407 1408 }
1408 1409
1409 1410 return ((ill->ill_state_flags & ILL_CONDEMNED) == 0);
1410 1411 }
1411 1412
1412 1413 void
1413 1414 ill_capability_reset(ill_t *ill, boolean_t reneg)
1414 1415 {
1415 1416 ASSERT(IAM_WRITER_ILL(ill));
1416 1417
1417 1418 if (ill->ill_dlpi_capab_state != IDCS_OK)
1418 1419 return;
1419 1420
1420 1421 ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
1421 1422
1422 1423 ASSERT(ill->ill_capab_reset_mp != NULL);
1423 1424
1424 1425 ill_capability_send(ill, ill->ill_capab_reset_mp);
1425 1426 ill->ill_capab_reset_mp = NULL;
1426 1427 /*
1427 1428 * We turn off all capabilities except those pertaining to
1428 1429 * direct function call capabilities viz. ILL_CAPAB_DLD*
1429 1430 * which will be turned off by the corresponding reset functions.
1430 1431 */
1431 1432 ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY);
1432 1433 }
1433 1434
1434 1435 static void
1435 1436 ill_capability_reset_alloc(ill_t *ill)
1436 1437 {
1437 1438 mblk_t *mp;
1438 1439 size_t size = 0;
1439 1440 int err;
1440 1441 dl_capability_req_t *capb;
1441 1442
1442 1443 ASSERT(IAM_WRITER_ILL(ill));
1443 1444 ASSERT(ill->ill_capab_reset_mp == NULL);
1444 1445
1445 1446 if (ILL_HCKSUM_CAPABLE(ill)) {
1446 1447 size += sizeof (dl_capability_sub_t) +
1447 1448 sizeof (dl_capab_hcksum_t);
1448 1449 }
1449 1450
1450 1451 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) {
1451 1452 size += sizeof (dl_capability_sub_t) +
1452 1453 sizeof (dl_capab_zerocopy_t);
1453 1454 }
1454 1455
1455 1456 if (ill->ill_capabilities & ILL_CAPAB_DLD) {
1456 1457 size += sizeof (dl_capability_sub_t) +
1457 1458 sizeof (dl_capab_dld_t);
1458 1459 }
1459 1460
1460 1461 mp = allocb_wait(size + sizeof (dl_capability_req_t), BPRI_MED,
1461 1462 STR_NOSIG, &err);
1462 1463
1463 1464 mp->b_datap->db_type = M_PROTO;
1464 1465 bzero(mp->b_rptr, size + sizeof (dl_capability_req_t));
1465 1466
1466 1467 capb = (dl_capability_req_t *)mp->b_rptr;
1467 1468 capb->dl_primitive = DL_CAPABILITY_REQ;
1468 1469 capb->dl_sub_offset = sizeof (dl_capability_req_t);
1469 1470 capb->dl_sub_length = size;
1470 1471
1471 1472 mp->b_wptr += sizeof (dl_capability_req_t);
1472 1473
1473 1474 /*
1474 1475 * Each handler fills in the corresponding dl_capability_sub_t
1475 1476 * inside the mblk,
1476 1477 */
1477 1478 ill_capability_hcksum_reset_fill(ill, mp);
1478 1479 ill_capability_zerocopy_reset_fill(ill, mp);
1479 1480 ill_capability_dld_reset_fill(ill, mp);
1480 1481
1481 1482 ill->ill_capab_reset_mp = mp;
1482 1483 }
1483 1484
1484 1485 static void
1485 1486 ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers)
1486 1487 {
1487 1488 dl_capab_id_t *id_ic;
1488 1489 uint_t sub_dl_cap = outers->dl_cap;
1489 1490 dl_capability_sub_t *inners;
1490 1491 uint8_t *capend;
1491 1492
1492 1493 ASSERT(sub_dl_cap == DL_CAPAB_ID_WRAPPER);
1493 1494
1494 1495 /*
1495 1496 * Note: range checks here are not absolutely sufficient to
1496 1497 * make us robust against malformed messages sent by drivers;
1497 1498 * this is in keeping with the rest of IP's dlpi handling.
1498 1499 * (Remember, it's coming from something else in the kernel
1499 1500 * address space)
1500 1501 */
1501 1502
1502 1503 capend = (uint8_t *)(outers + 1) + outers->dl_length;
1503 1504 if (capend > mp->b_wptr) {
1504 1505 cmn_err(CE_WARN, "ill_capability_id_ack: "
1505 1506 "malformed sub-capability too long for mblk");
1506 1507 return;
1507 1508 }
1508 1509
1509 1510 id_ic = (dl_capab_id_t *)(outers + 1);
1510 1511
1511 1512 if (outers->dl_length < sizeof (*id_ic) ||
1512 1513 (inners = &id_ic->id_subcap,
1513 1514 inners->dl_length > (outers->dl_length - sizeof (*inners)))) {
1514 1515 cmn_err(CE_WARN, "ill_capability_id_ack: malformed "
1515 1516 "encapsulated capab type %d too long for mblk",
1516 1517 inners->dl_cap);
1517 1518 return;
1518 1519 }
1519 1520
1520 1521 if (!dlcapabcheckqid(&id_ic->id_mid, ill->ill_lmod_rq)) {
1521 1522 ip1dbg(("ill_capability_id_ack: mid token for capab type %d "
1522 1523 "isn't as expected; pass-thru module(s) detected, "
1523 1524 "discarding capability\n", inners->dl_cap));
1524 1525 return;
1525 1526 }
1526 1527
1527 1528 /* Process the encapsulated sub-capability */
1528 1529 ill_capability_dispatch(ill, mp, inners);
1529 1530 }
1530 1531
1531 1532 static void
1532 1533 ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp)
1533 1534 {
1534 1535 dl_capability_sub_t *dl_subcap;
1535 1536
1536 1537 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
1537 1538 return;
1538 1539
1539 1540 /*
1540 1541 * The dl_capab_dld_t that follows the dl_capability_sub_t is not
1541 1542 * initialized below since it is not used by DLD.
1542 1543 */
1543 1544 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1544 1545 dl_subcap->dl_cap = DL_CAPAB_DLD;
1545 1546 dl_subcap->dl_length = sizeof (dl_capab_dld_t);
1546 1547
1547 1548 mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t);
1548 1549 }
1549 1550
1550 1551 static void
1551 1552 ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp)
1552 1553 {
1553 1554 /*
1554 1555 * If no ipif was brought up over this ill, this DL_CAPABILITY_REQ/ACK
1555 1556 * is only to get the VRRP capability.
1556 1557 *
1557 1558 * Note that we cannot check ill_ipif_up_count here since
1558 1559 * ill_ipif_up_count is only incremented when the resolver is setup.
1559 1560 * That is done asynchronously, and can race with this function.
1560 1561 */
1561 1562 if (!ill->ill_dl_up) {
1562 1563 if (subp->dl_cap == DL_CAPAB_VRRP)
1563 1564 ill_capability_vrrp_ack(ill, mp, subp);
1564 1565 return;
1565 1566 }
1566 1567
1567 1568 switch (subp->dl_cap) {
1568 1569 case DL_CAPAB_HCKSUM:
1569 1570 ill_capability_hcksum_ack(ill, mp, subp);
1570 1571 break;
1571 1572 case DL_CAPAB_ZEROCOPY:
1572 1573 ill_capability_zerocopy_ack(ill, mp, subp);
1573 1574 break;
1574 1575 case DL_CAPAB_DLD:
1575 1576 ill_capability_dld_ack(ill, mp, subp);
1576 1577 break;
1577 1578 case DL_CAPAB_VRRP:
1578 1579 break;
1579 1580 default:
1580 1581 ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
1581 1582 subp->dl_cap));
1582 1583 }
1583 1584 }
1584 1585
1585 1586 /*
1586 1587 * Process the vrrp capability received from a DLS Provider. isub must point
1587 1588 * to the sub-capability (DL_CAPAB_VRRP) of a DL_CAPABILITY_ACK message.
1588 1589 */
1589 1590 static void
1590 1591 ill_capability_vrrp_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1591 1592 {
1592 1593 dl_capab_vrrp_t *vrrp;
1593 1594 uint_t sub_dl_cap = isub->dl_cap;
1594 1595 uint8_t *capend;
1595 1596
1596 1597 ASSERT(IAM_WRITER_ILL(ill));
1597 1598 ASSERT(sub_dl_cap == DL_CAPAB_VRRP);
1598 1599
1599 1600 /*
1600 1601 * Note: range checks here are not absolutely sufficient to
1601 1602 * make us robust against malformed messages sent by drivers;
1602 1603 * this is in keeping with the rest of IP's dlpi handling.
1603 1604 * (Remember, it's coming from something else in the kernel
1604 1605 * address space)
1605 1606 */
1606 1607 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1607 1608 if (capend > mp->b_wptr) {
1608 1609 cmn_err(CE_WARN, "ill_capability_vrrp_ack: "
1609 1610 "malformed sub-capability too long for mblk");
1610 1611 return;
1611 1612 }
1612 1613 vrrp = (dl_capab_vrrp_t *)(isub + 1);
1613 1614
1614 1615 /*
1615 1616 * Compare the IP address family and set ILLF_VRRP for the right ill.
1616 1617 */
1617 1618 if ((vrrp->vrrp_af == AF_INET6 && ill->ill_isv6) ||
1618 1619 (vrrp->vrrp_af == AF_INET && !ill->ill_isv6)) {
1619 1620 ill->ill_flags |= ILLF_VRRP;
1620 1621 }
1621 1622 }
1622 1623
1623 1624 /*
1624 1625 * Process a hardware checksum offload capability negotiation ack received
1625 1626 * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
1626 1627 * of a DL_CAPABILITY_ACK message.
1627 1628 */
1628 1629 static void
1629 1630 ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1630 1631 {
1631 1632 dl_capability_req_t *ocap;
1632 1633 dl_capab_hcksum_t *ihck, *ohck;
1633 1634 ill_hcksum_capab_t **ill_hcksum;
1634 1635 mblk_t *nmp = NULL;
1635 1636 uint_t sub_dl_cap = isub->dl_cap;
1636 1637 uint8_t *capend;
1637 1638
1638 1639 ASSERT(sub_dl_cap == DL_CAPAB_HCKSUM);
1639 1640
1640 1641 ill_hcksum = (ill_hcksum_capab_t **)&ill->ill_hcksum_capab;
1641 1642
1642 1643 /*
1643 1644 * Note: range checks here are not absolutely sufficient to
1644 1645 * make us robust against malformed messages sent by drivers;
1645 1646 * this is in keeping with the rest of IP's dlpi handling.
1646 1647 * (Remember, it's coming from something else in the kernel
1647 1648 * address space)
1648 1649 */
1649 1650 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1650 1651 if (capend > mp->b_wptr) {
1651 1652 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1652 1653 "malformed sub-capability too long for mblk");
1653 1654 return;
1654 1655 }
1655 1656
1656 1657 /*
1657 1658 * There are two types of acks we process here:
1658 1659 * 1. acks in reply to a (first form) generic capability req
1659 1660 * (no ENABLE flag set)
1660 1661 * 2. acks in reply to a ENABLE capability req.
1661 1662 * (ENABLE flag set)
1662 1663 */
1663 1664 ihck = (dl_capab_hcksum_t *)(isub + 1);
1664 1665
1665 1666 if (ihck->hcksum_version != HCKSUM_VERSION_1) {
1666 1667 cmn_err(CE_CONT, "ill_capability_hcksum_ack: "
1667 1668 "unsupported hardware checksum "
1668 1669 "sub-capability (version %d, expected %d)",
1669 1670 ihck->hcksum_version, HCKSUM_VERSION_1);
1670 1671 return;
1671 1672 }
1672 1673
1673 1674 if (!dlcapabcheckqid(&ihck->hcksum_mid, ill->ill_lmod_rq)) {
1674 1675 ip1dbg(("ill_capability_hcksum_ack: mid token for hardware "
1675 1676 "checksum capability isn't as expected; pass-thru "
1676 1677 "module(s) detected, discarding capability\n"));
1677 1678 return;
1678 1679 }
1679 1680
1680 1681 #define CURR_HCKSUM_CAPAB \
1681 1682 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \
1682 1683 HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
1683 1684
1684 1685 if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
1685 1686 (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
1686 1687 /* do ENABLE processing */
1687 1688 if (*ill_hcksum == NULL) {
1688 1689 *ill_hcksum = kmem_zalloc(sizeof (ill_hcksum_capab_t),
1689 1690 KM_NOSLEEP);
1690 1691
1691 1692 if (*ill_hcksum == NULL) {
1692 1693 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1693 1694 "could not enable hcksum version %d "
1694 1695 "for %s (ENOMEM)\n", HCKSUM_CURRENT_VERSION,
1695 1696 ill->ill_name);
1696 1697 return;
1697 1698 }
1698 1699 }
1699 1700
1700 1701 (*ill_hcksum)->ill_hcksum_version = ihck->hcksum_version;
1701 1702 (*ill_hcksum)->ill_hcksum_txflags = ihck->hcksum_txflags;
1702 1703 ill->ill_capabilities |= ILL_CAPAB_HCKSUM;
1703 1704 ip1dbg(("ill_capability_hcksum_ack: interface %s "
1704 1705 "has enabled hardware checksumming\n ",
1705 1706 ill->ill_name));
1706 1707 } else if (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB) {
1707 1708 /*
1708 1709 * Enabling hardware checksum offload
1709 1710 * Currently IP supports {TCP,UDP}/IPv4
1710 1711 * partial and full cksum offload and
1711 1712 * IPv4 header checksum offload.
1712 1713 * Allocate new mblk which will
1713 1714 * contain a new capability request
1714 1715 * to enable hardware checksum offload.
1715 1716 */
1716 1717 uint_t size;
1717 1718 uchar_t *rptr;
1718 1719
1719 1720 size = sizeof (dl_capability_req_t) +
1720 1721 sizeof (dl_capability_sub_t) + isub->dl_length;
1721 1722
1722 1723 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1723 1724 cmn_err(CE_WARN, "ill_capability_hcksum_ack: "
1724 1725 "could not enable hardware cksum for %s (ENOMEM)\n",
1725 1726 ill->ill_name);
1726 1727 return;
1727 1728 }
1728 1729
1729 1730 rptr = nmp->b_rptr;
1730 1731 /* initialize dl_capability_req_t */
1731 1732 ocap = (dl_capability_req_t *)nmp->b_rptr;
1732 1733 ocap->dl_sub_offset =
1733 1734 sizeof (dl_capability_req_t);
1734 1735 ocap->dl_sub_length =
1735 1736 sizeof (dl_capability_sub_t) +
1736 1737 isub->dl_length;
1737 1738 nmp->b_rptr += sizeof (dl_capability_req_t);
1738 1739
1739 1740 /* initialize dl_capability_sub_t */
1740 1741 bcopy(isub, nmp->b_rptr, sizeof (*isub));
1741 1742 nmp->b_rptr += sizeof (*isub);
1742 1743
1743 1744 /* initialize dl_capab_hcksum_t */
1744 1745 ohck = (dl_capab_hcksum_t *)nmp->b_rptr;
1745 1746 bcopy(ihck, ohck, sizeof (*ihck));
1746 1747
1747 1748 nmp->b_rptr = rptr;
1748 1749 ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
1749 1750
1750 1751 /* Set ENABLE flag */
1751 1752 ohck->hcksum_txflags &= CURR_HCKSUM_CAPAB;
1752 1753 ohck->hcksum_txflags |= HCKSUM_ENABLE;
1753 1754
1754 1755 /*
1755 1756 * nmp points to a DL_CAPABILITY_REQ message to enable
1756 1757 * hardware checksum acceleration.
1757 1758 */
1758 1759 ill_capability_send(ill, nmp);
1759 1760 } else {
1760 1761 ip1dbg(("ill_capability_hcksum_ack: interface %s has "
1761 1762 "advertised %x hardware checksum capability flags\n",
1762 1763 ill->ill_name, ihck->hcksum_txflags));
1763 1764 }
1764 1765 }
1765 1766
1766 1767 static void
1767 1768 ill_capability_hcksum_reset_fill(ill_t *ill, mblk_t *mp)
1768 1769 {
1769 1770 dl_capab_hcksum_t *hck_subcap;
1770 1771 dl_capability_sub_t *dl_subcap;
1771 1772
1772 1773 if (!ILL_HCKSUM_CAPABLE(ill))
1773 1774 return;
1774 1775
1775 1776 ASSERT(ill->ill_hcksum_capab != NULL);
1776 1777
1777 1778 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1778 1779 dl_subcap->dl_cap = DL_CAPAB_HCKSUM;
1779 1780 dl_subcap->dl_length = sizeof (*hck_subcap);
1780 1781
1781 1782 hck_subcap = (dl_capab_hcksum_t *)(dl_subcap + 1);
1782 1783 hck_subcap->hcksum_version = ill->ill_hcksum_capab->ill_hcksum_version;
1783 1784 hck_subcap->hcksum_txflags = 0;
1784 1785
1785 1786 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*hck_subcap);
1786 1787 }
1787 1788
1788 1789 static void
1789 1790 ill_capability_zerocopy_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1790 1791 {
1791 1792 mblk_t *nmp = NULL;
1792 1793 dl_capability_req_t *oc;
1793 1794 dl_capab_zerocopy_t *zc_ic, *zc_oc;
1794 1795 ill_zerocopy_capab_t **ill_zerocopy_capab;
1795 1796 uint_t sub_dl_cap = isub->dl_cap;
1796 1797 uint8_t *capend;
1797 1798
1798 1799 ASSERT(sub_dl_cap == DL_CAPAB_ZEROCOPY);
1799 1800
1800 1801 ill_zerocopy_capab = (ill_zerocopy_capab_t **)&ill->ill_zerocopy_capab;
1801 1802
1802 1803 /*
1803 1804 * Note: range checks here are not absolutely sufficient to
1804 1805 * make us robust against malformed messages sent by drivers;
1805 1806 * this is in keeping with the rest of IP's dlpi handling.
1806 1807 * (Remember, it's coming from something else in the kernel
1807 1808 * address space)
1808 1809 */
1809 1810 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1810 1811 if (capend > mp->b_wptr) {
1811 1812 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1812 1813 "malformed sub-capability too long for mblk");
1813 1814 return;
1814 1815 }
1815 1816
1816 1817 zc_ic = (dl_capab_zerocopy_t *)(isub + 1);
1817 1818 if (zc_ic->zerocopy_version != ZEROCOPY_VERSION_1) {
1818 1819 cmn_err(CE_CONT, "ill_capability_zerocopy_ack: "
1819 1820 "unsupported ZEROCOPY sub-capability (version %d, "
1820 1821 "expected %d)", zc_ic->zerocopy_version,
1821 1822 ZEROCOPY_VERSION_1);
1822 1823 return;
1823 1824 }
1824 1825
1825 1826 if (!dlcapabcheckqid(&zc_ic->zerocopy_mid, ill->ill_lmod_rq)) {
1826 1827 ip1dbg(("ill_capability_zerocopy_ack: mid token for zerocopy "
1827 1828 "capability isn't as expected; pass-thru module(s) "
1828 1829 "detected, discarding capability\n"));
1829 1830 return;
1830 1831 }
1831 1832
1832 1833 if ((zc_ic->zerocopy_flags & DL_CAPAB_VMSAFE_MEM) != 0) {
1833 1834 if (*ill_zerocopy_capab == NULL) {
1834 1835 *ill_zerocopy_capab =
1835 1836 kmem_zalloc(sizeof (ill_zerocopy_capab_t),
1836 1837 KM_NOSLEEP);
1837 1838
1838 1839 if (*ill_zerocopy_capab == NULL) {
1839 1840 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1840 1841 "could not enable Zero-copy version %d "
1841 1842 "for %s (ENOMEM)\n", ZEROCOPY_VERSION_1,
1842 1843 ill->ill_name);
1843 1844 return;
1844 1845 }
1845 1846 }
1846 1847
1847 1848 ip1dbg(("ill_capability_zerocopy_ack: interface %s "
1848 1849 "supports Zero-copy version %d\n", ill->ill_name,
1849 1850 ZEROCOPY_VERSION_1));
1850 1851
1851 1852 (*ill_zerocopy_capab)->ill_zerocopy_version =
1852 1853 zc_ic->zerocopy_version;
1853 1854 (*ill_zerocopy_capab)->ill_zerocopy_flags =
1854 1855 zc_ic->zerocopy_flags;
1855 1856
1856 1857 ill->ill_capabilities |= ILL_CAPAB_ZEROCOPY;
1857 1858 } else {
1858 1859 uint_t size;
1859 1860 uchar_t *rptr;
1860 1861
1861 1862 size = sizeof (dl_capability_req_t) +
1862 1863 sizeof (dl_capability_sub_t) +
1863 1864 sizeof (dl_capab_zerocopy_t);
1864 1865
1865 1866 if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
1866 1867 cmn_err(CE_WARN, "ill_capability_zerocopy_ack: "
1867 1868 "could not enable zerocopy for %s (ENOMEM)\n",
1868 1869 ill->ill_name);
1869 1870 return;
1870 1871 }
1871 1872
1872 1873 rptr = nmp->b_rptr;
1873 1874 /* initialize dl_capability_req_t */
1874 1875 oc = (dl_capability_req_t *)rptr;
1875 1876 oc->dl_sub_offset = sizeof (dl_capability_req_t);
1876 1877 oc->dl_sub_length = sizeof (dl_capability_sub_t) +
1877 1878 sizeof (dl_capab_zerocopy_t);
1878 1879 rptr += sizeof (dl_capability_req_t);
1879 1880
1880 1881 /* initialize dl_capability_sub_t */
1881 1882 bcopy(isub, rptr, sizeof (*isub));
1882 1883 rptr += sizeof (*isub);
1883 1884
1884 1885 /* initialize dl_capab_zerocopy_t */
1885 1886 zc_oc = (dl_capab_zerocopy_t *)rptr;
1886 1887 *zc_oc = *zc_ic;
1887 1888
1888 1889 ip1dbg(("ill_capability_zerocopy_ack: asking interface %s "
1889 1890 "to enable zero-copy version %d\n", ill->ill_name,
1890 1891 ZEROCOPY_VERSION_1));
1891 1892
1892 1893 /* set VMSAFE_MEM flag */
1893 1894 zc_oc->zerocopy_flags |= DL_CAPAB_VMSAFE_MEM;
1894 1895
1895 1896 /* nmp points to a DL_CAPABILITY_REQ message to enable zcopy */
1896 1897 ill_capability_send(ill, nmp);
1897 1898 }
1898 1899 }
1899 1900
1900 1901 static void
1901 1902 ill_capability_zerocopy_reset_fill(ill_t *ill, mblk_t *mp)
1902 1903 {
1903 1904 dl_capab_zerocopy_t *zerocopy_subcap;
1904 1905 dl_capability_sub_t *dl_subcap;
1905 1906
1906 1907 if (!(ill->ill_capabilities & ILL_CAPAB_ZEROCOPY))
1907 1908 return;
1908 1909
1909 1910 ASSERT(ill->ill_zerocopy_capab != NULL);
1910 1911
1911 1912 dl_subcap = (dl_capability_sub_t *)mp->b_wptr;
1912 1913 dl_subcap->dl_cap = DL_CAPAB_ZEROCOPY;
1913 1914 dl_subcap->dl_length = sizeof (*zerocopy_subcap);
1914 1915
1915 1916 zerocopy_subcap = (dl_capab_zerocopy_t *)(dl_subcap + 1);
1916 1917 zerocopy_subcap->zerocopy_version =
1917 1918 ill->ill_zerocopy_capab->ill_zerocopy_version;
1918 1919 zerocopy_subcap->zerocopy_flags = 0;
1919 1920
1920 1921 mp->b_wptr += sizeof (*dl_subcap) + sizeof (*zerocopy_subcap);
1921 1922 }
1922 1923
1923 1924 /*
1924 1925 * DLD capability
1925 1926 * Refer to dld.h for more information regarding the purpose and usage
1926 1927 * of this capability.
1927 1928 */
1928 1929 static void
1929 1930 ill_capability_dld_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
1930 1931 {
1931 1932 dl_capab_dld_t *dld_ic, dld;
1932 1933 uint_t sub_dl_cap = isub->dl_cap;
1933 1934 uint8_t *capend;
1934 1935 ill_dld_capab_t *idc;
1935 1936
1936 1937 ASSERT(IAM_WRITER_ILL(ill));
1937 1938 ASSERT(sub_dl_cap == DL_CAPAB_DLD);
1938 1939
1939 1940 /*
1940 1941 * Note: range checks here are not absolutely sufficient to
1941 1942 * make us robust against malformed messages sent by drivers;
1942 1943 * this is in keeping with the rest of IP's dlpi handling.
1943 1944 * (Remember, it's coming from something else in the kernel
1944 1945 * address space)
1945 1946 */
1946 1947 capend = (uint8_t *)(isub + 1) + isub->dl_length;
1947 1948 if (capend > mp->b_wptr) {
1948 1949 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1949 1950 "malformed sub-capability too long for mblk");
1950 1951 return;
1951 1952 }
1952 1953 dld_ic = (dl_capab_dld_t *)(isub + 1);
1953 1954 if (dld_ic->dld_version != DLD_CURRENT_VERSION) {
1954 1955 cmn_err(CE_CONT, "ill_capability_dld_ack: "
1955 1956 "unsupported DLD sub-capability (version %d, "
1956 1957 "expected %d)", dld_ic->dld_version,
1957 1958 DLD_CURRENT_VERSION);
1958 1959 return;
1959 1960 }
1960 1961 if (!dlcapabcheckqid(&dld_ic->dld_mid, ill->ill_lmod_rq)) {
1961 1962 ip1dbg(("ill_capability_dld_ack: mid token for dld "
1962 1963 "capability isn't as expected; pass-thru module(s) "
1963 1964 "detected, discarding capability\n"));
1964 1965 return;
1965 1966 }
1966 1967
1967 1968 /*
1968 1969 * Copy locally to ensure alignment.
1969 1970 */
1970 1971 bcopy(dld_ic, &dld, sizeof (dl_capab_dld_t));
1971 1972
1972 1973 if ((idc = ill->ill_dld_capab) == NULL) {
1973 1974 idc = kmem_zalloc(sizeof (ill_dld_capab_t), KM_NOSLEEP);
1974 1975 if (idc == NULL) {
1975 1976 cmn_err(CE_WARN, "ill_capability_dld_ack: "
1976 1977 "could not enable DLD version %d "
1977 1978 "for %s (ENOMEM)\n", DLD_CURRENT_VERSION,
1978 1979 ill->ill_name);
1979 1980 return;
1980 1981 }
1981 1982 ill->ill_dld_capab = idc;
1982 1983 }
1983 1984 idc->idc_capab_df = (ip_capab_func_t)dld.dld_capab;
1984 1985 idc->idc_capab_dh = (void *)dld.dld_capab_handle;
1985 1986 ip1dbg(("ill_capability_dld_ack: interface %s "
1986 1987 "supports DLD version %d\n", ill->ill_name, DLD_CURRENT_VERSION));
1987 1988
1988 1989 ill_capability_dld_enable(ill);
1989 1990 }
1990 1991
1991 1992 /*
1992 1993 * Typically capability negotiation between IP and the driver happens via
1993 1994 * DLPI message exchange. However GLD also offers a direct function call
1994 1995 * mechanism to exchange the DLD_DIRECT_CAPAB and DLD_POLL_CAPAB capabilities,
1995 1996 * But arbitrary function calls into IP or GLD are not permitted, since both
1996 1997 * of them are protected by their own perimeter mechanism. The perimeter can
1997 1998 * be viewed as a coarse lock or serialization mechanism. The hierarchy of
1998 1999 * these perimeters is IP -> MAC. Thus for example to enable the squeue
1999 2000 * polling, IP needs to enter its perimeter, then call ill_mac_perim_enter
2000 2001 * to enter the mac perimeter and then do the direct function calls into
2001 2002 * GLD to enable squeue polling. The ring related callbacks from the mac into
2002 2003 * the stack to add, bind, quiesce, restart or cleanup a ring are all
2003 2004 * protected by the mac perimeter.
2004 2005 */
2005 2006 static void
2006 2007 ill_mac_perim_enter(ill_t *ill, mac_perim_handle_t *mphp)
2007 2008 {
2008 2009 ill_dld_capab_t *idc = ill->ill_dld_capab;
2009 2010 int err;
2010 2011
2011 2012 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mphp,
2012 2013 DLD_ENABLE);
2013 2014 ASSERT(err == 0);
2014 2015 }
2015 2016
2016 2017 static void
2017 2018 ill_mac_perim_exit(ill_t *ill, mac_perim_handle_t mph)
2018 2019 {
2019 2020 ill_dld_capab_t *idc = ill->ill_dld_capab;
2020 2021 int err;
2021 2022
2022 2023 err = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, mph,
2023 2024 DLD_DISABLE);
2024 2025 ASSERT(err == 0);
2025 2026 }
2026 2027
2027 2028 boolean_t
2028 2029 ill_mac_perim_held(ill_t *ill)
2029 2030 {
2030 2031 ill_dld_capab_t *idc = ill->ill_dld_capab;
2031 2032
2032 2033 return (idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_PERIM, NULL,
2033 2034 DLD_QUERY));
2034 2035 }
2035 2036
2036 2037 static void
2037 2038 ill_capability_direct_enable(ill_t *ill)
2038 2039 {
2039 2040 ill_dld_capab_t *idc = ill->ill_dld_capab;
2040 2041 ill_dld_direct_t *idd = &idc->idc_direct;
2041 2042 dld_capab_direct_t direct;
2042 2043 int rc;
2043 2044
2044 2045 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2045 2046
2046 2047 bzero(&direct, sizeof (direct));
2047 2048 direct.di_rx_cf = (uintptr_t)ip_input;
2048 2049 direct.di_rx_ch = ill;
2049 2050
2050 2051 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT, &direct,
2051 2052 DLD_ENABLE);
2052 2053 if (rc == 0) {
2053 2054 idd->idd_tx_df = (ip_dld_tx_t)direct.di_tx_df;
2054 2055 idd->idd_tx_dh = direct.di_tx_dh;
2055 2056 idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
2056 2057 idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
2057 2058 idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
2058 2059 idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
2059 2060 ASSERT(idd->idd_tx_cb_df != NULL);
2060 2061 ASSERT(idd->idd_tx_fctl_df != NULL);
2061 2062 ASSERT(idd->idd_tx_df != NULL);
2062 2063 /*
2063 2064 * One time registration of flow enable callback function
2064 2065 */
2065 2066 ill->ill_flownotify_mh = idd->idd_tx_cb_df(idd->idd_tx_cb_dh,
2066 2067 ill_flow_enable, ill);
2067 2068 ill->ill_capabilities |= ILL_CAPAB_DLD_DIRECT;
2068 2069 DTRACE_PROBE1(direct_on, (ill_t *), ill);
2069 2070 } else {
2070 2071 cmn_err(CE_WARN, "warning: could not enable DIRECT "
2071 2072 "capability, rc = %d\n", rc);
2072 2073 DTRACE_PROBE2(direct_off, (ill_t *), ill, (int), rc);
2073 2074 }
2074 2075 }
2075 2076
2076 2077 static void
2077 2078 ill_capability_poll_enable(ill_t *ill)
2078 2079 {
2079 2080 ill_dld_capab_t *idc = ill->ill_dld_capab;
2080 2081 dld_capab_poll_t poll;
2081 2082 int rc;
2082 2083
2083 2084 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2084 2085
2085 2086 bzero(&poll, sizeof (poll));
2086 2087 poll.poll_ring_add_cf = (uintptr_t)ip_squeue_add_ring;
2087 2088 poll.poll_ring_remove_cf = (uintptr_t)ip_squeue_clean_ring;
2088 2089 poll.poll_ring_quiesce_cf = (uintptr_t)ip_squeue_quiesce_ring;
2089 2090 poll.poll_ring_restart_cf = (uintptr_t)ip_squeue_restart_ring;
2090 2091 poll.poll_ring_bind_cf = (uintptr_t)ip_squeue_bind_ring;
2091 2092 poll.poll_ring_ch = ill;
2092 2093 rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL, &poll,
2093 2094 DLD_ENABLE);
2094 2095 if (rc == 0) {
2095 2096 ill->ill_capabilities |= ILL_CAPAB_DLD_POLL;
2096 2097 DTRACE_PROBE1(poll_on, (ill_t *), ill);
2097 2098 } else {
2098 2099 ip1dbg(("warning: could not enable POLL "
2099 2100 "capability, rc = %d\n", rc));
2100 2101 DTRACE_PROBE2(poll_off, (ill_t *), ill, (int), rc);
2101 2102 }
2102 2103 }
2103 2104
2104 2105 /*
2105 2106 * Enable the LSO capability.
2106 2107 */
2107 2108 static void
2108 2109 ill_capability_lso_enable(ill_t *ill)
2109 2110 {
2110 2111 ill_dld_capab_t *idc = ill->ill_dld_capab;
2111 2112 dld_capab_lso_t lso;
2112 2113 int rc;
2113 2114
2114 2115 ASSERT(!ill->ill_isv6 && IAM_WRITER_ILL(ill));
2115 2116
2116 2117 if (ill->ill_lso_capab == NULL) {
2117 2118 ill->ill_lso_capab = kmem_zalloc(sizeof (ill_lso_capab_t),
2118 2119 KM_NOSLEEP);
2119 2120 if (ill->ill_lso_capab == NULL) {
2120 2121 cmn_err(CE_WARN, "ill_capability_lso_enable: "
2121 2122 "could not enable LSO for %s (ENOMEM)\n",
2122 2123 ill->ill_name);
2123 2124 return;
2124 2125 }
2125 2126 }
2126 2127
2127 2128 bzero(&lso, sizeof (lso));
2128 2129 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, &lso,
2129 2130 DLD_ENABLE)) == 0) {
2130 2131 ill->ill_lso_capab->ill_lso_flags = lso.lso_flags;
2131 2132 ill->ill_lso_capab->ill_lso_max = lso.lso_max;
2132 2133 ill->ill_capabilities |= ILL_CAPAB_LSO;
2133 2134 ip1dbg(("ill_capability_lso_enable: interface %s "
2134 2135 "has enabled LSO\n ", ill->ill_name));
2135 2136 } else {
2136 2137 kmem_free(ill->ill_lso_capab, sizeof (ill_lso_capab_t));
2137 2138 ill->ill_lso_capab = NULL;
2138 2139 DTRACE_PROBE2(lso_off, (ill_t *), ill, (int), rc);
2139 2140 }
2140 2141 }
2141 2142
2142 2143 /*
2143 2144 * Check whether or not mac will prevent us from sending with a given IP
2144 2145 * address. This requires having the IPCHECK capability, which we should
2145 2146 * always be able to successfully negotiate, but if it's somehow missing
2146 2147 * then we just permit the caller to use the address, since mac does the
2147 2148 * actual enforcement and ip is just performing a courtesy check to help
2148 2149 * prevent users from unwittingly setting and attempting to use blocked
2149 2150 * addresses.
2150 2151 */
2151 2152 static boolean_t
2152 2153 ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr)
2153 2154 {
2154 2155 if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0)
2155 2156 return (B_TRUE);
2156 2157
2157 2158 ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck;
2158 2159 ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df;
2159 2160 return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr));
2160 2161 }
2161 2162
2162 2163 static void
2163 2164 ill_capability_ipcheck_enable(ill_t *ill)
2164 2165 {
2165 2166 ill_dld_capab_t *idc = ill->ill_dld_capab;
2166 2167 ill_dld_ipcheck_t *idi = &idc->idc_ipcheck;
2167 2168 dld_capab_ipcheck_t spoof;
2168 2169 int rc;
2169 2170
2170 2171 ASSERT(IAM_WRITER_ILL(ill));
2171 2172
2172 2173 bzero(&spoof, sizeof (spoof));
2173 2174 if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
2174 2175 &spoof, DLD_ENABLE)) == 0) {
2175 2176 idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df;
2176 2177 idi->idi_allowed_dh = spoof.ipc_allowed_dh;
2177 2178 ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK;
2178 2179 } else {
2179 2180 cmn_err(CE_WARN, "warning: could not enable IPCHECK "
2180 2181 "capability, rc = %d\n", rc);
2181 2182 DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc);
2182 2183 }
2183 2184 }
2184 2185
2185 2186 static void
2186 2187 ill_capability_dld_enable(ill_t *ill)
2187 2188 {
2188 2189 mac_perim_handle_t mph;
2189 2190
2190 2191 ASSERT(IAM_WRITER_ILL(ill));
2191 2192
2192 2193 ill_mac_perim_enter(ill, &mph);
2193 2194 if (!ill->ill_isv6) {
2194 2195 ill_capability_direct_enable(ill);
2195 2196 ill_capability_poll_enable(ill);
2196 2197 ill_capability_lso_enable(ill);
2197 2198 }
2198 2199
2199 2200 ill_capability_ipcheck_enable(ill);
2200 2201
2201 2202 ill->ill_capabilities |= ILL_CAPAB_DLD;
2202 2203 ill_mac_perim_exit(ill, mph);
2203 2204 }
2204 2205
2205 2206 static void
2206 2207 ill_capability_dld_disable(ill_t *ill)
2207 2208 {
2208 2209 ill_dld_capab_t *idc;
2209 2210 ill_dld_direct_t *idd;
2210 2211 mac_perim_handle_t mph;
2211 2212
2212 2213 ASSERT(IAM_WRITER_ILL(ill));
2213 2214
2214 2215 if (!(ill->ill_capabilities & ILL_CAPAB_DLD))
2215 2216 return;
2216 2217
2217 2218 ill_mac_perim_enter(ill, &mph);
2218 2219
2219 2220 idc = ill->ill_dld_capab;
2220 2221 if ((ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) {
2221 2222 /*
2222 2223 * For performance we avoid locks in the transmit data path
2223 2224 * and don't maintain a count of the number of threads using
2224 2225 * direct calls. Thus some threads could be using direct
2225 2226 * transmit calls to GLD, even after the capability mechanism
2226 2227 * turns it off. This is still safe since the handles used in
2227 2228 * the direct calls continue to be valid until the unplumb is
2228 2229 * completed. Remove the callback that was added (1-time) at
2229 2230 * capab enable time.
2230 2231 */
2231 2232 mutex_enter(&ill->ill_lock);
2232 2233 ill->ill_capabilities &= ~ILL_CAPAB_DLD_DIRECT;
2233 2234 mutex_exit(&ill->ill_lock);
2234 2235 if (ill->ill_flownotify_mh != NULL) {
2235 2236 idd = &idc->idc_direct;
2236 2237 idd->idd_tx_cb_df(idd->idd_tx_cb_dh, NULL,
2237 2238 ill->ill_flownotify_mh);
2238 2239 ill->ill_flownotify_mh = NULL;
2239 2240 }
2240 2241 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_DIRECT,
2241 2242 NULL, DLD_DISABLE);
2242 2243 }
2243 2244
2244 2245 if ((ill->ill_capabilities & ILL_CAPAB_DLD_POLL) != 0) {
2245 2246 ill->ill_capabilities &= ~ILL_CAPAB_DLD_POLL;
2246 2247 ip_squeue_clean_all(ill);
2247 2248 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_POLL,
2248 2249 NULL, DLD_DISABLE);
2249 2250 }
2250 2251
2251 2252 if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) {
2252 2253 ASSERT(ill->ill_lso_capab != NULL);
2253 2254 /*
2254 2255 * Clear the capability flag for LSO but retain the
2255 2256 * ill_lso_capab structure since it's possible that another
2256 2257 * thread is still referring to it. The structure only gets
2257 2258 * deallocated when we destroy the ill.
2258 2259 */
2259 2260
2260 2261 ill->ill_capabilities &= ~ILL_CAPAB_LSO;
2261 2262 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO,
2262 2263 NULL, DLD_DISABLE);
2263 2264 }
2264 2265
2265 2266 if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) {
2266 2267 ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL);
2267 2268 ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL);
2268 2269
2269 2270 ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK;
2270 2271 (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
2271 2272 NULL, DLD_DISABLE);
2272 2273 }
2273 2274
2274 2275 ill->ill_capabilities &= ~ILL_CAPAB_DLD;
2275 2276 ill_mac_perim_exit(ill, mph);
2276 2277 }
2277 2278
2278 2279 /*
2279 2280 * Capability Negotiation protocol
2280 2281 *
2281 2282 * We don't wait for DLPI capability operations to finish during interface
2282 2283 * bringup or teardown. Doing so would introduce more asynchrony and the
2283 2284 * interface up/down operations will need multiple return and restarts.
2284 2285 * Instead the 'ipsq_current_ipif' of the ipsq is not cleared as long as
2285 2286 * the 'ill_dlpi_deferred' chain is non-empty. This ensures that the next
2286 2287 * exclusive operation won't start until the DLPI operations of the previous
2287 2288 * exclusive operation complete.
2288 2289 *
2289 2290 * The capability state machine is shown below.
2290 2291 *
2291 2292 * state next state event, action
2292 2293 *
2293 2294 * IDCS_UNKNOWN IDCS_PROBE_SENT ill_capability_probe
2294 2295 * IDCS_PROBE_SENT IDCS_OK ill_capability_ack
2295 2296 * IDCS_PROBE_SENT IDCS_FAILED ip_rput_dlpi_writer (nack)
2296 2297 * IDCS_OK IDCS_RENEG Receipt of DL_NOTE_CAPAB_RENEG
2297 2298 * IDCS_OK IDCS_RESET_SENT ill_capability_reset
2298 2299 * IDCS_RESET_SENT IDCS_UNKNOWN ill_capability_ack_thr
2299 2300 * IDCS_RENEG IDCS_PROBE_SENT ill_capability_ack_thr ->
2300 2301 * ill_capability_probe.
2301 2302 */
2302 2303
2303 2304 /*
2304 2305 * Dedicated thread started from ip_stack_init that handles capability
2305 2306 * disable. This thread ensures the taskq dispatch does not fail by waiting
2306 2307 * for resources using TQ_SLEEP. The taskq mechanism is used to ensure
2307 2308 * that direct calls to DLD are done in a cv_waitable context.
2308 2309 */
2309 2310 void
2310 2311 ill_taskq_dispatch(ip_stack_t *ipst)
2311 2312 {
2312 2313 callb_cpr_t cprinfo;
2313 2314 char name[64];
2314 2315 mblk_t *mp;
2315 2316
2316 2317 (void) snprintf(name, sizeof (name), "ill_taskq_dispatch_%d",
2317 2318 ipst->ips_netstack->netstack_stackid);
2318 2319 CALLB_CPR_INIT(&cprinfo, &ipst->ips_capab_taskq_lock, callb_generic_cpr,
2319 2320 name);
2320 2321 mutex_enter(&ipst->ips_capab_taskq_lock);
2321 2322
2322 2323 for (;;) {
2323 2324 mp = ipst->ips_capab_taskq_head;
2324 2325 while (mp != NULL) {
2325 2326 ipst->ips_capab_taskq_head = mp->b_next;
2326 2327 if (ipst->ips_capab_taskq_head == NULL)
2327 2328 ipst->ips_capab_taskq_tail = NULL;
2328 2329 mutex_exit(&ipst->ips_capab_taskq_lock);
2329 2330 mp->b_next = NULL;
2330 2331
2331 2332 VERIFY(taskq_dispatch(system_taskq,
2332 2333 ill_capability_ack_thr, mp, TQ_SLEEP) !=
2333 2334 TASKQID_INVALID);
2334 2335 mutex_enter(&ipst->ips_capab_taskq_lock);
2335 2336 mp = ipst->ips_capab_taskq_head;
2336 2337 }
2337 2338
2338 2339 if (ipst->ips_capab_taskq_quit)
2339 2340 break;
2340 2341 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2341 2342 cv_wait(&ipst->ips_capab_taskq_cv, &ipst->ips_capab_taskq_lock);
2342 2343 CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_capab_taskq_lock);
2343 2344 }
2344 2345 VERIFY(ipst->ips_capab_taskq_head == NULL);
2345 2346 VERIFY(ipst->ips_capab_taskq_tail == NULL);
2346 2347 CALLB_CPR_EXIT(&cprinfo);
2347 2348 thread_exit();
2348 2349 }
2349 2350
2350 2351 /*
2351 2352 * Consume a new-style hardware capabilities negotiation ack.
2352 2353 * Called via taskq on receipt of DL_CAPABILITY_ACK.
2353 2354 */
2354 2355 static void
2355 2356 ill_capability_ack_thr(void *arg)
2356 2357 {
2357 2358 mblk_t *mp = arg;
2358 2359 dl_capability_ack_t *capp;
2359 2360 dl_capability_sub_t *subp, *endp;
2360 2361 ill_t *ill;
2361 2362 boolean_t reneg;
2362 2363
2363 2364 ill = (ill_t *)mp->b_prev;
2364 2365 mp->b_prev = NULL;
2365 2366
2366 2367 VERIFY(ipsq_enter(ill, B_FALSE, CUR_OP) == B_TRUE);
2367 2368
2368 2369 if (ill->ill_dlpi_capab_state == IDCS_RESET_SENT ||
2369 2370 ill->ill_dlpi_capab_state == IDCS_RENEG) {
2370 2371 /*
2371 2372 * We have received the ack for our DL_CAPAB reset request.
2372 2373 * There isnt' anything in the message that needs processing.
2373 2374 * All message based capabilities have been disabled, now
2374 2375 * do the function call based capability disable.
2375 2376 */
2376 2377 reneg = ill->ill_dlpi_capab_state == IDCS_RENEG;
2377 2378 ill_capability_dld_disable(ill);
2378 2379 ill->ill_dlpi_capab_state = IDCS_UNKNOWN;
2379 2380 if (reneg)
2380 2381 ill_capability_probe(ill);
2381 2382 goto done;
2382 2383 }
2383 2384
2384 2385 if (ill->ill_dlpi_capab_state == IDCS_PROBE_SENT)
2385 2386 ill->ill_dlpi_capab_state = IDCS_OK;
2386 2387
2387 2388 capp = (dl_capability_ack_t *)mp->b_rptr;
2388 2389
2389 2390 if (capp->dl_sub_length == 0) {
2390 2391 /* no new-style capabilities */
2391 2392 goto done;
2392 2393 }
2393 2394
2394 2395 /* make sure the driver supplied correct dl_sub_length */
2395 2396 if ((sizeof (*capp) + capp->dl_sub_length) > MBLKL(mp)) {
2396 2397 ip0dbg(("ill_capability_ack: bad DL_CAPABILITY_ACK, "
2397 2398 "invalid dl_sub_length (%d)\n", capp->dl_sub_length));
2398 2399 goto done;
2399 2400 }
2400 2401
2401 2402 #define SC(base, offset) (dl_capability_sub_t *)(((uchar_t *)(base))+(offset))
2402 2403 /*
2403 2404 * There are sub-capabilities. Process the ones we know about.
2404 2405 * Loop until we don't have room for another sub-cap header..
2405 2406 */
2406 2407 for (subp = SC(capp, capp->dl_sub_offset),
2407 2408 endp = SC(subp, capp->dl_sub_length - sizeof (*subp));
2408 2409 subp <= endp;
2409 2410 subp = SC(subp, sizeof (dl_capability_sub_t) + subp->dl_length)) {
2410 2411
2411 2412 switch (subp->dl_cap) {
2412 2413 case DL_CAPAB_ID_WRAPPER:
2413 2414 ill_capability_id_ack(ill, mp, subp);
2414 2415 break;
2415 2416 default:
2416 2417 ill_capability_dispatch(ill, mp, subp);
2417 2418 break;
2418 2419 }
2419 2420 }
2420 2421 #undef SC
2421 2422 done:
2422 2423 inet_freemsg(mp);
2423 2424 ill_capability_done(ill);
2424 2425 ipsq_exit(ill->ill_phyint->phyint_ipsq);
2425 2426 }
2426 2427
2427 2428 /*
2428 2429 * This needs to be started in a taskq thread to provide a cv_waitable
2429 2430 * context.
2430 2431 */
2431 2432 void
2432 2433 ill_capability_ack(ill_t *ill, mblk_t *mp)
2433 2434 {
2434 2435 ip_stack_t *ipst = ill->ill_ipst;
2435 2436
2436 2437 mp->b_prev = (mblk_t *)ill;
2437 2438 ASSERT(mp->b_next == NULL);
2438 2439
2439 2440 if (taskq_dispatch(system_taskq, ill_capability_ack_thr, mp,
2440 2441 TQ_NOSLEEP) != TASKQID_INVALID)
2441 2442 return;
2442 2443
2443 2444 /*
2444 2445 * The taskq dispatch failed. Signal the ill_taskq_dispatch thread
2445 2446 * which will do the dispatch using TQ_SLEEP to guarantee success.
2446 2447 */
2447 2448 mutex_enter(&ipst->ips_capab_taskq_lock);
2448 2449 if (ipst->ips_capab_taskq_head == NULL) {
2449 2450 ASSERT(ipst->ips_capab_taskq_tail == NULL);
2450 2451 ipst->ips_capab_taskq_head = mp;
2451 2452 } else {
2452 2453 ipst->ips_capab_taskq_tail->b_next = mp;
2453 2454 }
2454 2455 ipst->ips_capab_taskq_tail = mp;
2455 2456
2456 2457 cv_signal(&ipst->ips_capab_taskq_cv);
2457 2458 mutex_exit(&ipst->ips_capab_taskq_lock);
2458 2459 }
2459 2460
2460 2461 /*
2461 2462 * This routine is called to scan the fragmentation reassembly table for
2462 2463 * the specified ILL for any packets that are starting to smell.
2463 2464 * dead_interval is the maximum time in seconds that will be tolerated. It
2464 2465 * will either be the value specified in ip_g_frag_timeout, or zero if the
2465 2466 * ILL is shutting down and it is time to blow everything off.
2466 2467 *
2467 2468 * It returns the number of seconds (as a time_t) that the next frag timer
2468 2469 * should be scheduled for, 0 meaning that the timer doesn't need to be
2469 2470 * re-started. Note that the method of calculating next_timeout isn't
2470 2471 * entirely accurate since time will flow between the time we grab
2471 2472 * current_time and the time we schedule the next timeout. This isn't a
2472 2473 * big problem since this is the timer for sending an ICMP reassembly time
2473 2474 * exceeded messages, and it doesn't have to be exactly accurate.
2474 2475 *
2475 2476 * This function is
2476 2477 * sometimes called as writer, although this is not required.
2477 2478 */
2478 2479 time_t
2479 2480 ill_frag_timeout(ill_t *ill, time_t dead_interval)
2480 2481 {
2481 2482 ipfb_t *ipfb;
2482 2483 ipfb_t *endp;
2483 2484 ipf_t *ipf;
2484 2485 ipf_t *ipfnext;
2485 2486 mblk_t *mp;
2486 2487 time_t current_time = gethrestime_sec();
2487 2488 time_t next_timeout = 0;
2488 2489 uint32_t hdr_length;
2489 2490 mblk_t *send_icmp_head;
2490 2491 mblk_t *send_icmp_head_v6;
2491 2492 ip_stack_t *ipst = ill->ill_ipst;
2492 2493 ip_recv_attr_t iras;
2493 2494
2494 2495 bzero(&iras, sizeof (iras));
2495 2496 iras.ira_flags = 0;
2496 2497 iras.ira_ill = iras.ira_rill = ill;
2497 2498 iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
2498 2499 iras.ira_rifindex = iras.ira_ruifindex;
2499 2500
2500 2501 ipfb = ill->ill_frag_hash_tbl;
2501 2502 if (ipfb == NULL)
2502 2503 return (B_FALSE);
2503 2504 endp = &ipfb[ILL_FRAG_HASH_TBL_COUNT];
2504 2505 /* Walk the frag hash table. */
2505 2506 for (; ipfb < endp; ipfb++) {
2506 2507 send_icmp_head = NULL;
2507 2508 send_icmp_head_v6 = NULL;
2508 2509 mutex_enter(&ipfb->ipfb_lock);
2509 2510 while ((ipf = ipfb->ipfb_ipf) != 0) {
2510 2511 time_t frag_time = current_time - ipf->ipf_timestamp;
2511 2512 time_t frag_timeout;
2512 2513
2513 2514 if (frag_time < dead_interval) {
2514 2515 /*
2515 2516 * There are some outstanding fragments
2516 2517 * that will timeout later. Make note of
2517 2518 * the time so that we can reschedule the
2518 2519 * next timeout appropriately.
2519 2520 */
2520 2521 frag_timeout = dead_interval - frag_time;
2521 2522 if (next_timeout == 0 ||
2522 2523 frag_timeout < next_timeout) {
2523 2524 next_timeout = frag_timeout;
2524 2525 }
2525 2526 break;
2526 2527 }
2527 2528 /* Time's up. Get it out of here. */
2528 2529 hdr_length = ipf->ipf_nf_hdr_len;
2529 2530 ipfnext = ipf->ipf_hash_next;
2530 2531 if (ipfnext)
2531 2532 ipfnext->ipf_ptphn = ipf->ipf_ptphn;
2532 2533 *ipf->ipf_ptphn = ipfnext;
2533 2534 mp = ipf->ipf_mp->b_cont;
2534 2535 for (; mp; mp = mp->b_cont) {
2535 2536 /* Extra points for neatness. */
2536 2537 IP_REASS_SET_START(mp, 0);
2537 2538 IP_REASS_SET_END(mp, 0);
2538 2539 }
2539 2540 mp = ipf->ipf_mp->b_cont;
2540 2541 atomic_add_32(&ill->ill_frag_count, -ipf->ipf_count);
2541 2542 ASSERT(ipfb->ipfb_count >= ipf->ipf_count);
2542 2543 ipfb->ipfb_count -= ipf->ipf_count;
2543 2544 ASSERT(ipfb->ipfb_frag_pkts > 0);
2544 2545 ipfb->ipfb_frag_pkts--;
2545 2546 /*
2546 2547 * We do not send any icmp message from here because
2547 2548 * we currently are holding the ipfb_lock for this
2548 2549 * hash chain. If we try and send any icmp messages
2549 2550 * from here we may end up via a put back into ip
2550 2551 * trying to get the same lock, causing a recursive
2551 2552 * mutex panic. Instead we build a list and send all
2552 2553 * the icmp messages after we have dropped the lock.
2553 2554 */
2554 2555 if (ill->ill_isv6) {
2555 2556 if (hdr_length != 0) {
2556 2557 mp->b_next = send_icmp_head_v6;
2557 2558 send_icmp_head_v6 = mp;
2558 2559 } else {
2559 2560 freemsg(mp);
2560 2561 }
2561 2562 } else {
2562 2563 if (hdr_length != 0) {
2563 2564 mp->b_next = send_icmp_head;
2564 2565 send_icmp_head = mp;
2565 2566 } else {
2566 2567 freemsg(mp);
2567 2568 }
2568 2569 }
2569 2570 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2570 2571 ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill);
2571 2572 freeb(ipf->ipf_mp);
2572 2573 }
2573 2574 mutex_exit(&ipfb->ipfb_lock);
2574 2575 /*
2575 2576 * Now need to send any icmp messages that we delayed from
2576 2577 * above.
2577 2578 */
2578 2579 while (send_icmp_head_v6 != NULL) {
2579 2580 ip6_t *ip6h;
2580 2581
2581 2582 mp = send_icmp_head_v6;
2582 2583 send_icmp_head_v6 = send_icmp_head_v6->b_next;
2583 2584 mp->b_next = NULL;
2584 2585 ip6h = (ip6_t *)mp->b_rptr;
2585 2586 iras.ira_flags = 0;
2586 2587 /*
2587 2588 * This will result in an incorrect ALL_ZONES zoneid
2588 2589 * for multicast packets, but we
2589 2590 * don't send ICMP errors for those in any case.
2590 2591 */
2591 2592 iras.ira_zoneid =
2592 2593 ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst,
2593 2594 ill, ipst);
2594 2595 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2595 2596 icmp_time_exceeded_v6(mp,
2596 2597 ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE,
2597 2598 &iras);
2598 2599 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2599 2600 }
2600 2601 while (send_icmp_head != NULL) {
2601 2602 ipaddr_t dst;
2602 2603
2603 2604 mp = send_icmp_head;
2604 2605 send_icmp_head = send_icmp_head->b_next;
2605 2606 mp->b_next = NULL;
2606 2607
2607 2608 dst = ((ipha_t *)mp->b_rptr)->ipha_dst;
2608 2609
2609 2610 iras.ira_flags = IRAF_IS_IPV4;
2610 2611 /*
2611 2612 * This will result in an incorrect ALL_ZONES zoneid
2612 2613 * for broadcast and multicast packets, but we
2613 2614 * don't send ICMP errors for those in any case.
2614 2615 */
2615 2616 iras.ira_zoneid = ipif_lookup_addr_zoneid(dst,
2616 2617 ill, ipst);
2617 2618 ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill);
2618 2619 icmp_time_exceeded(mp,
2619 2620 ICMP_REASSEMBLY_TIME_EXCEEDED, &iras);
2620 2621 ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2621 2622 }
2622 2623 }
2623 2624 /*
2624 2625 * A non-dying ILL will use the return value to decide whether to
2625 2626 * restart the frag timer, and for how long.
2626 2627 */
2627 2628 return (next_timeout);
2628 2629 }
2629 2630
2630 2631 /*
2631 2632 * This routine is called when the approximate count of mblk memory used
2632 2633 * for the specified ILL has exceeded max_count.
2633 2634 */
2634 2635 void
2635 2636 ill_frag_prune(ill_t *ill, uint_t max_count)
2636 2637 {
2637 2638 ipfb_t *ipfb;
2638 2639 ipf_t *ipf;
2639 2640 size_t count;
2640 2641 clock_t now;
2641 2642
2642 2643 /*
2643 2644 * If we are here within ip_min_frag_prune_time msecs remove
2644 2645 * ill_frag_free_num_pkts oldest packets from each bucket and increment
2645 2646 * ill_frag_free_num_pkts.
2646 2647 */
2647 2648 mutex_enter(&ill->ill_lock);
2648 2649 now = ddi_get_lbolt();
2649 2650 if (TICK_TO_MSEC(now - ill->ill_last_frag_clean_time) <=
2650 2651 (ip_min_frag_prune_time != 0 ?
2651 2652 ip_min_frag_prune_time : msec_per_tick)) {
2652 2653
2653 2654 ill->ill_frag_free_num_pkts++;
2654 2655
2655 2656 } else {
2656 2657 ill->ill_frag_free_num_pkts = 0;
2657 2658 }
2658 2659 ill->ill_last_frag_clean_time = now;
2659 2660 mutex_exit(&ill->ill_lock);
2660 2661
2661 2662 /*
2662 2663 * free ill_frag_free_num_pkts oldest packets from each bucket.
2663 2664 */
2664 2665 if (ill->ill_frag_free_num_pkts != 0) {
2665 2666 int ix;
2666 2667
2667 2668 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2668 2669 ipfb = &ill->ill_frag_hash_tbl[ix];
2669 2670 mutex_enter(&ipfb->ipfb_lock);
2670 2671 if (ipfb->ipfb_ipf != NULL) {
2671 2672 ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
2672 2673 ill->ill_frag_free_num_pkts);
2673 2674 }
2674 2675 mutex_exit(&ipfb->ipfb_lock);
2675 2676 }
2676 2677 }
2677 2678 /*
2678 2679 * While the reassembly list for this ILL is too big, prune a fragment
2679 2680 * queue by age, oldest first.
2680 2681 */
2681 2682 while (ill->ill_frag_count > max_count) {
2682 2683 int ix;
2683 2684 ipfb_t *oipfb = NULL;
2684 2685 uint_t oldest = UINT_MAX;
2685 2686
2686 2687 count = 0;
2687 2688 for (ix = 0; ix < ILL_FRAG_HASH_TBL_COUNT; ix++) {
2688 2689 ipfb = &ill->ill_frag_hash_tbl[ix];
2689 2690 mutex_enter(&ipfb->ipfb_lock);
2690 2691 ipf = ipfb->ipfb_ipf;
2691 2692 if (ipf != NULL && ipf->ipf_gen < oldest) {
2692 2693 oldest = ipf->ipf_gen;
2693 2694 oipfb = ipfb;
2694 2695 }
2695 2696 count += ipfb->ipfb_count;
2696 2697 mutex_exit(&ipfb->ipfb_lock);
2697 2698 }
2698 2699 if (oipfb == NULL)
2699 2700 break;
2700 2701
2701 2702 if (count <= max_count)
2702 2703 return; /* Somebody beat us to it, nothing to do */
2703 2704 mutex_enter(&oipfb->ipfb_lock);
2704 2705 ipf = oipfb->ipfb_ipf;
2705 2706 if (ipf != NULL) {
2706 2707 ill_frag_free_pkts(ill, oipfb, ipf, 1);
2707 2708 }
2708 2709 mutex_exit(&oipfb->ipfb_lock);
2709 2710 }
2710 2711 }
2711 2712
2712 2713 /*
2713 2714 * free 'free_cnt' fragmented packets starting at ipf.
2714 2715 */
2715 2716 void
2716 2717 ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt)
2717 2718 {
2718 2719 size_t count;
2719 2720 mblk_t *mp;
2720 2721 mblk_t *tmp;
2721 2722 ipf_t **ipfp = ipf->ipf_ptphn;
2722 2723
2723 2724 ASSERT(MUTEX_HELD(&ipfb->ipfb_lock));
2724 2725 ASSERT(ipfp != NULL);
2725 2726 ASSERT(ipf != NULL);
2726 2727
2727 2728 while (ipf != NULL && free_cnt-- > 0) {
2728 2729 count = ipf->ipf_count;
2729 2730 mp = ipf->ipf_mp;
2730 2731 ipf = ipf->ipf_hash_next;
2731 2732 for (tmp = mp; tmp; tmp = tmp->b_cont) {
2732 2733 IP_REASS_SET_START(tmp, 0);
2733 2734 IP_REASS_SET_END(tmp, 0);
2734 2735 }
2735 2736 atomic_add_32(&ill->ill_frag_count, -count);
2736 2737 ASSERT(ipfb->ipfb_count >= count);
2737 2738 ipfb->ipfb_count -= count;
2738 2739 ASSERT(ipfb->ipfb_frag_pkts > 0);
2739 2740 ipfb->ipfb_frag_pkts--;
2740 2741 BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails);
2741 2742 ip_drop_input("ipIfStatsReasmFails", mp, ill);
2742 2743 freemsg(mp);
2743 2744 }
2744 2745
2745 2746 if (ipf)
2746 2747 ipf->ipf_ptphn = ipfp;
2747 2748 ipfp[0] = ipf;
2748 2749 }
2749 2750
2750 2751 /*
2751 2752 * Helper function for ill_forward_set().
2752 2753 */
2753 2754 static void
2754 2755 ill_forward_set_on_ill(ill_t *ill, boolean_t enable)
2755 2756 {
2756 2757 ip_stack_t *ipst = ill->ill_ipst;
2757 2758
2758 2759 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2759 2760
2760 2761 ip1dbg(("ill_forward_set: %s %s forwarding on %s",
2761 2762 (enable ? "Enabling" : "Disabling"),
2762 2763 (ill->ill_isv6 ? "IPv6" : "IPv4"), ill->ill_name));
2763 2764 mutex_enter(&ill->ill_lock);
2764 2765 if (enable)
2765 2766 ill->ill_flags |= ILLF_ROUTER;
2766 2767 else
2767 2768 ill->ill_flags &= ~ILLF_ROUTER;
2768 2769 mutex_exit(&ill->ill_lock);
2769 2770 if (ill->ill_isv6)
2770 2771 ill_set_nce_router_flags(ill, enable);
2771 2772 /* Notify routing socket listeners of this change. */
2772 2773 if (ill->ill_ipif != NULL)
2773 2774 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
2774 2775 }
2775 2776
2776 2777 /*
2777 2778 * Set an ill's ILLF_ROUTER flag appropriately. Send up RTS_IFINFO routing
2778 2779 * socket messages for each interface whose flags we change.
2779 2780 */
2780 2781 int
2781 2782 ill_forward_set(ill_t *ill, boolean_t enable)
2782 2783 {
2783 2784 ipmp_illgrp_t *illg;
2784 2785 ip_stack_t *ipst = ill->ill_ipst;
2785 2786
2786 2787 ASSERT(IAM_WRITER_ILL(ill) || RW_READ_HELD(&ipst->ips_ill_g_lock));
2787 2788
2788 2789 if ((enable && (ill->ill_flags & ILLF_ROUTER)) ||
2789 2790 (!enable && !(ill->ill_flags & ILLF_ROUTER)))
2790 2791 return (0);
2791 2792
2792 2793 if (IS_LOOPBACK(ill))
2793 2794 return (EINVAL);
2794 2795
2795 2796 if (enable && ill->ill_allowed_ips_cnt > 0)
2796 2797 return (EPERM);
2797 2798
2798 2799 if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) {
2799 2800 /*
2800 2801 * Update all of the interfaces in the group.
2801 2802 */
2802 2803 illg = ill->ill_grp;
2803 2804 ill = list_head(&illg->ig_if);
2804 2805 for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2805 2806 ill_forward_set_on_ill(ill, enable);
2806 2807
2807 2808 /*
2808 2809 * Update the IPMP meta-interface.
2809 2810 */
2810 2811 ill_forward_set_on_ill(ipmp_illgrp_ipmp_ill(illg), enable);
2811 2812 return (0);
2812 2813 }
2813 2814
2814 2815 ill_forward_set_on_ill(ill, enable);
2815 2816 return (0);
2816 2817 }
2817 2818
2818 2819 /*
2819 2820 * Based on the ILLF_ROUTER flag of an ill, make sure all local nce's for
2820 2821 * addresses assigned to the ill have the NCE_F_ISROUTER flag appropriately
2821 2822 * set or clear.
2822 2823 */
2823 2824 static void
2824 2825 ill_set_nce_router_flags(ill_t *ill, boolean_t enable)
2825 2826 {
2826 2827 ipif_t *ipif;
2827 2828 ncec_t *ncec;
2828 2829 nce_t *nce;
2829 2830
2830 2831 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
2831 2832 /*
2832 2833 * NOTE: we match across the illgrp because nce's for
2833 2834 * addresses on IPMP interfaces have an nce_ill that points to
2834 2835 * the bound underlying ill.
2835 2836 */
2836 2837 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
2837 2838 if (nce != NULL) {
2838 2839 ncec = nce->nce_common;
2839 2840 mutex_enter(&ncec->ncec_lock);
2840 2841 if (enable)
2841 2842 ncec->ncec_flags |= NCE_F_ISROUTER;
2842 2843 else
2843 2844 ncec->ncec_flags &= ~NCE_F_ISROUTER;
2844 2845 mutex_exit(&ncec->ncec_lock);
2845 2846 nce_refrele(nce);
2846 2847 }
2847 2848 }
2848 2849 }
2849 2850
2850 2851 /*
2851 2852 * Intializes the context structure and returns the first ill in the list
2852 2853 * cuurently start_list and end_list can have values:
2853 2854 * MAX_G_HEADS Traverse both IPV4 and IPV6 lists.
2854 2855 * IP_V4_G_HEAD Traverse IPV4 list only.
2855 2856 * IP_V6_G_HEAD Traverse IPV6 list only.
2856 2857 */
2857 2858
2858 2859 /*
2859 2860 * We don't check for CONDEMNED ills here. Caller must do that if
2860 2861 * necessary under the ill lock.
2861 2862 */
2862 2863 ill_t *
2863 2864 ill_first(int start_list, int end_list, ill_walk_context_t *ctx,
2864 2865 ip_stack_t *ipst)
2865 2866 {
2866 2867 ill_if_t *ifp;
2867 2868 ill_t *ill;
2868 2869 avl_tree_t *avl_tree;
2869 2870
2870 2871 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
2871 2872 ASSERT(end_list <= MAX_G_HEADS && start_list >= 0);
2872 2873
2873 2874 /*
2874 2875 * setup the lists to search
2875 2876 */
2876 2877 if (end_list != MAX_G_HEADS) {
2877 2878 ctx->ctx_current_list = start_list;
2878 2879 ctx->ctx_last_list = end_list;
2879 2880 } else {
2880 2881 ctx->ctx_last_list = MAX_G_HEADS - 1;
2881 2882 ctx->ctx_current_list = 0;
2882 2883 }
2883 2884
2884 2885 while (ctx->ctx_current_list <= ctx->ctx_last_list) {
2885 2886 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2886 2887 if (ifp != (ill_if_t *)
2887 2888 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2888 2889 avl_tree = &ifp->illif_avl_by_ppa;
2889 2890 ill = avl_first(avl_tree);
2890 2891 /*
2891 2892 * ill is guaranteed to be non NULL or ifp should have
2892 2893 * not existed.
2893 2894 */
2894 2895 ASSERT(ill != NULL);
2895 2896 return (ill);
2896 2897 }
2897 2898 ctx->ctx_current_list++;
2898 2899 }
2899 2900
2900 2901 return (NULL);
2901 2902 }
2902 2903
2903 2904 /*
2904 2905 * returns the next ill in the list. ill_first() must have been called
2905 2906 * before calling ill_next() or bad things will happen.
2906 2907 */
2907 2908
2908 2909 /*
2909 2910 * We don't check for CONDEMNED ills here. Caller must do that if
2910 2911 * necessary under the ill lock.
2911 2912 */
2912 2913 ill_t *
2913 2914 ill_next(ill_walk_context_t *ctx, ill_t *lastill)
2914 2915 {
2915 2916 ill_if_t *ifp;
2916 2917 ill_t *ill;
2917 2918 ip_stack_t *ipst = lastill->ill_ipst;
2918 2919
2919 2920 ASSERT(lastill->ill_ifptr != (ill_if_t *)
2920 2921 &IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst));
2921 2922 if ((ill = avl_walk(&lastill->ill_ifptr->illif_avl_by_ppa, lastill,
2922 2923 AVL_AFTER)) != NULL) {
2923 2924 return (ill);
2924 2925 }
2925 2926
2926 2927 /* goto next ill_ifp in the list. */
2927 2928 ifp = lastill->ill_ifptr->illif_next;
2928 2929
2929 2930 /* make sure not at end of circular list */
2930 2931 while (ifp ==
2931 2932 (ill_if_t *)&IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst)) {
2932 2933 if (++ctx->ctx_current_list > ctx->ctx_last_list)
2933 2934 return (NULL);
2934 2935 ifp = IP_VX_ILL_G_LIST(ctx->ctx_current_list, ipst);
2935 2936 }
2936 2937
2937 2938 return (avl_first(&ifp->illif_avl_by_ppa));
2938 2939 }
2939 2940
2940 2941 /*
2941 2942 * Check interface name for correct format: [a-zA-Z]+[a-zA-Z0-9._]*[0-9]+
2942 2943 * The final number (PPA) must not have any leading zeros. Upon success, a
2943 2944 * pointer to the start of the PPA is returned; otherwise NULL is returned.
2944 2945 */
2945 2946 static char *
2946 2947 ill_get_ppa_ptr(char *name)
2947 2948 {
2948 2949 int namelen = strlen(name);
2949 2950 int end_ndx = namelen - 1;
2950 2951 int ppa_ndx, i;
2951 2952
2952 2953 /*
2953 2954 * Check that the first character is [a-zA-Z], and that the last
2954 2955 * character is [0-9].
2955 2956 */
2956 2957 if (namelen == 0 || !isalpha(name[0]) || !isdigit(name[end_ndx]))
2957 2958 return (NULL);
2958 2959
2959 2960 /*
2960 2961 * Set `ppa_ndx' to the PPA start, and check for leading zeroes.
2961 2962 */
2962 2963 for (ppa_ndx = end_ndx; ppa_ndx > 0; ppa_ndx--)
2963 2964 if (!isdigit(name[ppa_ndx - 1]))
2964 2965 break;
2965 2966
2966 2967 if (name[ppa_ndx] == '0' && ppa_ndx < end_ndx)
2967 2968 return (NULL);
2968 2969
2969 2970 /*
2970 2971 * Check that the intermediate characters are [a-z0-9.]
2971 2972 */
2972 2973 for (i = 1; i < ppa_ndx; i++) {
2973 2974 if (!isalpha(name[i]) && !isdigit(name[i]) &&
2974 2975 name[i] != '.' && name[i] != '_') {
2975 2976 return (NULL);
2976 2977 }
2977 2978 }
2978 2979
2979 2980 return (name + ppa_ndx);
2980 2981 }
2981 2982
2982 2983 /*
2983 2984 * use avl tree to locate the ill.
2984 2985 */
2985 2986 static ill_t *
2986 2987 ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst)
2987 2988 {
2988 2989 char *ppa_ptr = NULL;
2989 2990 int len;
2990 2991 uint_t ppa;
2991 2992 ill_t *ill = NULL;
2992 2993 ill_if_t *ifp;
2993 2994 int list;
2994 2995
2995 2996 /*
2996 2997 * get ppa ptr
2997 2998 */
2998 2999 if (isv6)
2999 3000 list = IP_V6_G_HEAD;
3000 3001 else
3001 3002 list = IP_V4_G_HEAD;
3002 3003
3003 3004 if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) {
3004 3005 return (NULL);
3005 3006 }
3006 3007
3007 3008 len = ppa_ptr - name + 1;
3008 3009
3009 3010 ppa = stoi(&ppa_ptr);
3010 3011
3011 3012 ifp = IP_VX_ILL_G_LIST(list, ipst);
3012 3013
3013 3014 while (ifp != (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
3014 3015 /*
3015 3016 * match is done on len - 1 as the name is not null
3016 3017 * terminated it contains ppa in addition to the interface
3017 3018 * name.
3018 3019 */
3019 3020 if ((ifp->illif_name_len == len) &&
3020 3021 bcmp(ifp->illif_name, name, len - 1) == 0) {
3021 3022 break;
3022 3023 } else {
3023 3024 ifp = ifp->illif_next;
3024 3025 }
3025 3026 }
3026 3027
3027 3028 if (ifp == (ill_if_t *)&IP_VX_ILL_G_LIST(list, ipst)) {
3028 3029 /*
3029 3030 * Even the interface type does not exist.
3030 3031 */
3031 3032 return (NULL);
3032 3033 }
3033 3034
3034 3035 ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL);
3035 3036 if (ill != NULL) {
3036 3037 mutex_enter(&ill->ill_lock);
3037 3038 if (ILL_CAN_LOOKUP(ill)) {
3038 3039 ill_refhold_locked(ill);
3039 3040 mutex_exit(&ill->ill_lock);
3040 3041 return (ill);
3041 3042 }
3042 3043 mutex_exit(&ill->ill_lock);
3043 3044 }
3044 3045 return (NULL);
3045 3046 }
3046 3047
3047 3048 /*
3048 3049 * comparison function for use with avl.
3049 3050 */
3050 3051 static int
3051 3052 ill_compare_ppa(const void *ppa_ptr, const void *ill_ptr)
3052 3053 {
3053 3054 uint_t ppa;
3054 3055 uint_t ill_ppa;
3055 3056
3056 3057 ASSERT(ppa_ptr != NULL && ill_ptr != NULL);
3057 3058
3058 3059 ppa = *((uint_t *)ppa_ptr);
3059 3060 ill_ppa = ((const ill_t *)ill_ptr)->ill_ppa;
3060 3061 /*
3061 3062 * We want the ill with the lowest ppa to be on the
3062 3063 * top.
3063 3064 */
3064 3065 if (ill_ppa < ppa)
3065 3066 return (1);
3066 3067 if (ill_ppa > ppa)
3067 3068 return (-1);
3068 3069 return (0);
3069 3070 }
3070 3071
3071 3072 /*
3072 3073 * remove an interface type from the global list.
3073 3074 */
3074 3075 static void
3075 3076 ill_delete_interface_type(ill_if_t *interface)
3076 3077 {
3077 3078 ASSERT(interface != NULL);
3078 3079 ASSERT(avl_numnodes(&interface->illif_avl_by_ppa) == 0);
3079 3080
3080 3081 avl_destroy(&interface->illif_avl_by_ppa);
3081 3082 if (interface->illif_ppa_arena != NULL)
3082 3083 vmem_destroy(interface->illif_ppa_arena);
3083 3084
3084 3085 remque(interface);
3085 3086
3086 3087 mi_free(interface);
3087 3088 }
3088 3089
3089 3090 /*
3090 3091 * remove ill from the global list.
3091 3092 */
3092 3093 static void
3093 3094 ill_glist_delete(ill_t *ill)
3094 3095 {
3095 3096 ip_stack_t *ipst;
3096 3097 phyint_t *phyi;
3097 3098
3098 3099 if (ill == NULL)
3099 3100 return;
3100 3101 ipst = ill->ill_ipst;
3101 3102 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3102 3103
3103 3104 /*
3104 3105 * If the ill was never inserted into the AVL tree
3105 3106 * we skip the if branch.
3106 3107 */
3107 3108 if (ill->ill_ifptr != NULL) {
3108 3109 /*
3109 3110 * remove from AVL tree and free ppa number
3110 3111 */
3111 3112 avl_remove(&ill->ill_ifptr->illif_avl_by_ppa, ill);
3112 3113
3113 3114 if (ill->ill_ifptr->illif_ppa_arena != NULL) {
3114 3115 vmem_free(ill->ill_ifptr->illif_ppa_arena,
3115 3116 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3116 3117 }
3117 3118 if (avl_numnodes(&ill->ill_ifptr->illif_avl_by_ppa) == 0) {
3118 3119 ill_delete_interface_type(ill->ill_ifptr);
3119 3120 }
3120 3121
3121 3122 /*
3122 3123 * Indicate ill is no longer in the list.
3123 3124 */
3124 3125 ill->ill_ifptr = NULL;
3125 3126 ill->ill_name_length = 0;
3126 3127 ill->ill_name[0] = '\0';
3127 3128 ill->ill_ppa = UINT_MAX;
3128 3129 }
3129 3130
3130 3131 /* Generate one last event for this ill. */
3131 3132 ill_nic_event_dispatch(ill, 0, NE_UNPLUMB, ill->ill_name,
3132 3133 ill->ill_name_length);
3133 3134
3134 3135 ASSERT(ill->ill_phyint != NULL);
3135 3136 phyi = ill->ill_phyint;
3136 3137 ill->ill_phyint = NULL;
3137 3138
3138 3139 /*
3139 3140 * ill_init allocates a phyint always to store the copy
3140 3141 * of flags relevant to phyint. At that point in time, we could
3141 3142 * not assign the name and hence phyint_illv4/v6 could not be
3142 3143 * initialized. Later in ipif_set_values, we assign the name to
3143 3144 * the ill, at which point in time we assign phyint_illv4/v6.
3144 3145 * Thus we don't rely on phyint_illv6 to be initialized always.
3145 3146 */
3146 3147 if (ill->ill_flags & ILLF_IPV6)
3147 3148 phyi->phyint_illv6 = NULL;
3148 3149 else
3149 3150 phyi->phyint_illv4 = NULL;
3150 3151
3151 3152 if (phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL) {
3152 3153 rw_exit(&ipst->ips_ill_g_lock);
3153 3154 return;
3154 3155 }
3155 3156
3156 3157 /*
3157 3158 * There are no ills left on this phyint; pull it out of the phyint
3158 3159 * avl trees, and free it.
3159 3160 */
3160 3161 if (phyi->phyint_ifindex > 0) {
3161 3162 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3162 3163 phyi);
3163 3164 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
3164 3165 phyi);
3165 3166 }
3166 3167 rw_exit(&ipst->ips_ill_g_lock);
3167 3168
3168 3169 phyint_free(phyi);
3169 3170 }
3170 3171
3171 3172 /*
3172 3173 * allocate a ppa, if the number of plumbed interfaces of this type are
3173 3174 * less than ill_no_arena do a linear search to find a unused ppa.
3174 3175 * When the number goes beyond ill_no_arena switch to using an arena.
3175 3176 * Note: ppa value of zero cannot be allocated from vmem_arena as it
3176 3177 * is the return value for an error condition, so allocation starts at one
3177 3178 * and is decremented by one.
3178 3179 */
3179 3180 static int
3180 3181 ill_alloc_ppa(ill_if_t *ifp, ill_t *ill)
3181 3182 {
3182 3183 ill_t *tmp_ill;
3183 3184 uint_t start, end;
3184 3185 int ppa;
3185 3186
3186 3187 if (ifp->illif_ppa_arena == NULL &&
3187 3188 (avl_numnodes(&ifp->illif_avl_by_ppa) + 1 > ill_no_arena)) {
3188 3189 /*
3189 3190 * Create an arena.
3190 3191 */
3191 3192 ifp->illif_ppa_arena = vmem_create(ifp->illif_name,
3192 3193 (void *)1, UINT_MAX - 1, 1, NULL, NULL,
3193 3194 NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
3194 3195 /* allocate what has already been assigned */
3195 3196 for (tmp_ill = avl_first(&ifp->illif_avl_by_ppa);
3196 3197 tmp_ill != NULL; tmp_ill = avl_walk(&ifp->illif_avl_by_ppa,
3197 3198 tmp_ill, AVL_AFTER)) {
3198 3199 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3199 3200 1, /* size */
3200 3201 1, /* align/quantum */
3201 3202 0, /* phase */
3202 3203 0, /* nocross */
3203 3204 /* minaddr */
3204 3205 (void *)((uintptr_t)tmp_ill->ill_ppa + 1),
3205 3206 /* maxaddr */
3206 3207 (void *)((uintptr_t)tmp_ill->ill_ppa + 2),
3207 3208 VM_NOSLEEP|VM_FIRSTFIT);
3208 3209 if (ppa == 0) {
3209 3210 ip1dbg(("ill_alloc_ppa: ppa allocation"
3210 3211 " failed while switching"));
3211 3212 vmem_destroy(ifp->illif_ppa_arena);
3212 3213 ifp->illif_ppa_arena = NULL;
3213 3214 break;
3214 3215 }
3215 3216 }
3216 3217 }
3217 3218
3218 3219 if (ifp->illif_ppa_arena != NULL) {
3219 3220 if (ill->ill_ppa == UINT_MAX) {
3220 3221 ppa = (int)(uintptr_t)vmem_alloc(ifp->illif_ppa_arena,
3221 3222 1, VM_NOSLEEP|VM_FIRSTFIT);
3222 3223 if (ppa == 0)
3223 3224 return (EAGAIN);
3224 3225 ill->ill_ppa = --ppa;
3225 3226 } else {
3226 3227 ppa = (int)(uintptr_t)vmem_xalloc(ifp->illif_ppa_arena,
3227 3228 1, /* size */
3228 3229 1, /* align/quantum */
3229 3230 0, /* phase */
3230 3231 0, /* nocross */
3231 3232 (void *)(uintptr_t)(ill->ill_ppa + 1), /* minaddr */
3232 3233 (void *)(uintptr_t)(ill->ill_ppa + 2), /* maxaddr */
3233 3234 VM_NOSLEEP|VM_FIRSTFIT);
3234 3235 /*
3235 3236 * Most likely the allocation failed because
3236 3237 * the requested ppa was in use.
3237 3238 */
3238 3239 if (ppa == 0)
3239 3240 return (EEXIST);
3240 3241 }
3241 3242 return (0);
3242 3243 }
3243 3244
3244 3245 /*
3245 3246 * No arena is in use and not enough (>ill_no_arena) interfaces have
3246 3247 * been plumbed to create one. Do a linear search to get a unused ppa.
3247 3248 */
3248 3249 if (ill->ill_ppa == UINT_MAX) {
3249 3250 end = UINT_MAX - 1;
3250 3251 start = 0;
3251 3252 } else {
3252 3253 end = start = ill->ill_ppa;
3253 3254 }
3254 3255
3255 3256 tmp_ill = avl_find(&ifp->illif_avl_by_ppa, (void *)&start, NULL);
3256 3257 while (tmp_ill != NULL && tmp_ill->ill_ppa == start) {
3257 3258 if (start++ >= end) {
3258 3259 if (ill->ill_ppa == UINT_MAX)
3259 3260 return (EAGAIN);
3260 3261 else
3261 3262 return (EEXIST);
3262 3263 }
3263 3264 tmp_ill = avl_walk(&ifp->illif_avl_by_ppa, tmp_ill, AVL_AFTER);
3264 3265 }
3265 3266 ill->ill_ppa = start;
3266 3267 return (0);
3267 3268 }
3268 3269
3269 3270 /*
3270 3271 * Insert ill into the list of configured ill's. Once this function completes,
3271 3272 * the ill is globally visible and is available through lookups. More precisely
3272 3273 * this happens after the caller drops the ill_g_lock.
3273 3274 */
3274 3275 static int
3275 3276 ill_glist_insert(ill_t *ill, char *name, boolean_t isv6)
3276 3277 {
3277 3278 ill_if_t *ill_interface;
3278 3279 avl_index_t where = 0;
3279 3280 int error;
3280 3281 int name_length;
3281 3282 int index;
3282 3283 boolean_t check_length = B_FALSE;
3283 3284 ip_stack_t *ipst = ill->ill_ipst;
3284 3285
3285 3286 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
3286 3287
3287 3288 name_length = mi_strlen(name) + 1;
3288 3289
3289 3290 if (isv6)
3290 3291 index = IP_V6_G_HEAD;
3291 3292 else
3292 3293 index = IP_V4_G_HEAD;
3293 3294
3294 3295 ill_interface = IP_VX_ILL_G_LIST(index, ipst);
3295 3296 /*
3296 3297 * Search for interface type based on name
3297 3298 */
3298 3299 while (ill_interface != (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3299 3300 if ((ill_interface->illif_name_len == name_length) &&
3300 3301 (strcmp(ill_interface->illif_name, name) == 0)) {
3301 3302 break;
3302 3303 }
3303 3304 ill_interface = ill_interface->illif_next;
3304 3305 }
3305 3306
3306 3307 /*
3307 3308 * Interface type not found, create one.
3308 3309 */
3309 3310 if (ill_interface == (ill_if_t *)&IP_VX_ILL_G_LIST(index, ipst)) {
3310 3311 ill_g_head_t ghead;
3311 3312
3312 3313 /*
3313 3314 * allocate ill_if_t structure
3314 3315 */
3315 3316 ill_interface = (ill_if_t *)mi_zalloc(sizeof (ill_if_t));
3316 3317 if (ill_interface == NULL) {
3317 3318 return (ENOMEM);
3318 3319 }
3319 3320
3320 3321 (void) strcpy(ill_interface->illif_name, name);
3321 3322 ill_interface->illif_name_len = name_length;
3322 3323
3323 3324 avl_create(&ill_interface->illif_avl_by_ppa,
3324 3325 ill_compare_ppa, sizeof (ill_t),
3325 3326 offsetof(struct ill_s, ill_avl_byppa));
3326 3327
3327 3328 /*
3328 3329 * link the structure in the back to maintain order
3329 3330 * of configuration for ifconfig output.
3330 3331 */
3331 3332 ghead = ipst->ips_ill_g_heads[index];
3332 3333 insque(ill_interface, ghead.ill_g_list_tail);
3333 3334 }
3334 3335
3335 3336 if (ill->ill_ppa == UINT_MAX)
3336 3337 check_length = B_TRUE;
3337 3338
3338 3339 error = ill_alloc_ppa(ill_interface, ill);
3339 3340 if (error != 0) {
3340 3341 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3341 3342 ill_delete_interface_type(ill->ill_ifptr);
3342 3343 return (error);
3343 3344 }
3344 3345
3345 3346 /*
3346 3347 * When the ppa is choosen by the system, check that there is
3347 3348 * enough space to insert ppa. if a specific ppa was passed in this
3348 3349 * check is not required as the interface name passed in will have
3349 3350 * the right ppa in it.
3350 3351 */
3351 3352 if (check_length) {
3352 3353 /*
3353 3354 * UINT_MAX - 1 should fit in 10 chars, alloc 12 chars.
3354 3355 */
3355 3356 char buf[sizeof (uint_t) * 3];
3356 3357
3357 3358 /*
3358 3359 * convert ppa to string to calculate the amount of space
3359 3360 * required for it in the name.
3360 3361 */
3361 3362 numtos(ill->ill_ppa, buf);
3362 3363
3363 3364 /* Do we have enough space to insert ppa ? */
3364 3365
3365 3366 if ((mi_strlen(name) + mi_strlen(buf) + 1) > LIFNAMSIZ) {
3366 3367 /* Free ppa and interface type struct */
3367 3368 if (ill_interface->illif_ppa_arena != NULL) {
3368 3369 vmem_free(ill_interface->illif_ppa_arena,
3369 3370 (void *)(uintptr_t)(ill->ill_ppa+1), 1);
3370 3371 }
3371 3372 if (avl_numnodes(&ill_interface->illif_avl_by_ppa) == 0)
3372 3373 ill_delete_interface_type(ill->ill_ifptr);
3373 3374
3374 3375 return (EINVAL);
3375 3376 }
3376 3377 }
3377 3378
3378 3379 (void) sprintf(ill->ill_name, "%s%u", name, ill->ill_ppa);
3379 3380 ill->ill_name_length = mi_strlen(ill->ill_name) + 1;
3380 3381
3381 3382 (void) avl_find(&ill_interface->illif_avl_by_ppa, &ill->ill_ppa,
3382 3383 &where);
3383 3384 ill->ill_ifptr = ill_interface;
3384 3385 avl_insert(&ill_interface->illif_avl_by_ppa, ill, where);
3385 3386
3386 3387 ill_phyint_reinit(ill);
3387 3388 return (0);
3388 3389 }
3389 3390
3390 3391 /* Initialize the per phyint ipsq used for serialization */
3391 3392 static boolean_t
3392 3393 ipsq_init(ill_t *ill, boolean_t enter)
3393 3394 {
3394 3395 ipsq_t *ipsq;
3395 3396 ipxop_t *ipx;
3396 3397
3397 3398 if ((ipsq = kmem_zalloc(sizeof (ipsq_t), KM_NOSLEEP)) == NULL)
3398 3399 return (B_FALSE);
3399 3400
3400 3401 ill->ill_phyint->phyint_ipsq = ipsq;
3401 3402 ipx = ipsq->ipsq_xop = &ipsq->ipsq_ownxop;
3402 3403 ipx->ipx_ipsq = ipsq;
3403 3404 ipsq->ipsq_next = ipsq;
3404 3405 ipsq->ipsq_phyint = ill->ill_phyint;
3405 3406 mutex_init(&ipsq->ipsq_lock, NULL, MUTEX_DEFAULT, 0);
3406 3407 mutex_init(&ipx->ipx_lock, NULL, MUTEX_DEFAULT, 0);
3407 3408 ipsq->ipsq_ipst = ill->ill_ipst; /* No netstack_hold */
3408 3409 if (enter) {
3409 3410 ipx->ipx_writer = curthread;
3410 3411 ipx->ipx_forced = B_FALSE;
3411 3412 ipx->ipx_reentry_cnt = 1;
3412 3413 #ifdef DEBUG
3413 3414 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
3414 3415 #endif
3415 3416 }
3416 3417 return (B_TRUE);
3417 3418 }
3418 3419
3419 3420 /*
3420 3421 * Here we perform initialisation of the ill_t common to both regular
3421 3422 * interface ILLs and the special loopback ILL created by ill_lookup_on_name.
3422 3423 */
3423 3424 static int
3424 3425 ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
3425 3426 boolean_t ipsq_enter)
3426 3427 {
3427 3428 int count;
3428 3429 uchar_t *frag_ptr;
3429 3430
3430 3431 mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
3431 3432 mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
3432 3433 ill->ill_saved_ire_cnt = 0;
3433 3434
3434 3435 if (is_loopback) {
3435 3436 ill->ill_max_frag = isv6 ? ip_loopback_mtu_v6plus :
3436 3437 ip_loopback_mtuplus;
3437 3438 /*
3438 3439 * No resolver here.
3439 3440 */
3440 3441 ill->ill_net_type = IRE_LOOPBACK;
3441 3442 } else {
3442 3443 ill->ill_rq = q;
3443 3444 ill->ill_wq = WR(q);
3444 3445 ill->ill_ppa = UINT_MAX;
3445 3446 }
3446 3447
3447 3448 ill->ill_isv6 = isv6;
3448 3449
3449 3450 /*
3450 3451 * Allocate sufficient space to contain our fragment hash table and
3451 3452 * the device name.
3452 3453 */
3453 3454 frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
3454 3455 if (frag_ptr == NULL)
3455 3456 return (ENOMEM);
3456 3457 ill->ill_frag_ptr = frag_ptr;
3457 3458 ill->ill_frag_free_num_pkts = 0;
3458 3459 ill->ill_last_frag_clean_time = 0;
3459 3460 ill->ill_frag_hash_tbl = (ipfb_t *)frag_ptr;
3460 3461 ill->ill_name = (char *)(frag_ptr + ILL_FRAG_HASH_TBL_SIZE);
3461 3462 for (count = 0; count < ILL_FRAG_HASH_TBL_COUNT; count++) {
3462 3463 mutex_init(&ill->ill_frag_hash_tbl[count].ipfb_lock,
3463 3464 NULL, MUTEX_DEFAULT, NULL);
3464 3465 }
3465 3466
3466 3467 ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
3467 3468 if (ill->ill_phyint == NULL) {
3468 3469 mi_free(frag_ptr);
3469 3470 return (ENOMEM);
3470 3471 }
3471 3472
3472 3473 mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
3473 3474 if (isv6) {
3474 3475 ill->ill_phyint->phyint_illv6 = ill;
3475 3476 } else {
3476 3477 ill->ill_phyint->phyint_illv4 = ill;
3477 3478 }
3478 3479 if (is_loopback) {
3479 3480 phyint_flags_init(ill->ill_phyint, DL_LOOP);
3480 3481 }
3481 3482
3482 3483 list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
3483 3484
3484 3485 ill_set_inputfn(ill);
3485 3486
3486 3487 if (!ipsq_init(ill, ipsq_enter)) {
3487 3488 mi_free(frag_ptr);
3488 3489 mi_free(ill->ill_phyint);
3489 3490 return (ENOMEM);
3490 3491 }
3491 3492
3492 3493 /* Frag queue limit stuff */
3493 3494 ill->ill_frag_count = 0;
3494 3495 ill->ill_ipf_gen = 0;
3495 3496
3496 3497 rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
3497 3498 mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
3498 3499 ill->ill_global_timer = INFINITY;
3499 3500 ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
3500 3501 ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
3501 3502 ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
3502 3503 ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
3503 3504
3504 3505 /*
3505 3506 * Initialize IPv6 configuration variables. The IP module is always
|
↓ open down ↓ |
2095 lines elided |
↑ open up ↑ |
3506 3507 * opened as an IPv4 module. Instead tracking down the cases where
3507 3508 * it switches to do ipv6, we'll just initialize the IPv6 configuration
3508 3509 * here for convenience, this has no effect until the ill is set to do
3509 3510 * IPv6.
3510 3511 */
3511 3512 ill->ill_reachable_time = ND_REACHABLE_TIME;
3512 3513 ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT;
3513 3514 ill->ill_max_buf = ND_MAX_Q;
3514 3515 ill->ill_refcnt = 0;
3515 3516
3516 - cv_init(&ill->ill_dlpi_capab_cv, NULL, CV_DEFAULT, NULL);
3517 - mutex_init(&ill->ill_dlpi_capab_lock, NULL, MUTEX_DEFAULT, NULL);
3518 -
3519 3517 return (0);
3520 3518 }
3521 3519
3522 3520 /*
3523 3521 * ill_init is called by ip_open when a device control stream is opened.
3524 3522 * It does a few initializations, and shoots a DL_INFO_REQ message down
3525 3523 * to the driver. The response is later picked up in ip_rput_dlpi and
3526 3524 * used to set up default mechanisms for talking to the driver. (Always
3527 3525 * called as writer.)
3528 3526 *
3529 3527 * If this function returns error, ip_open will call ip_close which in
3530 3528 * turn will call ill_delete to clean up any memory allocated here that
3531 3529 * is not yet freed.
3532 3530 *
3533 3531 * Note: ill_ipst and ill_zoneid must be set before calling ill_init.
3534 3532 */
3535 3533 int
3536 3534 ill_init(queue_t *q, ill_t *ill)
3537 3535 {
3538 3536 int ret;
3539 3537 dl_info_req_t *dlir;
3540 3538 mblk_t *info_mp;
3541 3539
3542 3540 info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
3543 3541 BPRI_HI);
3544 3542 if (info_mp == NULL)
3545 3543 return (ENOMEM);
3546 3544
3547 3545 /*
3548 3546 * For now pretend this is a v4 ill. We need to set phyint_ill*
3549 3547 * at this point because of the following reason. If we can't
3550 3548 * enter the ipsq at some point and cv_wait, the writer that
3551 3549 * wakes us up tries to locate us using the list of all phyints
3552 3550 * in an ipsq and the ills from the phyint thru the phyint_ill*.
3553 3551 * If we don't set it now, we risk a missed wakeup.
3554 3552 */
3555 3553 if ((ret = ill_init_common(ill, q, B_FALSE, B_FALSE, B_TRUE)) != 0) {
3556 3554 freemsg(info_mp);
3557 3555 return (ret);
3558 3556 }
3559 3557
3560 3558 ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
3561 3559
3562 3560 /* Send down the Info Request to the driver. */
3563 3561 info_mp->b_datap->db_type = M_PCPROTO;
3564 3562 dlir = (dl_info_req_t *)info_mp->b_rptr;
3565 3563 info_mp->b_wptr = (uchar_t *)&dlir[1];
3566 3564 dlir->dl_primitive = DL_INFO_REQ;
3567 3565
3568 3566 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3569 3567
3570 3568 qprocson(q);
3571 3569 ill_dlpi_send(ill, info_mp);
3572 3570
3573 3571 return (0);
3574 3572 }
3575 3573
3576 3574 /*
3577 3575 * ill_dls_info
3578 3576 * creates datalink socket info from the device.
3579 3577 */
3580 3578 int
3581 3579 ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill)
3582 3580 {
3583 3581 size_t len;
3584 3582
3585 3583 sdl->sdl_family = AF_LINK;
3586 3584 sdl->sdl_index = ill_get_upper_ifindex(ill);
3587 3585 sdl->sdl_type = ill->ill_type;
3588 3586 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3589 3587 len = strlen(sdl->sdl_data);
3590 3588 ASSERT(len < 256);
3591 3589 sdl->sdl_nlen = (uchar_t)len;
3592 3590 sdl->sdl_alen = ill->ill_phys_addr_length;
3593 3591 sdl->sdl_slen = 0;
3594 3592 if (ill->ill_phys_addr_length != 0 && ill->ill_phys_addr != NULL)
3595 3593 bcopy(ill->ill_phys_addr, &sdl->sdl_data[len], sdl->sdl_alen);
3596 3594
3597 3595 return (sizeof (struct sockaddr_dl));
3598 3596 }
3599 3597
3600 3598 /*
3601 3599 * ill_xarp_info
3602 3600 * creates xarp info from the device.
3603 3601 */
3604 3602 static int
3605 3603 ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill)
3606 3604 {
3607 3605 sdl->sdl_family = AF_LINK;
3608 3606 sdl->sdl_index = ill->ill_phyint->phyint_ifindex;
3609 3607 sdl->sdl_type = ill->ill_type;
3610 3608 ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data));
3611 3609 sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data);
3612 3610 sdl->sdl_alen = ill->ill_phys_addr_length;
3613 3611 sdl->sdl_slen = 0;
3614 3612 return (sdl->sdl_nlen);
3615 3613 }
3616 3614
3617 3615 static int
3618 3616 loopback_kstat_update(kstat_t *ksp, int rw)
3619 3617 {
3620 3618 kstat_named_t *kn;
3621 3619 netstackid_t stackid;
3622 3620 netstack_t *ns;
3623 3621 ip_stack_t *ipst;
3624 3622
3625 3623 if (ksp == NULL || ksp->ks_data == NULL)
3626 3624 return (EIO);
3627 3625
3628 3626 if (rw == KSTAT_WRITE)
3629 3627 return (EACCES);
3630 3628
3631 3629 kn = KSTAT_NAMED_PTR(ksp);
3632 3630 stackid = (zoneid_t)(uintptr_t)ksp->ks_private;
3633 3631
3634 3632 ns = netstack_find_by_stackid(stackid);
3635 3633 if (ns == NULL)
3636 3634 return (-1);
3637 3635
3638 3636 ipst = ns->netstack_ip;
3639 3637 if (ipst == NULL) {
3640 3638 netstack_rele(ns);
3641 3639 return (-1);
3642 3640 }
3643 3641 kn[0].value.ui32 = ipst->ips_loopback_packets;
3644 3642 kn[1].value.ui32 = ipst->ips_loopback_packets;
3645 3643 netstack_rele(ns);
3646 3644 return (0);
3647 3645 }
3648 3646
3649 3647 /*
3650 3648 * Has ifindex been plumbed already?
3651 3649 */
3652 3650 static boolean_t
3653 3651 phyint_exists(uint_t index, ip_stack_t *ipst)
3654 3652 {
3655 3653 ASSERT(index != 0);
3656 3654 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
3657 3655
3658 3656 return (avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3659 3657 &index, NULL) != NULL);
3660 3658 }
3661 3659
3662 3660 /*
3663 3661 * Pick a unique ifindex.
3664 3662 * When the index counter passes IF_INDEX_MAX for the first time, the wrap
3665 3663 * flag is set so that next time time ip_assign_ifindex() is called, it
3666 3664 * falls through and resets the index counter back to 1, the minimum value
3667 3665 * for the interface index. The logic below assumes that ips_ill_index
3668 3666 * can hold a value of IF_INDEX_MAX+1 without there being any loss
3669 3667 * (i.e. reset back to 0.)
3670 3668 */
3671 3669 boolean_t
3672 3670 ip_assign_ifindex(uint_t *indexp, ip_stack_t *ipst)
3673 3671 {
3674 3672 uint_t loops;
3675 3673
3676 3674 if (!ipst->ips_ill_index_wrap) {
3677 3675 *indexp = ipst->ips_ill_index++;
3678 3676 if (ipst->ips_ill_index > IF_INDEX_MAX) {
3679 3677 /*
3680 3678 * Reached the maximum ifindex value, set the wrap
3681 3679 * flag to indicate that it is no longer possible
3682 3680 * to assume that a given index is unallocated.
3683 3681 */
3684 3682 ipst->ips_ill_index_wrap = B_TRUE;
3685 3683 }
3686 3684 return (B_TRUE);
3687 3685 }
3688 3686
3689 3687 if (ipst->ips_ill_index > IF_INDEX_MAX)
3690 3688 ipst->ips_ill_index = 1;
3691 3689
3692 3690 /*
3693 3691 * Start reusing unused indexes. Note that we hold the ill_g_lock
3694 3692 * at this point and don't want to call any function that attempts
3695 3693 * to get the lock again.
3696 3694 */
3697 3695 for (loops = IF_INDEX_MAX; loops > 0; loops--) {
3698 3696 if (!phyint_exists(ipst->ips_ill_index, ipst)) {
3699 3697 /* found unused index - use it */
3700 3698 *indexp = ipst->ips_ill_index;
3701 3699 return (B_TRUE);
3702 3700 }
3703 3701
3704 3702 ipst->ips_ill_index++;
3705 3703 if (ipst->ips_ill_index > IF_INDEX_MAX)
3706 3704 ipst->ips_ill_index = 1;
3707 3705 }
3708 3706
3709 3707 /*
3710 3708 * all interface indicies are inuse.
3711 3709 */
3712 3710 return (B_FALSE);
3713 3711 }
3714 3712
3715 3713 /*
3716 3714 * Assign a unique interface index for the phyint.
3717 3715 */
3718 3716 static boolean_t
3719 3717 phyint_assign_ifindex(phyint_t *phyi, ip_stack_t *ipst)
3720 3718 {
3721 3719 ASSERT(phyi->phyint_ifindex == 0);
3722 3720 return (ip_assign_ifindex(&phyi->phyint_ifindex, ipst));
3723 3721 }
3724 3722
3725 3723 /*
3726 3724 * Initialize the flags on `phyi' as per the provided mactype.
3727 3725 */
3728 3726 static void
3729 3727 phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype)
3730 3728 {
3731 3729 uint64_t flags = 0;
3732 3730
3733 3731 /*
3734 3732 * Initialize PHYI_RUNNING and PHYI_FAILED. For non-IPMP interfaces,
3735 3733 * we always presume the underlying hardware is working and set
3736 3734 * PHYI_RUNNING (if it's not, the driver will subsequently send a
3737 3735 * DL_NOTE_LINK_DOWN message). For IPMP interfaces, at initialization
3738 3736 * there are no active interfaces in the group so we set PHYI_FAILED.
3739 3737 */
3740 3738 if (mactype == SUNW_DL_IPMP)
3741 3739 flags |= PHYI_FAILED;
3742 3740 else
3743 3741 flags |= PHYI_RUNNING;
3744 3742
3745 3743 switch (mactype) {
3746 3744 case SUNW_DL_VNI:
3747 3745 flags |= PHYI_VIRTUAL;
3748 3746 break;
3749 3747 case SUNW_DL_IPMP:
3750 3748 flags |= PHYI_IPMP;
3751 3749 break;
3752 3750 case DL_LOOP:
3753 3751 flags |= (PHYI_LOOPBACK | PHYI_VIRTUAL);
3754 3752 break;
3755 3753 }
3756 3754
3757 3755 mutex_enter(&phyi->phyint_lock);
3758 3756 phyi->phyint_flags |= flags;
3759 3757 mutex_exit(&phyi->phyint_lock);
3760 3758 }
3761 3759
3762 3760 /*
3763 3761 * Return a pointer to the ill which matches the supplied name. Note that
3764 3762 * the ill name length includes the null termination character. (May be
3765 3763 * called as writer.)
3766 3764 * If do_alloc and the interface is "lo0" it will be automatically created.
3767 3765 * Cannot bump up reference on condemned ills. So dup detect can't be done
3768 3766 * using this func.
3769 3767 */
3770 3768 ill_t *
3771 3769 ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
3772 3770 boolean_t *did_alloc, ip_stack_t *ipst)
3773 3771 {
3774 3772 ill_t *ill;
3775 3773 ipif_t *ipif;
3776 3774 ipsq_t *ipsq;
3777 3775 kstat_named_t *kn;
3778 3776 boolean_t isloopback;
3779 3777 in6_addr_t ov6addr;
3780 3778
3781 3779 isloopback = mi_strcmp(name, ipif_loopback_name) == 0;
3782 3780
3783 3781 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3784 3782 ill = ill_find_by_name(name, isv6, ipst);
3785 3783 rw_exit(&ipst->ips_ill_g_lock);
3786 3784 if (ill != NULL)
3787 3785 return (ill);
3788 3786
3789 3787 /*
3790 3788 * Couldn't find it. Does this happen to be a lookup for the
3791 3789 * loopback device and are we allowed to allocate it?
3792 3790 */
3793 3791 if (!isloopback || !do_alloc)
3794 3792 return (NULL);
3795 3793
3796 3794 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
3797 3795 ill = ill_find_by_name(name, isv6, ipst);
3798 3796 if (ill != NULL) {
3799 3797 rw_exit(&ipst->ips_ill_g_lock);
3800 3798 return (ill);
3801 3799 }
3802 3800
3803 3801 /* Create the loopback device on demand */
3804 3802 ill = (ill_t *)(mi_alloc(sizeof (ill_t) +
3805 3803 sizeof (ipif_loopback_name), BPRI_MED));
3806 3804 if (ill == NULL)
3807 3805 goto done;
3808 3806
3809 3807 bzero(ill, sizeof (*ill));
3810 3808 ill->ill_ipst = ipst;
3811 3809 netstack_hold(ipst->ips_netstack);
3812 3810 /*
3813 3811 * For exclusive stacks we set the zoneid to zero
3814 3812 * to make IP operate as if in the global zone.
3815 3813 */
3816 3814 ill->ill_zoneid = GLOBAL_ZONEID;
3817 3815
3818 3816 if (ill_init_common(ill, NULL, isv6, B_TRUE, B_FALSE) != 0)
3819 3817 goto done;
3820 3818
3821 3819 if (!ill_allocate_mibs(ill))
3822 3820 goto done;
3823 3821
3824 3822 ill->ill_current_frag = ill->ill_max_frag;
3825 3823 ill->ill_mtu = ill->ill_max_frag; /* Initial value */
3826 3824 ill->ill_mc_mtu = ill->ill_mtu;
3827 3825 /*
3828 3826 * ipif_loopback_name can't be pointed at directly because its used
3829 3827 * by both the ipv4 and ipv6 interfaces. When the ill is removed
3830 3828 * from the glist, ill_glist_delete() sets the first character of
3831 3829 * ill_name to '\0'.
3832 3830 */
3833 3831 ill->ill_name = (char *)ill + sizeof (*ill);
3834 3832 (void) strcpy(ill->ill_name, ipif_loopback_name);
3835 3833 ill->ill_name_length = sizeof (ipif_loopback_name);
3836 3834 /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
3837 3835 ill->ill_dlpi_pending = DL_PRIM_INVAL;
3838 3836
3839 3837 ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
3840 3838 if (ipif == NULL)
3841 3839 goto done;
3842 3840
3843 3841 ill->ill_flags = ILLF_MULTICAST;
3844 3842
3845 3843 ov6addr = ipif->ipif_v6lcl_addr;
3846 3844 /* Set up default loopback address and mask. */
3847 3845 if (!isv6) {
3848 3846 ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK);
3849 3847
3850 3848 IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr);
3851 3849 V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask);
3852 3850 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3853 3851 ipif->ipif_v6subnet);
3854 3852 ill->ill_flags |= ILLF_IPV4;
3855 3853 } else {
3856 3854 ipif->ipif_v6lcl_addr = ipv6_loopback;
3857 3855 ipif->ipif_v6net_mask = ipv6_all_ones;
3858 3856 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
3859 3857 ipif->ipif_v6subnet);
3860 3858 ill->ill_flags |= ILLF_IPV6;
3861 3859 }
3862 3860
3863 3861 /*
3864 3862 * Chain us in at the end of the ill list. hold the ill
3865 3863 * before we make it globally visible. 1 for the lookup.
3866 3864 */
3867 3865 ill_refhold(ill);
3868 3866
3869 3867 ipsq = ill->ill_phyint->phyint_ipsq;
3870 3868
3871 3869 if (ill_glist_insert(ill, "lo", isv6) != 0)
3872 3870 cmn_err(CE_PANIC, "cannot insert loopback interface");
3873 3871
3874 3872 /* Let SCTP know so that it can add this to its list */
3875 3873 sctp_update_ill(ill, SCTP_ILL_INSERT);
3876 3874
3877 3875 /*
3878 3876 * We have already assigned ipif_v6lcl_addr above, but we need to
3879 3877 * call sctp_update_ipif_addr() after SCTP_ILL_INSERT, which
3880 3878 * requires to be after ill_glist_insert() since we need the
3881 3879 * ill_index set. Pass on ipv6_loopback as the old address.
3882 3880 */
3883 3881 sctp_update_ipif_addr(ipif, ov6addr);
3884 3882
3885 3883 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
3886 3884
3887 3885 /*
3888 3886 * ill_glist_insert() -> ill_phyint_reinit() may have merged IPSQs.
3889 3887 * If so, free our original one.
3890 3888 */
3891 3889 if (ipsq != ill->ill_phyint->phyint_ipsq)
3892 3890 ipsq_delete(ipsq);
3893 3891
3894 3892 if (ipst->ips_loopback_ksp == NULL) {
3895 3893 /* Export loopback interface statistics */
3896 3894 ipst->ips_loopback_ksp = kstat_create_netstack("lo", 0,
3897 3895 ipif_loopback_name, "net",
3898 3896 KSTAT_TYPE_NAMED, 2, 0,
3899 3897 ipst->ips_netstack->netstack_stackid);
3900 3898 if (ipst->ips_loopback_ksp != NULL) {
3901 3899 ipst->ips_loopback_ksp->ks_update =
3902 3900 loopback_kstat_update;
3903 3901 kn = KSTAT_NAMED_PTR(ipst->ips_loopback_ksp);
3904 3902 kstat_named_init(&kn[0], "ipackets", KSTAT_DATA_UINT32);
3905 3903 kstat_named_init(&kn[1], "opackets", KSTAT_DATA_UINT32);
3906 3904 ipst->ips_loopback_ksp->ks_private =
3907 3905 (void *)(uintptr_t)ipst->ips_netstack->
3908 3906 netstack_stackid;
3909 3907 kstat_install(ipst->ips_loopback_ksp);
3910 3908 }
3911 3909 }
3912 3910
3913 3911 *did_alloc = B_TRUE;
3914 3912 rw_exit(&ipst->ips_ill_g_lock);
3915 3913 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id),
3916 3914 NE_PLUMB, ill->ill_name, ill->ill_name_length);
3917 3915 return (ill);
3918 3916 done:
3919 3917 if (ill != NULL) {
3920 3918 if (ill->ill_phyint != NULL) {
3921 3919 ipsq = ill->ill_phyint->phyint_ipsq;
3922 3920 if (ipsq != NULL) {
3923 3921 ipsq->ipsq_phyint = NULL;
3924 3922 ipsq_delete(ipsq);
3925 3923 }
3926 3924 mi_free(ill->ill_phyint);
3927 3925 }
3928 3926 ill_free_mib(ill);
3929 3927 if (ill->ill_ipst != NULL)
3930 3928 netstack_rele(ill->ill_ipst->ips_netstack);
3931 3929 mi_free(ill);
3932 3930 }
3933 3931 rw_exit(&ipst->ips_ill_g_lock);
3934 3932 return (NULL);
3935 3933 }
3936 3934
3937 3935 /*
3938 3936 * For IPP calls - use the ip_stack_t for global stack.
3939 3937 */
3940 3938 ill_t *
3941 3939 ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
3942 3940 {
3943 3941 ip_stack_t *ipst;
3944 3942 ill_t *ill;
3945 3943 netstack_t *ns;
3946 3944
3947 3945 ns = netstack_find_by_stackid(GLOBAL_NETSTACKID);
3948 3946
3949 3947 if ((ipst = ns->netstack_ip) == NULL) {
3950 3948 cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
3951 3949 netstack_rele(ns);
3952 3950 return (NULL);
3953 3951 }
3954 3952
3955 3953 ill = ill_lookup_on_ifindex(index, isv6, ipst);
3956 3954 netstack_rele(ns);
3957 3955 return (ill);
3958 3956 }
3959 3957
3960 3958 /*
3961 3959 * Return a pointer to the ill which matches the index and IP version type.
3962 3960 */
3963 3961 ill_t *
3964 3962 ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
3965 3963 {
3966 3964 ill_t *ill;
3967 3965 phyint_t *phyi;
3968 3966
3969 3967 /*
3970 3968 * Indexes are stored in the phyint - a common structure
3971 3969 * to both IPv4 and IPv6.
3972 3970 */
3973 3971 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
3974 3972 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
3975 3973 (void *) &index, NULL);
3976 3974 if (phyi != NULL) {
3977 3975 ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4;
3978 3976 if (ill != NULL) {
3979 3977 mutex_enter(&ill->ill_lock);
3980 3978 if (!ILL_IS_CONDEMNED(ill)) {
3981 3979 ill_refhold_locked(ill);
3982 3980 mutex_exit(&ill->ill_lock);
3983 3981 rw_exit(&ipst->ips_ill_g_lock);
3984 3982 return (ill);
3985 3983 }
3986 3984 mutex_exit(&ill->ill_lock);
3987 3985 }
3988 3986 }
3989 3987 rw_exit(&ipst->ips_ill_g_lock);
3990 3988 return (NULL);
3991 3989 }
3992 3990
3993 3991 /*
3994 3992 * Verify whether or not an interface index is valid for the specified zoneid
3995 3993 * to transmit packets.
3996 3994 * It can be zero (meaning "reset") or an interface index assigned
3997 3995 * to a non-VNI interface. (We don't use VNI interface to send packets.)
3998 3996 */
3999 3997 boolean_t
4000 3998 ip_xmit_ifindex_valid(uint_t ifindex, zoneid_t zoneid, boolean_t isv6,
4001 3999 ip_stack_t *ipst)
4002 4000 {
4003 4001 ill_t *ill;
4004 4002
4005 4003 if (ifindex == 0)
4006 4004 return (B_TRUE);
4007 4005
4008 4006 ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, isv6, ipst);
4009 4007 if (ill == NULL)
4010 4008 return (B_FALSE);
4011 4009 if (IS_VNI(ill)) {
4012 4010 ill_refrele(ill);
4013 4011 return (B_FALSE);
4014 4012 }
4015 4013 ill_refrele(ill);
4016 4014 return (B_TRUE);
4017 4015 }
4018 4016
4019 4017 /*
4020 4018 * Return the ifindex next in sequence after the passed in ifindex.
4021 4019 * If there is no next ifindex for the given protocol, return 0.
4022 4020 */
4023 4021 uint_t
4024 4022 ill_get_next_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst)
4025 4023 {
4026 4024 phyint_t *phyi;
4027 4025 phyint_t *phyi_initial;
4028 4026 uint_t ifindex;
4029 4027
4030 4028 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4031 4029
4032 4030 if (index == 0) {
4033 4031 phyi = avl_first(
4034 4032 &ipst->ips_phyint_g_list->phyint_list_avl_by_index);
4035 4033 } else {
4036 4034 phyi = phyi_initial = avl_find(
4037 4035 &ipst->ips_phyint_g_list->phyint_list_avl_by_index,
4038 4036 (void *) &index, NULL);
4039 4037 }
4040 4038
4041 4039 for (; phyi != NULL;
4042 4040 phyi = avl_walk(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
4043 4041 phyi, AVL_AFTER)) {
4044 4042 /*
4045 4043 * If we're not returning the first interface in the tree
4046 4044 * and we still haven't moved past the phyint_t that
4047 4045 * corresponds to index, avl_walk needs to be called again
4048 4046 */
4049 4047 if (!((index != 0) && (phyi == phyi_initial))) {
4050 4048 if (isv6) {
4051 4049 if ((phyi->phyint_illv6) &&
4052 4050 ILL_CAN_LOOKUP(phyi->phyint_illv6) &&
4053 4051 (phyi->phyint_illv6->ill_isv6 == 1))
4054 4052 break;
4055 4053 } else {
4056 4054 if ((phyi->phyint_illv4) &&
4057 4055 ILL_CAN_LOOKUP(phyi->phyint_illv4) &&
4058 4056 (phyi->phyint_illv4->ill_isv6 == 0))
4059 4057 break;
4060 4058 }
4061 4059 }
4062 4060 }
4063 4061
4064 4062 rw_exit(&ipst->ips_ill_g_lock);
4065 4063
4066 4064 if (phyi != NULL)
4067 4065 ifindex = phyi->phyint_ifindex;
4068 4066 else
4069 4067 ifindex = 0;
4070 4068
4071 4069 return (ifindex);
4072 4070 }
4073 4071
4074 4072 /*
4075 4073 * Return the ifindex for the named interface.
4076 4074 * If there is no next ifindex for the interface, return 0.
4077 4075 */
4078 4076 uint_t
4079 4077 ill_get_ifindex_by_name(char *name, ip_stack_t *ipst)
4080 4078 {
4081 4079 phyint_t *phyi;
4082 4080 avl_index_t where = 0;
4083 4081 uint_t ifindex;
4084 4082
4085 4083 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4086 4084
4087 4085 if ((phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
4088 4086 name, &where)) == NULL) {
4089 4087 rw_exit(&ipst->ips_ill_g_lock);
4090 4088 return (0);
4091 4089 }
4092 4090
4093 4091 ifindex = phyi->phyint_ifindex;
4094 4092
4095 4093 rw_exit(&ipst->ips_ill_g_lock);
4096 4094
4097 4095 return (ifindex);
4098 4096 }
4099 4097
4100 4098 /*
4101 4099 * Return the ifindex to be used by upper layer protocols for instance
4102 4100 * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill.
4103 4101 */
4104 4102 uint_t
4105 4103 ill_get_upper_ifindex(const ill_t *ill)
4106 4104 {
4107 4105 if (IS_UNDER_IPMP(ill))
4108 4106 return (ipmp_ill_get_ipmp_ifindex(ill));
4109 4107 else
4110 4108 return (ill->ill_phyint->phyint_ifindex);
4111 4109 }
4112 4110
4113 4111
4114 4112 /*
4115 4113 * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt
4116 4114 * that gives a running thread a reference to the ill. This reference must be
4117 4115 * released by the thread when it is done accessing the ill and related
4118 4116 * objects. ill_refcnt can not be used to account for static references
4119 4117 * such as other structures pointing to an ill. Callers must generally
4120 4118 * check whether an ill can be refheld by using ILL_CAN_LOOKUP macros
4121 4119 * or be sure that the ill is not being deleted or changing state before
4122 4120 * calling the refhold functions. A non-zero ill_refcnt ensures that the
4123 4121 * ill won't change any of its critical state such as address, netmask etc.
4124 4122 */
4125 4123 void
4126 4124 ill_refhold(ill_t *ill)
4127 4125 {
4128 4126 mutex_enter(&ill->ill_lock);
4129 4127 ill->ill_refcnt++;
4130 4128 ILL_TRACE_REF(ill);
4131 4129 mutex_exit(&ill->ill_lock);
4132 4130 }
4133 4131
4134 4132 void
4135 4133 ill_refhold_locked(ill_t *ill)
4136 4134 {
4137 4135 ASSERT(MUTEX_HELD(&ill->ill_lock));
4138 4136 ill->ill_refcnt++;
4139 4137 ILL_TRACE_REF(ill);
4140 4138 }
4141 4139
4142 4140 /* Returns true if we managed to get a refhold */
4143 4141 boolean_t
4144 4142 ill_check_and_refhold(ill_t *ill)
4145 4143 {
4146 4144 mutex_enter(&ill->ill_lock);
4147 4145 if (!ILL_IS_CONDEMNED(ill)) {
4148 4146 ill_refhold_locked(ill);
4149 4147 mutex_exit(&ill->ill_lock);
4150 4148 return (B_TRUE);
4151 4149 }
4152 4150 mutex_exit(&ill->ill_lock);
4153 4151 return (B_FALSE);
4154 4152 }
4155 4153
4156 4154 /*
4157 4155 * Must not be called while holding any locks. Otherwise if this is
4158 4156 * the last reference to be released, there is a chance of recursive mutex
4159 4157 * panic due to ill_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
4160 4158 * to restart an ioctl.
4161 4159 */
4162 4160 void
4163 4161 ill_refrele(ill_t *ill)
4164 4162 {
4165 4163 mutex_enter(&ill->ill_lock);
4166 4164 ASSERT(ill->ill_refcnt != 0);
4167 4165 ill->ill_refcnt--;
4168 4166 ILL_UNTRACE_REF(ill);
4169 4167 if (ill->ill_refcnt != 0) {
4170 4168 /* Every ire pointing to the ill adds 1 to ill_refcnt */
4171 4169 mutex_exit(&ill->ill_lock);
4172 4170 return;
4173 4171 }
4174 4172
4175 4173 /* Drops the ill_lock */
4176 4174 ipif_ill_refrele_tail(ill);
4177 4175 }
4178 4176
4179 4177 /*
4180 4178 * Obtain a weak reference count on the ill. This reference ensures the
4181 4179 * ill won't be freed, but the ill may change any of its critical state
4182 4180 * such as netmask, address etc. Returns an error if the ill has started
4183 4181 * closing.
4184 4182 */
4185 4183 boolean_t
4186 4184 ill_waiter_inc(ill_t *ill)
4187 4185 {
4188 4186 mutex_enter(&ill->ill_lock);
4189 4187 if (ill->ill_state_flags & ILL_CONDEMNED) {
4190 4188 mutex_exit(&ill->ill_lock);
4191 4189 return (B_FALSE);
4192 4190 }
4193 4191 ill->ill_waiters++;
4194 4192 mutex_exit(&ill->ill_lock);
4195 4193 return (B_TRUE);
4196 4194 }
4197 4195
4198 4196 void
4199 4197 ill_waiter_dcr(ill_t *ill)
4200 4198 {
4201 4199 mutex_enter(&ill->ill_lock);
4202 4200 ill->ill_waiters--;
4203 4201 if (ill->ill_waiters == 0)
4204 4202 cv_broadcast(&ill->ill_cv);
4205 4203 mutex_exit(&ill->ill_lock);
4206 4204 }
4207 4205
4208 4206 /*
4209 4207 * ip_ll_subnet_defaults is called when we get the DL_INFO_ACK back from the
4210 4208 * driver. We construct best guess defaults for lower level information that
4211 4209 * we need. If an interface is brought up without injection of any overriding
4212 4210 * information from outside, we have to be ready to go with these defaults.
4213 4211 * When we get the first DL_INFO_ACK (from ip_open() sending a DL_INFO_REQ)
4214 4212 * we primarely want the dl_provider_style.
4215 4213 * The subsequent DL_INFO_ACK is received after doing a DL_ATTACH and DL_BIND
4216 4214 * at which point we assume the other part of the information is valid.
4217 4215 */
4218 4216 void
4219 4217 ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp)
4220 4218 {
4221 4219 uchar_t *brdcst_addr;
4222 4220 uint_t brdcst_addr_length, phys_addr_length;
4223 4221 t_scalar_t sap_length;
4224 4222 dl_info_ack_t *dlia;
4225 4223 ip_m_t *ipm;
4226 4224 dl_qos_cl_sel1_t *sel1;
4227 4225 int min_mtu;
4228 4226
4229 4227 ASSERT(IAM_WRITER_ILL(ill));
4230 4228
4231 4229 /*
4232 4230 * Till the ill is fully up the ill is not globally visible.
4233 4231 * So no need for a lock.
4234 4232 */
4235 4233 dlia = (dl_info_ack_t *)mp->b_rptr;
4236 4234 ill->ill_mactype = dlia->dl_mac_type;
4237 4235
4238 4236 ipm = ip_m_lookup(dlia->dl_mac_type);
4239 4237 if (ipm == NULL) {
4240 4238 ipm = ip_m_lookup(DL_OTHER);
4241 4239 ASSERT(ipm != NULL);
4242 4240 }
4243 4241 ill->ill_media = ipm;
4244 4242
4245 4243 /*
4246 4244 * When the new DLPI stuff is ready we'll pull lengths
4247 4245 * from dlia.
4248 4246 */
4249 4247 if (dlia->dl_version == DL_VERSION_2) {
4250 4248 brdcst_addr_length = dlia->dl_brdcst_addr_length;
4251 4249 brdcst_addr = mi_offset_param(mp, dlia->dl_brdcst_addr_offset,
4252 4250 brdcst_addr_length);
4253 4251 if (brdcst_addr == NULL) {
4254 4252 brdcst_addr_length = 0;
4255 4253 }
4256 4254 sap_length = dlia->dl_sap_length;
4257 4255 phys_addr_length = dlia->dl_addr_length - ABS(sap_length);
4258 4256 ip1dbg(("ip: bcast_len %d, sap_len %d, phys_len %d\n",
4259 4257 brdcst_addr_length, sap_length, phys_addr_length));
4260 4258 } else {
4261 4259 brdcst_addr_length = 6;
4262 4260 brdcst_addr = ip_six_byte_all_ones;
4263 4261 sap_length = -2;
4264 4262 phys_addr_length = brdcst_addr_length;
4265 4263 }
4266 4264
4267 4265 ill->ill_bcast_addr_length = brdcst_addr_length;
4268 4266 ill->ill_phys_addr_length = phys_addr_length;
4269 4267 ill->ill_sap_length = sap_length;
4270 4268
4271 4269 /*
4272 4270 * Synthetic DLPI types such as SUNW_DL_IPMP specify a zero SDU,
4273 4271 * but we must ensure a minimum IP MTU is used since other bits of
4274 4272 * IP will fly apart otherwise.
4275 4273 */
4276 4274 min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
4277 4275 ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu);
4278 4276 ill->ill_current_frag = ill->ill_max_frag;
4279 4277 ill->ill_mtu = ill->ill_max_frag;
4280 4278 ill->ill_mc_mtu = ill->ill_mtu; /* Overridden by DL_NOTE_SDU_SIZE2 */
4281 4279
4282 4280 ill->ill_type = ipm->ip_m_type;
4283 4281
4284 4282 if (!ill->ill_dlpi_style_set) {
4285 4283 if (dlia->dl_provider_style == DL_STYLE2)
4286 4284 ill->ill_needs_attach = 1;
4287 4285
4288 4286 phyint_flags_init(ill->ill_phyint, ill->ill_mactype);
4289 4287
4290 4288 /*
4291 4289 * Allocate the first ipif on this ill. We don't delay it
4292 4290 * further as ioctl handling assumes at least one ipif exists.
4293 4291 *
4294 4292 * At this point we don't know whether the ill is v4 or v6.
4295 4293 * We will know this whan the SIOCSLIFNAME happens and
4296 4294 * the correct value for ill_isv6 will be assigned in
4297 4295 * ipif_set_values(). We need to hold the ill lock and
4298 4296 * clear the ILL_LL_SUBNET_PENDING flag and atomically do
4299 4297 * the wakeup.
4300 4298 */
4301 4299 (void) ipif_allocate(ill, 0, IRE_LOCAL,
4302 4300 dlia->dl_provider_style != DL_STYLE2, B_TRUE, NULL);
4303 4301 mutex_enter(&ill->ill_lock);
4304 4302 ASSERT(ill->ill_dlpi_style_set == 0);
4305 4303 ill->ill_dlpi_style_set = 1;
4306 4304 ill->ill_state_flags &= ~ILL_LL_SUBNET_PENDING;
4307 4305 cv_broadcast(&ill->ill_cv);
4308 4306 mutex_exit(&ill->ill_lock);
4309 4307 freemsg(mp);
4310 4308 return;
4311 4309 }
4312 4310 ASSERT(ill->ill_ipif != NULL);
4313 4311 /*
4314 4312 * We know whether it is IPv4 or IPv6 now, as this is the
4315 4313 * second DL_INFO_ACK we are recieving in response to the
4316 4314 * DL_INFO_REQ sent in ipif_set_values.
4317 4315 */
4318 4316 ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap;
4319 4317 /*
4320 4318 * Clear all the flags that were set based on ill_bcast_addr_length
4321 4319 * and ill_phys_addr_length (in ipif_set_values) as these could have
4322 4320 * changed now and we need to re-evaluate.
4323 4321 */
4324 4322 ill->ill_flags &= ~(ILLF_MULTICAST | ILLF_NONUD | ILLF_NOARP);
4325 4323 ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT);
4326 4324
4327 4325 /*
4328 4326 * Free ill_bcast_mp as things could have changed now.
4329 4327 *
4330 4328 * NOTE: The IPMP meta-interface is special-cased because it starts
4331 4329 * with no underlying interfaces (and thus an unknown broadcast
4332 4330 * address length), but we enforce that an interface is broadcast-
4333 4331 * capable as part of allowing it to join a group.
4334 4332 */
4335 4333 if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) {
4336 4334 if (ill->ill_bcast_mp != NULL)
4337 4335 freemsg(ill->ill_bcast_mp);
4338 4336 ill->ill_net_type = IRE_IF_NORESOLVER;
4339 4337
4340 4338 ill->ill_bcast_mp = ill_dlur_gen(NULL,
4341 4339 ill->ill_phys_addr_length,
4342 4340 ill->ill_sap,
4343 4341 ill->ill_sap_length);
4344 4342
4345 4343 if (ill->ill_isv6)
4346 4344 /*
4347 4345 * Note: xresolv interfaces will eventually need NOARP
4348 4346 * set here as well, but that will require those
4349 4347 * external resolvers to have some knowledge of
4350 4348 * that flag and act appropriately. Not to be changed
4351 4349 * at present.
4352 4350 */
4353 4351 ill->ill_flags |= ILLF_NONUD;
4354 4352 else
4355 4353 ill->ill_flags |= ILLF_NOARP;
4356 4354
4357 4355 if (ill->ill_mactype == SUNW_DL_VNI) {
4358 4356 ill->ill_ipif->ipif_flags |= IPIF_NOXMIT;
4359 4357 } else if (ill->ill_phys_addr_length == 0 ||
4360 4358 ill->ill_mactype == DL_IPV4 ||
4361 4359 ill->ill_mactype == DL_IPV6) {
4362 4360 /*
4363 4361 * The underying link is point-to-point, so mark the
4364 4362 * interface as such. We can do IP multicast over
4365 4363 * such a link since it transmits all network-layer
4366 4364 * packets to the remote side the same way.
4367 4365 */
4368 4366 ill->ill_flags |= ILLF_MULTICAST;
4369 4367 ill->ill_ipif->ipif_flags |= IPIF_POINTOPOINT;
4370 4368 }
4371 4369 } else {
4372 4370 ill->ill_net_type = IRE_IF_RESOLVER;
4373 4371 if (ill->ill_bcast_mp != NULL)
4374 4372 freemsg(ill->ill_bcast_mp);
4375 4373 ill->ill_bcast_mp = ill_dlur_gen(brdcst_addr,
4376 4374 ill->ill_bcast_addr_length, ill->ill_sap,
4377 4375 ill->ill_sap_length);
4378 4376 /*
4379 4377 * Later detect lack of DLPI driver multicast
4380 4378 * capability by catching DL_ENABMULTI errors in
4381 4379 * ip_rput_dlpi.
4382 4380 */
4383 4381 ill->ill_flags |= ILLF_MULTICAST;
4384 4382 if (!ill->ill_isv6)
4385 4383 ill->ill_ipif->ipif_flags |= IPIF_BROADCAST;
4386 4384 }
4387 4385
4388 4386 /* For IPMP, PHYI_IPMP should already be set by phyint_flags_init() */
4389 4387 if (ill->ill_mactype == SUNW_DL_IPMP)
4390 4388 ASSERT(ill->ill_phyint->phyint_flags & PHYI_IPMP);
4391 4389
4392 4390 /* By default an interface does not support any CoS marking */
4393 4391 ill->ill_flags &= ~ILLF_COS_ENABLED;
4394 4392
4395 4393 /*
4396 4394 * If we get QoS information in DL_INFO_ACK, the device supports
4397 4395 * some form of CoS marking, set ILLF_COS_ENABLED.
4398 4396 */
4399 4397 sel1 = (dl_qos_cl_sel1_t *)mi_offset_param(mp, dlia->dl_qos_offset,
4400 4398 dlia->dl_qos_length);
4401 4399 if ((sel1 != NULL) && (sel1->dl_qos_type == DL_QOS_CL_SEL1)) {
4402 4400 ill->ill_flags |= ILLF_COS_ENABLED;
4403 4401 }
4404 4402
4405 4403 /* Clear any previous error indication. */
4406 4404 ill->ill_error = 0;
4407 4405 freemsg(mp);
4408 4406 }
4409 4407
4410 4408 /*
4411 4409 * Perform various checks to verify that an address would make sense as a
4412 4410 * local, remote, or subnet interface address.
4413 4411 */
4414 4412 static boolean_t
4415 4413 ip_addr_ok_v4(ipaddr_t addr, ipaddr_t subnet_mask)
4416 4414 {
4417 4415 ipaddr_t net_mask;
4418 4416
4419 4417 /*
4420 4418 * Don't allow all zeroes, or all ones, but allow
4421 4419 * all ones netmask.
4422 4420 */
4423 4421 if ((net_mask = ip_net_mask(addr)) == 0)
4424 4422 return (B_FALSE);
4425 4423 /* A given netmask overrides the "guess" netmask */
4426 4424 if (subnet_mask != 0)
4427 4425 net_mask = subnet_mask;
4428 4426 if ((net_mask != ~(ipaddr_t)0) && ((addr == (addr & net_mask)) ||
4429 4427 (addr == (addr | ~net_mask)))) {
4430 4428 return (B_FALSE);
4431 4429 }
4432 4430
4433 4431 /*
4434 4432 * Even if the netmask is all ones, we do not allow address to be
4435 4433 * 255.255.255.255
4436 4434 */
4437 4435 if (addr == INADDR_BROADCAST)
4438 4436 return (B_FALSE);
4439 4437
4440 4438 if (CLASSD(addr))
4441 4439 return (B_FALSE);
4442 4440
4443 4441 return (B_TRUE);
4444 4442 }
4445 4443
4446 4444 #define V6_IPIF_LINKLOCAL(p) \
4447 4445 IN6_IS_ADDR_LINKLOCAL(&(p)->ipif_v6lcl_addr)
4448 4446
4449 4447 /*
4450 4448 * Compare two given ipifs and check if the second one is better than
4451 4449 * the first one using the order of preference (not taking deprecated
4452 4450 * into acount) specified in ipif_lookup_multicast().
4453 4451 */
4454 4452 static boolean_t
4455 4453 ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6)
4456 4454 {
4457 4455 /* Check the least preferred first. */
4458 4456 if (IS_LOOPBACK(old_ipif->ipif_ill)) {
4459 4457 /* If both ipifs are the same, use the first one. */
4460 4458 if (IS_LOOPBACK(new_ipif->ipif_ill))
4461 4459 return (B_FALSE);
4462 4460 else
4463 4461 return (B_TRUE);
4464 4462 }
4465 4463
4466 4464 /* For IPv6, check for link local address. */
4467 4465 if (isv6 && V6_IPIF_LINKLOCAL(old_ipif)) {
4468 4466 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4469 4467 V6_IPIF_LINKLOCAL(new_ipif)) {
4470 4468 /* The second one is equal or less preferred. */
4471 4469 return (B_FALSE);
4472 4470 } else {
4473 4471 return (B_TRUE);
4474 4472 }
4475 4473 }
4476 4474
4477 4475 /* Then check for point to point interface. */
4478 4476 if (old_ipif->ipif_flags & IPIF_POINTOPOINT) {
4479 4477 if (IS_LOOPBACK(new_ipif->ipif_ill) ||
4480 4478 (isv6 && V6_IPIF_LINKLOCAL(new_ipif)) ||
4481 4479 (new_ipif->ipif_flags & IPIF_POINTOPOINT)) {
4482 4480 return (B_FALSE);
4483 4481 } else {
4484 4482 return (B_TRUE);
4485 4483 }
4486 4484 }
4487 4485
4488 4486 /* old_ipif is a normal interface, so no need to use the new one. */
4489 4487 return (B_FALSE);
4490 4488 }
4491 4489
4492 4490 /*
4493 4491 * Find a mulitcast-capable ipif given an IP instance and zoneid.
4494 4492 * The ipif must be up, and its ill must multicast-capable, not
4495 4493 * condemned, not an underlying interface in an IPMP group, and
4496 4494 * not a VNI interface. Order of preference:
4497 4495 *
4498 4496 * 1a. normal
4499 4497 * 1b. normal, but deprecated
4500 4498 * 2a. point to point
4501 4499 * 2b. point to point, but deprecated
4502 4500 * 3a. link local
4503 4501 * 3b. link local, but deprecated
4504 4502 * 4. loopback.
4505 4503 */
4506 4504 static ipif_t *
4507 4505 ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4508 4506 {
4509 4507 ill_t *ill;
4510 4508 ill_walk_context_t ctx;
4511 4509 ipif_t *ipif;
4512 4510 ipif_t *saved_ipif = NULL;
4513 4511 ipif_t *dep_ipif = NULL;
4514 4512
4515 4513 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4516 4514 if (isv6)
4517 4515 ill = ILL_START_WALK_V6(&ctx, ipst);
4518 4516 else
4519 4517 ill = ILL_START_WALK_V4(&ctx, ipst);
4520 4518
4521 4519 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4522 4520 mutex_enter(&ill->ill_lock);
4523 4521 if (IS_VNI(ill) || IS_UNDER_IPMP(ill) ||
4524 4522 ILL_IS_CONDEMNED(ill) ||
4525 4523 !(ill->ill_flags & ILLF_MULTICAST)) {
4526 4524 mutex_exit(&ill->ill_lock);
4527 4525 continue;
4528 4526 }
4529 4527 for (ipif = ill->ill_ipif; ipif != NULL;
4530 4528 ipif = ipif->ipif_next) {
4531 4529 if (zoneid != ipif->ipif_zoneid &&
4532 4530 zoneid != ALL_ZONES &&
4533 4531 ipif->ipif_zoneid != ALL_ZONES) {
4534 4532 continue;
4535 4533 }
4536 4534 if (!(ipif->ipif_flags & IPIF_UP) ||
4537 4535 IPIF_IS_CONDEMNED(ipif)) {
4538 4536 continue;
4539 4537 }
4540 4538
4541 4539 /*
4542 4540 * Found one candidate. If it is deprecated,
4543 4541 * remember it in dep_ipif. If it is not deprecated,
4544 4542 * remember it in saved_ipif.
4545 4543 */
4546 4544 if (ipif->ipif_flags & IPIF_DEPRECATED) {
4547 4545 if (dep_ipif == NULL) {
4548 4546 dep_ipif = ipif;
4549 4547 } else if (ipif_comp_multi(dep_ipif, ipif,
4550 4548 isv6)) {
4551 4549 /*
4552 4550 * If the previous dep_ipif does not
4553 4551 * belong to the same ill, we've done
4554 4552 * a ipif_refhold() on it. So we need
4555 4553 * to release it.
4556 4554 */
4557 4555 if (dep_ipif->ipif_ill != ill)
4558 4556 ipif_refrele(dep_ipif);
4559 4557 dep_ipif = ipif;
4560 4558 }
4561 4559 continue;
4562 4560 }
4563 4561 if (saved_ipif == NULL) {
4564 4562 saved_ipif = ipif;
4565 4563 } else {
4566 4564 if (ipif_comp_multi(saved_ipif, ipif, isv6)) {
4567 4565 if (saved_ipif->ipif_ill != ill)
4568 4566 ipif_refrele(saved_ipif);
4569 4567 saved_ipif = ipif;
4570 4568 }
4571 4569 }
4572 4570 }
4573 4571 /*
4574 4572 * Before going to the next ill, do a ipif_refhold() on the
4575 4573 * saved ones.
4576 4574 */
4577 4575 if (saved_ipif != NULL && saved_ipif->ipif_ill == ill)
4578 4576 ipif_refhold_locked(saved_ipif);
4579 4577 if (dep_ipif != NULL && dep_ipif->ipif_ill == ill)
4580 4578 ipif_refhold_locked(dep_ipif);
4581 4579 mutex_exit(&ill->ill_lock);
4582 4580 }
4583 4581 rw_exit(&ipst->ips_ill_g_lock);
4584 4582
4585 4583 /*
4586 4584 * If we have only the saved_ipif, return it. But if we have both
4587 4585 * saved_ipif and dep_ipif, check to see which one is better.
4588 4586 */
4589 4587 if (saved_ipif != NULL) {
4590 4588 if (dep_ipif != NULL) {
4591 4589 if (ipif_comp_multi(saved_ipif, dep_ipif, isv6)) {
4592 4590 ipif_refrele(saved_ipif);
4593 4591 return (dep_ipif);
4594 4592 } else {
4595 4593 ipif_refrele(dep_ipif);
4596 4594 return (saved_ipif);
4597 4595 }
4598 4596 }
4599 4597 return (saved_ipif);
4600 4598 } else {
4601 4599 return (dep_ipif);
4602 4600 }
4603 4601 }
4604 4602
4605 4603 ill_t *
4606 4604 ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6)
4607 4605 {
4608 4606 ipif_t *ipif;
4609 4607 ill_t *ill;
4610 4608
4611 4609 ipif = ipif_lookup_multicast(ipst, zoneid, isv6);
4612 4610 if (ipif == NULL)
4613 4611 return (NULL);
4614 4612
4615 4613 ill = ipif->ipif_ill;
4616 4614 ill_refhold(ill);
4617 4615 ipif_refrele(ipif);
4618 4616 return (ill);
4619 4617 }
4620 4618
4621 4619 /*
4622 4620 * This function is called when an application does not specify an interface
4623 4621 * to be used for multicast traffic (joining a group/sending data). It
4624 4622 * calls ire_lookup_multi() to look for an interface route for the
4625 4623 * specified multicast group. Doing this allows the administrator to add
4626 4624 * prefix routes for multicast to indicate which interface to be used for
4627 4625 * multicast traffic in the above scenario. The route could be for all
4628 4626 * multicast (224.0/4), for a single multicast group (a /32 route) or
4629 4627 * anything in between. If there is no such multicast route, we just find
4630 4628 * any multicast capable interface and return it. The returned ipif
4631 4629 * is refhold'ed.
4632 4630 *
4633 4631 * We support MULTIRT and RTF_SETSRC on the multicast routes added to the
4634 4632 * unicast table. This is used by CGTP.
4635 4633 */
4636 4634 ill_t *
4637 4635 ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst,
4638 4636 boolean_t *multirtp, ipaddr_t *setsrcp)
4639 4637 {
4640 4638 ill_t *ill;
4641 4639
4642 4640 ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp);
4643 4641 if (ill != NULL)
4644 4642 return (ill);
4645 4643
4646 4644 return (ill_lookup_multicast(ipst, zoneid, B_FALSE));
4647 4645 }
4648 4646
4649 4647 /*
4650 4648 * Look for an ipif with the specified interface address and destination.
4651 4649 * The destination address is used only for matching point-to-point interfaces.
4652 4650 */
4653 4651 ipif_t *
4654 4652 ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst)
4655 4653 {
4656 4654 ipif_t *ipif;
4657 4655 ill_t *ill;
4658 4656 ill_walk_context_t ctx;
4659 4657
4660 4658 /*
4661 4659 * First match all the point-to-point interfaces
4662 4660 * before looking at non-point-to-point interfaces.
4663 4661 * This is done to avoid returning non-point-to-point
4664 4662 * ipif instead of unnumbered point-to-point ipif.
4665 4663 */
4666 4664 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4667 4665 ill = ILL_START_WALK_V4(&ctx, ipst);
4668 4666 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4669 4667 mutex_enter(&ill->ill_lock);
4670 4668 for (ipif = ill->ill_ipif; ipif != NULL;
4671 4669 ipif = ipif->ipif_next) {
4672 4670 /* Allow the ipif to be down */
4673 4671 if ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
4674 4672 (ipif->ipif_lcl_addr == if_addr) &&
4675 4673 (ipif->ipif_pp_dst_addr == dst)) {
4676 4674 if (!IPIF_IS_CONDEMNED(ipif)) {
4677 4675 ipif_refhold_locked(ipif);
4678 4676 mutex_exit(&ill->ill_lock);
4679 4677 rw_exit(&ipst->ips_ill_g_lock);
4680 4678 return (ipif);
4681 4679 }
4682 4680 }
4683 4681 }
4684 4682 mutex_exit(&ill->ill_lock);
4685 4683 }
4686 4684 rw_exit(&ipst->ips_ill_g_lock);
4687 4685
4688 4686 /* lookup the ipif based on interface address */
4689 4687 ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst);
4690 4688 ASSERT(ipif == NULL || !ipif->ipif_isv6);
4691 4689 return (ipif);
4692 4690 }
4693 4691
4694 4692 /*
4695 4693 * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact().
4696 4694 */
4697 4695 static ipif_t *
4698 4696 ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags,
4699 4697 zoneid_t zoneid, ip_stack_t *ipst)
4700 4698 {
4701 4699 ipif_t *ipif;
4702 4700 ill_t *ill;
4703 4701 boolean_t ptp = B_FALSE;
4704 4702 ill_walk_context_t ctx;
4705 4703 boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP);
4706 4704 boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP);
4707 4705
4708 4706 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4709 4707 /*
4710 4708 * Repeat twice, first based on local addresses and
4711 4709 * next time for pointopoint.
4712 4710 */
4713 4711 repeat:
4714 4712 ill = ILL_START_WALK_V4(&ctx, ipst);
4715 4713 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4716 4714 if (match_ill != NULL && ill != match_ill &&
4717 4715 (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) {
4718 4716 continue;
4719 4717 }
4720 4718 mutex_enter(&ill->ill_lock);
4721 4719 for (ipif = ill->ill_ipif; ipif != NULL;
4722 4720 ipif = ipif->ipif_next) {
4723 4721 if (zoneid != ALL_ZONES &&
4724 4722 zoneid != ipif->ipif_zoneid &&
4725 4723 ipif->ipif_zoneid != ALL_ZONES)
4726 4724 continue;
4727 4725
4728 4726 if (no_duplicate && !(ipif->ipif_flags & IPIF_UP))
4729 4727 continue;
4730 4728
4731 4729 /* Allow the ipif to be down */
4732 4730 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4733 4731 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4734 4732 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4735 4733 (ipif->ipif_pp_dst_addr == addr))) {
4736 4734 if (!IPIF_IS_CONDEMNED(ipif)) {
4737 4735 ipif_refhold_locked(ipif);
4738 4736 mutex_exit(&ill->ill_lock);
4739 4737 rw_exit(&ipst->ips_ill_g_lock);
4740 4738 return (ipif);
4741 4739 }
4742 4740 }
4743 4741 }
4744 4742 mutex_exit(&ill->ill_lock);
4745 4743 }
4746 4744
4747 4745 /* If we already did the ptp case, then we are done */
4748 4746 if (ptp) {
4749 4747 rw_exit(&ipst->ips_ill_g_lock);
4750 4748 return (NULL);
4751 4749 }
4752 4750 ptp = B_TRUE;
4753 4751 goto repeat;
4754 4752 }
4755 4753
4756 4754 /*
4757 4755 * Lookup an ipif with the specified address. For point-to-point links we
4758 4756 * look for matches on either the destination address or the local address,
4759 4757 * but we skip the local address check if IPIF_UNNUMBERED is set. If the
4760 4758 * `match_ill' argument is non-NULL, the lookup is restricted to that ill
4761 4759 * (or illgrp if `match_ill' is in an IPMP group).
4762 4760 */
4763 4761 ipif_t *
4764 4762 ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4765 4763 ip_stack_t *ipst)
4766 4764 {
4767 4765 return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP,
4768 4766 zoneid, ipst));
4769 4767 }
4770 4768
4771 4769 /*
4772 4770 * Lookup an ipif with the specified address. Similar to ipif_lookup_addr,
4773 4771 * except that we will only return an address if it is not marked as
4774 4772 * IPIF_DUPLICATE
4775 4773 */
4776 4774 ipif_t *
4777 4775 ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid,
4778 4776 ip_stack_t *ipst)
4779 4777 {
4780 4778 return (ipif_lookup_addr_common(addr, match_ill,
4781 4779 (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP),
4782 4780 zoneid, ipst));
4783 4781 }
4784 4782
4785 4783 /*
4786 4784 * Special abbreviated version of ipif_lookup_addr() that doesn't match
4787 4785 * `match_ill' across the IPMP group. This function is only needed in some
4788 4786 * corner-cases; almost everything should use ipif_lookup_addr().
4789 4787 */
4790 4788 ipif_t *
4791 4789 ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4792 4790 {
4793 4791 ASSERT(match_ill != NULL);
4794 4792 return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES,
4795 4793 ipst));
4796 4794 }
4797 4795
4798 4796 /*
4799 4797 * Look for an ipif with the specified address. For point-point links
4800 4798 * we look for matches on either the destination address and the local
4801 4799 * address, but we ignore the check on the local address if IPIF_UNNUMBERED
4802 4800 * is set.
4803 4801 * If the `match_ill' argument is non-NULL, the lookup is restricted to that
4804 4802 * ill (or illgrp if `match_ill' is in an IPMP group).
4805 4803 * Return the zoneid for the ipif which matches. ALL_ZONES if no match.
4806 4804 */
4807 4805 zoneid_t
4808 4806 ipif_lookup_addr_zoneid(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst)
4809 4807 {
4810 4808 zoneid_t zoneid;
4811 4809 ipif_t *ipif;
4812 4810 ill_t *ill;
4813 4811 boolean_t ptp = B_FALSE;
4814 4812 ill_walk_context_t ctx;
4815 4813
4816 4814 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
4817 4815 /*
4818 4816 * Repeat twice, first based on local addresses and
4819 4817 * next time for pointopoint.
4820 4818 */
4821 4819 repeat:
4822 4820 ill = ILL_START_WALK_V4(&ctx, ipst);
4823 4821 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
4824 4822 if (match_ill != NULL && ill != match_ill &&
4825 4823 !IS_IN_SAME_ILLGRP(ill, match_ill)) {
4826 4824 continue;
4827 4825 }
4828 4826 mutex_enter(&ill->ill_lock);
4829 4827 for (ipif = ill->ill_ipif; ipif != NULL;
4830 4828 ipif = ipif->ipif_next) {
4831 4829 /* Allow the ipif to be down */
4832 4830 if ((!ptp && (ipif->ipif_lcl_addr == addr) &&
4833 4831 ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
4834 4832 (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) &&
4835 4833 (ipif->ipif_pp_dst_addr == addr)) &&
4836 4834 !(ipif->ipif_state_flags & IPIF_CONDEMNED)) {
4837 4835 zoneid = ipif->ipif_zoneid;
4838 4836 mutex_exit(&ill->ill_lock);
4839 4837 rw_exit(&ipst->ips_ill_g_lock);
4840 4838 /*
4841 4839 * If ipif_zoneid was ALL_ZONES then we have
4842 4840 * a trusted extensions shared IP address.
4843 4841 * In that case GLOBAL_ZONEID works to send.
4844 4842 */
4845 4843 if (zoneid == ALL_ZONES)
4846 4844 zoneid = GLOBAL_ZONEID;
4847 4845 return (zoneid);
4848 4846 }
4849 4847 }
4850 4848 mutex_exit(&ill->ill_lock);
4851 4849 }
4852 4850
4853 4851 /* If we already did the ptp case, then we are done */
4854 4852 if (ptp) {
4855 4853 rw_exit(&ipst->ips_ill_g_lock);
4856 4854 return (ALL_ZONES);
4857 4855 }
4858 4856 ptp = B_TRUE;
4859 4857 goto repeat;
4860 4858 }
4861 4859
4862 4860 /*
4863 4861 * Look for an ipif that matches the specified remote address i.e. the
4864 4862 * ipif that would receive the specified packet.
4865 4863 * First look for directly connected interfaces and then do a recursive
4866 4864 * IRE lookup and pick the first ipif corresponding to the source address in the
4867 4865 * ire.
4868 4866 * Returns: held ipif
4869 4867 *
4870 4868 * This is only used for ICMP_ADDRESS_MASK_REQUESTs
4871 4869 */
4872 4870 ipif_t *
4873 4871 ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid)
4874 4872 {
4875 4873 ipif_t *ipif;
4876 4874
4877 4875 ASSERT(!ill->ill_isv6);
4878 4876
4879 4877 /*
4880 4878 * Someone could be changing this ipif currently or change it
4881 4879 * after we return this. Thus a few packets could use the old
4882 4880 * old values. However structure updates/creates (ire, ilg, ilm etc)
4883 4881 * will atomically be updated or cleaned up with the new value
4884 4882 * Thus we don't need a lock to check the flags or other attrs below.
4885 4883 */
4886 4884 mutex_enter(&ill->ill_lock);
4887 4885 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4888 4886 if (IPIF_IS_CONDEMNED(ipif))
4889 4887 continue;
4890 4888 if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid &&
4891 4889 ipif->ipif_zoneid != ALL_ZONES)
4892 4890 continue;
4893 4891 /* Allow the ipif to be down */
4894 4892 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
4895 4893 if ((ipif->ipif_pp_dst_addr == addr) ||
4896 4894 (!(ipif->ipif_flags & IPIF_UNNUMBERED) &&
4897 4895 ipif->ipif_lcl_addr == addr)) {
4898 4896 ipif_refhold_locked(ipif);
4899 4897 mutex_exit(&ill->ill_lock);
4900 4898 return (ipif);
4901 4899 }
4902 4900 } else if (ipif->ipif_subnet == (addr & ipif->ipif_net_mask)) {
4903 4901 ipif_refhold_locked(ipif);
4904 4902 mutex_exit(&ill->ill_lock);
4905 4903 return (ipif);
4906 4904 }
4907 4905 }
4908 4906 mutex_exit(&ill->ill_lock);
4909 4907 /*
4910 4908 * For a remote destination it isn't possible to nail down a particular
4911 4909 * ipif.
4912 4910 */
4913 4911
4914 4912 /* Pick the first interface */
4915 4913 ipif = ipif_get_next_ipif(NULL, ill);
4916 4914 return (ipif);
4917 4915 }
4918 4916
4919 4917 /*
4920 4918 * This func does not prevent refcnt from increasing. But if
4921 4919 * the caller has taken steps to that effect, then this func
4922 4920 * can be used to determine whether the ill has become quiescent
4923 4921 */
4924 4922 static boolean_t
4925 4923 ill_is_quiescent(ill_t *ill)
4926 4924 {
4927 4925 ipif_t *ipif;
4928 4926
4929 4927 ASSERT(MUTEX_HELD(&ill->ill_lock));
4930 4928
4931 4929 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4932 4930 if (ipif->ipif_refcnt != 0)
4933 4931 return (B_FALSE);
4934 4932 }
4935 4933 if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) {
4936 4934 return (B_FALSE);
4937 4935 }
4938 4936 return (B_TRUE);
4939 4937 }
4940 4938
4941 4939 boolean_t
4942 4940 ill_is_freeable(ill_t *ill)
4943 4941 {
4944 4942 ipif_t *ipif;
4945 4943
4946 4944 ASSERT(MUTEX_HELD(&ill->ill_lock));
4947 4945
4948 4946 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
4949 4947 if (ipif->ipif_refcnt != 0) {
4950 4948 return (B_FALSE);
4951 4949 }
4952 4950 }
4953 4951 if (!ILL_FREE_OK(ill) || ill->ill_refcnt != 0) {
4954 4952 return (B_FALSE);
4955 4953 }
4956 4954 return (B_TRUE);
4957 4955 }
4958 4956
4959 4957 /*
4960 4958 * This func does not prevent refcnt from increasing. But if
4961 4959 * the caller has taken steps to that effect, then this func
4962 4960 * can be used to determine whether the ipif has become quiescent
4963 4961 */
4964 4962 static boolean_t
4965 4963 ipif_is_quiescent(ipif_t *ipif)
4966 4964 {
4967 4965 ill_t *ill;
4968 4966
4969 4967 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4970 4968
4971 4969 if (ipif->ipif_refcnt != 0)
4972 4970 return (B_FALSE);
4973 4971
4974 4972 ill = ipif->ipif_ill;
4975 4973 if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 ||
4976 4974 ill->ill_logical_down) {
4977 4975 return (B_TRUE);
4978 4976 }
4979 4977
4980 4978 /* This is the last ipif going down or being deleted on this ill */
4981 4979 if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) {
4982 4980 return (B_FALSE);
4983 4981 }
4984 4982
4985 4983 return (B_TRUE);
4986 4984 }
4987 4985
4988 4986 /*
4989 4987 * return true if the ipif can be destroyed: the ipif has to be quiescent
4990 4988 * with zero references from ire/ilm to it.
4991 4989 */
4992 4990 static boolean_t
4993 4991 ipif_is_freeable(ipif_t *ipif)
4994 4992 {
4995 4993 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
4996 4994 ASSERT(ipif->ipif_id != 0);
4997 4995 return (ipif->ipif_refcnt == 0);
4998 4996 }
4999 4997
5000 4998 /*
5001 4999 * The ipif/ill/ire has been refreled. Do the tail processing.
5002 5000 * Determine if the ipif or ill in question has become quiescent and if so
5003 5001 * wakeup close and/or restart any queued pending ioctl that is waiting
5004 5002 * for the ipif_down (or ill_down)
5005 5003 */
5006 5004 void
5007 5005 ipif_ill_refrele_tail(ill_t *ill)
5008 5006 {
5009 5007 mblk_t *mp;
5010 5008 conn_t *connp;
5011 5009 ipsq_t *ipsq;
5012 5010 ipxop_t *ipx;
5013 5011 ipif_t *ipif;
5014 5012 dl_notify_ind_t *dlindp;
5015 5013
5016 5014 ASSERT(MUTEX_HELD(&ill->ill_lock));
5017 5015
5018 5016 if ((ill->ill_state_flags & ILL_CONDEMNED) && ill_is_freeable(ill)) {
5019 5017 /* ip_modclose() may be waiting */
5020 5018 cv_broadcast(&ill->ill_cv);
5021 5019 }
5022 5020
5023 5021 ipsq = ill->ill_phyint->phyint_ipsq;
5024 5022 mutex_enter(&ipsq->ipsq_lock);
5025 5023 ipx = ipsq->ipsq_xop;
5026 5024 mutex_enter(&ipx->ipx_lock);
5027 5025 if (ipx->ipx_waitfor == 0) /* no one's waiting; bail */
5028 5026 goto unlock;
5029 5027
5030 5028 ASSERT(ipx->ipx_pending_mp != NULL && ipx->ipx_pending_ipif != NULL);
5031 5029
5032 5030 ipif = ipx->ipx_pending_ipif;
5033 5031 if (ipif->ipif_ill != ill) /* wait is for another ill; bail */
5034 5032 goto unlock;
5035 5033
5036 5034 switch (ipx->ipx_waitfor) {
5037 5035 case IPIF_DOWN:
5038 5036 if (!ipif_is_quiescent(ipif))
5039 5037 goto unlock;
5040 5038 break;
5041 5039 case IPIF_FREE:
5042 5040 if (!ipif_is_freeable(ipif))
5043 5041 goto unlock;
5044 5042 break;
5045 5043 case ILL_DOWN:
5046 5044 if (!ill_is_quiescent(ill))
5047 5045 goto unlock;
5048 5046 break;
5049 5047 case ILL_FREE:
5050 5048 /*
5051 5049 * ILL_FREE is only for loopback; normal ill teardown waits
5052 5050 * synchronously in ip_modclose() without using ipx_waitfor,
5053 5051 * handled by the cv_broadcast() at the top of this function.
5054 5052 */
5055 5053 if (!ill_is_freeable(ill))
5056 5054 goto unlock;
5057 5055 break;
5058 5056 default:
5059 5057 cmn_err(CE_PANIC, "ipsq: %p unknown ipx_waitfor %d\n",
5060 5058 (void *)ipsq, ipx->ipx_waitfor);
5061 5059 }
5062 5060
5063 5061 ill_refhold_locked(ill); /* for qwriter_ip() call below */
5064 5062 mutex_exit(&ipx->ipx_lock);
5065 5063 mp = ipsq_pending_mp_get(ipsq, &connp);
5066 5064 mutex_exit(&ipsq->ipsq_lock);
5067 5065 mutex_exit(&ill->ill_lock);
5068 5066
5069 5067 ASSERT(mp != NULL);
5070 5068 /*
5071 5069 * NOTE: all of the qwriter_ip() calls below use CUR_OP since
5072 5070 * we can only get here when the current operation decides it
5073 5071 * it needs to quiesce via ipsq_pending_mp_add().
5074 5072 */
5075 5073 switch (mp->b_datap->db_type) {
5076 5074 case M_PCPROTO:
5077 5075 case M_PROTO:
5078 5076 /*
5079 5077 * For now, only DL_NOTIFY_IND messages can use this facility.
5080 5078 */
5081 5079 dlindp = (dl_notify_ind_t *)mp->b_rptr;
5082 5080 ASSERT(dlindp->dl_primitive == DL_NOTIFY_IND);
5083 5081
5084 5082 switch (dlindp->dl_notification) {
5085 5083 case DL_NOTE_PHYS_ADDR:
5086 5084 qwriter_ip(ill, ill->ill_rq, mp,
5087 5085 ill_set_phys_addr_tail, CUR_OP, B_TRUE);
5088 5086 return;
5089 5087 case DL_NOTE_REPLUMB:
5090 5088 qwriter_ip(ill, ill->ill_rq, mp,
5091 5089 ill_replumb_tail, CUR_OP, B_TRUE);
5092 5090 return;
5093 5091 default:
5094 5092 ASSERT(0);
5095 5093 ill_refrele(ill);
5096 5094 }
5097 5095 break;
5098 5096
5099 5097 case M_ERROR:
5100 5098 case M_HANGUP:
5101 5099 qwriter_ip(ill, ill->ill_rq, mp, ipif_all_down_tail, CUR_OP,
5102 5100 B_TRUE);
5103 5101 return;
5104 5102
5105 5103 case M_IOCTL:
5106 5104 case M_IOCDATA:
5107 5105 qwriter_ip(ill, (connp != NULL ? CONNP_TO_WQ(connp) :
5108 5106 ill->ill_wq), mp, ip_reprocess_ioctl, CUR_OP, B_TRUE);
5109 5107 return;
5110 5108
5111 5109 default:
5112 5110 cmn_err(CE_PANIC, "ipif_ill_refrele_tail mp %p "
5113 5111 "db_type %d\n", (void *)mp, mp->b_datap->db_type);
5114 5112 }
5115 5113 return;
5116 5114 unlock:
5117 5115 mutex_exit(&ipsq->ipsq_lock);
5118 5116 mutex_exit(&ipx->ipx_lock);
5119 5117 mutex_exit(&ill->ill_lock);
5120 5118 }
5121 5119
5122 5120 #ifdef DEBUG
5123 5121 /* Reuse trace buffer from beginning (if reached the end) and record trace */
5124 5122 static void
5125 5123 th_trace_rrecord(th_trace_t *th_trace)
5126 5124 {
5127 5125 tr_buf_t *tr_buf;
5128 5126 uint_t lastref;
5129 5127
5130 5128 lastref = th_trace->th_trace_lastref;
5131 5129 lastref++;
5132 5130 if (lastref == TR_BUF_MAX)
5133 5131 lastref = 0;
5134 5132 th_trace->th_trace_lastref = lastref;
5135 5133 tr_buf = &th_trace->th_trbuf[lastref];
5136 5134 tr_buf->tr_time = ddi_get_lbolt();
5137 5135 tr_buf->tr_depth = getpcstack(tr_buf->tr_stack, TR_STACK_DEPTH);
5138 5136 }
5139 5137
5140 5138 static void
5141 5139 th_trace_free(void *value)
5142 5140 {
5143 5141 th_trace_t *th_trace = value;
5144 5142
5145 5143 ASSERT(th_trace->th_refcnt == 0);
5146 5144 kmem_free(th_trace, sizeof (*th_trace));
5147 5145 }
5148 5146
5149 5147 /*
5150 5148 * Find or create the per-thread hash table used to track object references.
5151 5149 * The ipst argument is NULL if we shouldn't allocate.
5152 5150 *
5153 5151 * Accesses per-thread data, so there's no need to lock here.
5154 5152 */
5155 5153 static mod_hash_t *
5156 5154 th_trace_gethash(ip_stack_t *ipst)
5157 5155 {
5158 5156 th_hash_t *thh;
5159 5157
5160 5158 if ((thh = tsd_get(ip_thread_data)) == NULL && ipst != NULL) {
5161 5159 mod_hash_t *mh;
5162 5160 char name[256];
5163 5161 size_t objsize, rshift;
5164 5162 int retv;
5165 5163
5166 5164 if ((thh = kmem_alloc(sizeof (*thh), KM_NOSLEEP)) == NULL)
5167 5165 return (NULL);
5168 5166 (void) snprintf(name, sizeof (name), "th_trace_%p",
5169 5167 (void *)curthread);
5170 5168
5171 5169 /*
5172 5170 * We use mod_hash_create_extended here rather than the more
5173 5171 * obvious mod_hash_create_ptrhash because the latter has a
5174 5172 * hard-coded KM_SLEEP, and we'd prefer to fail rather than
5175 5173 * block.
5176 5174 */
5177 5175 objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)),
5178 5176 MAX(sizeof (ire_t), sizeof (ncec_t)));
5179 5177 rshift = highbit(objsize);
5180 5178 mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor,
5181 5179 th_trace_free, mod_hash_byptr, (void *)rshift,
5182 5180 mod_hash_ptrkey_cmp, KM_NOSLEEP);
5183 5181 if (mh == NULL) {
5184 5182 kmem_free(thh, sizeof (*thh));
5185 5183 return (NULL);
5186 5184 }
5187 5185 thh->thh_hash = mh;
5188 5186 thh->thh_ipst = ipst;
5189 5187 /*
5190 5188 * We trace ills, ipifs, ires, and nces. All of these are
5191 5189 * per-IP-stack, so the lock on the thread list is as well.
5192 5190 */
5193 5191 rw_enter(&ip_thread_rwlock, RW_WRITER);
5194 5192 list_insert_tail(&ip_thread_list, thh);
5195 5193 rw_exit(&ip_thread_rwlock);
5196 5194 retv = tsd_set(ip_thread_data, thh);
5197 5195 ASSERT(retv == 0);
5198 5196 }
5199 5197 return (thh != NULL ? thh->thh_hash : NULL);
5200 5198 }
5201 5199
5202 5200 boolean_t
5203 5201 th_trace_ref(const void *obj, ip_stack_t *ipst)
5204 5202 {
5205 5203 th_trace_t *th_trace;
5206 5204 mod_hash_t *mh;
5207 5205 mod_hash_val_t val;
5208 5206
5209 5207 if ((mh = th_trace_gethash(ipst)) == NULL)
5210 5208 return (B_FALSE);
5211 5209
5212 5210 /*
5213 5211 * Attempt to locate the trace buffer for this obj and thread.
5214 5212 * If it does not exist, then allocate a new trace buffer and
5215 5213 * insert into the hash.
5216 5214 */
5217 5215 if (mod_hash_find(mh, (mod_hash_key_t)obj, &val) == MH_ERR_NOTFOUND) {
5218 5216 th_trace = kmem_zalloc(sizeof (th_trace_t), KM_NOSLEEP);
5219 5217 if (th_trace == NULL)
5220 5218 return (B_FALSE);
5221 5219
5222 5220 th_trace->th_id = curthread;
5223 5221 if (mod_hash_insert(mh, (mod_hash_key_t)obj,
5224 5222 (mod_hash_val_t)th_trace) != 0) {
5225 5223 kmem_free(th_trace, sizeof (th_trace_t));
5226 5224 return (B_FALSE);
5227 5225 }
5228 5226 } else {
5229 5227 th_trace = (th_trace_t *)val;
5230 5228 }
5231 5229
5232 5230 ASSERT(th_trace->th_refcnt >= 0 &&
5233 5231 th_trace->th_refcnt < TR_BUF_MAX - 1);
5234 5232
5235 5233 th_trace->th_refcnt++;
5236 5234 th_trace_rrecord(th_trace);
5237 5235 return (B_TRUE);
5238 5236 }
5239 5237
5240 5238 /*
5241 5239 * For the purpose of tracing a reference release, we assume that global
5242 5240 * tracing is always on and that the same thread initiated the reference hold
5243 5241 * is releasing.
5244 5242 */
5245 5243 void
5246 5244 th_trace_unref(const void *obj)
5247 5245 {
5248 5246 int retv;
5249 5247 mod_hash_t *mh;
5250 5248 th_trace_t *th_trace;
5251 5249 mod_hash_val_t val;
5252 5250
5253 5251 mh = th_trace_gethash(NULL);
5254 5252 retv = mod_hash_find(mh, (mod_hash_key_t)obj, &val);
5255 5253 ASSERT(retv == 0);
5256 5254 th_trace = (th_trace_t *)val;
5257 5255
5258 5256 ASSERT(th_trace->th_refcnt > 0);
5259 5257 th_trace->th_refcnt--;
5260 5258 th_trace_rrecord(th_trace);
5261 5259 }
5262 5260
5263 5261 /*
5264 5262 * If tracing has been disabled, then we assume that the reference counts are
5265 5263 * now useless, and we clear them out before destroying the entries.
5266 5264 */
5267 5265 void
5268 5266 th_trace_cleanup(const void *obj, boolean_t trace_disable)
5269 5267 {
5270 5268 th_hash_t *thh;
5271 5269 mod_hash_t *mh;
5272 5270 mod_hash_val_t val;
5273 5271 th_trace_t *th_trace;
5274 5272 int retv;
5275 5273
5276 5274 rw_enter(&ip_thread_rwlock, RW_READER);
5277 5275 for (thh = list_head(&ip_thread_list); thh != NULL;
5278 5276 thh = list_next(&ip_thread_list, thh)) {
5279 5277 if (mod_hash_find(mh = thh->thh_hash, (mod_hash_key_t)obj,
5280 5278 &val) == 0) {
5281 5279 th_trace = (th_trace_t *)val;
5282 5280 if (trace_disable)
5283 5281 th_trace->th_refcnt = 0;
5284 5282 retv = mod_hash_destroy(mh, (mod_hash_key_t)obj);
5285 5283 ASSERT(retv == 0);
5286 5284 }
5287 5285 }
5288 5286 rw_exit(&ip_thread_rwlock);
5289 5287 }
5290 5288
5291 5289 void
5292 5290 ipif_trace_ref(ipif_t *ipif)
5293 5291 {
5294 5292 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5295 5293
5296 5294 if (ipif->ipif_trace_disable)
5297 5295 return;
5298 5296
5299 5297 if (!th_trace_ref(ipif, ipif->ipif_ill->ill_ipst)) {
5300 5298 ipif->ipif_trace_disable = B_TRUE;
5301 5299 ipif_trace_cleanup(ipif);
5302 5300 }
5303 5301 }
5304 5302
5305 5303 void
5306 5304 ipif_untrace_ref(ipif_t *ipif)
5307 5305 {
5308 5306 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5309 5307
5310 5308 if (!ipif->ipif_trace_disable)
5311 5309 th_trace_unref(ipif);
5312 5310 }
5313 5311
5314 5312 void
5315 5313 ill_trace_ref(ill_t *ill)
5316 5314 {
5317 5315 ASSERT(MUTEX_HELD(&ill->ill_lock));
5318 5316
5319 5317 if (ill->ill_trace_disable)
5320 5318 return;
5321 5319
5322 5320 if (!th_trace_ref(ill, ill->ill_ipst)) {
5323 5321 ill->ill_trace_disable = B_TRUE;
5324 5322 ill_trace_cleanup(ill);
5325 5323 }
5326 5324 }
5327 5325
5328 5326 void
5329 5327 ill_untrace_ref(ill_t *ill)
5330 5328 {
5331 5329 ASSERT(MUTEX_HELD(&ill->ill_lock));
5332 5330
5333 5331 if (!ill->ill_trace_disable)
5334 5332 th_trace_unref(ill);
5335 5333 }
5336 5334
5337 5335 /*
5338 5336 * Called when ipif is unplumbed or when memory alloc fails. Note that on
5339 5337 * failure, ipif_trace_disable is set.
5340 5338 */
5341 5339 static void
5342 5340 ipif_trace_cleanup(const ipif_t *ipif)
5343 5341 {
5344 5342 th_trace_cleanup(ipif, ipif->ipif_trace_disable);
5345 5343 }
5346 5344
5347 5345 /*
5348 5346 * Called when ill is unplumbed or when memory alloc fails. Note that on
5349 5347 * failure, ill_trace_disable is set.
5350 5348 */
5351 5349 static void
5352 5350 ill_trace_cleanup(const ill_t *ill)
5353 5351 {
5354 5352 th_trace_cleanup(ill, ill->ill_trace_disable);
5355 5353 }
5356 5354 #endif /* DEBUG */
5357 5355
5358 5356 void
5359 5357 ipif_refhold_locked(ipif_t *ipif)
5360 5358 {
5361 5359 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
5362 5360 ipif->ipif_refcnt++;
5363 5361 IPIF_TRACE_REF(ipif);
5364 5362 }
5365 5363
5366 5364 void
5367 5365 ipif_refhold(ipif_t *ipif)
5368 5366 {
5369 5367 ill_t *ill;
5370 5368
5371 5369 ill = ipif->ipif_ill;
5372 5370 mutex_enter(&ill->ill_lock);
5373 5371 ipif->ipif_refcnt++;
5374 5372 IPIF_TRACE_REF(ipif);
5375 5373 mutex_exit(&ill->ill_lock);
5376 5374 }
5377 5375
5378 5376 /*
5379 5377 * Must not be called while holding any locks. Otherwise if this is
5380 5378 * the last reference to be released there is a chance of recursive mutex
5381 5379 * panic due to ipif_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
5382 5380 * to restart an ioctl.
5383 5381 */
5384 5382 void
5385 5383 ipif_refrele(ipif_t *ipif)
5386 5384 {
5387 5385 ill_t *ill;
5388 5386
5389 5387 ill = ipif->ipif_ill;
5390 5388
5391 5389 mutex_enter(&ill->ill_lock);
5392 5390 ASSERT(ipif->ipif_refcnt != 0);
5393 5391 ipif->ipif_refcnt--;
5394 5392 IPIF_UNTRACE_REF(ipif);
5395 5393 if (ipif->ipif_refcnt != 0) {
5396 5394 mutex_exit(&ill->ill_lock);
5397 5395 return;
5398 5396 }
5399 5397
5400 5398 /* Drops the ill_lock */
5401 5399 ipif_ill_refrele_tail(ill);
5402 5400 }
5403 5401
5404 5402 ipif_t *
5405 5403 ipif_get_next_ipif(ipif_t *curr, ill_t *ill)
5406 5404 {
5407 5405 ipif_t *ipif;
5408 5406
5409 5407 mutex_enter(&ill->ill_lock);
5410 5408 for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next);
5411 5409 ipif != NULL; ipif = ipif->ipif_next) {
5412 5410 if (IPIF_IS_CONDEMNED(ipif))
5413 5411 continue;
5414 5412 ipif_refhold_locked(ipif);
5415 5413 mutex_exit(&ill->ill_lock);
5416 5414 return (ipif);
5417 5415 }
5418 5416 mutex_exit(&ill->ill_lock);
5419 5417 return (NULL);
5420 5418 }
5421 5419
5422 5420 /*
5423 5421 * TODO: make this table extendible at run time
5424 5422 * Return a pointer to the mac type info for 'mac_type'
5425 5423 */
5426 5424 static ip_m_t *
5427 5425 ip_m_lookup(t_uscalar_t mac_type)
5428 5426 {
5429 5427 ip_m_t *ipm;
5430 5428
5431 5429 for (ipm = ip_m_tbl; ipm < A_END(ip_m_tbl); ipm++)
5432 5430 if (ipm->ip_m_mac_type == mac_type)
5433 5431 return (ipm);
5434 5432 return (NULL);
5435 5433 }
5436 5434
5437 5435 /*
5438 5436 * Make a link layer address from the multicast IP address *addr.
5439 5437 * To form the link layer address, invoke the ip_m_v*mapping function
5440 5438 * associated with the link-layer type.
5441 5439 */
5442 5440 void
5443 5441 ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr)
5444 5442 {
5445 5443 ip_m_t *ipm;
5446 5444
5447 5445 if (ill->ill_net_type == IRE_IF_NORESOLVER)
5448 5446 return;
5449 5447
5450 5448 ASSERT(addr != NULL);
5451 5449
5452 5450 ipm = ip_m_lookup(ill->ill_mactype);
5453 5451 if (ipm == NULL ||
5454 5452 (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) ||
5455 5453 (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) {
5456 5454 ip0dbg(("no mapping for ill %s mactype 0x%x\n",
5457 5455 ill->ill_name, ill->ill_mactype));
5458 5456 return;
5459 5457 }
5460 5458 if (ill->ill_isv6)
5461 5459 (*ipm->ip_m_v6mapping)(ill, addr, hwaddr);
5462 5460 else
5463 5461 (*ipm->ip_m_v4mapping)(ill, addr, hwaddr);
5464 5462 }
5465 5463
5466 5464 /*
5467 5465 * Returns B_FALSE if the IPv4 netmask pointed by `mask' is non-contiguous.
5468 5466 * Otherwise returns B_TRUE.
5469 5467 *
5470 5468 * The netmask can be verified to be contiguous with 32 shifts and or
5471 5469 * operations. Take the contiguous mask (in host byte order) and compute
5472 5470 * mask | mask << 1 | mask << 2 | ... | mask << 31
5473 5471 * the result will be the same as the 'mask' for contiguous mask.
5474 5472 */
5475 5473 static boolean_t
5476 5474 ip_contiguous_mask(uint32_t mask)
5477 5475 {
5478 5476 uint32_t m = mask;
5479 5477 int i;
5480 5478
5481 5479 for (i = 1; i < 32; i++)
5482 5480 m |= (mask << i);
5483 5481
5484 5482 return (m == mask);
5485 5483 }
5486 5484
5487 5485 /*
5488 5486 * ip_rt_add is called to add an IPv4 route to the forwarding table.
5489 5487 * ill is passed in to associate it with the correct interface.
5490 5488 * If ire_arg is set, then we return the held IRE in that location.
5491 5489 */
5492 5490 int
5493 5491 ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
5494 5492 ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg,
5495 5493 boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid)
5496 5494 {
5497 5495 ire_t *ire, *nire;
5498 5496 ire_t *gw_ire = NULL;
5499 5497 ipif_t *ipif = NULL;
5500 5498 uint_t type;
5501 5499 int match_flags = MATCH_IRE_TYPE;
5502 5500 tsol_gc_t *gc = NULL;
5503 5501 tsol_gcgrp_t *gcgrp = NULL;
5504 5502 boolean_t gcgrp_xtraref = B_FALSE;
5505 5503 boolean_t cgtp_broadcast;
5506 5504 boolean_t unbound = B_FALSE;
5507 5505
5508 5506 ip1dbg(("ip_rt_add:"));
5509 5507
5510 5508 if (ire_arg != NULL)
5511 5509 *ire_arg = NULL;
5512 5510
5513 5511 /* disallow non-contiguous netmasks */
5514 5512 if (!ip_contiguous_mask(ntohl(mask)))
5515 5513 return (ENOTSUP);
5516 5514
5517 5515 /*
5518 5516 * If this is the case of RTF_HOST being set, then we set the netmask
5519 5517 * to all ones (regardless if one was supplied).
5520 5518 */
5521 5519 if (flags & RTF_HOST)
5522 5520 mask = IP_HOST_MASK;
5523 5521
5524 5522 /*
5525 5523 * Prevent routes with a zero gateway from being created (since
5526 5524 * interfaces can currently be plumbed and brought up no assigned
5527 5525 * address).
5528 5526 */
5529 5527 if (gw_addr == 0)
5530 5528 return (ENETUNREACH);
5531 5529 /*
5532 5530 * Get the ipif, if any, corresponding to the gw_addr
5533 5531 * If -ifp was specified we restrict ourselves to the ill, otherwise
5534 5532 * we match on the gatway and destination to handle unnumbered pt-pt
5535 5533 * interfaces.
5536 5534 */
5537 5535 if (ill != NULL)
5538 5536 ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst);
5539 5537 else
5540 5538 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
5541 5539 if (ipif != NULL) {
5542 5540 if (IS_VNI(ipif->ipif_ill)) {
5543 5541 ipif_refrele(ipif);
5544 5542 return (EINVAL);
5545 5543 }
5546 5544 }
5547 5545
5548 5546 /*
5549 5547 * GateD will attempt to create routes with a loopback interface
5550 5548 * address as the gateway and with RTF_GATEWAY set. We allow
5551 5549 * these routes to be added, but create them as interface routes
5552 5550 * since the gateway is an interface address.
5553 5551 */
5554 5552 if ((ipif != NULL) && (ipif->ipif_ire_type == IRE_LOOPBACK)) {
5555 5553 flags &= ~RTF_GATEWAY;
5556 5554 if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK &&
5557 5555 mask == IP_HOST_MASK) {
5558 5556 ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK,
5559 5557 NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
5560 5558 NULL);
5561 5559 if (ire != NULL) {
5562 5560 ire_refrele(ire);
5563 5561 ipif_refrele(ipif);
5564 5562 return (EEXIST);
5565 5563 }
5566 5564 ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x"
5567 5565 "for 0x%x\n", (void *)ipif,
5568 5566 ipif->ipif_ire_type,
5569 5567 ntohl(ipif->ipif_lcl_addr)));
5570 5568 ire = ire_create(
5571 5569 (uchar_t *)&dst_addr, /* dest address */
5572 5570 (uchar_t *)&mask, /* mask */
5573 5571 NULL, /* no gateway */
5574 5572 ipif->ipif_ire_type, /* LOOPBACK */
5575 5573 ipif->ipif_ill,
5576 5574 zoneid,
5577 5575 (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0,
5578 5576 NULL,
5579 5577 ipst);
5580 5578
5581 5579 if (ire == NULL) {
5582 5580 ipif_refrele(ipif);
5583 5581 return (ENOMEM);
5584 5582 }
5585 5583 /* src address assigned by the caller? */
5586 5584 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5587 5585 ire->ire_setsrc_addr = src_addr;
5588 5586
5589 5587 nire = ire_add(ire);
5590 5588 if (nire == NULL) {
5591 5589 /*
5592 5590 * In the result of failure, ire_add() will have
5593 5591 * already deleted the ire in question, so there
5594 5592 * is no need to do that here.
5595 5593 */
5596 5594 ipif_refrele(ipif);
5597 5595 return (ENOMEM);
5598 5596 }
5599 5597 /*
5600 5598 * Check if it was a duplicate entry. This handles
5601 5599 * the case of two racing route adds for the same route
5602 5600 */
5603 5601 if (nire != ire) {
5604 5602 ASSERT(nire->ire_identical_ref > 1);
5605 5603 ire_delete(nire);
5606 5604 ire_refrele(nire);
5607 5605 ipif_refrele(ipif);
5608 5606 return (EEXIST);
5609 5607 }
5610 5608 ire = nire;
5611 5609 goto save_ire;
5612 5610 }
5613 5611 }
5614 5612
5615 5613 /*
5616 5614 * The routes for multicast with CGTP are quite special in that
5617 5615 * the gateway is the local interface address, yet RTF_GATEWAY
5618 5616 * is set. We turn off RTF_GATEWAY to provide compatibility with
5619 5617 * this undocumented and unusual use of multicast routes.
5620 5618 */
5621 5619 if ((flags & RTF_MULTIRT) && ipif != NULL)
5622 5620 flags &= ~RTF_GATEWAY;
5623 5621
5624 5622 /*
5625 5623 * Traditionally, interface routes are ones where RTF_GATEWAY isn't set
5626 5624 * and the gateway address provided is one of the system's interface
5627 5625 * addresses. By using the routing socket interface and supplying an
5628 5626 * RTA_IFP sockaddr with an interface index, an alternate method of
5629 5627 * specifying an interface route to be created is available which uses
5630 5628 * the interface index that specifies the outgoing interface rather than
5631 5629 * the address of an outgoing interface (which may not be able to
5632 5630 * uniquely identify an interface). When coupled with the RTF_GATEWAY
5633 5631 * flag, routes can be specified which not only specify the next-hop to
5634 5632 * be used when routing to a certain prefix, but also which outgoing
5635 5633 * interface should be used.
5636 5634 *
5637 5635 * Previously, interfaces would have unique addresses assigned to them
5638 5636 * and so the address assigned to a particular interface could be used
5639 5637 * to identify a particular interface. One exception to this was the
5640 5638 * case of an unnumbered interface (where IPIF_UNNUMBERED was set).
5641 5639 *
5642 5640 * With the advent of IPv6 and its link-local addresses, this
5643 5641 * restriction was relaxed and interfaces could share addresses between
5644 5642 * themselves. In fact, typically all of the link-local interfaces on
5645 5643 * an IPv6 node or router will have the same link-local address. In
5646 5644 * order to differentiate between these interfaces, the use of an
5647 5645 * interface index is necessary and this index can be carried inside a
5648 5646 * RTA_IFP sockaddr (which is actually a sockaddr_dl). One restriction
5649 5647 * of using the interface index, however, is that all of the ipif's that
5650 5648 * are part of an ill have the same index and so the RTA_IFP sockaddr
5651 5649 * cannot be used to differentiate between ipif's (or logical
5652 5650 * interfaces) that belong to the same ill (physical interface).
5653 5651 *
5654 5652 * For example, in the following case involving IPv4 interfaces and
5655 5653 * logical interfaces
5656 5654 *
5657 5655 * 192.0.2.32 255.255.255.224 192.0.2.33 U if0
5658 5656 * 192.0.2.32 255.255.255.224 192.0.2.34 U if0
5659 5657 * 192.0.2.32 255.255.255.224 192.0.2.35 U if0
5660 5658 *
5661 5659 * the ipif's corresponding to each of these interface routes can be
5662 5660 * uniquely identified by the "gateway" (actually interface address).
5663 5661 *
5664 5662 * In this case involving multiple IPv6 default routes to a particular
5665 5663 * link-local gateway, the use of RTA_IFP is necessary to specify which
5666 5664 * default route is of interest:
5667 5665 *
5668 5666 * default fe80::123:4567:89ab:cdef U if0
5669 5667 * default fe80::123:4567:89ab:cdef U if1
5670 5668 */
5671 5669
5672 5670 /* RTF_GATEWAY not set */
5673 5671 if (!(flags & RTF_GATEWAY)) {
5674 5672 if (sp != NULL) {
5675 5673 ip2dbg(("ip_rt_add: gateway security attributes "
5676 5674 "cannot be set with interface route\n"));
5677 5675 if (ipif != NULL)
5678 5676 ipif_refrele(ipif);
5679 5677 return (EINVAL);
5680 5678 }
5681 5679
5682 5680 /*
5683 5681 * Whether or not ill (RTA_IFP) is set, we require that
5684 5682 * the gateway is one of our local addresses.
5685 5683 */
5686 5684 if (ipif == NULL)
5687 5685 return (ENETUNREACH);
5688 5686
5689 5687 /*
5690 5688 * We use MATCH_IRE_ILL here. If the caller specified an
5691 5689 * interface (from the RTA_IFP sockaddr) we use it, otherwise
5692 5690 * we use the ill derived from the gateway address.
5693 5691 * We can always match the gateway address since we record it
5694 5692 * in ire_gateway_addr.
5695 5693 * We don't allow RTA_IFP to specify a different ill than the
5696 5694 * one matching the ipif to make sure we can delete the route.
5697 5695 */
5698 5696 match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL;
5699 5697 if (ill == NULL) {
5700 5698 ill = ipif->ipif_ill;
5701 5699 } else if (ill != ipif->ipif_ill) {
5702 5700 ipif_refrele(ipif);
5703 5701 return (EINVAL);
5704 5702 }
5705 5703
5706 5704 /*
5707 5705 * We check for an existing entry at this point.
5708 5706 *
5709 5707 * Since a netmask isn't passed in via the ioctl interface
5710 5708 * (SIOCADDRT), we don't check for a matching netmask in that
5711 5709 * case.
5712 5710 */
5713 5711 if (!ioctl_msg)
5714 5712 match_flags |= MATCH_IRE_MASK;
5715 5713 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
5716 5714 IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst,
5717 5715 NULL);
5718 5716 if (ire != NULL) {
5719 5717 ire_refrele(ire);
5720 5718 ipif_refrele(ipif);
5721 5719 return (EEXIST);
5722 5720 }
5723 5721
5724 5722 /*
5725 5723 * Some software (for example, GateD and Sun Cluster) attempts
5726 5724 * to create (what amount to) IRE_PREFIX routes with the
5727 5725 * loopback address as the gateway. This is primarily done to
5728 5726 * set up prefixes with the RTF_REJECT flag set (for example,
5729 5727 * when generating aggregate routes.)
5730 5728 *
5731 5729 * If the IRE type (as defined by ill->ill_net_type) would be
5732 5730 * IRE_LOOPBACK, then we map the request into a
5733 5731 * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as
5734 5732 * these interface routes, by definition, can only be that.
5735 5733 *
5736 5734 * Needless to say, the real IRE_LOOPBACK is NOT created by this
5737 5735 * routine, but rather using ire_create() directly.
5738 5736 *
5739 5737 */
5740 5738 type = ill->ill_net_type;
5741 5739 if (type == IRE_LOOPBACK) {
5742 5740 type = IRE_IF_NORESOLVER;
5743 5741 flags |= RTF_BLACKHOLE;
5744 5742 }
5745 5743
5746 5744 /*
5747 5745 * Create a copy of the IRE_IF_NORESOLVER or
5748 5746 * IRE_IF_RESOLVER with the modified address, netmask, and
5749 5747 * gateway.
5750 5748 */
5751 5749 ire = ire_create(
5752 5750 (uchar_t *)&dst_addr,
5753 5751 (uint8_t *)&mask,
5754 5752 (uint8_t *)&gw_addr,
5755 5753 type,
5756 5754 ill,
5757 5755 zoneid,
5758 5756 flags,
5759 5757 NULL,
5760 5758 ipst);
5761 5759 if (ire == NULL) {
5762 5760 ipif_refrele(ipif);
5763 5761 return (ENOMEM);
5764 5762 }
5765 5763
5766 5764 /* src address assigned by the caller? */
5767 5765 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5768 5766 ire->ire_setsrc_addr = src_addr;
5769 5767
5770 5768 nire = ire_add(ire);
5771 5769 if (nire == NULL) {
5772 5770 /*
5773 5771 * In the result of failure, ire_add() will have
5774 5772 * already deleted the ire in question, so there
5775 5773 * is no need to do that here.
5776 5774 */
5777 5775 ipif_refrele(ipif);
5778 5776 return (ENOMEM);
5779 5777 }
5780 5778 /*
5781 5779 * Check if it was a duplicate entry. This handles
5782 5780 * the case of two racing route adds for the same route
5783 5781 */
5784 5782 if (nire != ire) {
5785 5783 ire_delete(nire);
5786 5784 ire_refrele(nire);
5787 5785 ipif_refrele(ipif);
5788 5786 return (EEXIST);
5789 5787 }
5790 5788 ire = nire;
5791 5789 goto save_ire;
5792 5790 }
5793 5791
5794 5792 /*
5795 5793 * Get an interface IRE for the specified gateway.
5796 5794 * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the
5797 5795 * gateway, it is currently unreachable and we fail the request
5798 5796 * accordingly. We reject any RTF_GATEWAY routes where the gateway
5799 5797 * is an IRE_LOCAL or IRE_LOOPBACK.
5800 5798 * If RTA_IFP was specified we look on that particular ill.
5801 5799 */
5802 5800 if (ill != NULL)
5803 5801 match_flags |= MATCH_IRE_ILL;
5804 5802
5805 5803 /* Check whether the gateway is reachable. */
5806 5804 again:
5807 5805 type = IRE_INTERFACE | IRE_LOCAL | IRE_LOOPBACK;
5808 5806 if (flags & RTF_INDIRECT)
5809 5807 type |= IRE_OFFLINK;
5810 5808
5811 5809 gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill,
5812 5810 ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
5813 5811 if (gw_ire == NULL) {
5814 5812 /*
5815 5813 * With IPMP, we allow host routes to influence in.mpathd's
5816 5814 * target selection. However, if the test addresses are on
5817 5815 * their own network, the above lookup will fail since the
5818 5816 * underlying IRE_INTERFACEs are marked hidden. So allow
5819 5817 * hidden test IREs to be found and try again.
5820 5818 */
5821 5819 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) {
5822 5820 match_flags |= MATCH_IRE_TESTHIDDEN;
5823 5821 goto again;
5824 5822 }
5825 5823 if (ipif != NULL)
5826 5824 ipif_refrele(ipif);
5827 5825 return (ENETUNREACH);
5828 5826 }
5829 5827 if (gw_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) {
5830 5828 ire_refrele(gw_ire);
5831 5829 if (ipif != NULL)
5832 5830 ipif_refrele(ipif);
5833 5831 return (ENETUNREACH);
5834 5832 }
5835 5833
5836 5834 if (ill == NULL && !(flags & RTF_INDIRECT)) {
5837 5835 unbound = B_TRUE;
5838 5836 if (ipst->ips_ip_strict_src_multihoming > 0)
5839 5837 ill = gw_ire->ire_ill;
5840 5838 }
5841 5839
5842 5840 /*
5843 5841 * We create one of three types of IREs as a result of this request
5844 5842 * based on the netmask. A netmask of all ones (which is automatically
5845 5843 * assumed when RTF_HOST is set) results in an IRE_HOST being created.
5846 5844 * An all zeroes netmask implies a default route so an IRE_DEFAULT is
5847 5845 * created. Otherwise, an IRE_PREFIX route is created for the
5848 5846 * destination prefix.
5849 5847 */
5850 5848 if (mask == IP_HOST_MASK)
5851 5849 type = IRE_HOST;
5852 5850 else if (mask == 0)
5853 5851 type = IRE_DEFAULT;
5854 5852 else
5855 5853 type = IRE_PREFIX;
5856 5854
5857 5855 /* check for a duplicate entry */
5858 5856 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
5859 5857 ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW,
5860 5858 0, ipst, NULL);
5861 5859 if (ire != NULL) {
5862 5860 if (ipif != NULL)
5863 5861 ipif_refrele(ipif);
5864 5862 ire_refrele(gw_ire);
5865 5863 ire_refrele(ire);
5866 5864 return (EEXIST);
5867 5865 }
5868 5866
5869 5867 /* Security attribute exists */
5870 5868 if (sp != NULL) {
5871 5869 tsol_gcgrp_addr_t ga;
5872 5870
5873 5871 /* find or create the gateway credentials group */
5874 5872 ga.ga_af = AF_INET;
5875 5873 IN6_IPADDR_TO_V4MAPPED(gw_addr, &ga.ga_addr);
5876 5874
5877 5875 /* we hold reference to it upon success */
5878 5876 gcgrp = gcgrp_lookup(&ga, B_TRUE);
5879 5877 if (gcgrp == NULL) {
5880 5878 if (ipif != NULL)
5881 5879 ipif_refrele(ipif);
5882 5880 ire_refrele(gw_ire);
5883 5881 return (ENOMEM);
5884 5882 }
5885 5883
5886 5884 /*
5887 5885 * Create and add the security attribute to the group; a
5888 5886 * reference to the group is made upon allocating a new
5889 5887 * entry successfully. If it finds an already-existing
5890 5888 * entry for the security attribute in the group, it simply
5891 5889 * returns it and no new reference is made to the group.
5892 5890 */
5893 5891 gc = gc_create(sp, gcgrp, &gcgrp_xtraref);
5894 5892 if (gc == NULL) {
5895 5893 if (ipif != NULL)
5896 5894 ipif_refrele(ipif);
5897 5895 /* release reference held by gcgrp_lookup */
5898 5896 GCGRP_REFRELE(gcgrp);
5899 5897 ire_refrele(gw_ire);
5900 5898 return (ENOMEM);
5901 5899 }
5902 5900 }
5903 5901
5904 5902 /* Create the IRE. */
5905 5903 ire = ire_create(
5906 5904 (uchar_t *)&dst_addr, /* dest address */
5907 5905 (uchar_t *)&mask, /* mask */
5908 5906 (uchar_t *)&gw_addr, /* gateway address */
5909 5907 (ushort_t)type, /* IRE type */
5910 5908 ill,
5911 5909 zoneid,
5912 5910 flags,
5913 5911 gc, /* security attribute */
5914 5912 ipst);
5915 5913
5916 5914 /*
5917 5915 * The ire holds a reference to the 'gc' and the 'gc' holds a
5918 5916 * reference to the 'gcgrp'. We can now release the extra reference
5919 5917 * the 'gcgrp' acquired in the gcgrp_lookup, if it was not used.
5920 5918 */
5921 5919 if (gcgrp_xtraref)
5922 5920 GCGRP_REFRELE(gcgrp);
5923 5921 if (ire == NULL) {
5924 5922 if (gc != NULL)
5925 5923 GC_REFRELE(gc);
5926 5924 if (ipif != NULL)
5927 5925 ipif_refrele(ipif);
5928 5926 ire_refrele(gw_ire);
5929 5927 return (ENOMEM);
5930 5928 }
5931 5929
5932 5930 /* Before we add, check if an extra CGTP broadcast is needed */
5933 5931 cgtp_broadcast = ((flags & RTF_MULTIRT) &&
5934 5932 ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST);
5935 5933
5936 5934 /* src address assigned by the caller? */
5937 5935 if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC))
5938 5936 ire->ire_setsrc_addr = src_addr;
5939 5937
5940 5938 ire->ire_unbound = unbound;
5941 5939
5942 5940 /*
5943 5941 * POLICY: should we allow an RTF_HOST with address INADDR_ANY?
5944 5942 * SUN/OS socket stuff does but do we really want to allow 0.0.0.0?
5945 5943 */
5946 5944
5947 5945 /* Add the new IRE. */
5948 5946 nire = ire_add(ire);
5949 5947 if (nire == NULL) {
5950 5948 /*
5951 5949 * In the result of failure, ire_add() will have
5952 5950 * already deleted the ire in question, so there
5953 5951 * is no need to do that here.
5954 5952 */
5955 5953 if (ipif != NULL)
5956 5954 ipif_refrele(ipif);
5957 5955 ire_refrele(gw_ire);
5958 5956 return (ENOMEM);
5959 5957 }
5960 5958 /*
5961 5959 * Check if it was a duplicate entry. This handles
5962 5960 * the case of two racing route adds for the same route
5963 5961 */
5964 5962 if (nire != ire) {
5965 5963 ire_delete(nire);
5966 5964 ire_refrele(nire);
5967 5965 if (ipif != NULL)
5968 5966 ipif_refrele(ipif);
5969 5967 ire_refrele(gw_ire);
5970 5968 return (EEXIST);
5971 5969 }
5972 5970 ire = nire;
5973 5971
5974 5972 if (flags & RTF_MULTIRT) {
5975 5973 /*
5976 5974 * Invoke the CGTP (multirouting) filtering module
5977 5975 * to add the dst address in the filtering database.
5978 5976 * Replicated inbound packets coming from that address
5979 5977 * will be filtered to discard the duplicates.
5980 5978 * It is not necessary to call the CGTP filter hook
5981 5979 * when the dst address is a broadcast or multicast,
5982 5980 * because an IP source address cannot be a broadcast
5983 5981 * or a multicast.
5984 5982 */
5985 5983 if (cgtp_broadcast) {
5986 5984 ip_cgtp_bcast_add(ire, ipst);
5987 5985 goto save_ire;
5988 5986 }
5989 5987 if (ipst->ips_ip_cgtp_filter_ops != NULL &&
5990 5988 !CLASSD(ire->ire_addr)) {
5991 5989 int res;
5992 5990 ipif_t *src_ipif;
5993 5991
5994 5992 /* Find the source address corresponding to gw_ire */
5995 5993 src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr,
5996 5994 NULL, zoneid, ipst);
5997 5995 if (src_ipif != NULL) {
5998 5996 res = ipst->ips_ip_cgtp_filter_ops->
5999 5997 cfo_add_dest_v4(
6000 5998 ipst->ips_netstack->netstack_stackid,
6001 5999 ire->ire_addr,
6002 6000 ire->ire_gateway_addr,
6003 6001 ire->ire_setsrc_addr,
6004 6002 src_ipif->ipif_lcl_addr);
6005 6003 ipif_refrele(src_ipif);
6006 6004 } else {
6007 6005 res = EADDRNOTAVAIL;
6008 6006 }
6009 6007 if (res != 0) {
6010 6008 if (ipif != NULL)
6011 6009 ipif_refrele(ipif);
6012 6010 ire_refrele(gw_ire);
6013 6011 ire_delete(ire);
6014 6012 ire_refrele(ire); /* Held in ire_add */
6015 6013 return (res);
6016 6014 }
6017 6015 }
6018 6016 }
6019 6017
6020 6018 save_ire:
6021 6019 if (gw_ire != NULL) {
6022 6020 ire_refrele(gw_ire);
6023 6021 gw_ire = NULL;
6024 6022 }
6025 6023 if (ill != NULL) {
6026 6024 /*
6027 6025 * Save enough information so that we can recreate the IRE if
6028 6026 * the interface goes down and then up. The metrics associated
6029 6027 * with the route will be saved as well when rts_setmetrics() is
6030 6028 * called after the IRE has been created. In the case where
6031 6029 * memory cannot be allocated, none of this information will be
6032 6030 * saved.
6033 6031 */
6034 6032 ill_save_ire(ill, ire);
6035 6033 }
6036 6034 if (ioctl_msg)
6037 6035 ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst);
6038 6036 if (ire_arg != NULL) {
6039 6037 /*
6040 6038 * Store the ire that was successfully added into where ire_arg
6041 6039 * points to so that callers don't have to look it up
6042 6040 * themselves (but they are responsible for ire_refrele()ing
6043 6041 * the ire when they are finished with it).
6044 6042 */
6045 6043 *ire_arg = ire;
6046 6044 } else {
6047 6045 ire_refrele(ire); /* Held in ire_add */
6048 6046 }
6049 6047 if (ipif != NULL)
6050 6048 ipif_refrele(ipif);
6051 6049 return (0);
6052 6050 }
6053 6051
6054 6052 /*
6055 6053 * ip_rt_delete is called to delete an IPv4 route.
6056 6054 * ill is passed in to associate it with the correct interface.
6057 6055 */
6058 6056 /* ARGSUSED4 */
6059 6057 int
6060 6058 ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr,
6061 6059 uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg,
6062 6060 ip_stack_t *ipst, zoneid_t zoneid)
6063 6061 {
6064 6062 ire_t *ire = NULL;
6065 6063 ipif_t *ipif;
6066 6064 uint_t type;
6067 6065 uint_t match_flags = MATCH_IRE_TYPE;
6068 6066 int err = 0;
6069 6067
6070 6068 ip1dbg(("ip_rt_delete:"));
6071 6069 /*
6072 6070 * If this is the case of RTF_HOST being set, then we set the netmask
6073 6071 * to all ones. Otherwise, we use the netmask if one was supplied.
6074 6072 */
6075 6073 if (flags & RTF_HOST) {
6076 6074 mask = IP_HOST_MASK;
6077 6075 match_flags |= MATCH_IRE_MASK;
6078 6076 } else if (rtm_addrs & RTA_NETMASK) {
6079 6077 match_flags |= MATCH_IRE_MASK;
6080 6078 }
6081 6079
6082 6080 /*
6083 6081 * Note that RTF_GATEWAY is never set on a delete, therefore
6084 6082 * we check if the gateway address is one of our interfaces first,
6085 6083 * and fall back on RTF_GATEWAY routes.
6086 6084 *
6087 6085 * This makes it possible to delete an original
6088 6086 * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1.
6089 6087 * However, we have RTF_KERNEL set on the ones created by ipif_up
6090 6088 * and those can not be deleted here.
6091 6089 *
6092 6090 * We use MATCH_IRE_ILL if we know the interface. If the caller
6093 6091 * specified an interface (from the RTA_IFP sockaddr) we use it,
6094 6092 * otherwise we use the ill derived from the gateway address.
6095 6093 * We can always match the gateway address since we record it
6096 6094 * in ire_gateway_addr.
6097 6095 *
6098 6096 * For more detail on specifying routes by gateway address and by
6099 6097 * interface index, see the comments in ip_rt_add().
6100 6098 */
6101 6099 ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst);
6102 6100 if (ipif != NULL) {
6103 6101 ill_t *ill_match;
6104 6102
6105 6103 if (ill != NULL)
6106 6104 ill_match = ill;
6107 6105 else
6108 6106 ill_match = ipif->ipif_ill;
6109 6107
6110 6108 match_flags |= MATCH_IRE_ILL;
6111 6109 if (ipif->ipif_ire_type == IRE_LOOPBACK) {
6112 6110 ire = ire_ftable_lookup_v4(dst_addr, mask, 0,
6113 6111 IRE_LOOPBACK, ill_match, ALL_ZONES, NULL,
6114 6112 match_flags, 0, ipst, NULL);
6115 6113 }
6116 6114 if (ire == NULL) {
6117 6115 match_flags |= MATCH_IRE_GW;
6118 6116 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr,
6119 6117 IRE_INTERFACE, ill_match, ALL_ZONES, NULL,
6120 6118 match_flags, 0, ipst, NULL);
6121 6119 }
6122 6120 /* Avoid deleting routes created by kernel from an ipif */
6123 6121 if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) {
6124 6122 ire_refrele(ire);
6125 6123 ire = NULL;
6126 6124 }
6127 6125
6128 6126 /* Restore in case we didn't find a match */
6129 6127 match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL);
6130 6128 }
6131 6129
6132 6130 if (ire == NULL) {
6133 6131 /*
6134 6132 * At this point, the gateway address is not one of our own
6135 6133 * addresses or a matching interface route was not found. We
6136 6134 * set the IRE type to lookup based on whether
6137 6135 * this is a host route, a default route or just a prefix.
6138 6136 *
6139 6137 * If an ill was passed in, then the lookup is based on an
6140 6138 * interface index so MATCH_IRE_ILL is added to match_flags.
6141 6139 */
6142 6140 match_flags |= MATCH_IRE_GW;
6143 6141 if (ill != NULL)
6144 6142 match_flags |= MATCH_IRE_ILL;
6145 6143 if (mask == IP_HOST_MASK)
6146 6144 type = IRE_HOST;
6147 6145 else if (mask == 0)
6148 6146 type = IRE_DEFAULT;
6149 6147 else
6150 6148 type = IRE_PREFIX;
6151 6149 ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill,
6152 6150 ALL_ZONES, NULL, match_flags, 0, ipst, NULL);
6153 6151 }
6154 6152
6155 6153 if (ipif != NULL) {
6156 6154 ipif_refrele(ipif);
6157 6155 ipif = NULL;
6158 6156 }
6159 6157
6160 6158 if (ire == NULL)
6161 6159 return (ESRCH);
6162 6160
6163 6161 if (ire->ire_flags & RTF_MULTIRT) {
6164 6162 /*
6165 6163 * Invoke the CGTP (multirouting) filtering module
6166 6164 * to remove the dst address from the filtering database.
6167 6165 * Packets coming from that address will no longer be
6168 6166 * filtered to remove duplicates.
6169 6167 */
6170 6168 if (ipst->ips_ip_cgtp_filter_ops != NULL) {
6171 6169 err = ipst->ips_ip_cgtp_filter_ops->cfo_del_dest_v4(
6172 6170 ipst->ips_netstack->netstack_stackid,
6173 6171 ire->ire_addr, ire->ire_gateway_addr);
6174 6172 }
6175 6173 ip_cgtp_bcast_delete(ire, ipst);
6176 6174 }
6177 6175
6178 6176 ill = ire->ire_ill;
6179 6177 if (ill != NULL)
6180 6178 ill_remove_saved_ire(ill, ire);
6181 6179 if (ioctl_msg)
6182 6180 ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst);
6183 6181 ire_delete(ire);
6184 6182 ire_refrele(ire);
6185 6183 return (err);
6186 6184 }
6187 6185
6188 6186 /*
6189 6187 * ip_siocaddrt is called to complete processing of an SIOCADDRT IOCTL.
6190 6188 */
6191 6189 /* ARGSUSED */
6192 6190 int
6193 6191 ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6194 6192 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6195 6193 {
6196 6194 ipaddr_t dst_addr;
6197 6195 ipaddr_t gw_addr;
6198 6196 ipaddr_t mask;
6199 6197 int error = 0;
6200 6198 mblk_t *mp1;
6201 6199 struct rtentry *rt;
6202 6200 ipif_t *ipif = NULL;
6203 6201 ip_stack_t *ipst;
6204 6202
6205 6203 ASSERT(q->q_next == NULL);
6206 6204 ipst = CONNQ_TO_IPST(q);
6207 6205
6208 6206 ip1dbg(("ip_siocaddrt:"));
6209 6207 /* Existence of mp1 verified in ip_wput_nondata */
6210 6208 mp1 = mp->b_cont->b_cont;
6211 6209 rt = (struct rtentry *)mp1->b_rptr;
6212 6210
6213 6211 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6214 6212 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6215 6213
6216 6214 /*
6217 6215 * If the RTF_HOST flag is on, this is a request to assign a gateway
6218 6216 * to a particular host address. In this case, we set the netmask to
6219 6217 * all ones for the particular destination address. Otherwise,
6220 6218 * determine the netmask to be used based on dst_addr and the interfaces
6221 6219 * in use.
6222 6220 */
6223 6221 if (rt->rt_flags & RTF_HOST) {
6224 6222 mask = IP_HOST_MASK;
6225 6223 } else {
6226 6224 /*
6227 6225 * Note that ip_subnet_mask returns a zero mask in the case of
6228 6226 * default (an all-zeroes address).
6229 6227 */
6230 6228 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6231 6229 }
6232 6230
6233 6231 error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL,
6234 6232 B_TRUE, NULL, ipst, ALL_ZONES);
6235 6233 if (ipif != NULL)
6236 6234 ipif_refrele(ipif);
6237 6235 return (error);
6238 6236 }
6239 6237
6240 6238 /*
6241 6239 * ip_siocdelrt is called to complete processing of an SIOCDELRT IOCTL.
6242 6240 */
6243 6241 /* ARGSUSED */
6244 6242 int
6245 6243 ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
6246 6244 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
6247 6245 {
6248 6246 ipaddr_t dst_addr;
6249 6247 ipaddr_t gw_addr;
6250 6248 ipaddr_t mask;
6251 6249 int error;
6252 6250 mblk_t *mp1;
6253 6251 struct rtentry *rt;
6254 6252 ipif_t *ipif = NULL;
6255 6253 ip_stack_t *ipst;
6256 6254
6257 6255 ASSERT(q->q_next == NULL);
6258 6256 ipst = CONNQ_TO_IPST(q);
6259 6257
6260 6258 ip1dbg(("ip_siocdelrt:"));
6261 6259 /* Existence of mp1 verified in ip_wput_nondata */
6262 6260 mp1 = mp->b_cont->b_cont;
6263 6261 rt = (struct rtentry *)mp1->b_rptr;
6264 6262
6265 6263 dst_addr = ((sin_t *)&rt->rt_dst)->sin_addr.s_addr;
6266 6264 gw_addr = ((sin_t *)&rt->rt_gateway)->sin_addr.s_addr;
6267 6265
6268 6266 /*
6269 6267 * If the RTF_HOST flag is on, this is a request to delete a gateway
6270 6268 * to a particular host address. In this case, we set the netmask to
6271 6269 * all ones for the particular destination address. Otherwise,
6272 6270 * determine the netmask to be used based on dst_addr and the interfaces
6273 6271 * in use.
6274 6272 */
6275 6273 if (rt->rt_flags & RTF_HOST) {
6276 6274 mask = IP_HOST_MASK;
6277 6275 } else {
6278 6276 /*
6279 6277 * Note that ip_subnet_mask returns a zero mask in the case of
6280 6278 * default (an all-zeroes address).
6281 6279 */
6282 6280 mask = ip_subnet_mask(dst_addr, &ipif, ipst);
6283 6281 }
6284 6282
6285 6283 error = ip_rt_delete(dst_addr, mask, gw_addr,
6286 6284 RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE,
6287 6285 ipst, ALL_ZONES);
6288 6286 if (ipif != NULL)
6289 6287 ipif_refrele(ipif);
6290 6288 return (error);
6291 6289 }
6292 6290
6293 6291 /*
6294 6292 * Enqueue the mp onto the ipsq, chained by b_next.
6295 6293 * b_prev stores the function to be executed later, and b_queue the queue
6296 6294 * where this mp originated.
6297 6295 */
6298 6296 void
6299 6297 ipsq_enq(ipsq_t *ipsq, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6300 6298 ill_t *pending_ill)
6301 6299 {
6302 6300 conn_t *connp;
6303 6301 ipxop_t *ipx = ipsq->ipsq_xop;
6304 6302
6305 6303 ASSERT(MUTEX_HELD(&ipsq->ipsq_lock));
6306 6304 ASSERT(MUTEX_HELD(&ipx->ipx_lock));
6307 6305 ASSERT(func != NULL);
6308 6306
6309 6307 mp->b_queue = q;
6310 6308 mp->b_prev = (void *)func;
6311 6309 mp->b_next = NULL;
6312 6310
6313 6311 switch (type) {
6314 6312 case CUR_OP:
6315 6313 if (ipx->ipx_mptail != NULL) {
6316 6314 ASSERT(ipx->ipx_mphead != NULL);
6317 6315 ipx->ipx_mptail->b_next = mp;
6318 6316 } else {
6319 6317 ASSERT(ipx->ipx_mphead == NULL);
6320 6318 ipx->ipx_mphead = mp;
6321 6319 }
6322 6320 ipx->ipx_mptail = mp;
6323 6321 break;
6324 6322
6325 6323 case NEW_OP:
6326 6324 if (ipsq->ipsq_xopq_mptail != NULL) {
6327 6325 ASSERT(ipsq->ipsq_xopq_mphead != NULL);
6328 6326 ipsq->ipsq_xopq_mptail->b_next = mp;
6329 6327 } else {
6330 6328 ASSERT(ipsq->ipsq_xopq_mphead == NULL);
6331 6329 ipsq->ipsq_xopq_mphead = mp;
6332 6330 }
6333 6331 ipsq->ipsq_xopq_mptail = mp;
6334 6332 ipx->ipx_ipsq_queued = B_TRUE;
6335 6333 break;
6336 6334
6337 6335 case SWITCH_OP:
6338 6336 ASSERT(ipsq->ipsq_swxop != NULL);
6339 6337 /* only one switch operation is currently allowed */
6340 6338 ASSERT(ipsq->ipsq_switch_mp == NULL);
6341 6339 ipsq->ipsq_switch_mp = mp;
6342 6340 ipx->ipx_ipsq_queued = B_TRUE;
6343 6341 break;
6344 6342 default:
6345 6343 cmn_err(CE_PANIC, "ipsq_enq %d type \n", type);
6346 6344 }
6347 6345
6348 6346 if (CONN_Q(q) && pending_ill != NULL) {
6349 6347 connp = Q_TO_CONN(q);
6350 6348 ASSERT(MUTEX_HELD(&connp->conn_lock));
6351 6349 connp->conn_oper_pending_ill = pending_ill;
6352 6350 }
6353 6351 }
6354 6352
6355 6353 /*
6356 6354 * Dequeue the next message that requested exclusive access to this IPSQ's
6357 6355 * xop. Specifically:
6358 6356 *
6359 6357 * 1. If we're still processing the current operation on `ipsq', then
6360 6358 * dequeue the next message for the operation (from ipx_mphead), or
6361 6359 * return NULL if there are no queued messages for the operation.
6362 6360 * These messages are queued via CUR_OP to qwriter_ip() and friends.
6363 6361 *
6364 6362 * 2. If the current operation on `ipsq' has completed (ipx_current_ipif is
6365 6363 * not set) see if the ipsq has requested an xop switch. If so, switch
6366 6364 * `ipsq' to a different xop. Xop switches only happen when joining or
6367 6365 * leaving IPMP groups and require a careful dance -- see the comments
6368 6366 * in-line below for details. If we're leaving a group xop or if we're
6369 6367 * joining a group xop and become writer on it, then we proceed to (3).
6370 6368 * Otherwise, we return NULL and exit the xop.
6371 6369 *
6372 6370 * 3. For each IPSQ in the xop, return any switch operation stored on
6373 6371 * ipsq_switch_mp (set via SWITCH_OP); these must be processed before
6374 6372 * any other messages queued on the IPSQ. Otherwise, dequeue the next
6375 6373 * exclusive operation (queued via NEW_OP) stored on ipsq_xopq_mphead.
6376 6374 * Note that if the phyint tied to `ipsq' is not using IPMP there will
6377 6375 * only be one IPSQ in the xop. Otherwise, there will be one IPSQ for
6378 6376 * each phyint in the group, including the IPMP meta-interface phyint.
6379 6377 */
6380 6378 static mblk_t *
6381 6379 ipsq_dq(ipsq_t *ipsq)
6382 6380 {
6383 6381 ill_t *illv4, *illv6;
6384 6382 mblk_t *mp;
6385 6383 ipsq_t *xopipsq;
6386 6384 ipsq_t *leftipsq = NULL;
6387 6385 ipxop_t *ipx;
6388 6386 phyint_t *phyi = ipsq->ipsq_phyint;
6389 6387 ip_stack_t *ipst = ipsq->ipsq_ipst;
6390 6388 boolean_t emptied = B_FALSE;
6391 6389
6392 6390 /*
6393 6391 * Grab all the locks we need in the defined order (ill_g_lock ->
6394 6392 * ipsq_lock -> ipx_lock); ill_g_lock is needed to use ipsq_next.
6395 6393 */
6396 6394 rw_enter(&ipst->ips_ill_g_lock,
6397 6395 ipsq->ipsq_swxop != NULL ? RW_WRITER : RW_READER);
6398 6396 mutex_enter(&ipsq->ipsq_lock);
6399 6397 ipx = ipsq->ipsq_xop;
6400 6398 mutex_enter(&ipx->ipx_lock);
6401 6399
6402 6400 /*
6403 6401 * Dequeue the next message associated with the current exclusive
6404 6402 * operation, if any.
6405 6403 */
6406 6404 if ((mp = ipx->ipx_mphead) != NULL) {
6407 6405 ipx->ipx_mphead = mp->b_next;
6408 6406 if (ipx->ipx_mphead == NULL)
6409 6407 ipx->ipx_mptail = NULL;
6410 6408 mp->b_next = (void *)ipsq;
6411 6409 goto out;
6412 6410 }
6413 6411
6414 6412 if (ipx->ipx_current_ipif != NULL)
6415 6413 goto empty;
6416 6414
6417 6415 if (ipsq->ipsq_swxop != NULL) {
6418 6416 /*
6419 6417 * The exclusive operation that is now being completed has
6420 6418 * requested a switch to a different xop. This happens
6421 6419 * when an interface joins or leaves an IPMP group. Joins
6422 6420 * happen through SIOCSLIFGROUPNAME (ip_sioctl_groupname()).
6423 6421 * Leaves happen via SIOCSLIFGROUPNAME, interface unplumb
6424 6422 * (phyint_free()), or interface plumb for an ill type
6425 6423 * not in the IPMP group (ip_rput_dlpi_writer()).
6426 6424 *
6427 6425 * Xop switches are not allowed on the IPMP meta-interface.
6428 6426 */
6429 6427 ASSERT(phyi == NULL || !(phyi->phyint_flags & PHYI_IPMP));
6430 6428 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
6431 6429 DTRACE_PROBE1(ipsq__switch, (ipsq_t *), ipsq);
6432 6430
6433 6431 if (ipsq->ipsq_swxop == &ipsq->ipsq_ownxop) {
6434 6432 /*
6435 6433 * We're switching back to our own xop, so we have two
6436 6434 * xop's to drain/exit: our own, and the group xop
6437 6435 * that we are leaving.
6438 6436 *
6439 6437 * First, pull ourselves out of the group ipsq list.
6440 6438 * This is safe since we're writer on ill_g_lock.
6441 6439 */
6442 6440 ASSERT(ipsq->ipsq_xop != &ipsq->ipsq_ownxop);
6443 6441
6444 6442 xopipsq = ipx->ipx_ipsq;
6445 6443 while (xopipsq->ipsq_next != ipsq)
6446 6444 xopipsq = xopipsq->ipsq_next;
6447 6445
6448 6446 xopipsq->ipsq_next = ipsq->ipsq_next;
6449 6447 ipsq->ipsq_next = ipsq;
6450 6448 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6451 6449 ipsq->ipsq_swxop = NULL;
6452 6450
6453 6451 /*
6454 6452 * Second, prepare to exit the group xop. The actual
6455 6453 * ipsq_exit() is done at the end of this function
6456 6454 * since we cannot hold any locks across ipsq_exit().
6457 6455 * Note that although we drop the group's ipx_lock, no
6458 6456 * threads can proceed since we're still ipx_writer.
6459 6457 */
6460 6458 leftipsq = xopipsq;
6461 6459 mutex_exit(&ipx->ipx_lock);
6462 6460
6463 6461 /*
6464 6462 * Third, set ipx to point to our own xop (which was
6465 6463 * inactive and therefore can be entered).
6466 6464 */
6467 6465 ipx = ipsq->ipsq_xop;
6468 6466 mutex_enter(&ipx->ipx_lock);
6469 6467 ASSERT(ipx->ipx_writer == NULL);
6470 6468 ASSERT(ipx->ipx_current_ipif == NULL);
6471 6469 } else {
6472 6470 /*
6473 6471 * We're switching from our own xop to a group xop.
6474 6472 * The requestor of the switch must ensure that the
6475 6473 * group xop cannot go away (e.g. by ensuring the
6476 6474 * phyint associated with the xop cannot go away).
6477 6475 *
6478 6476 * If we can become writer on our new xop, then we'll
6479 6477 * do the drain. Otherwise, the current writer of our
6480 6478 * new xop will do the drain when it exits.
6481 6479 *
6482 6480 * First, splice ourselves into the group IPSQ list.
6483 6481 * This is safe since we're writer on ill_g_lock.
6484 6482 */
6485 6483 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6486 6484
6487 6485 xopipsq = ipsq->ipsq_swxop->ipx_ipsq;
6488 6486 while (xopipsq->ipsq_next != ipsq->ipsq_swxop->ipx_ipsq)
6489 6487 xopipsq = xopipsq->ipsq_next;
6490 6488
6491 6489 xopipsq->ipsq_next = ipsq;
6492 6490 ipsq->ipsq_next = ipsq->ipsq_swxop->ipx_ipsq;
6493 6491 ipsq->ipsq_xop = ipsq->ipsq_swxop;
6494 6492 ipsq->ipsq_swxop = NULL;
6495 6493
6496 6494 /*
6497 6495 * Second, exit our own xop, since it's now unused.
6498 6496 * This is safe since we've got the only reference.
6499 6497 */
6500 6498 ASSERT(ipx->ipx_writer == curthread);
6501 6499 ipx->ipx_writer = NULL;
6502 6500 VERIFY(--ipx->ipx_reentry_cnt == 0);
6503 6501 ipx->ipx_ipsq_queued = B_FALSE;
6504 6502 mutex_exit(&ipx->ipx_lock);
6505 6503
6506 6504 /*
6507 6505 * Third, set ipx to point to our new xop, and check
6508 6506 * if we can become writer on it. If we cannot, then
6509 6507 * the current writer will drain the IPSQ group when
6510 6508 * it exits. Our ipsq_xop is guaranteed to be stable
6511 6509 * because we're still holding ipsq_lock.
6512 6510 */
6513 6511 ipx = ipsq->ipsq_xop;
6514 6512 mutex_enter(&ipx->ipx_lock);
6515 6513 if (ipx->ipx_writer != NULL ||
6516 6514 ipx->ipx_current_ipif != NULL) {
6517 6515 goto out;
6518 6516 }
6519 6517 }
6520 6518
6521 6519 /*
6522 6520 * Fourth, become writer on our new ipx before we continue
6523 6521 * with the drain. Note that we never dropped ipsq_lock
6524 6522 * above, so no other thread could've raced with us to
6525 6523 * become writer first. Also, we're holding ipx_lock, so
6526 6524 * no other thread can examine the ipx right now.
6527 6525 */
6528 6526 ASSERT(ipx->ipx_current_ipif == NULL);
6529 6527 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6530 6528 VERIFY(ipx->ipx_reentry_cnt++ == 0);
6531 6529 ipx->ipx_writer = curthread;
6532 6530 ipx->ipx_forced = B_FALSE;
6533 6531 #ifdef DEBUG
6534 6532 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6535 6533 #endif
6536 6534 }
6537 6535
6538 6536 xopipsq = ipsq;
6539 6537 do {
6540 6538 /*
6541 6539 * So that other operations operate on a consistent and
6542 6540 * complete phyint, a switch message on an IPSQ must be
6543 6541 * handled prior to any other operations on that IPSQ.
6544 6542 */
6545 6543 if ((mp = xopipsq->ipsq_switch_mp) != NULL) {
6546 6544 xopipsq->ipsq_switch_mp = NULL;
6547 6545 ASSERT(mp->b_next == NULL);
6548 6546 mp->b_next = (void *)xopipsq;
6549 6547 goto out;
6550 6548 }
6551 6549
6552 6550 if ((mp = xopipsq->ipsq_xopq_mphead) != NULL) {
6553 6551 xopipsq->ipsq_xopq_mphead = mp->b_next;
6554 6552 if (xopipsq->ipsq_xopq_mphead == NULL)
6555 6553 xopipsq->ipsq_xopq_mptail = NULL;
6556 6554 mp->b_next = (void *)xopipsq;
6557 6555 goto out;
6558 6556 }
6559 6557 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6560 6558 empty:
6561 6559 /*
6562 6560 * There are no messages. Further, we are holding ipx_lock, hence no
6563 6561 * new messages can end up on any IPSQ in the xop.
6564 6562 */
6565 6563 ipx->ipx_writer = NULL;
6566 6564 ipx->ipx_forced = B_FALSE;
6567 6565 VERIFY(--ipx->ipx_reentry_cnt == 0);
6568 6566 ipx->ipx_ipsq_queued = B_FALSE;
6569 6567 emptied = B_TRUE;
6570 6568 #ifdef DEBUG
6571 6569 ipx->ipx_depth = 0;
6572 6570 #endif
6573 6571 out:
6574 6572 mutex_exit(&ipx->ipx_lock);
6575 6573 mutex_exit(&ipsq->ipsq_lock);
6576 6574
6577 6575 /*
6578 6576 * If we completely emptied the xop, then wake up any threads waiting
6579 6577 * to enter any of the IPSQ's associated with it.
6580 6578 */
6581 6579 if (emptied) {
6582 6580 xopipsq = ipsq;
6583 6581 do {
6584 6582 if ((phyi = xopipsq->ipsq_phyint) == NULL)
6585 6583 continue;
6586 6584
6587 6585 illv4 = phyi->phyint_illv4;
6588 6586 illv6 = phyi->phyint_illv6;
6589 6587
6590 6588 GRAB_ILL_LOCKS(illv4, illv6);
6591 6589 if (illv4 != NULL)
6592 6590 cv_broadcast(&illv4->ill_cv);
6593 6591 if (illv6 != NULL)
6594 6592 cv_broadcast(&illv6->ill_cv);
6595 6593 RELEASE_ILL_LOCKS(illv4, illv6);
6596 6594 } while ((xopipsq = xopipsq->ipsq_next) != ipsq);
6597 6595 }
6598 6596 rw_exit(&ipst->ips_ill_g_lock);
6599 6597
6600 6598 /*
6601 6599 * Now that all locks are dropped, exit the IPSQ we left.
6602 6600 */
6603 6601 if (leftipsq != NULL)
6604 6602 ipsq_exit(leftipsq);
6605 6603
6606 6604 return (mp);
6607 6605 }
6608 6606
6609 6607 /*
6610 6608 * Return completion status of previously initiated DLPI operations on
6611 6609 * ills in the purview of an ipsq.
6612 6610 */
6613 6611 static boolean_t
6614 6612 ipsq_dlpi_done(ipsq_t *ipsq)
6615 6613 {
6616 6614 ipsq_t *ipsq_start;
6617 6615 phyint_t *phyi;
6618 6616 ill_t *ill;
6619 6617
6620 6618 ASSERT(RW_LOCK_HELD(&ipsq->ipsq_ipst->ips_ill_g_lock));
6621 6619 ipsq_start = ipsq;
6622 6620
6623 6621 do {
6624 6622 /*
6625 6623 * The only current users of this function are ipsq_try_enter
6626 6624 * and ipsq_enter which have made sure that ipsq_writer is
6627 6625 * NULL before we reach here. ill_dlpi_pending is modified
6628 6626 * only by an ipsq writer
6629 6627 */
6630 6628 ASSERT(ipsq->ipsq_xop->ipx_writer == NULL);
6631 6629 phyi = ipsq->ipsq_phyint;
6632 6630 /*
6633 6631 * phyi could be NULL if a phyint that is part of an
6634 6632 * IPMP group is being unplumbed. A more detailed
6635 6633 * comment is in ipmp_grp_update_kstats()
6636 6634 */
6637 6635 if (phyi != NULL) {
6638 6636 ill = phyi->phyint_illv4;
6639 6637 if (ill != NULL &&
6640 6638 (ill->ill_dlpi_pending != DL_PRIM_INVAL ||
6641 6639 ill->ill_arl_dlpi_pending))
6642 6640 return (B_FALSE);
6643 6641
6644 6642 ill = phyi->phyint_illv6;
6645 6643 if (ill != NULL &&
6646 6644 ill->ill_dlpi_pending != DL_PRIM_INVAL)
6647 6645 return (B_FALSE);
6648 6646 }
6649 6647
6650 6648 } while ((ipsq = ipsq->ipsq_next) != ipsq_start);
6651 6649
6652 6650 return (B_TRUE);
6653 6651 }
6654 6652
6655 6653 /*
6656 6654 * Enter the ipsq corresponding to ill, by waiting synchronously till
6657 6655 * we can enter the ipsq exclusively. Unless 'force' is used, the ipsq
6658 6656 * will have to drain completely before ipsq_enter returns success.
6659 6657 * ipx_current_ipif will be set if some exclusive op is in progress,
6660 6658 * and the ipsq_exit logic will start the next enqueued op after
6661 6659 * completion of the current op. If 'force' is used, we don't wait
6662 6660 * for the enqueued ops. This is needed when a conn_close wants to
6663 6661 * enter the ipsq and abort an ioctl that is somehow stuck. Unplumb
6664 6662 * of an ill can also use this option. But we dont' use it currently.
6665 6663 */
6666 6664 #define ENTER_SQ_WAIT_TICKS 100
6667 6665 boolean_t
6668 6666 ipsq_enter(ill_t *ill, boolean_t force, int type)
6669 6667 {
6670 6668 ipsq_t *ipsq;
6671 6669 ipxop_t *ipx;
6672 6670 boolean_t waited_enough = B_FALSE;
6673 6671 ip_stack_t *ipst = ill->ill_ipst;
6674 6672
6675 6673 /*
6676 6674 * Note that the relationship between ill and ipsq is fixed as long as
6677 6675 * the ill is not ILL_CONDEMNED. Holding ipsq_lock ensures the
6678 6676 * relationship between the IPSQ and xop cannot change. However,
6679 6677 * since we cannot hold ipsq_lock across the cv_wait(), it may change
6680 6678 * while we're waiting. We wait on ill_cv and rely on ipsq_exit()
6681 6679 * waking up all ills in the xop when it becomes available.
6682 6680 */
6683 6681 for (;;) {
6684 6682 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6685 6683 mutex_enter(&ill->ill_lock);
6686 6684 if (ill->ill_state_flags & ILL_CONDEMNED) {
6687 6685 mutex_exit(&ill->ill_lock);
6688 6686 rw_exit(&ipst->ips_ill_g_lock);
6689 6687 return (B_FALSE);
6690 6688 }
6691 6689
6692 6690 ipsq = ill->ill_phyint->phyint_ipsq;
6693 6691 mutex_enter(&ipsq->ipsq_lock);
6694 6692 ipx = ipsq->ipsq_xop;
6695 6693 mutex_enter(&ipx->ipx_lock);
6696 6694
6697 6695 if (ipx->ipx_writer == NULL && (type == CUR_OP ||
6698 6696 (ipx->ipx_current_ipif == NULL && ipsq_dlpi_done(ipsq)) ||
6699 6697 waited_enough))
6700 6698 break;
6701 6699
6702 6700 rw_exit(&ipst->ips_ill_g_lock);
6703 6701
6704 6702 if (!force || ipx->ipx_writer != NULL) {
6705 6703 mutex_exit(&ipx->ipx_lock);
6706 6704 mutex_exit(&ipsq->ipsq_lock);
6707 6705 cv_wait(&ill->ill_cv, &ill->ill_lock);
6708 6706 } else {
6709 6707 mutex_exit(&ipx->ipx_lock);
6710 6708 mutex_exit(&ipsq->ipsq_lock);
6711 6709 (void) cv_reltimedwait(&ill->ill_cv,
6712 6710 &ill->ill_lock, ENTER_SQ_WAIT_TICKS, TR_CLOCK_TICK);
6713 6711 waited_enough = B_TRUE;
6714 6712 }
6715 6713 mutex_exit(&ill->ill_lock);
6716 6714 }
6717 6715
6718 6716 ASSERT(ipx->ipx_mphead == NULL && ipx->ipx_mptail == NULL);
6719 6717 ASSERT(ipx->ipx_reentry_cnt == 0);
6720 6718 ipx->ipx_writer = curthread;
6721 6719 ipx->ipx_forced = (ipx->ipx_current_ipif != NULL);
6722 6720 ipx->ipx_reentry_cnt++;
6723 6721 #ifdef DEBUG
6724 6722 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6725 6723 #endif
6726 6724 mutex_exit(&ipx->ipx_lock);
6727 6725 mutex_exit(&ipsq->ipsq_lock);
6728 6726 mutex_exit(&ill->ill_lock);
6729 6727 rw_exit(&ipst->ips_ill_g_lock);
6730 6728
6731 6729 return (B_TRUE);
6732 6730 }
6733 6731
6734 6732 /*
6735 6733 * ipif_set_values() has a constraint that it cannot drop the ips_ill_g_lock
6736 6734 * across the call to the core interface ipsq_try_enter() and hence calls this
6737 6735 * function directly. This is explained more fully in ipif_set_values().
6738 6736 * In order to support the above constraint, ipsq_try_enter is implemented as
6739 6737 * a wrapper that grabs the ips_ill_g_lock and calls this function subsequently
6740 6738 */
6741 6739 static ipsq_t *
6742 6740 ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func,
6743 6741 int type, boolean_t reentry_ok)
6744 6742 {
6745 6743 ipsq_t *ipsq;
6746 6744 ipxop_t *ipx;
6747 6745 ip_stack_t *ipst = ill->ill_ipst;
6748 6746
6749 6747 /*
6750 6748 * lock ordering:
6751 6749 * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock.
6752 6750 *
6753 6751 * ipx of an ipsq can't change when ipsq_lock is held.
6754 6752 */
6755 6753 ASSERT(RW_LOCK_HELD(&ipst->ips_ill_g_lock));
6756 6754 GRAB_CONN_LOCK(q);
6757 6755 mutex_enter(&ill->ill_lock);
6758 6756 ipsq = ill->ill_phyint->phyint_ipsq;
6759 6757 mutex_enter(&ipsq->ipsq_lock);
6760 6758 ipx = ipsq->ipsq_xop;
6761 6759 mutex_enter(&ipx->ipx_lock);
6762 6760
6763 6761 /*
6764 6762 * 1. Enter the ipsq if we are already writer and reentry is ok.
6765 6763 * (Note: If the caller does not specify reentry_ok then neither
6766 6764 * 'func' nor any of its callees must ever attempt to enter the ipsq
6767 6765 * again. Otherwise it can lead to an infinite loop
6768 6766 * 2. Enter the ipsq if there is no current writer and this attempted
6769 6767 * entry is part of the current operation
6770 6768 * 3. Enter the ipsq if there is no current writer and this is a new
6771 6769 * operation and the operation queue is empty and there is no
6772 6770 * operation currently in progress and if all previously initiated
6773 6771 * DLPI operations have completed.
6774 6772 */
6775 6773 if ((ipx->ipx_writer == curthread && reentry_ok) ||
6776 6774 (ipx->ipx_writer == NULL && (type == CUR_OP || (type == NEW_OP &&
6777 6775 !ipx->ipx_ipsq_queued && ipx->ipx_current_ipif == NULL &&
6778 6776 ipsq_dlpi_done(ipsq))))) {
6779 6777 /* Success. */
6780 6778 ipx->ipx_reentry_cnt++;
6781 6779 ipx->ipx_writer = curthread;
6782 6780 ipx->ipx_forced = B_FALSE;
6783 6781 mutex_exit(&ipx->ipx_lock);
6784 6782 mutex_exit(&ipsq->ipsq_lock);
6785 6783 mutex_exit(&ill->ill_lock);
6786 6784 RELEASE_CONN_LOCK(q);
6787 6785 #ifdef DEBUG
6788 6786 ipx->ipx_depth = getpcstack(ipx->ipx_stack, IPX_STACK_DEPTH);
6789 6787 #endif
6790 6788 return (ipsq);
6791 6789 }
6792 6790
6793 6791 if (func != NULL)
6794 6792 ipsq_enq(ipsq, q, mp, func, type, ill);
6795 6793
6796 6794 mutex_exit(&ipx->ipx_lock);
6797 6795 mutex_exit(&ipsq->ipsq_lock);
6798 6796 mutex_exit(&ill->ill_lock);
6799 6797 RELEASE_CONN_LOCK(q);
6800 6798 return (NULL);
6801 6799 }
6802 6800
6803 6801 /*
6804 6802 * The ipsq_t (ipsq) is the synchronization data structure used to serialize
6805 6803 * certain critical operations like plumbing (i.e. most set ioctls), etc.
6806 6804 * There is one ipsq per phyint. The ipsq
6807 6805 * serializes exclusive ioctls issued by applications on a per ipsq basis in
6808 6806 * ipsq_xopq_mphead. It also protects against multiple threads executing in
6809 6807 * the ipsq. Responses from the driver pertain to the current ioctl (say a
6810 6808 * DL_BIND_ACK in response to a DL_BIND_REQ initiated as part of bringing
6811 6809 * up the interface) and are enqueued in ipx_mphead.
6812 6810 *
6813 6811 * If a thread does not want to reenter the ipsq when it is already writer,
6814 6812 * it must make sure that the specified reentry point to be called later
6815 6813 * when the ipsq is empty, nor any code path starting from the specified reentry
6816 6814 * point must never ever try to enter the ipsq again. Otherwise it can lead
6817 6815 * to an infinite loop. The reentry point ip_rput_dlpi_writer is an example.
6818 6816 * When the thread that is currently exclusive finishes, it (ipsq_exit)
6819 6817 * dequeues the requests waiting to become exclusive in ipx_mphead and calls
6820 6818 * the reentry point. When the list at ipx_mphead becomes empty ipsq_exit
6821 6819 * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next
6822 6820 * ioctl if the current ioctl has completed. If the current ioctl is still
6823 6821 * in progress it simply returns. The current ioctl could be waiting for
6824 6822 * a response from another module (the driver or could be waiting for
6825 6823 * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp
6826 6824 * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the
6827 6825 * execution of the ioctl and ipsq_exit does not start the next ioctl unless
6828 6826 * ipx_current_ipif is NULL which happens only once the ioctl is complete and
6829 6827 * all associated DLPI operations have completed.
6830 6828 */
6831 6829
6832 6830 /*
6833 6831 * Try to enter the IPSQ corresponding to `ipif' or `ill' exclusively (`ipif'
6834 6832 * and `ill' cannot both be specified). Returns a pointer to the entered IPSQ
6835 6833 * on success, or NULL on failure. The caller ensures ipif/ill is valid by
6836 6834 * refholding it as necessary. If the IPSQ cannot be entered and `func' is
6837 6835 * non-NULL, then `func' will be called back with `q' and `mp' once the IPSQ
6838 6836 * can be entered. If `func' is NULL, then `q' and `mp' are ignored.
6839 6837 */
6840 6838 ipsq_t *
6841 6839 ipsq_try_enter(ipif_t *ipif, ill_t *ill, queue_t *q, mblk_t *mp,
6842 6840 ipsq_func_t func, int type, boolean_t reentry_ok)
6843 6841 {
6844 6842 ip_stack_t *ipst;
6845 6843 ipsq_t *ipsq;
6846 6844
6847 6845 /* Only 1 of ipif or ill can be specified */
6848 6846 ASSERT((ipif != NULL) ^ (ill != NULL));
6849 6847
6850 6848 if (ipif != NULL)
6851 6849 ill = ipif->ipif_ill;
6852 6850 ipst = ill->ill_ipst;
6853 6851
6854 6852 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
6855 6853 ipsq = ipsq_try_enter_internal(ill, q, mp, func, type, reentry_ok);
6856 6854 rw_exit(&ipst->ips_ill_g_lock);
6857 6855
6858 6856 return (ipsq);
6859 6857 }
6860 6858
6861 6859 /*
6862 6860 * Try to enter the IPSQ corresponding to `ill' as writer. The caller ensures
6863 6861 * ill is valid by refholding it if necessary; we will refrele. If the IPSQ
6864 6862 * cannot be entered, the mp is queued for completion.
6865 6863 */
6866 6864 void
6867 6865 qwriter_ip(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, int type,
6868 6866 boolean_t reentry_ok)
6869 6867 {
6870 6868 ipsq_t *ipsq;
6871 6869
6872 6870 ipsq = ipsq_try_enter(NULL, ill, q, mp, func, type, reentry_ok);
6873 6871
6874 6872 /*
6875 6873 * Drop the caller's refhold on the ill. This is safe since we either
6876 6874 * entered the IPSQ (and thus are exclusive), or failed to enter the
6877 6875 * IPSQ, in which case we return without accessing ill anymore. This
6878 6876 * is needed because func needs to see the correct refcount.
6879 6877 * e.g. removeif can work only then.
6880 6878 */
6881 6879 ill_refrele(ill);
6882 6880 if (ipsq != NULL) {
6883 6881 (*func)(ipsq, q, mp, NULL);
6884 6882 ipsq_exit(ipsq);
6885 6883 }
6886 6884 }
6887 6885
6888 6886 /*
6889 6887 * Exit the specified IPSQ. If this is the final exit on it then drain it
6890 6888 * prior to exiting. Caller must be writer on the specified IPSQ.
6891 6889 */
6892 6890 void
6893 6891 ipsq_exit(ipsq_t *ipsq)
6894 6892 {
6895 6893 mblk_t *mp;
6896 6894 ipsq_t *mp_ipsq;
6897 6895 queue_t *q;
6898 6896 phyint_t *phyi;
6899 6897 ipsq_func_t func;
6900 6898
6901 6899 ASSERT(IAM_WRITER_IPSQ(ipsq));
6902 6900
6903 6901 ASSERT(ipsq->ipsq_xop->ipx_reentry_cnt >= 1);
6904 6902 if (ipsq->ipsq_xop->ipx_reentry_cnt != 1) {
6905 6903 ipsq->ipsq_xop->ipx_reentry_cnt--;
6906 6904 return;
6907 6905 }
6908 6906
6909 6907 for (;;) {
6910 6908 phyi = ipsq->ipsq_phyint;
6911 6909 mp = ipsq_dq(ipsq);
6912 6910 mp_ipsq = (mp == NULL) ? NULL : (ipsq_t *)mp->b_next;
6913 6911
6914 6912 /*
6915 6913 * If we've changed to a new IPSQ, and the phyint associated
6916 6914 * with the old one has gone away, free the old IPSQ. Note
6917 6915 * that this cannot happen while the IPSQ is in a group.
6918 6916 */
6919 6917 if (mp_ipsq != ipsq && phyi == NULL) {
6920 6918 ASSERT(ipsq->ipsq_next == ipsq);
6921 6919 ASSERT(ipsq->ipsq_xop == &ipsq->ipsq_ownxop);
6922 6920 ipsq_delete(ipsq);
6923 6921 }
6924 6922
6925 6923 if (mp == NULL)
6926 6924 break;
6927 6925
6928 6926 q = mp->b_queue;
6929 6927 func = (ipsq_func_t)mp->b_prev;
6930 6928 ipsq = mp_ipsq;
6931 6929 mp->b_next = mp->b_prev = NULL;
6932 6930 mp->b_queue = NULL;
6933 6931
6934 6932 /*
6935 6933 * If 'q' is an conn queue, it is valid, since we did a
6936 6934 * a refhold on the conn at the start of the ioctl.
6937 6935 * If 'q' is an ill queue, it is valid, since close of an
6938 6936 * ill will clean up its IPSQ.
6939 6937 */
6940 6938 (*func)(ipsq, q, mp, NULL);
6941 6939 }
6942 6940 }
6943 6941
6944 6942 /*
6945 6943 * Used to start any igmp or mld timers that could not be started
6946 6944 * while holding ill_mcast_lock. The timers can't be started while holding
6947 6945 * the lock, since mld/igmp_start_timers may need to call untimeout()
6948 6946 * which can't be done while holding the lock which the timeout handler
6949 6947 * acquires. Otherwise
6950 6948 * there could be a deadlock since the timeout handlers
6951 6949 * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire
6952 6950 * ill_mcast_lock.
6953 6951 */
6954 6952 void
6955 6953 ill_mcast_timer_start(ip_stack_t *ipst)
6956 6954 {
6957 6955 int next;
6958 6956
6959 6957 mutex_enter(&ipst->ips_igmp_timer_lock);
6960 6958 next = ipst->ips_igmp_deferred_next;
6961 6959 ipst->ips_igmp_deferred_next = INFINITY;
6962 6960 mutex_exit(&ipst->ips_igmp_timer_lock);
6963 6961
6964 6962 if (next != INFINITY)
6965 6963 igmp_start_timers(next, ipst);
6966 6964
6967 6965 mutex_enter(&ipst->ips_mld_timer_lock);
6968 6966 next = ipst->ips_mld_deferred_next;
6969 6967 ipst->ips_mld_deferred_next = INFINITY;
6970 6968 mutex_exit(&ipst->ips_mld_timer_lock);
6971 6969
6972 6970 if (next != INFINITY)
6973 6971 mld_start_timers(next, ipst);
6974 6972 }
6975 6973
6976 6974 /*
6977 6975 * Start the current exclusive operation on `ipsq'; associate it with `ipif'
6978 6976 * and `ioccmd'.
6979 6977 */
6980 6978 void
6981 6979 ipsq_current_start(ipsq_t *ipsq, ipif_t *ipif, int ioccmd)
6982 6980 {
6983 6981 ill_t *ill = ipif->ipif_ill;
6984 6982 ipxop_t *ipx = ipsq->ipsq_xop;
6985 6983
6986 6984 ASSERT(IAM_WRITER_IPSQ(ipsq));
6987 6985 ASSERT(ipx->ipx_current_ipif == NULL);
6988 6986 ASSERT(ipx->ipx_current_ioctl == 0);
6989 6987
6990 6988 ipx->ipx_current_done = B_FALSE;
6991 6989 ipx->ipx_current_ioctl = ioccmd;
6992 6990 mutex_enter(&ipx->ipx_lock);
6993 6991 ipx->ipx_current_ipif = ipif;
6994 6992 mutex_exit(&ipx->ipx_lock);
6995 6993
6996 6994 /*
6997 6995 * Set IPIF_CHANGING on one or more ipifs associated with the
6998 6996 * current exclusive operation. IPIF_CHANGING prevents any new
6999 6997 * references to the ipif (so that the references will eventually
7000 6998 * drop to zero) and also prevents any "get" operations (e.g.,
7001 6999 * SIOCGLIFFLAGS) from being able to access the ipif until the
7002 7000 * operation has completed and the ipif is again in a stable state.
7003 7001 *
7004 7002 * For ioctls, IPIF_CHANGING is set on the ipif associated with the
7005 7003 * ioctl. For internal operations (where ioccmd is zero), all ipifs
7006 7004 * on the ill are marked with IPIF_CHANGING since it's unclear which
7007 7005 * ipifs will be affected.
7008 7006 *
7009 7007 * Note that SIOCLIFREMOVEIF is a special case as it sets
7010 7008 * IPIF_CONDEMNED internally after identifying the right ipif to
7011 7009 * operate on.
7012 7010 */
7013 7011 switch (ioccmd) {
7014 7012 case SIOCLIFREMOVEIF:
7015 7013 break;
7016 7014 case 0:
7017 7015 mutex_enter(&ill->ill_lock);
7018 7016 ipif = ipif->ipif_ill->ill_ipif;
7019 7017 for (; ipif != NULL; ipif = ipif->ipif_next)
7020 7018 ipif->ipif_state_flags |= IPIF_CHANGING;
7021 7019 mutex_exit(&ill->ill_lock);
7022 7020 break;
7023 7021 default:
7024 7022 mutex_enter(&ill->ill_lock);
7025 7023 ipif->ipif_state_flags |= IPIF_CHANGING;
7026 7024 mutex_exit(&ill->ill_lock);
7027 7025 }
7028 7026 }
7029 7027
7030 7028 /*
7031 7029 * Finish the current exclusive operation on `ipsq'. Usually, this will allow
7032 7030 * the next exclusive operation to begin once we ipsq_exit(). However, if
7033 7031 * pending DLPI operations remain, then we will wait for the queue to drain
7034 7032 * before allowing the next exclusive operation to begin. This ensures that
7035 7033 * DLPI operations from one exclusive operation are never improperly processed
7036 7034 * as part of a subsequent exclusive operation.
7037 7035 */
7038 7036 void
7039 7037 ipsq_current_finish(ipsq_t *ipsq)
7040 7038 {
7041 7039 ipxop_t *ipx = ipsq->ipsq_xop;
7042 7040 t_uscalar_t dlpi_pending = DL_PRIM_INVAL;
7043 7041 ipif_t *ipif = ipx->ipx_current_ipif;
7044 7042
7045 7043 ASSERT(IAM_WRITER_IPSQ(ipsq));
7046 7044
7047 7045 /*
7048 7046 * For SIOCLIFREMOVEIF, the ipif has been already been blown away
7049 7047 * (but in that case, IPIF_CHANGING will already be clear and no
7050 7048 * pending DLPI messages can remain).
7051 7049 */
7052 7050 if (ipx->ipx_current_ioctl != SIOCLIFREMOVEIF) {
7053 7051 ill_t *ill = ipif->ipif_ill;
7054 7052
7055 7053 mutex_enter(&ill->ill_lock);
7056 7054 dlpi_pending = ill->ill_dlpi_pending;
7057 7055 if (ipx->ipx_current_ioctl == 0) {
7058 7056 ipif = ill->ill_ipif;
7059 7057 for (; ipif != NULL; ipif = ipif->ipif_next)
7060 7058 ipif->ipif_state_flags &= ~IPIF_CHANGING;
7061 7059 } else {
7062 7060 ipif->ipif_state_flags &= ~IPIF_CHANGING;
7063 7061 }
7064 7062 mutex_exit(&ill->ill_lock);
7065 7063 }
7066 7064
7067 7065 ASSERT(!ipx->ipx_current_done);
7068 7066 ipx->ipx_current_done = B_TRUE;
7069 7067 ipx->ipx_current_ioctl = 0;
7070 7068 if (dlpi_pending == DL_PRIM_INVAL) {
7071 7069 mutex_enter(&ipx->ipx_lock);
7072 7070 ipx->ipx_current_ipif = NULL;
7073 7071 mutex_exit(&ipx->ipx_lock);
7074 7072 }
7075 7073 }
7076 7074
7077 7075 /*
7078 7076 * The ill is closing. Flush all messages on the ipsq that originated
7079 7077 * from this ill. Usually there wont' be any messages on the ipsq_xopq_mphead
7080 7078 * for this ill since ipsq_enter could not have entered until then.
7081 7079 * New messages can't be queued since the CONDEMNED flag is set.
7082 7080 */
7083 7081 static void
7084 7082 ipsq_flush(ill_t *ill)
7085 7083 {
7086 7084 queue_t *q;
7087 7085 mblk_t *prev;
7088 7086 mblk_t *mp;
7089 7087 mblk_t *mp_next;
7090 7088 ipxop_t *ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop;
7091 7089
7092 7090 ASSERT(IAM_WRITER_ILL(ill));
7093 7091
7094 7092 /*
7095 7093 * Flush any messages sent up by the driver.
7096 7094 */
7097 7095 mutex_enter(&ipx->ipx_lock);
7098 7096 for (prev = NULL, mp = ipx->ipx_mphead; mp != NULL; mp = mp_next) {
7099 7097 mp_next = mp->b_next;
7100 7098 q = mp->b_queue;
7101 7099 if (q == ill->ill_rq || q == ill->ill_wq) {
7102 7100 /* dequeue mp */
7103 7101 if (prev == NULL)
7104 7102 ipx->ipx_mphead = mp->b_next;
7105 7103 else
7106 7104 prev->b_next = mp->b_next;
7107 7105 if (ipx->ipx_mptail == mp) {
7108 7106 ASSERT(mp_next == NULL);
7109 7107 ipx->ipx_mptail = prev;
7110 7108 }
7111 7109 inet_freemsg(mp);
7112 7110 } else {
7113 7111 prev = mp;
7114 7112 }
7115 7113 }
7116 7114 mutex_exit(&ipx->ipx_lock);
7117 7115 (void) ipsq_pending_mp_cleanup(ill, NULL);
7118 7116 ipsq_xopq_mp_cleanup(ill, NULL);
7119 7117 }
7120 7118
7121 7119 /*
7122 7120 * Parse an ifreq or lifreq struct coming down ioctls and refhold
7123 7121 * and return the associated ipif.
7124 7122 * Return value:
7125 7123 * Non zero: An error has occurred. ci may not be filled out.
7126 7124 * zero : ci is filled out with the ioctl cmd in ci.ci_name, and
7127 7125 * a held ipif in ci.ci_ipif.
7128 7126 */
7129 7127 int
7130 7128 ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
7131 7129 cmd_info_t *ci)
7132 7130 {
7133 7131 char *name;
7134 7132 struct ifreq *ifr;
7135 7133 struct lifreq *lifr;
7136 7134 ipif_t *ipif = NULL;
7137 7135 ill_t *ill;
7138 7136 conn_t *connp;
7139 7137 boolean_t isv6;
7140 7138 int err;
7141 7139 mblk_t *mp1;
7142 7140 zoneid_t zoneid;
7143 7141 ip_stack_t *ipst;
7144 7142
7145 7143 if (q->q_next != NULL) {
7146 7144 ill = (ill_t *)q->q_ptr;
7147 7145 isv6 = ill->ill_isv6;
7148 7146 connp = NULL;
7149 7147 zoneid = ALL_ZONES;
7150 7148 ipst = ill->ill_ipst;
7151 7149 } else {
7152 7150 ill = NULL;
7153 7151 connp = Q_TO_CONN(q);
7154 7152 isv6 = (connp->conn_family == AF_INET6);
7155 7153 zoneid = connp->conn_zoneid;
7156 7154 if (zoneid == GLOBAL_ZONEID) {
7157 7155 /* global zone can access ipifs in all zones */
7158 7156 zoneid = ALL_ZONES;
7159 7157 }
7160 7158 ipst = connp->conn_netstack->netstack_ip;
7161 7159 }
7162 7160
7163 7161 /* Has been checked in ip_wput_nondata */
7164 7162 mp1 = mp->b_cont->b_cont;
7165 7163
7166 7164 if (ipip->ipi_cmd_type == IF_CMD) {
7167 7165 /* This a old style SIOC[GS]IF* command */
7168 7166 ifr = (struct ifreq *)mp1->b_rptr;
7169 7167 /*
7170 7168 * Null terminate the string to protect against buffer
7171 7169 * overrun. String was generated by user code and may not
7172 7170 * be trusted.
7173 7171 */
7174 7172 ifr->ifr_name[IFNAMSIZ - 1] = '\0';
7175 7173 name = ifr->ifr_name;
7176 7174 ci->ci_sin = (sin_t *)&ifr->ifr_addr;
7177 7175 ci->ci_sin6 = NULL;
7178 7176 ci->ci_lifr = (struct lifreq *)ifr;
7179 7177 } else {
7180 7178 /* This a new style SIOC[GS]LIF* command */
7181 7179 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
7182 7180 lifr = (struct lifreq *)mp1->b_rptr;
7183 7181 /*
7184 7182 * Null terminate the string to protect against buffer
7185 7183 * overrun. String was generated by user code and may not
7186 7184 * be trusted.
7187 7185 */
7188 7186 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
7189 7187 name = lifr->lifr_name;
7190 7188 ci->ci_sin = (sin_t *)&lifr->lifr_addr;
7191 7189 ci->ci_sin6 = (sin6_t *)&lifr->lifr_addr;
7192 7190 ci->ci_lifr = lifr;
7193 7191 }
7194 7192
7195 7193 if (ipip->ipi_cmd == SIOCSLIFNAME) {
7196 7194 /*
7197 7195 * The ioctl will be failed if the ioctl comes down
7198 7196 * an conn stream
7199 7197 */
7200 7198 if (ill == NULL) {
7201 7199 /*
7202 7200 * Not an ill queue, return EINVAL same as the
7203 7201 * old error code.
7204 7202 */
7205 7203 return (ENXIO);
7206 7204 }
7207 7205 ipif = ill->ill_ipif;
7208 7206 ipif_refhold(ipif);
7209 7207 } else {
7210 7208 /*
7211 7209 * Ensure that ioctls don't see any internal state changes
7212 7210 * caused by set ioctls by deferring them if IPIF_CHANGING is
7213 7211 * set.
7214 7212 */
7215 7213 ipif = ipif_lookup_on_name_async(name, mi_strlen(name),
7216 7214 isv6, zoneid, q, mp, ip_process_ioctl, &err, ipst);
7217 7215 if (ipif == NULL) {
7218 7216 if (err == EINPROGRESS)
7219 7217 return (err);
7220 7218 err = 0; /* Ensure we don't use it below */
7221 7219 }
7222 7220 }
7223 7221
7224 7222 /*
7225 7223 * Old style [GS]IFCMD does not admit IPv6 ipif
7226 7224 */
7227 7225 if (ipif != NULL && ipif->ipif_isv6 && ipip->ipi_cmd_type == IF_CMD) {
7228 7226 ipif_refrele(ipif);
7229 7227 return (ENXIO);
7230 7228 }
7231 7229
7232 7230 if (ipif == NULL && ill != NULL && ill->ill_ipif != NULL &&
7233 7231 name[0] == '\0') {
7234 7232 /*
7235 7233 * Handle a or a SIOC?IF* with a null name
7236 7234 * during plumb (on the ill queue before the I_PLINK).
7237 7235 */
7238 7236 ipif = ill->ill_ipif;
7239 7237 ipif_refhold(ipif);
7240 7238 }
7241 7239
7242 7240 if (ipif == NULL)
7243 7241 return (ENXIO);
7244 7242
7245 7243 DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq",
7246 7244 int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif);
7247 7245
7248 7246 ci->ci_ipif = ipif;
7249 7247 return (0);
7250 7248 }
7251 7249
7252 7250 /*
7253 7251 * Return the total number of ipifs.
7254 7252 */
7255 7253 static uint_t
7256 7254 ip_get_numifs(zoneid_t zoneid, ip_stack_t *ipst)
7257 7255 {
7258 7256 uint_t numifs = 0;
7259 7257 ill_t *ill;
7260 7258 ill_walk_context_t ctx;
7261 7259 ipif_t *ipif;
7262 7260
7263 7261 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7264 7262 ill = ILL_START_WALK_V4(&ctx, ipst);
7265 7263 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7266 7264 if (IS_UNDER_IPMP(ill))
7267 7265 continue;
7268 7266 for (ipif = ill->ill_ipif; ipif != NULL;
7269 7267 ipif = ipif->ipif_next) {
7270 7268 if (ipif->ipif_zoneid == zoneid ||
7271 7269 ipif->ipif_zoneid == ALL_ZONES)
7272 7270 numifs++;
7273 7271 }
7274 7272 }
7275 7273 rw_exit(&ipst->ips_ill_g_lock);
7276 7274 return (numifs);
7277 7275 }
7278 7276
7279 7277 /*
7280 7278 * Return the total number of ipifs.
7281 7279 */
7282 7280 static uint_t
7283 7281 ip_get_numlifs(int family, int lifn_flags, zoneid_t zoneid, ip_stack_t *ipst)
7284 7282 {
7285 7283 uint_t numifs = 0;
7286 7284 ill_t *ill;
7287 7285 ipif_t *ipif;
7288 7286 ill_walk_context_t ctx;
7289 7287
7290 7288 ip1dbg(("ip_get_numlifs(%d %u %d)\n", family, lifn_flags, (int)zoneid));
7291 7289
7292 7290 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7293 7291 if (family == AF_INET)
7294 7292 ill = ILL_START_WALK_V4(&ctx, ipst);
7295 7293 else if (family == AF_INET6)
7296 7294 ill = ILL_START_WALK_V6(&ctx, ipst);
7297 7295 else
7298 7296 ill = ILL_START_WALK_ALL(&ctx, ipst);
7299 7297
7300 7298 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7301 7299 if (IS_UNDER_IPMP(ill) && !(lifn_flags & LIFC_UNDER_IPMP))
7302 7300 continue;
7303 7301
7304 7302 for (ipif = ill->ill_ipif; ipif != NULL;
7305 7303 ipif = ipif->ipif_next) {
7306 7304 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7307 7305 !(lifn_flags & LIFC_NOXMIT))
7308 7306 continue;
7309 7307 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7310 7308 !(lifn_flags & LIFC_TEMPORARY))
7311 7309 continue;
7312 7310 if (((ipif->ipif_flags &
7313 7311 (IPIF_NOXMIT|IPIF_NOLOCAL|
7314 7312 IPIF_DEPRECATED)) ||
7315 7313 IS_LOOPBACK(ill) ||
7316 7314 !(ipif->ipif_flags & IPIF_UP)) &&
7317 7315 (lifn_flags & LIFC_EXTERNAL_SOURCE))
7318 7316 continue;
7319 7317
7320 7318 if (zoneid != ipif->ipif_zoneid &&
7321 7319 ipif->ipif_zoneid != ALL_ZONES &&
7322 7320 (zoneid != GLOBAL_ZONEID ||
7323 7321 !(lifn_flags & LIFC_ALLZONES)))
7324 7322 continue;
7325 7323
7326 7324 numifs++;
7327 7325 }
7328 7326 }
7329 7327 rw_exit(&ipst->ips_ill_g_lock);
7330 7328 return (numifs);
7331 7329 }
7332 7330
7333 7331 uint_t
7334 7332 ip_get_lifsrcofnum(ill_t *ill)
7335 7333 {
7336 7334 uint_t numifs = 0;
7337 7335 ill_t *ill_head = ill;
7338 7336 ip_stack_t *ipst = ill->ill_ipst;
7339 7337
7340 7338 /*
7341 7339 * ill_g_usesrc_lock protects ill_usesrc_grp_next, for example, some
7342 7340 * other thread may be trying to relink the ILLs in this usesrc group
7343 7341 * and adjusting the ill_usesrc_grp_next pointers
7344 7342 */
7345 7343 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7346 7344 if ((ill->ill_usesrc_ifindex == 0) &&
7347 7345 (ill->ill_usesrc_grp_next != NULL)) {
7348 7346 for (; (ill != NULL) && (ill->ill_usesrc_grp_next != ill_head);
7349 7347 ill = ill->ill_usesrc_grp_next)
7350 7348 numifs++;
7351 7349 }
7352 7350 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7353 7351
7354 7352 return (numifs);
7355 7353 }
7356 7354
7357 7355 /* Null values are passed in for ipif, sin, and ifreq */
7358 7356 /* ARGSUSED */
7359 7357 int
7360 7358 ip_sioctl_get_ifnum(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7361 7359 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7362 7360 {
7363 7361 int *nump;
7364 7362 conn_t *connp = Q_TO_CONN(q);
7365 7363
7366 7364 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7367 7365
7368 7366 /* Existence of b_cont->b_cont checked in ip_wput_nondata */
7369 7367 nump = (int *)mp->b_cont->b_cont->b_rptr;
7370 7368
7371 7369 *nump = ip_get_numifs(connp->conn_zoneid,
7372 7370 connp->conn_netstack->netstack_ip);
7373 7371 ip1dbg(("ip_sioctl_get_ifnum numifs %d", *nump));
7374 7372 return (0);
7375 7373 }
7376 7374
7377 7375 /* Null values are passed in for ipif, sin, and ifreq */
7378 7376 /* ARGSUSED */
7379 7377 int
7380 7378 ip_sioctl_get_lifnum(ipif_t *dummy_ipif, sin_t *dummy_sin,
7381 7379 queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7382 7380 {
7383 7381 struct lifnum *lifn;
7384 7382 mblk_t *mp1;
7385 7383 conn_t *connp = Q_TO_CONN(q);
7386 7384
7387 7385 ASSERT(q->q_next == NULL); /* not a valid ioctl for ip as a module */
7388 7386
7389 7387 /* Existence checked in ip_wput_nondata */
7390 7388 mp1 = mp->b_cont->b_cont;
7391 7389
7392 7390 lifn = (struct lifnum *)mp1->b_rptr;
7393 7391 switch (lifn->lifn_family) {
7394 7392 case AF_UNSPEC:
7395 7393 case AF_INET:
7396 7394 case AF_INET6:
7397 7395 break;
7398 7396 default:
7399 7397 return (EAFNOSUPPORT);
7400 7398 }
7401 7399
7402 7400 lifn->lifn_count = ip_get_numlifs(lifn->lifn_family, lifn->lifn_flags,
7403 7401 connp->conn_zoneid, connp->conn_netstack->netstack_ip);
7404 7402 ip1dbg(("ip_sioctl_get_lifnum numifs %d", lifn->lifn_count));
7405 7403 return (0);
7406 7404 }
7407 7405
7408 7406 /* ARGSUSED */
7409 7407 int
7410 7408 ip_sioctl_get_ifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7411 7409 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7412 7410 {
7413 7411 STRUCT_HANDLE(ifconf, ifc);
7414 7412 mblk_t *mp1;
7415 7413 struct iocblk *iocp;
7416 7414 struct ifreq *ifr;
7417 7415 ill_walk_context_t ctx;
7418 7416 ill_t *ill;
7419 7417 ipif_t *ipif;
7420 7418 struct sockaddr_in *sin;
7421 7419 int32_t ifclen;
7422 7420 zoneid_t zoneid;
7423 7421 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7424 7422
7425 7423 ASSERT(q->q_next == NULL); /* not valid ioctls for ip as a module */
7426 7424
7427 7425 ip1dbg(("ip_sioctl_get_ifconf"));
7428 7426 /* Existence verified in ip_wput_nondata */
7429 7427 mp1 = mp->b_cont->b_cont;
7430 7428 iocp = (struct iocblk *)mp->b_rptr;
7431 7429 zoneid = Q_TO_CONN(q)->conn_zoneid;
7432 7430
7433 7431 /*
7434 7432 * The original SIOCGIFCONF passed in a struct ifconf which specified
7435 7433 * the user buffer address and length into which the list of struct
7436 7434 * ifreqs was to be copied. Since AT&T Streams does not seem to
7437 7435 * allow M_COPYOUT to be used in conjunction with I_STR IOCTLS,
7438 7436 * the SIOCGIFCONF operation was redefined to simply provide
7439 7437 * a large output buffer into which we are supposed to jam the ifreq
7440 7438 * array. The same ioctl command code was used, despite the fact that
7441 7439 * both the applications and the kernel code had to change, thus making
7442 7440 * it impossible to support both interfaces.
7443 7441 *
7444 7442 * For reasons not good enough to try to explain, the following
7445 7443 * algorithm is used for deciding what to do with one of these:
7446 7444 * If the IOCTL comes in as an I_STR, it is assumed to be of the new
7447 7445 * form with the output buffer coming down as the continuation message.
7448 7446 * If it arrives as a TRANSPARENT IOCTL, it is assumed to be old style,
7449 7447 * and we have to copy in the ifconf structure to find out how big the
7450 7448 * output buffer is and where to copy out to. Sure no problem...
7451 7449 *
7452 7450 */
7453 7451 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag, NULL);
7454 7452 if ((mp1->b_wptr - mp1->b_rptr) == STRUCT_SIZE(ifc)) {
7455 7453 int numifs = 0;
7456 7454 size_t ifc_bufsize;
7457 7455
7458 7456 /*
7459 7457 * Must be (better be!) continuation of a TRANSPARENT
7460 7458 * IOCTL. We just copied in the ifconf structure.
7461 7459 */
7462 7460 STRUCT_SET_HANDLE(ifc, iocp->ioc_flag,
7463 7461 (struct ifconf *)mp1->b_rptr);
7464 7462
7465 7463 /*
7466 7464 * Allocate a buffer to hold requested information.
7467 7465 *
7468 7466 * If ifc_len is larger than what is needed, we only
7469 7467 * allocate what we will use.
7470 7468 *
7471 7469 * If ifc_len is smaller than what is needed, return
7472 7470 * EINVAL.
7473 7471 *
7474 7472 * XXX: the ill_t structure can hava 2 counters, for
7475 7473 * v4 and v6 (not just ill_ipif_up_count) to store the
7476 7474 * number of interfaces for a device, so we don't need
7477 7475 * to count them here...
7478 7476 */
7479 7477 numifs = ip_get_numifs(zoneid, ipst);
7480 7478
7481 7479 ifclen = STRUCT_FGET(ifc, ifc_len);
7482 7480 ifc_bufsize = numifs * sizeof (struct ifreq);
7483 7481 if (ifc_bufsize > ifclen) {
7484 7482 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7485 7483 /* old behaviour */
7486 7484 return (EINVAL);
7487 7485 } else {
7488 7486 ifc_bufsize = ifclen;
7489 7487 }
7490 7488 }
7491 7489
7492 7490 mp1 = mi_copyout_alloc(q, mp,
7493 7491 STRUCT_FGETP(ifc, ifc_buf), ifc_bufsize, B_FALSE);
7494 7492 if (mp1 == NULL)
7495 7493 return (ENOMEM);
7496 7494
7497 7495 mp1->b_wptr = mp1->b_rptr + ifc_bufsize;
7498 7496 }
7499 7497 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7500 7498 /*
7501 7499 * the SIOCGIFCONF ioctl only knows about
7502 7500 * IPv4 addresses, so don't try to tell
7503 7501 * it about interfaces with IPv6-only
7504 7502 * addresses. (Last parm 'isv6' is B_FALSE)
7505 7503 */
7506 7504
7507 7505 ifr = (struct ifreq *)mp1->b_rptr;
7508 7506
7509 7507 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7510 7508 ill = ILL_START_WALK_V4(&ctx, ipst);
7511 7509 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7512 7510 if (IS_UNDER_IPMP(ill))
7513 7511 continue;
7514 7512 for (ipif = ill->ill_ipif; ipif != NULL;
7515 7513 ipif = ipif->ipif_next) {
7516 7514 if (zoneid != ipif->ipif_zoneid &&
7517 7515 ipif->ipif_zoneid != ALL_ZONES)
7518 7516 continue;
7519 7517 if ((uchar_t *)&ifr[1] > mp1->b_wptr) {
7520 7518 if (iocp->ioc_cmd == O_SIOCGIFCONF) {
7521 7519 /* old behaviour */
7522 7520 rw_exit(&ipst->ips_ill_g_lock);
7523 7521 return (EINVAL);
7524 7522 } else {
7525 7523 goto if_copydone;
7526 7524 }
7527 7525 }
7528 7526 ipif_get_name(ipif, ifr->ifr_name,
7529 7527 sizeof (ifr->ifr_name));
7530 7528 sin = (sin_t *)&ifr->ifr_addr;
7531 7529 *sin = sin_null;
7532 7530 sin->sin_family = AF_INET;
7533 7531 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7534 7532 ifr++;
7535 7533 }
7536 7534 }
7537 7535 if_copydone:
7538 7536 rw_exit(&ipst->ips_ill_g_lock);
7539 7537 mp1->b_wptr = (uchar_t *)ifr;
7540 7538
7541 7539 if (STRUCT_BUF(ifc) != NULL) {
7542 7540 STRUCT_FSET(ifc, ifc_len,
7543 7541 (int)((uchar_t *)ifr - mp1->b_rptr));
7544 7542 }
7545 7543 return (0);
7546 7544 }
7547 7545
7548 7546 /*
7549 7547 * Get the interfaces using the address hosted on the interface passed in,
7550 7548 * as a source adddress
7551 7549 */
7552 7550 /* ARGSUSED */
7553 7551 int
7554 7552 ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7555 7553 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7556 7554 {
7557 7555 mblk_t *mp1;
7558 7556 ill_t *ill, *ill_head;
7559 7557 ipif_t *ipif, *orig_ipif;
7560 7558 int numlifs = 0;
7561 7559 size_t lifs_bufsize, lifsmaxlen;
7562 7560 struct lifreq *lifr;
7563 7561 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7564 7562 uint_t ifindex;
7565 7563 zoneid_t zoneid;
7566 7564 boolean_t isv6 = B_FALSE;
7567 7565 struct sockaddr_in *sin;
7568 7566 struct sockaddr_in6 *sin6;
7569 7567 STRUCT_HANDLE(lifsrcof, lifs);
7570 7568 ip_stack_t *ipst;
7571 7569
7572 7570 ipst = CONNQ_TO_IPST(q);
7573 7571
7574 7572 ASSERT(q->q_next == NULL);
7575 7573
7576 7574 zoneid = Q_TO_CONN(q)->conn_zoneid;
7577 7575
7578 7576 /* Existence verified in ip_wput_nondata */
7579 7577 mp1 = mp->b_cont->b_cont;
7580 7578
7581 7579 /*
7582 7580 * Must be (better be!) continuation of a TRANSPARENT
7583 7581 * IOCTL. We just copied in the lifsrcof structure.
7584 7582 */
7585 7583 STRUCT_SET_HANDLE(lifs, iocp->ioc_flag,
7586 7584 (struct lifsrcof *)mp1->b_rptr);
7587 7585
7588 7586 if (MBLKL(mp1) != STRUCT_SIZE(lifs))
7589 7587 return (EINVAL);
7590 7588
7591 7589 ifindex = STRUCT_FGET(lifs, lifs_ifindex);
7592 7590 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
7593 7591 ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst);
7594 7592 if (ipif == NULL) {
7595 7593 ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n",
7596 7594 ifindex));
7597 7595 return (ENXIO);
7598 7596 }
7599 7597
7600 7598 /* Allocate a buffer to hold requested information */
7601 7599 numlifs = ip_get_lifsrcofnum(ipif->ipif_ill);
7602 7600 lifs_bufsize = numlifs * sizeof (struct lifreq);
7603 7601 lifsmaxlen = STRUCT_FGET(lifs, lifs_maxlen);
7604 7602 /* The actual size needed is always returned in lifs_len */
7605 7603 STRUCT_FSET(lifs, lifs_len, lifs_bufsize);
7606 7604
7607 7605 /* If the amount we need is more than what is passed in, abort */
7608 7606 if (lifs_bufsize > lifsmaxlen || lifs_bufsize == 0) {
7609 7607 ipif_refrele(ipif);
7610 7608 return (0);
7611 7609 }
7612 7610
7613 7611 mp1 = mi_copyout_alloc(q, mp,
7614 7612 STRUCT_FGETP(lifs, lifs_buf), lifs_bufsize, B_FALSE);
7615 7613 if (mp1 == NULL) {
7616 7614 ipif_refrele(ipif);
7617 7615 return (ENOMEM);
7618 7616 }
7619 7617
7620 7618 mp1->b_wptr = mp1->b_rptr + lifs_bufsize;
7621 7619 bzero(mp1->b_rptr, lifs_bufsize);
7622 7620
7623 7621 lifr = (struct lifreq *)mp1->b_rptr;
7624 7622
7625 7623 ill = ill_head = ipif->ipif_ill;
7626 7624 orig_ipif = ipif;
7627 7625
7628 7626 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
7629 7627 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER);
7630 7628 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7631 7629
7632 7630 ill = ill->ill_usesrc_grp_next; /* start from next ill */
7633 7631 for (; (ill != NULL) && (ill != ill_head);
7634 7632 ill = ill->ill_usesrc_grp_next) {
7635 7633
7636 7634 if ((uchar_t *)&lifr[1] > mp1->b_wptr)
7637 7635 break;
7638 7636
7639 7637 ipif = ill->ill_ipif;
7640 7638 ipif_get_name(ipif, lifr->lifr_name, sizeof (lifr->lifr_name));
7641 7639 if (ipif->ipif_isv6) {
7642 7640 sin6 = (sin6_t *)&lifr->lifr_addr;
7643 7641 *sin6 = sin6_null;
7644 7642 sin6->sin6_family = AF_INET6;
7645 7643 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
7646 7644 lifr->lifr_addrlen = ip_mask_to_plen_v6(
7647 7645 &ipif->ipif_v6net_mask);
7648 7646 } else {
7649 7647 sin = (sin_t *)&lifr->lifr_addr;
7650 7648 *sin = sin_null;
7651 7649 sin->sin_family = AF_INET;
7652 7650 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
7653 7651 lifr->lifr_addrlen = ip_mask_to_plen(
7654 7652 ipif->ipif_net_mask);
7655 7653 }
7656 7654 lifr++;
7657 7655 }
7658 7656 rw_exit(&ipst->ips_ill_g_lock);
7659 7657 rw_exit(&ipst->ips_ill_g_usesrc_lock);
7660 7658 ipif_refrele(orig_ipif);
7661 7659 mp1->b_wptr = (uchar_t *)lifr;
7662 7660 STRUCT_FSET(lifs, lifs_len, (int)((uchar_t *)lifr - mp1->b_rptr));
7663 7661
7664 7662 return (0);
7665 7663 }
7666 7664
7667 7665 /* ARGSUSED */
7668 7666 int
7669 7667 ip_sioctl_get_lifconf(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q,
7670 7668 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq)
7671 7669 {
7672 7670 mblk_t *mp1;
7673 7671 int list;
7674 7672 ill_t *ill;
7675 7673 ipif_t *ipif;
7676 7674 int flags;
7677 7675 int numlifs = 0;
7678 7676 size_t lifc_bufsize;
7679 7677 struct lifreq *lifr;
7680 7678 sa_family_t family;
7681 7679 struct sockaddr_in *sin;
7682 7680 struct sockaddr_in6 *sin6;
7683 7681 ill_walk_context_t ctx;
7684 7682 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7685 7683 int32_t lifclen;
7686 7684 zoneid_t zoneid;
7687 7685 STRUCT_HANDLE(lifconf, lifc);
7688 7686 ip_stack_t *ipst = CONNQ_TO_IPST(q);
7689 7687
7690 7688 ip1dbg(("ip_sioctl_get_lifconf"));
7691 7689
7692 7690 ASSERT(q->q_next == NULL);
7693 7691
7694 7692 zoneid = Q_TO_CONN(q)->conn_zoneid;
7695 7693
7696 7694 /* Existence verified in ip_wput_nondata */
7697 7695 mp1 = mp->b_cont->b_cont;
7698 7696
7699 7697 /*
7700 7698 * An extended version of SIOCGIFCONF that takes an
7701 7699 * additional address family and flags field.
7702 7700 * AF_UNSPEC retrieve both IPv4 and IPv6.
7703 7701 * Unless LIFC_NOXMIT is specified the IPIF_NOXMIT
7704 7702 * interfaces are omitted.
7705 7703 * Similarly, IPIF_TEMPORARY interfaces are omitted
7706 7704 * unless LIFC_TEMPORARY is specified.
7707 7705 * If LIFC_EXTERNAL_SOURCE is specified, IPIF_NOXMIT,
7708 7706 * IPIF_NOLOCAL, PHYI_LOOPBACK, IPIF_DEPRECATED and
7709 7707 * not IPIF_UP interfaces are omitted. LIFC_EXTERNAL_SOURCE
7710 7708 * has priority over LIFC_NOXMIT.
7711 7709 */
7712 7710 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, NULL);
7713 7711
7714 7712 if ((mp1->b_wptr - mp1->b_rptr) != STRUCT_SIZE(lifc))
7715 7713 return (EINVAL);
7716 7714
7717 7715 /*
7718 7716 * Must be (better be!) continuation of a TRANSPARENT
7719 7717 * IOCTL. We just copied in the lifconf structure.
7720 7718 */
7721 7719 STRUCT_SET_HANDLE(lifc, iocp->ioc_flag, (struct lifconf *)mp1->b_rptr);
7722 7720
7723 7721 family = STRUCT_FGET(lifc, lifc_family);
7724 7722 flags = STRUCT_FGET(lifc, lifc_flags);
7725 7723
7726 7724 switch (family) {
7727 7725 case AF_UNSPEC:
7728 7726 /*
7729 7727 * walk all ILL's.
7730 7728 */
7731 7729 list = MAX_G_HEADS;
7732 7730 break;
7733 7731 case AF_INET:
7734 7732 /*
7735 7733 * walk only IPV4 ILL's.
7736 7734 */
7737 7735 list = IP_V4_G_HEAD;
7738 7736 break;
7739 7737 case AF_INET6:
7740 7738 /*
7741 7739 * walk only IPV6 ILL's.
7742 7740 */
7743 7741 list = IP_V6_G_HEAD;
7744 7742 break;
7745 7743 default:
7746 7744 return (EAFNOSUPPORT);
7747 7745 }
7748 7746
7749 7747 /*
7750 7748 * Allocate a buffer to hold requested information.
7751 7749 *
7752 7750 * If lifc_len is larger than what is needed, we only
7753 7751 * allocate what we will use.
7754 7752 *
7755 7753 * If lifc_len is smaller than what is needed, return
7756 7754 * EINVAL.
7757 7755 */
7758 7756 numlifs = ip_get_numlifs(family, flags, zoneid, ipst);
7759 7757 lifc_bufsize = numlifs * sizeof (struct lifreq);
7760 7758 lifclen = STRUCT_FGET(lifc, lifc_len);
7761 7759 if (lifc_bufsize > lifclen) {
7762 7760 if (iocp->ioc_cmd == O_SIOCGLIFCONF)
7763 7761 return (EINVAL);
7764 7762 else
7765 7763 lifc_bufsize = lifclen;
7766 7764 }
7767 7765
7768 7766 mp1 = mi_copyout_alloc(q, mp,
7769 7767 STRUCT_FGETP(lifc, lifc_buf), lifc_bufsize, B_FALSE);
7770 7768 if (mp1 == NULL)
7771 7769 return (ENOMEM);
7772 7770
7773 7771 mp1->b_wptr = mp1->b_rptr + lifc_bufsize;
7774 7772 bzero(mp1->b_rptr, mp1->b_wptr - mp1->b_rptr);
7775 7773
7776 7774 lifr = (struct lifreq *)mp1->b_rptr;
7777 7775
7778 7776 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
7779 7777 ill = ill_first(list, list, &ctx, ipst);
7780 7778 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
7781 7779 if (IS_UNDER_IPMP(ill) && !(flags & LIFC_UNDER_IPMP))
7782 7780 continue;
7783 7781
7784 7782 for (ipif = ill->ill_ipif; ipif != NULL;
7785 7783 ipif = ipif->ipif_next) {
7786 7784 if ((ipif->ipif_flags & IPIF_NOXMIT) &&
7787 7785 !(flags & LIFC_NOXMIT))
7788 7786 continue;
7789 7787
7790 7788 if ((ipif->ipif_flags & IPIF_TEMPORARY) &&
7791 7789 !(flags & LIFC_TEMPORARY))
7792 7790 continue;
7793 7791
7794 7792 if (((ipif->ipif_flags &
7795 7793 (IPIF_NOXMIT|IPIF_NOLOCAL|
7796 7794 IPIF_DEPRECATED)) ||
7797 7795 IS_LOOPBACK(ill) ||
7798 7796 !(ipif->ipif_flags & IPIF_UP)) &&
7799 7797 (flags & LIFC_EXTERNAL_SOURCE))
7800 7798 continue;
7801 7799
7802 7800 if (zoneid != ipif->ipif_zoneid &&
7803 7801 ipif->ipif_zoneid != ALL_ZONES &&
7804 7802 (zoneid != GLOBAL_ZONEID ||
7805 7803 !(flags & LIFC_ALLZONES)))
7806 7804 continue;
7807 7805
7808 7806 if ((uchar_t *)&lifr[1] > mp1->b_wptr) {
7809 7807 if (iocp->ioc_cmd == O_SIOCGLIFCONF) {
7810 7808 rw_exit(&ipst->ips_ill_g_lock);
7811 7809 return (EINVAL);
7812 7810 } else {
7813 7811 goto lif_copydone;
7814 7812 }
7815 7813 }
7816 7814
7817 7815 ipif_get_name(ipif, lifr->lifr_name,
7818 7816 sizeof (lifr->lifr_name));
7819 7817 lifr->lifr_type = ill->ill_type;
7820 7818 if (ipif->ipif_isv6) {
7821 7819 sin6 = (sin6_t *)&lifr->lifr_addr;
7822 7820 *sin6 = sin6_null;
7823 7821 sin6->sin6_family = AF_INET6;
7824 7822 sin6->sin6_addr =
7825 7823 ipif->ipif_v6lcl_addr;
7826 7824 lifr->lifr_addrlen =
7827 7825 ip_mask_to_plen_v6(
7828 7826 &ipif->ipif_v6net_mask);
7829 7827 } else {
7830 7828 sin = (sin_t *)&lifr->lifr_addr;
7831 7829 *sin = sin_null;
7832 7830 sin->sin_family = AF_INET;
7833 7831 sin->sin_addr.s_addr =
7834 7832 ipif->ipif_lcl_addr;
7835 7833 lifr->lifr_addrlen =
7836 7834 ip_mask_to_plen(
7837 7835 ipif->ipif_net_mask);
7838 7836 }
7839 7837 lifr++;
7840 7838 }
7841 7839 }
7842 7840 lif_copydone:
7843 7841 rw_exit(&ipst->ips_ill_g_lock);
7844 7842
7845 7843 mp1->b_wptr = (uchar_t *)lifr;
7846 7844 if (STRUCT_BUF(lifc) != NULL) {
7847 7845 STRUCT_FSET(lifc, lifc_len,
7848 7846 (int)((uchar_t *)lifr - mp1->b_rptr));
7849 7847 }
7850 7848 return (0);
7851 7849 }
7852 7850
7853 7851 static void
7854 7852 ip_sioctl_ip6addrpolicy(queue_t *q, mblk_t *mp)
7855 7853 {
7856 7854 ip6_asp_t *table;
7857 7855 size_t table_size;
7858 7856 mblk_t *data_mp;
7859 7857 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7860 7858 ip_stack_t *ipst;
7861 7859
7862 7860 if (q->q_next == NULL)
7863 7861 ipst = CONNQ_TO_IPST(q);
7864 7862 else
7865 7863 ipst = ILLQ_TO_IPST(q);
7866 7864
7867 7865 /* These two ioctls are I_STR only */
7868 7866 if (iocp->ioc_count == TRANSPARENT) {
7869 7867 miocnak(q, mp, 0, EINVAL);
7870 7868 return;
7871 7869 }
7872 7870
7873 7871 data_mp = mp->b_cont;
7874 7872 if (data_mp == NULL) {
7875 7873 /* The user passed us a NULL argument */
7876 7874 table = NULL;
7877 7875 table_size = iocp->ioc_count;
7878 7876 } else {
7879 7877 /*
7880 7878 * The user provided a table. The stream head
7881 7879 * may have copied in the user data in chunks,
7882 7880 * so make sure everything is pulled up
7883 7881 * properly.
7884 7882 */
7885 7883 if (MBLKL(data_mp) < iocp->ioc_count) {
7886 7884 mblk_t *new_data_mp;
7887 7885 if ((new_data_mp = msgpullup(data_mp, -1)) ==
7888 7886 NULL) {
7889 7887 miocnak(q, mp, 0, ENOMEM);
7890 7888 return;
7891 7889 }
7892 7890 freemsg(data_mp);
7893 7891 data_mp = new_data_mp;
7894 7892 mp->b_cont = data_mp;
7895 7893 }
7896 7894 table = (ip6_asp_t *)data_mp->b_rptr;
7897 7895 table_size = iocp->ioc_count;
7898 7896 }
7899 7897
7900 7898 switch (iocp->ioc_cmd) {
7901 7899 case SIOCGIP6ADDRPOLICY:
7902 7900 iocp->ioc_rval = ip6_asp_get(table, table_size, ipst);
7903 7901 if (iocp->ioc_rval == -1)
7904 7902 iocp->ioc_error = EINVAL;
7905 7903 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7906 7904 else if (table != NULL &&
7907 7905 (iocp->ioc_flag & IOC_MODELS) == IOC_ILP32) {
7908 7906 ip6_asp_t *src = table;
7909 7907 ip6_asp32_t *dst = (void *)table;
7910 7908 int count = table_size / sizeof (ip6_asp_t);
7911 7909 int i;
7912 7910
7913 7911 /*
7914 7912 * We need to do an in-place shrink of the array
7915 7913 * to match the alignment attributes of the
7916 7914 * 32-bit ABI looking at it.
7917 7915 */
7918 7916 /* LINTED: logical expression always true: op "||" */
7919 7917 ASSERT(sizeof (*src) > sizeof (*dst));
7920 7918 for (i = 1; i < count; i++)
7921 7919 bcopy(src + i, dst + i, sizeof (*dst));
7922 7920 }
7923 7921 #endif
7924 7922 break;
7925 7923
7926 7924 case SIOCSIP6ADDRPOLICY:
7927 7925 ASSERT(mp->b_prev == NULL);
7928 7926 mp->b_prev = (void *)q;
7929 7927 #if defined(_SYSCALL32_IMPL) && _LONG_LONG_ALIGNMENT_32 == 4
7930 7928 /*
7931 7929 * We pass in the datamodel here so that the ip6_asp_replace()
7932 7930 * routine can handle converting from 32-bit to native formats
7933 7931 * where necessary.
7934 7932 *
7935 7933 * A better way to handle this might be to convert the inbound
7936 7934 * data structure here, and hang it off a new 'mp'; thus the
7937 7935 * ip6_asp_replace() logic would always be dealing with native
7938 7936 * format data structures..
7939 7937 *
7940 7938 * (An even simpler way to handle these ioctls is to just
7941 7939 * add a 32-bit trailing 'pad' field to the ip6_asp_t structure
7942 7940 * and just recompile everything that depends on it.)
7943 7941 */
7944 7942 #endif
7945 7943 ip6_asp_replace(mp, table, table_size, B_FALSE, ipst,
7946 7944 iocp->ioc_flag & IOC_MODELS);
7947 7945 return;
7948 7946 }
7949 7947
7950 7948 DB_TYPE(mp) = (iocp->ioc_error == 0) ? M_IOCACK : M_IOCNAK;
7951 7949 qreply(q, mp);
7952 7950 }
7953 7951
7954 7952 static void
7955 7953 ip_sioctl_dstinfo(queue_t *q, mblk_t *mp)
7956 7954 {
7957 7955 mblk_t *data_mp;
7958 7956 struct dstinforeq *dir;
7959 7957 uint8_t *end, *cur;
7960 7958 in6_addr_t *daddr, *saddr;
7961 7959 ipaddr_t v4daddr;
7962 7960 ire_t *ire;
7963 7961 ipaddr_t v4setsrc;
7964 7962 in6_addr_t v6setsrc;
7965 7963 char *slabel, *dlabel;
7966 7964 boolean_t isipv4;
7967 7965 int match_ire;
7968 7966 ill_t *dst_ill;
7969 7967 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
7970 7968 conn_t *connp = Q_TO_CONN(q);
7971 7969 zoneid_t zoneid = IPCL_ZONEID(connp);
7972 7970 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
7973 7971 uint64_t ipif_flags;
7974 7972
7975 7973 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
7976 7974
7977 7975 /*
7978 7976 * This ioctl is I_STR only, and must have a
7979 7977 * data mblk following the M_IOCTL mblk.
7980 7978 */
7981 7979 data_mp = mp->b_cont;
7982 7980 if (iocp->ioc_count == TRANSPARENT || data_mp == NULL) {
7983 7981 miocnak(q, mp, 0, EINVAL);
7984 7982 return;
7985 7983 }
7986 7984
7987 7985 if (MBLKL(data_mp) < iocp->ioc_count) {
7988 7986 mblk_t *new_data_mp;
7989 7987
7990 7988 if ((new_data_mp = msgpullup(data_mp, -1)) == NULL) {
7991 7989 miocnak(q, mp, 0, ENOMEM);
7992 7990 return;
7993 7991 }
7994 7992 freemsg(data_mp);
7995 7993 data_mp = new_data_mp;
7996 7994 mp->b_cont = data_mp;
7997 7995 }
7998 7996 match_ire = MATCH_IRE_DSTONLY;
7999 7997
8000 7998 for (cur = data_mp->b_rptr, end = data_mp->b_wptr;
8001 7999 end - cur >= sizeof (struct dstinforeq);
8002 8000 cur += sizeof (struct dstinforeq)) {
8003 8001 dir = (struct dstinforeq *)cur;
8004 8002 daddr = &dir->dir_daddr;
8005 8003 saddr = &dir->dir_saddr;
8006 8004
8007 8005 /*
8008 8006 * ip_addr_scope_v6() and ip6_asp_lookup() handle
8009 8007 * v4 mapped addresses; ire_ftable_lookup_v6()
8010 8008 * and ip_select_source_v6() do not.
8011 8009 */
8012 8010 dir->dir_dscope = ip_addr_scope_v6(daddr);
8013 8011 dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst);
8014 8012
8015 8013 isipv4 = IN6_IS_ADDR_V4MAPPED(daddr);
8016 8014 if (isipv4) {
8017 8015 IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr);
8018 8016 v4setsrc = INADDR_ANY;
8019 8017 ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid,
8020 8018 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v4setsrc,
8021 8019 NULL, NULL);
8022 8020 } else {
8023 8021 v6setsrc = ipv6_all_zeros;
8024 8022 ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid,
8025 8023 NULL, match_ire, IRR_ALLOCATE, 0, ipst, &v6setsrc,
8026 8024 NULL, NULL);
8027 8025 }
8028 8026 ASSERT(ire != NULL);
8029 8027 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
8030 8028 ire_refrele(ire);
8031 8029 dir->dir_dreachable = 0;
8032 8030
8033 8031 /* move on to next dst addr */
8034 8032 continue;
8035 8033 }
8036 8034 dir->dir_dreachable = 1;
8037 8035
8038 8036 dst_ill = ire_nexthop_ill(ire);
8039 8037 if (dst_ill == NULL) {
8040 8038 ire_refrele(ire);
8041 8039 continue;
8042 8040 }
8043 8041
8044 8042 /* With ipmp we most likely look at the ipmp ill here */
8045 8043 dir->dir_dmactype = dst_ill->ill_mactype;
8046 8044
8047 8045 if (isipv4) {
8048 8046 ipaddr_t v4saddr;
8049 8047
8050 8048 if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr,
8051 8049 connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst,
8052 8050 &v4saddr, NULL, &ipif_flags) != 0) {
8053 8051 v4saddr = INADDR_ANY;
8054 8052 ipif_flags = 0;
8055 8053 }
8056 8054 IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr);
8057 8055 } else {
8058 8056 if (ip_select_source_v6(dst_ill, &v6setsrc, daddr,
8059 8057 zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT,
8060 8058 saddr, NULL, &ipif_flags) != 0) {
8061 8059 *saddr = ipv6_all_zeros;
8062 8060 ipif_flags = 0;
8063 8061 }
8064 8062 }
8065 8063
8066 8064 dir->dir_sscope = ip_addr_scope_v6(saddr);
8067 8065 slabel = ip6_asp_lookup(saddr, NULL, ipst);
8068 8066 dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel);
8069 8067 dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0;
8070 8068 ire_refrele(ire);
8071 8069 ill_refrele(dst_ill);
8072 8070 }
8073 8071 miocack(q, mp, iocp->ioc_count, 0);
8074 8072 }
8075 8073
8076 8074 /*
8077 8075 * Check if this is an address assigned to this machine.
8078 8076 * Skips interfaces that are down by using ire checks.
8079 8077 * Translates mapped addresses to v4 addresses and then
8080 8078 * treats them as such, returning true if the v4 address
8081 8079 * associated with this mapped address is configured.
8082 8080 * Note: Applications will have to be careful what they do
8083 8081 * with the response; use of mapped addresses limits
8084 8082 * what can be done with the socket, especially with
8085 8083 * respect to socket options and ioctls - neither IPv4
8086 8084 * options nor IPv6 sticky options/ancillary data options
8087 8085 * may be used.
8088 8086 */
8089 8087 /* ARGSUSED */
8090 8088 int
8091 8089 ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8092 8090 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8093 8091 {
8094 8092 struct sioc_addrreq *sia;
8095 8093 sin_t *sin;
8096 8094 ire_t *ire;
8097 8095 mblk_t *mp1;
8098 8096 zoneid_t zoneid;
8099 8097 ip_stack_t *ipst;
8100 8098
8101 8099 ip1dbg(("ip_sioctl_tmyaddr"));
8102 8100
8103 8101 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8104 8102 zoneid = Q_TO_CONN(q)->conn_zoneid;
8105 8103 ipst = CONNQ_TO_IPST(q);
8106 8104
8107 8105 /* Existence verified in ip_wput_nondata */
8108 8106 mp1 = mp->b_cont->b_cont;
8109 8107 sia = (struct sioc_addrreq *)mp1->b_rptr;
8110 8108 sin = (sin_t *)&sia->sa_addr;
8111 8109 switch (sin->sin_family) {
8112 8110 case AF_INET6: {
8113 8111 sin6_t *sin6 = (sin6_t *)sin;
8114 8112
8115 8113 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8116 8114 ipaddr_t v4_addr;
8117 8115
8118 8116 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8119 8117 v4_addr);
8120 8118 ire = ire_ftable_lookup_v4(v4_addr, 0, 0,
8121 8119 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8122 8120 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8123 8121 } else {
8124 8122 in6_addr_t v6addr;
8125 8123
8126 8124 v6addr = sin6->sin6_addr;
8127 8125 ire = ire_ftable_lookup_v6(&v6addr, 0, 0,
8128 8126 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL,
8129 8127 MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8130 8128 }
8131 8129 break;
8132 8130 }
8133 8131 case AF_INET: {
8134 8132 ipaddr_t v4addr;
8135 8133
8136 8134 v4addr = sin->sin_addr.s_addr;
8137 8135 ire = ire_ftable_lookup_v4(v4addr, 0, 0,
8138 8136 IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid,
8139 8137 NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL);
8140 8138 break;
8141 8139 }
8142 8140 default:
8143 8141 return (EAFNOSUPPORT);
8144 8142 }
8145 8143 if (ire != NULL) {
8146 8144 sia->sa_res = 1;
8147 8145 ire_refrele(ire);
8148 8146 } else {
8149 8147 sia->sa_res = 0;
8150 8148 }
8151 8149 return (0);
8152 8150 }
8153 8151
8154 8152 /*
8155 8153 * Check if this is an address assigned on-link i.e. neighbor,
8156 8154 * and makes sure it's reachable from the current zone.
8157 8155 * Returns true for my addresses as well.
8158 8156 * Translates mapped addresses to v4 addresses and then
8159 8157 * treats them as such, returning true if the v4 address
8160 8158 * associated with this mapped address is configured.
8161 8159 * Note: Applications will have to be careful what they do
8162 8160 * with the response; use of mapped addresses limits
8163 8161 * what can be done with the socket, especially with
8164 8162 * respect to socket options and ioctls - neither IPv4
8165 8163 * options nor IPv6 sticky options/ancillary data options
8166 8164 * may be used.
8167 8165 */
8168 8166 /* ARGSUSED */
8169 8167 int
8170 8168 ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
8171 8169 ip_ioctl_cmd_t *ipip, void *duymmy_ifreq)
8172 8170 {
8173 8171 struct sioc_addrreq *sia;
8174 8172 sin_t *sin;
8175 8173 mblk_t *mp1;
8176 8174 ire_t *ire = NULL;
8177 8175 zoneid_t zoneid;
8178 8176 ip_stack_t *ipst;
8179 8177
8180 8178 ip1dbg(("ip_sioctl_tonlink"));
8181 8179
8182 8180 ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */
8183 8181 zoneid = Q_TO_CONN(q)->conn_zoneid;
8184 8182 ipst = CONNQ_TO_IPST(q);
8185 8183
8186 8184 /* Existence verified in ip_wput_nondata */
8187 8185 mp1 = mp->b_cont->b_cont;
8188 8186 sia = (struct sioc_addrreq *)mp1->b_rptr;
8189 8187 sin = (sin_t *)&sia->sa_addr;
8190 8188
8191 8189 /*
8192 8190 * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST
8193 8191 * to make sure we only look at on-link unicast address.
8194 8192 */
8195 8193 switch (sin->sin_family) {
8196 8194 case AF_INET6: {
8197 8195 sin6_t *sin6 = (sin6_t *)sin;
8198 8196
8199 8197 if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
8200 8198 ipaddr_t v4_addr;
8201 8199
8202 8200 IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr,
8203 8201 v4_addr);
8204 8202 if (!CLASSD(v4_addr)) {
8205 8203 ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0,
8206 8204 NULL, zoneid, NULL, MATCH_IRE_DSTONLY,
8207 8205 0, ipst, NULL);
8208 8206 }
8209 8207 } else {
8210 8208 in6_addr_t v6addr;
8211 8209
8212 8210 v6addr = sin6->sin6_addr;
8213 8211 if (!IN6_IS_ADDR_MULTICAST(&v6addr)) {
8214 8212 ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0,
8215 8213 NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0,
8216 8214 ipst, NULL);
8217 8215 }
8218 8216 }
8219 8217 break;
8220 8218 }
8221 8219 case AF_INET: {
8222 8220 ipaddr_t v4addr;
8223 8221
8224 8222 v4addr = sin->sin_addr.s_addr;
8225 8223 if (!CLASSD(v4addr)) {
8226 8224 ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL,
8227 8225 zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL);
8228 8226 }
8229 8227 break;
8230 8228 }
8231 8229 default:
8232 8230 return (EAFNOSUPPORT);
8233 8231 }
8234 8232 sia->sa_res = 0;
8235 8233 if (ire != NULL) {
8236 8234 ASSERT(!(ire->ire_type & IRE_MULTICAST));
8237 8235
8238 8236 if ((ire->ire_type & IRE_ONLINK) &&
8239 8237 !(ire->ire_type & IRE_BROADCAST))
8240 8238 sia->sa_res = 1;
8241 8239 ire_refrele(ire);
8242 8240 }
8243 8241 return (0);
8244 8242 }
8245 8243
8246 8244 /*
8247 8245 * TBD: implement when kernel maintaines a list of site prefixes.
8248 8246 */
8249 8247 /* ARGSUSED */
8250 8248 int
8251 8249 ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8252 8250 ip_ioctl_cmd_t *ipip, void *ifreq)
8253 8251 {
8254 8252 return (ENXIO);
8255 8253 }
8256 8254
8257 8255 /* ARP IOCTLs. */
8258 8256 /* ARGSUSED */
8259 8257 int
8260 8258 ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
8261 8259 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
8262 8260 {
8263 8261 int err;
8264 8262 ipaddr_t ipaddr;
8265 8263 struct iocblk *iocp;
8266 8264 conn_t *connp;
8267 8265 struct arpreq *ar;
8268 8266 struct xarpreq *xar;
8269 8267 int arp_flags, flags, alength;
8270 8268 uchar_t *lladdr;
8271 8269 ip_stack_t *ipst;
8272 8270 ill_t *ill = ipif->ipif_ill;
8273 8271 ill_t *proxy_ill = NULL;
8274 8272 ipmp_arpent_t *entp = NULL;
8275 8273 boolean_t proxyarp = B_FALSE;
8276 8274 boolean_t if_arp_ioctl = B_FALSE;
8277 8275 ncec_t *ncec = NULL;
8278 8276 nce_t *nce;
8279 8277
8280 8278 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8281 8279 connp = Q_TO_CONN(q);
8282 8280 ipst = connp->conn_netstack->netstack_ip;
8283 8281 iocp = (struct iocblk *)mp->b_rptr;
8284 8282
8285 8283 if (ipip->ipi_cmd_type == XARP_CMD) {
8286 8284 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */
8287 8285 xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr;
8288 8286 ar = NULL;
8289 8287
8290 8288 arp_flags = xar->xarp_flags;
8291 8289 lladdr = (uchar_t *)LLADDR(&xar->xarp_ha);
8292 8290 if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0);
8293 8291 /*
8294 8292 * Validate against user's link layer address length
8295 8293 * input and name and addr length limits.
8296 8294 */
8297 8295 alength = ill->ill_phys_addr_length;
8298 8296 if (ipip->ipi_cmd == SIOCSXARP) {
8299 8297 if (alength != xar->xarp_ha.sdl_alen ||
8300 8298 (alength + xar->xarp_ha.sdl_nlen >
8301 8299 sizeof (xar->xarp_ha.sdl_data)))
8302 8300 return (EINVAL);
8303 8301 }
8304 8302 } else {
8305 8303 /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->ARPREQ_MBLK */
8306 8304 ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr;
8307 8305 xar = NULL;
8308 8306
8309 8307 arp_flags = ar->arp_flags;
8310 8308 lladdr = (uchar_t *)ar->arp_ha.sa_data;
8311 8309 /*
8312 8310 * Theoretically, the sa_family could tell us what link
8313 8311 * layer type this operation is trying to deal with. By
8314 8312 * common usage AF_UNSPEC means ethernet. We'll assume
8315 8313 * any attempt to use the SIOC?ARP ioctls is for ethernet,
8316 8314 * for now. Our new SIOC*XARP ioctls can be used more
8317 8315 * generally.
8318 8316 *
8319 8317 * If the underlying media happens to have a non 6 byte
8320 8318 * address, arp module will fail set/get, but the del
8321 8319 * operation will succeed.
8322 8320 */
8323 8321 alength = 6;
8324 8322 if ((ipip->ipi_cmd != SIOCDARP) &&
8325 8323 (alength != ill->ill_phys_addr_length)) {
8326 8324 return (EINVAL);
8327 8325 }
8328 8326 }
8329 8327
8330 8328 /* Translate ATF* flags to NCE* flags */
8331 8329 flags = 0;
8332 8330 if (arp_flags & ATF_AUTHORITY)
8333 8331 flags |= NCE_F_AUTHORITY;
8334 8332 if (arp_flags & ATF_PERM)
8335 8333 flags |= NCE_F_NONUD; /* not subject to aging */
8336 8334 if (arp_flags & ATF_PUBL)
8337 8335 flags |= NCE_F_PUBLISH;
8338 8336
8339 8337 /*
8340 8338 * IPMP ARP special handling:
8341 8339 *
8342 8340 * 1. Since ARP mappings must appear consistent across the group,
8343 8341 * prohibit changing ARP mappings on the underlying interfaces.
8344 8342 *
8345 8343 * 2. Since ARP mappings for IPMP data addresses are maintained by
8346 8344 * IP itself, prohibit changing them.
8347 8345 *
8348 8346 * 3. For proxy ARP, use a functioning hardware address in the group,
8349 8347 * provided one exists. If one doesn't, just add the entry as-is;
8350 8348 * ipmp_illgrp_refresh_arpent() will refresh it if things change.
8351 8349 */
8352 8350 if (IS_UNDER_IPMP(ill)) {
8353 8351 if (ipip->ipi_cmd != SIOCGARP && ipip->ipi_cmd != SIOCGXARP)
8354 8352 return (EPERM);
8355 8353 }
8356 8354 if (IS_IPMP(ill)) {
8357 8355 ipmp_illgrp_t *illg = ill->ill_grp;
8358 8356
8359 8357 switch (ipip->ipi_cmd) {
8360 8358 case SIOCSARP:
8361 8359 case SIOCSXARP:
8362 8360 proxy_ill = ipmp_illgrp_find_ill(illg, lladdr, alength);
8363 8361 if (proxy_ill != NULL) {
8364 8362 proxyarp = B_TRUE;
8365 8363 if (!ipmp_ill_is_active(proxy_ill))
8366 8364 proxy_ill = ipmp_illgrp_next_ill(illg);
8367 8365 if (proxy_ill != NULL)
8368 8366 lladdr = proxy_ill->ill_phys_addr;
8369 8367 }
8370 8368 /* FALLTHRU */
8371 8369 }
8372 8370 }
8373 8371
8374 8372 ipaddr = sin->sin_addr.s_addr;
8375 8373 /*
8376 8374 * don't match across illgrp per case (1) and (2).
8377 8375 * XXX use IS_IPMP(ill) like ndp_sioc_update?
8378 8376 */
8379 8377 nce = nce_lookup_v4(ill, &ipaddr);
8380 8378 if (nce != NULL)
8381 8379 ncec = nce->nce_common;
8382 8380
8383 8381 switch (iocp->ioc_cmd) {
8384 8382 case SIOCDARP:
8385 8383 case SIOCDXARP: {
8386 8384 /*
8387 8385 * Delete the NCE if any.
8388 8386 */
8389 8387 if (ncec == NULL) {
8390 8388 iocp->ioc_error = ENXIO;
8391 8389 break;
8392 8390 }
8393 8391 /* Don't allow changes to arp mappings of local addresses. */
8394 8392 if (NCE_MYADDR(ncec)) {
8395 8393 nce_refrele(nce);
8396 8394 return (ENOTSUP);
8397 8395 }
8398 8396 iocp->ioc_error = 0;
8399 8397
8400 8398 /*
8401 8399 * Delete the nce_common which has ncec_ill set to ipmp_ill.
8402 8400 * This will delete all the nce entries on the under_ills.
8403 8401 */
8404 8402 ncec_delete(ncec);
8405 8403 /*
8406 8404 * Once the NCE has been deleted, then the ire_dep* consistency
8407 8405 * mechanism will find any IRE which depended on the now
8408 8406 * condemned NCE (as part of sending packets).
8409 8407 * That mechanism handles redirects by deleting redirects
8410 8408 * that refer to UNREACHABLE nces.
8411 8409 */
8412 8410 break;
8413 8411 }
8414 8412 case SIOCGARP:
8415 8413 case SIOCGXARP:
8416 8414 if (ncec != NULL) {
8417 8415 lladdr = ncec->ncec_lladdr;
8418 8416 flags = ncec->ncec_flags;
8419 8417 iocp->ioc_error = 0;
8420 8418 ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags);
8421 8419 } else {
8422 8420 iocp->ioc_error = ENXIO;
8423 8421 }
8424 8422 break;
8425 8423 case SIOCSARP:
8426 8424 case SIOCSXARP:
8427 8425 /* Don't allow changes to arp mappings of local addresses. */
8428 8426 if (ncec != NULL && NCE_MYADDR(ncec)) {
8429 8427 nce_refrele(nce);
8430 8428 return (ENOTSUP);
8431 8429 }
8432 8430
8433 8431 /* static arp entries will undergo NUD if ATF_PERM is not set */
8434 8432 flags |= NCE_F_STATIC;
8435 8433 if (!if_arp_ioctl) {
8436 8434 ip_nce_lookup_and_update(&ipaddr, NULL, ipst,
8437 8435 lladdr, alength, flags);
8438 8436 } else {
8439 8437 ipif_t *ipif = ipif_get_next_ipif(NULL, ill);
8440 8438 if (ipif != NULL) {
8441 8439 ip_nce_lookup_and_update(&ipaddr, ipif, ipst,
8442 8440 lladdr, alength, flags);
8443 8441 ipif_refrele(ipif);
8444 8442 }
8445 8443 }
8446 8444 if (nce != NULL) {
8447 8445 nce_refrele(nce);
8448 8446 nce = NULL;
8449 8447 }
8450 8448 /*
8451 8449 * NCE_F_STATIC entries will be added in state ND_REACHABLE
8452 8450 * by nce_add_common()
8453 8451 */
8454 8452 err = nce_lookup_then_add_v4(ill, lladdr,
8455 8453 ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED,
8456 8454 &nce);
8457 8455 if (err == EEXIST) {
8458 8456 ncec = nce->nce_common;
8459 8457 mutex_enter(&ncec->ncec_lock);
8460 8458 ncec->ncec_state = ND_REACHABLE;
8461 8459 ncec->ncec_flags = flags;
8462 8460 nce_update(ncec, ND_UNCHANGED, lladdr);
8463 8461 mutex_exit(&ncec->ncec_lock);
8464 8462 err = 0;
8465 8463 }
8466 8464 if (nce != NULL) {
8467 8465 nce_refrele(nce);
8468 8466 nce = NULL;
8469 8467 }
8470 8468 if (IS_IPMP(ill) && err == 0) {
8471 8469 entp = ipmp_illgrp_create_arpent(ill->ill_grp,
8472 8470 proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length,
8473 8471 flags);
8474 8472 if (entp == NULL || (proxyarp && proxy_ill == NULL)) {
8475 8473 iocp->ioc_error = (entp == NULL ? ENOMEM : 0);
8476 8474 break;
8477 8475 }
8478 8476 }
8479 8477 iocp->ioc_error = err;
8480 8478 }
8481 8479
8482 8480 if (nce != NULL) {
8483 8481 nce_refrele(nce);
8484 8482 }
8485 8483
8486 8484 /*
8487 8485 * If we created an IPMP ARP entry, mark that we've notified ARP.
8488 8486 */
8489 8487 if (entp != NULL)
8490 8488 ipmp_illgrp_mark_arpent(ill->ill_grp, entp);
8491 8489
8492 8490 return (iocp->ioc_error);
8493 8491 }
8494 8492
8495 8493 /*
8496 8494 * Parse an [x]arpreq structure coming down SIOC[GSD][X]ARP ioctls, identify
8497 8495 * the associated sin and refhold and return the associated ipif via `ci'.
8498 8496 */
8499 8497 int
8500 8498 ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip,
8501 8499 cmd_info_t *ci)
8502 8500 {
8503 8501 mblk_t *mp1;
8504 8502 sin_t *sin;
8505 8503 conn_t *connp;
8506 8504 ipif_t *ipif;
8507 8505 ire_t *ire = NULL;
8508 8506 ill_t *ill = NULL;
8509 8507 boolean_t exists;
8510 8508 ip_stack_t *ipst;
8511 8509 struct arpreq *ar;
8512 8510 struct xarpreq *xar;
8513 8511 struct sockaddr_dl *sdl;
8514 8512
8515 8513 /* ioctl comes down on a conn */
8516 8514 ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL);
8517 8515 connp = Q_TO_CONN(q);
8518 8516 if (connp->conn_family == AF_INET6)
8519 8517 return (ENXIO);
8520 8518
8521 8519 ipst = connp->conn_netstack->netstack_ip;
8522 8520
8523 8521 /* Verified in ip_wput_nondata */
8524 8522 mp1 = mp->b_cont->b_cont;
8525 8523
8526 8524 if (ipip->ipi_cmd_type == XARP_CMD) {
8527 8525 ASSERT(MBLKL(mp1) >= sizeof (struct xarpreq));
8528 8526 xar = (struct xarpreq *)mp1->b_rptr;
8529 8527 sin = (sin_t *)&xar->xarp_pa;
8530 8528 sdl = &xar->xarp_ha;
8531 8529
8532 8530 if (sdl->sdl_family != AF_LINK || sin->sin_family != AF_INET)
8533 8531 return (ENXIO);
8534 8532 if (sdl->sdl_nlen >= LIFNAMSIZ)
8535 8533 return (EINVAL);
8536 8534 } else {
8537 8535 ASSERT(ipip->ipi_cmd_type == ARP_CMD);
8538 8536 ASSERT(MBLKL(mp1) >= sizeof (struct arpreq));
8539 8537 ar = (struct arpreq *)mp1->b_rptr;
8540 8538 sin = (sin_t *)&ar->arp_pa;
8541 8539 }
8542 8540
8543 8541 if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) {
8544 8542 ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen,
8545 8543 B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst);
8546 8544 if (ipif == NULL)
8547 8545 return (ENXIO);
8548 8546 if (ipif->ipif_id != 0) {
8549 8547 ipif_refrele(ipif);
8550 8548 return (ENXIO);
8551 8549 }
8552 8550 } else {
8553 8551 /*
8554 8552 * Either an SIOC[DGS]ARP or an SIOC[DGS]XARP with an sdl_nlen
8555 8553 * of 0: use the IP address to find the ipif. If the IP
8556 8554 * address is an IPMP test address, ire_ftable_lookup() will
8557 8555 * find the wrong ill, so we first do an ipif_lookup_addr().
8558 8556 */
8559 8557 ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES,
8560 8558 ipst);
8561 8559 if (ipif == NULL) {
8562 8560 ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr,
8563 8561 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES,
8564 8562 NULL, MATCH_IRE_TYPE, 0, ipst, NULL);
8565 8563 if (ire == NULL || ((ill = ire->ire_ill) == NULL)) {
8566 8564 if (ire != NULL)
8567 8565 ire_refrele(ire);
8568 8566 return (ENXIO);
8569 8567 }
8570 8568 ASSERT(ire != NULL && ill != NULL);
8571 8569 ipif = ill->ill_ipif;
8572 8570 ipif_refhold(ipif);
8573 8571 ire_refrele(ire);
8574 8572 }
8575 8573 }
8576 8574
8577 8575 if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) {
8578 8576 ipif_refrele(ipif);
8579 8577 return (ENXIO);
8580 8578 }
8581 8579
8582 8580 ci->ci_sin = sin;
8583 8581 ci->ci_ipif = ipif;
8584 8582 return (0);
8585 8583 }
8586 8584
8587 8585 /*
8588 8586 * Link or unlink the illgrp on IPMP meta-interface `ill' depending on the
8589 8587 * value of `ioccmd'. While an illgrp is linked to an ipmp_grp_t, it is
8590 8588 * accessible from that ipmp_grp_t, which means SIOCSLIFGROUPNAME can look it
8591 8589 * up and thus an ill can join that illgrp.
8592 8590 *
8593 8591 * We use I_PLINK/I_PUNLINK to do the link/unlink operations rather than
8594 8592 * open()/close() primarily because close() is not allowed to fail or block
8595 8593 * forever. On the other hand, I_PUNLINK *can* fail, and there's no reason
8596 8594 * why anyone should ever need to I_PUNLINK an in-use IPMP stream. To ensure
8597 8595 * symmetric behavior (e.g., doing an I_PLINK after and I_PUNLINK undoes the
8598 8596 * I_PUNLINK) we defer linking to I_PLINK. Separately, we also fail attempts
8599 8597 * to I_LINK since I_UNLINK is optional and we'd end up in an inconsistent
8600 8598 * state if I_UNLINK didn't occur.
8601 8599 *
8602 8600 * Note that for each plumb/unplumb operation, we may end up here more than
8603 8601 * once because of the way ifconfig works. However, it's OK to link the same
8604 8602 * illgrp more than once, or unlink an illgrp that's already unlinked.
8605 8603 */
8606 8604 static int
8607 8605 ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd)
8608 8606 {
8609 8607 int err;
8610 8608 ip_stack_t *ipst = ill->ill_ipst;
8611 8609
8612 8610 ASSERT(IS_IPMP(ill));
8613 8611 ASSERT(IAM_WRITER_ILL(ill));
8614 8612
8615 8613 switch (ioccmd) {
8616 8614 case I_LINK:
8617 8615 return (ENOTSUP);
8618 8616
8619 8617 case I_PLINK:
8620 8618 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8621 8619 ipmp_illgrp_link_grp(ill->ill_grp, ill->ill_phyint->phyint_grp);
8622 8620 rw_exit(&ipst->ips_ipmp_lock);
8623 8621 break;
8624 8622
8625 8623 case I_PUNLINK:
8626 8624 /*
8627 8625 * Require all UP ipifs be brought down prior to unlinking the
8628 8626 * illgrp so any associated IREs (and other state) is torched.
8629 8627 */
8630 8628 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
8631 8629 return (EBUSY);
8632 8630
8633 8631 /*
8634 8632 * NOTE: We hold ipmp_lock across the unlink to prevent a race
8635 8633 * with an SIOCSLIFGROUPNAME request from an ill trying to
8636 8634 * join this group. Specifically: ills trying to join grab
8637 8635 * ipmp_lock and bump a "pending join" counter checked by
8638 8636 * ipmp_illgrp_unlink_grp(). During the unlink no new pending
8639 8637 * joins can occur (since we have ipmp_lock). Once we drop
8640 8638 * ipmp_lock, subsequent SIOCSLIFGROUPNAME requests will not
8641 8639 * find the illgrp (since we unlinked it) and will return
8642 8640 * EAFNOSUPPORT. This will then take them back through the
8643 8641 * IPMP meta-interface plumbing logic in ifconfig, and thus
8644 8642 * back through I_PLINK above.
8645 8643 */
8646 8644 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
8647 8645 err = ipmp_illgrp_unlink_grp(ill->ill_grp);
8648 8646 rw_exit(&ipst->ips_ipmp_lock);
8649 8647 return (err);
8650 8648 default:
8651 8649 break;
8652 8650 }
8653 8651 return (0);
8654 8652 }
8655 8653
8656 8654 /*
8657 8655 * Do I_PLINK/I_LINK or I_PUNLINK/I_UNLINK with consistency checks and also
8658 8656 * atomically set/clear the muxids. Also complete the ioctl by acking or
8659 8657 * naking it. Note that the code is structured such that the link type,
8660 8658 * whether it's persistent or not, is treated equally. ifconfig(1M) and
8661 8659 * its clones use the persistent link, while pppd(1M) and perhaps many
8662 8660 * other daemons may use non-persistent link. When combined with some
8663 8661 * ill_t states, linking and unlinking lower streams may be used as
8664 8662 * indicators of dynamic re-plumbing events [see PSARC/1999/348].
8665 8663 */
8666 8664 /* ARGSUSED */
8667 8665 void
8668 8666 ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
8669 8667 {
8670 8668 mblk_t *mp1;
8671 8669 struct linkblk *li;
8672 8670 int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd;
8673 8671 int err = 0;
8674 8672
8675 8673 ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK ||
8676 8674 ioccmd == I_LINK || ioccmd == I_UNLINK);
8677 8675
8678 8676 mp1 = mp->b_cont; /* This is the linkblk info */
8679 8677 li = (struct linkblk *)mp1->b_rptr;
8680 8678
8681 8679 err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li);
8682 8680 if (err == EINPROGRESS)
8683 8681 return;
8684 8682 if (err == 0)
8685 8683 miocack(q, mp, 0, 0);
8686 8684 else
8687 8685 miocnak(q, mp, 0, err);
8688 8686
8689 8687 /* Conn was refheld in ip_sioctl_copyin_setup */
8690 8688 if (CONN_Q(q)) {
8691 8689 CONN_DEC_IOCTLREF(Q_TO_CONN(q));
8692 8690 CONN_OPER_PENDING_DONE(Q_TO_CONN(q));
8693 8691 }
8694 8692 }
8695 8693
8696 8694 /*
8697 8695 * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to
8698 8696 * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP
8699 8697 * module stream).
8700 8698 * Returns zero on success, EINPROGRESS if the operation is still pending, or
8701 8699 * an error code on failure.
8702 8700 */
8703 8701 static int
8704 8702 ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd,
8705 8703 struct linkblk *li)
8706 8704 {
8707 8705 int err = 0;
8708 8706 ill_t *ill;
8709 8707 queue_t *ipwq, *dwq;
8710 8708 const char *name;
8711 8709 struct qinit *qinfo;
8712 8710 boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK);
8713 8711 boolean_t entered_ipsq = B_FALSE;
8714 8712 boolean_t is_ip = B_FALSE;
8715 8713 arl_t *arl;
8716 8714
8717 8715 /*
8718 8716 * Walk the lower stream to verify it's the IP module stream.
8719 8717 * The IP module is identified by its name, wput function,
8720 8718 * and non-NULL q_next. STREAMS ensures that the lower stream
8721 8719 * (li->l_qbot) will not vanish until this ioctl completes.
8722 8720 */
8723 8721 for (ipwq = li->l_qbot; ipwq != NULL; ipwq = ipwq->q_next) {
8724 8722 qinfo = ipwq->q_qinfo;
8725 8723 name = qinfo->qi_minfo->mi_idname;
8726 8724 if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 &&
8727 8725 qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8728 8726 is_ip = B_TRUE;
8729 8727 break;
8730 8728 }
8731 8729 if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 &&
8732 8730 qinfo->qi_putp != ip_lwput && ipwq->q_next != NULL) {
8733 8731 break;
8734 8732 }
8735 8733 }
8736 8734
8737 8735 /*
8738 8736 * If this isn't an IP module stream, bail.
8739 8737 */
8740 8738 if (ipwq == NULL)
8741 8739 return (0);
8742 8740
8743 8741 if (!is_ip) {
8744 8742 arl = (arl_t *)ipwq->q_ptr;
8745 8743 ill = arl_to_ill(arl);
8746 8744 if (ill == NULL)
8747 8745 return (0);
8748 8746 } else {
8749 8747 ill = ipwq->q_ptr;
8750 8748 }
8751 8749 ASSERT(ill != NULL);
8752 8750
8753 8751 if (ipsq == NULL) {
8754 8752 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink,
8755 8753 NEW_OP, B_FALSE);
8756 8754 if (ipsq == NULL) {
8757 8755 if (!is_ip)
8758 8756 ill_refrele(ill);
8759 8757 return (EINPROGRESS);
8760 8758 }
8761 8759 entered_ipsq = B_TRUE;
8762 8760 }
8763 8761 ASSERT(IAM_WRITER_ILL(ill));
8764 8762 mutex_enter(&ill->ill_lock);
8765 8763 if (!is_ip) {
8766 8764 if (islink && ill->ill_muxid == 0) {
8767 8765 /*
8768 8766 * Plumbing has to be done with IP plumbed first, arp
8769 8767 * second, but here we have arp being plumbed first.
8770 8768 */
8771 8769 mutex_exit(&ill->ill_lock);
8772 8770 if (entered_ipsq)
8773 8771 ipsq_exit(ipsq);
8774 8772 ill_refrele(ill);
8775 8773 return (EINVAL);
8776 8774 }
8777 8775 }
8778 8776 mutex_exit(&ill->ill_lock);
8779 8777 if (!is_ip) {
8780 8778 arl->arl_muxid = islink ? li->l_index : 0;
8781 8779 ill_refrele(ill);
8782 8780 goto done;
8783 8781 }
8784 8782
8785 8783 if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0)
8786 8784 goto done;
8787 8785
8788 8786 /*
8789 8787 * As part of I_{P}LINKing, stash the number of downstream modules and
8790 8788 * the read queue of the module immediately below IP in the ill.
8791 8789 * These are used during the capability negotiation below.
8792 8790 */
8793 8791 ill->ill_lmod_rq = NULL;
8794 8792 ill->ill_lmod_cnt = 0;
8795 8793 if (islink && ((dwq = ipwq->q_next) != NULL)) {
8796 8794 ill->ill_lmod_rq = RD(dwq);
8797 8795 for (; dwq != NULL; dwq = dwq->q_next)
8798 8796 ill->ill_lmod_cnt++;
8799 8797 }
8800 8798
8801 8799 ill->ill_muxid = islink ? li->l_index : 0;
8802 8800
8803 8801 /*
8804 8802 * Mark the ipsq busy until the capability operations initiated below
8805 8803 * complete. The PLINK/UNLINK ioctl itself completes when our caller
8806 8804 * returns, but the capability operation may complete asynchronously
8807 8805 * much later.
8808 8806 */
8809 8807 ipsq_current_start(ipsq, ill->ill_ipif, ioccmd);
8810 8808 /*
8811 8809 * If there's at least one up ipif on this ill, then we're bound to
8812 8810 * the underlying driver via DLPI. In that case, renegotiate
8813 8811 * capabilities to account for any possible change in modules
8814 8812 * interposed between IP and the driver.
8815 8813 */
8816 8814 if (ill->ill_ipif_up_count > 0) {
8817 8815 if (islink)
8818 8816 ill_capability_probe(ill);
8819 8817 else
8820 8818 ill_capability_reset(ill, B_FALSE);
8821 8819 }
8822 8820 ipsq_current_finish(ipsq);
8823 8821 done:
8824 8822 if (entered_ipsq)
8825 8823 ipsq_exit(ipsq);
8826 8824
8827 8825 return (err);
8828 8826 }
8829 8827
8830 8828 /*
8831 8829 * Search the ioctl command in the ioctl tables and return a pointer
8832 8830 * to the ioctl command information. The ioctl command tables are
8833 8831 * static and fully populated at compile time.
8834 8832 */
8835 8833 ip_ioctl_cmd_t *
8836 8834 ip_sioctl_lookup(int ioc_cmd)
8837 8835 {
8838 8836 int index;
8839 8837 ip_ioctl_cmd_t *ipip;
8840 8838 ip_ioctl_cmd_t *ipip_end;
8841 8839
8842 8840 if (ioc_cmd == IPI_DONTCARE)
8843 8841 return (NULL);
8844 8842
8845 8843 /*
8846 8844 * Do a 2 step search. First search the indexed table
8847 8845 * based on the least significant byte of the ioctl cmd.
8848 8846 * If we don't find a match, then search the misc table
8849 8847 * serially.
8850 8848 */
8851 8849 index = ioc_cmd & 0xFF;
8852 8850 if (index < ip_ndx_ioctl_count) {
8853 8851 ipip = &ip_ndx_ioctl_table[index];
8854 8852 if (ipip->ipi_cmd == ioc_cmd) {
8855 8853 /* Found a match in the ndx table */
8856 8854 return (ipip);
8857 8855 }
8858 8856 }
8859 8857
8860 8858 /* Search the misc table */
8861 8859 ipip_end = &ip_misc_ioctl_table[ip_misc_ioctl_count];
8862 8860 for (ipip = ip_misc_ioctl_table; ipip < ipip_end; ipip++) {
8863 8861 if (ipip->ipi_cmd == ioc_cmd)
8864 8862 /* Found a match in the misc table */
8865 8863 return (ipip);
8866 8864 }
8867 8865
8868 8866 return (NULL);
8869 8867 }
8870 8868
8871 8869 /*
8872 8870 * helper function for ip_sioctl_getsetprop(), which does some sanity checks
8873 8871 */
8874 8872 static boolean_t
8875 8873 getset_ioctl_checks(mblk_t *mp)
8876 8874 {
8877 8875 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8878 8876 mblk_t *mp1 = mp->b_cont;
8879 8877 mod_ioc_prop_t *pioc;
8880 8878 uint_t flags;
8881 8879 uint_t pioc_size;
8882 8880
8883 8881 /* do sanity checks on various arguments */
8884 8882 if (mp1 == NULL || iocp->ioc_count == 0 ||
8885 8883 iocp->ioc_count == TRANSPARENT) {
8886 8884 return (B_FALSE);
8887 8885 }
8888 8886 if (msgdsize(mp1) < iocp->ioc_count) {
8889 8887 if (!pullupmsg(mp1, iocp->ioc_count))
8890 8888 return (B_FALSE);
8891 8889 }
8892 8890
8893 8891 pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8894 8892
8895 8893 /* sanity checks on mpr_valsize */
8896 8894 pioc_size = sizeof (mod_ioc_prop_t);
8897 8895 if (pioc->mpr_valsize != 0)
8898 8896 pioc_size += pioc->mpr_valsize - 1;
8899 8897
8900 8898 if (iocp->ioc_count != pioc_size)
8901 8899 return (B_FALSE);
8902 8900
8903 8901 flags = pioc->mpr_flags;
8904 8902 if (iocp->ioc_cmd == SIOCSETPROP) {
8905 8903 /*
8906 8904 * One can either reset the value to it's default value or
8907 8905 * change the current value or append/remove the value from
8908 8906 * a multi-valued properties.
8909 8907 */
8910 8908 if ((flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8911 8909 flags != MOD_PROP_ACTIVE &&
8912 8910 flags != (MOD_PROP_ACTIVE|MOD_PROP_APPEND) &&
8913 8911 flags != (MOD_PROP_ACTIVE|MOD_PROP_REMOVE))
8914 8912 return (B_FALSE);
8915 8913 } else {
8916 8914 ASSERT(iocp->ioc_cmd == SIOCGETPROP);
8917 8915
8918 8916 /*
8919 8917 * One can retrieve only one kind of property information
8920 8918 * at a time.
8921 8919 */
8922 8920 if ((flags & MOD_PROP_ACTIVE) != MOD_PROP_ACTIVE &&
8923 8921 (flags & MOD_PROP_DEFAULT) != MOD_PROP_DEFAULT &&
8924 8922 (flags & MOD_PROP_POSSIBLE) != MOD_PROP_POSSIBLE &&
8925 8923 (flags & MOD_PROP_PERM) != MOD_PROP_PERM)
8926 8924 return (B_FALSE);
8927 8925 }
8928 8926
8929 8927 return (B_TRUE);
8930 8928 }
8931 8929
8932 8930 /*
8933 8931 * process the SIOC{SET|GET}PROP ioctl's
8934 8932 */
8935 8933 /* ARGSUSED */
8936 8934 static void
8937 8935 ip_sioctl_getsetprop(queue_t *q, mblk_t *mp)
8938 8936 {
8939 8937 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
8940 8938 mblk_t *mp1 = mp->b_cont;
8941 8939 mod_ioc_prop_t *pioc;
8942 8940 mod_prop_info_t *ptbl = NULL, *pinfo = NULL;
8943 8941 ip_stack_t *ipst;
8944 8942 netstack_t *stack;
8945 8943 cred_t *cr;
8946 8944 boolean_t set;
8947 8945 int err;
8948 8946
8949 8947 ASSERT(q->q_next == NULL);
8950 8948 ASSERT(CONN_Q(q));
8951 8949
8952 8950 if (!getset_ioctl_checks(mp)) {
8953 8951 miocnak(q, mp, 0, EINVAL);
8954 8952 return;
8955 8953 }
8956 8954 ipst = CONNQ_TO_IPST(q);
8957 8955 stack = ipst->ips_netstack;
8958 8956 pioc = (mod_ioc_prop_t *)mp1->b_rptr;
8959 8957
8960 8958 switch (pioc->mpr_proto) {
8961 8959 case MOD_PROTO_IP:
8962 8960 case MOD_PROTO_IPV4:
8963 8961 case MOD_PROTO_IPV6:
8964 8962 ptbl = ipst->ips_propinfo_tbl;
8965 8963 break;
8966 8964 case MOD_PROTO_RAWIP:
8967 8965 ptbl = stack->netstack_icmp->is_propinfo_tbl;
8968 8966 break;
8969 8967 case MOD_PROTO_TCP:
8970 8968 ptbl = stack->netstack_tcp->tcps_propinfo_tbl;
8971 8969 break;
8972 8970 case MOD_PROTO_UDP:
8973 8971 ptbl = stack->netstack_udp->us_propinfo_tbl;
8974 8972 break;
8975 8973 case MOD_PROTO_SCTP:
8976 8974 ptbl = stack->netstack_sctp->sctps_propinfo_tbl;
8977 8975 break;
8978 8976 default:
8979 8977 miocnak(q, mp, 0, EINVAL);
8980 8978 return;
8981 8979 }
8982 8980
8983 8981 pinfo = mod_prop_lookup(ptbl, pioc->mpr_name, pioc->mpr_proto);
8984 8982 if (pinfo == NULL) {
8985 8983 miocnak(q, mp, 0, ENOENT);
8986 8984 return;
8987 8985 }
8988 8986
8989 8987 set = (iocp->ioc_cmd == SIOCSETPROP) ? B_TRUE : B_FALSE;
8990 8988 if (set && pinfo->mpi_setf != NULL) {
8991 8989 cr = msg_getcred(mp, NULL);
8992 8990 if (cr == NULL)
8993 8991 cr = iocp->ioc_cr;
8994 8992 err = pinfo->mpi_setf(stack, cr, pinfo, pioc->mpr_ifname,
8995 8993 pioc->mpr_val, pioc->mpr_flags);
8996 8994 } else if (!set && pinfo->mpi_getf != NULL) {
8997 8995 err = pinfo->mpi_getf(stack, pinfo, pioc->mpr_ifname,
8998 8996 pioc->mpr_val, pioc->mpr_valsize, pioc->mpr_flags);
8999 8997 } else {
9000 8998 err = EPERM;
9001 8999 }
9002 9000
9003 9001 if (err != 0) {
9004 9002 miocnak(q, mp, 0, err);
9005 9003 } else {
9006 9004 if (set)
9007 9005 miocack(q, mp, 0, 0);
9008 9006 else /* For get, we need to return back the data */
9009 9007 miocack(q, mp, iocp->ioc_count, 0);
9010 9008 }
9011 9009 }
9012 9010
9013 9011 /*
9014 9012 * process the legacy ND_GET, ND_SET ioctl just for {ip|ip6}_forwarding
9015 9013 * as several routing daemons have unfortunately used this 'unpublished'
9016 9014 * but well-known ioctls.
9017 9015 */
9018 9016 /* ARGSUSED */
9019 9017 static void
9020 9018 ip_process_legacy_nddprop(queue_t *q, mblk_t *mp)
9021 9019 {
9022 9020 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9023 9021 mblk_t *mp1 = mp->b_cont;
9024 9022 char *pname, *pval, *buf;
9025 9023 uint_t bufsize, proto;
9026 9024 mod_prop_info_t *pinfo = NULL;
9027 9025 ip_stack_t *ipst;
9028 9026 int err = 0;
9029 9027
9030 9028 ASSERT(CONN_Q(q));
9031 9029 ipst = CONNQ_TO_IPST(q);
9032 9030
9033 9031 if (iocp->ioc_count == 0 || mp1 == NULL) {
9034 9032 miocnak(q, mp, 0, EINVAL);
9035 9033 return;
9036 9034 }
9037 9035
9038 9036 mp1->b_datap->db_lim[-1] = '\0'; /* Force null termination */
9039 9037 pval = buf = pname = (char *)mp1->b_rptr;
9040 9038 bufsize = MBLKL(mp1);
9041 9039
9042 9040 if (strcmp(pname, "ip_forwarding") == 0) {
9043 9041 pname = "forwarding";
9044 9042 proto = MOD_PROTO_IPV4;
9045 9043 } else if (strcmp(pname, "ip6_forwarding") == 0) {
9046 9044 pname = "forwarding";
9047 9045 proto = MOD_PROTO_IPV6;
9048 9046 } else {
9049 9047 miocnak(q, mp, 0, EINVAL);
9050 9048 return;
9051 9049 }
9052 9050
9053 9051 pinfo = mod_prop_lookup(ipst->ips_propinfo_tbl, pname, proto);
9054 9052
9055 9053 switch (iocp->ioc_cmd) {
9056 9054 case ND_GET:
9057 9055 if ((err = pinfo->mpi_getf(ipst->ips_netstack, pinfo, NULL, buf,
9058 9056 bufsize, 0)) == 0) {
9059 9057 miocack(q, mp, iocp->ioc_count, 0);
9060 9058 return;
9061 9059 }
9062 9060 break;
9063 9061 case ND_SET:
9064 9062 /*
9065 9063 * buffer will have property name and value in the following
9066 9064 * format,
9067 9065 * <property name>'\0'<property value>'\0', extract them;
9068 9066 */
9069 9067 while (*pval++)
9070 9068 noop;
9071 9069
9072 9070 if (!*pval || pval >= (char *)mp1->b_wptr) {
9073 9071 err = EINVAL;
9074 9072 } else if ((err = pinfo->mpi_setf(ipst->ips_netstack, NULL,
9075 9073 pinfo, NULL, pval, 0)) == 0) {
9076 9074 miocack(q, mp, 0, 0);
9077 9075 return;
9078 9076 }
9079 9077 break;
9080 9078 default:
9081 9079 err = EINVAL;
9082 9080 break;
9083 9081 }
9084 9082 miocnak(q, mp, 0, err);
9085 9083 }
9086 9084
9087 9085 /*
9088 9086 * Wrapper function for resuming deferred ioctl processing
9089 9087 * Used for SIOCGDSTINFO, SIOCGIP6ADDRPOLICY, SIOCGMSFILTER,
9090 9088 * SIOCSMSFILTER, SIOCGIPMSFILTER, and SIOCSIPMSFILTER currently.
9091 9089 */
9092 9090 /* ARGSUSED */
9093 9091 void
9094 9092 ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp,
9095 9093 void *dummy_arg)
9096 9094 {
9097 9095 ip_sioctl_copyin_setup(q, mp);
9098 9096 }
9099 9097
9100 9098 /*
9101 9099 * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message
9102 9100 * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle
9103 9101 * in either I_STR or TRANSPARENT form, using the mi_copy facility.
9104 9102 * We establish here the size of the block to be copied in. mi_copyin
9105 9103 * arranges for this to happen, an processing continues in ip_wput_nondata with
9106 9104 * an M_IOCDATA message.
9107 9105 */
9108 9106 void
9109 9107 ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp)
9110 9108 {
9111 9109 int copyin_size;
9112 9110 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
9113 9111 ip_ioctl_cmd_t *ipip;
9114 9112 cred_t *cr;
9115 9113 ip_stack_t *ipst;
9116 9114
9117 9115 if (CONN_Q(q))
9118 9116 ipst = CONNQ_TO_IPST(q);
9119 9117 else
9120 9118 ipst = ILLQ_TO_IPST(q);
9121 9119
9122 9120 ipip = ip_sioctl_lookup(iocp->ioc_cmd);
9123 9121 if (ipip == NULL) {
9124 9122 /*
9125 9123 * The ioctl is not one we understand or own.
9126 9124 * Pass it along to be processed down stream,
9127 9125 * if this is a module instance of IP, else nak
9128 9126 * the ioctl.
9129 9127 */
9130 9128 if (q->q_next == NULL) {
9131 9129 goto nak;
9132 9130 } else {
9133 9131 putnext(q, mp);
9134 9132 return;
9135 9133 }
9136 9134 }
9137 9135
9138 9136 /*
9139 9137 * If this is deferred, then we will do all the checks when we
9140 9138 * come back.
9141 9139 */
9142 9140 if ((iocp->ioc_cmd == SIOCGDSTINFO ||
9143 9141 iocp->ioc_cmd == SIOCGIP6ADDRPOLICY) && !ip6_asp_can_lookup(ipst)) {
9144 9142 ip6_asp_pending_op(q, mp, ip_sioctl_copyin_resume);
9145 9143 return;
9146 9144 }
9147 9145
9148 9146 /*
9149 9147 * Only allow a very small subset of IP ioctls on this stream if
9150 9148 * IP is a module and not a driver. Allowing ioctls to be processed
9151 9149 * in this case may cause assert failures or data corruption.
9152 9150 * Typically G[L]IFFLAGS, SLIFNAME/IF_UNITSEL are the only few
9153 9151 * ioctls allowed on an IP module stream, after which this stream
9154 9152 * normally becomes a multiplexor (at which time the stream head
9155 9153 * will fail all ioctls).
9156 9154 */
9157 9155 if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) {
9158 9156 goto nak;
9159 9157 }
9160 9158
9161 9159 /* Make sure we have ioctl data to process. */
9162 9160 if (mp->b_cont == NULL && !(ipip->ipi_flags & IPI_NULL_BCONT))
9163 9161 goto nak;
9164 9162
9165 9163 /*
9166 9164 * Prefer dblk credential over ioctl credential; some synthesized
9167 9165 * ioctls have kcred set because there's no way to crhold()
9168 9166 * a credential in some contexts. (ioc_cr is not crfree() by
9169 9167 * the framework; the caller of ioctl needs to hold the reference
9170 9168 * for the duration of the call).
9171 9169 */
9172 9170 cr = msg_getcred(mp, NULL);
9173 9171 if (cr == NULL)
9174 9172 cr = iocp->ioc_cr;
9175 9173
9176 9174 /* Make sure normal users don't send down privileged ioctls */
9177 9175 if ((ipip->ipi_flags & IPI_PRIV) &&
9178 9176 (cr != NULL) && secpolicy_ip_config(cr, B_TRUE) != 0) {
9179 9177 /* We checked the privilege earlier but log it here */
9180 9178 miocnak(q, mp, 0, secpolicy_ip_config(cr, B_FALSE));
9181 9179 return;
9182 9180 }
9183 9181
9184 9182 /*
9185 9183 * The ioctl command tables can only encode fixed length
9186 9184 * ioctl data. If the length is variable, the table will
9187 9185 * encode the length as zero. Such special cases are handled
9188 9186 * below in the switch.
9189 9187 */
9190 9188 if (ipip->ipi_copyin_size != 0) {
9191 9189 mi_copyin(q, mp, NULL, ipip->ipi_copyin_size);
9192 9190 return;
9193 9191 }
9194 9192
9195 9193 switch (iocp->ioc_cmd) {
9196 9194 case O_SIOCGIFCONF:
9197 9195 case SIOCGIFCONF:
9198 9196 /*
9199 9197 * This IOCTL is hilarious. See comments in
9200 9198 * ip_sioctl_get_ifconf for the story.
9201 9199 */
9202 9200 if (iocp->ioc_count == TRANSPARENT)
9203 9201 copyin_size = SIZEOF_STRUCT(ifconf,
9204 9202 iocp->ioc_flag);
9205 9203 else
9206 9204 copyin_size = iocp->ioc_count;
9207 9205 mi_copyin(q, mp, NULL, copyin_size);
9208 9206 return;
9209 9207
9210 9208 case O_SIOCGLIFCONF:
9211 9209 case SIOCGLIFCONF:
9212 9210 copyin_size = SIZEOF_STRUCT(lifconf, iocp->ioc_flag);
9213 9211 mi_copyin(q, mp, NULL, copyin_size);
9214 9212 return;
9215 9213
9216 9214 case SIOCGLIFSRCOF:
9217 9215 copyin_size = SIZEOF_STRUCT(lifsrcof, iocp->ioc_flag);
9218 9216 mi_copyin(q, mp, NULL, copyin_size);
9219 9217 return;
9220 9218
9221 9219 case SIOCGIP6ADDRPOLICY:
9222 9220 ip_sioctl_ip6addrpolicy(q, mp);
9223 9221 ip6_asp_table_refrele(ipst);
9224 9222 return;
9225 9223
9226 9224 case SIOCSIP6ADDRPOLICY:
9227 9225 ip_sioctl_ip6addrpolicy(q, mp);
9228 9226 return;
9229 9227
9230 9228 case SIOCGDSTINFO:
9231 9229 ip_sioctl_dstinfo(q, mp);
9232 9230 ip6_asp_table_refrele(ipst);
9233 9231 return;
9234 9232
9235 9233 case ND_SET:
9236 9234 case ND_GET:
9237 9235 ip_process_legacy_nddprop(q, mp);
9238 9236 return;
9239 9237
9240 9238 case SIOCSETPROP:
9241 9239 case SIOCGETPROP:
9242 9240 ip_sioctl_getsetprop(q, mp);
9243 9241 return;
9244 9242
9245 9243 case I_PLINK:
9246 9244 case I_PUNLINK:
9247 9245 case I_LINK:
9248 9246 case I_UNLINK:
9249 9247 /*
9250 9248 * We treat non-persistent link similarly as the persistent
9251 9249 * link case, in terms of plumbing/unplumbing, as well as
9252 9250 * dynamic re-plumbing events indicator. See comments
9253 9251 * in ip_sioctl_plink() for more.
9254 9252 *
9255 9253 * Request can be enqueued in the 'ipsq' while waiting
9256 9254 * to become exclusive. So bump up the conn ref.
9257 9255 */
9258 9256 if (CONN_Q(q)) {
9259 9257 CONN_INC_REF(Q_TO_CONN(q));
9260 9258 CONN_INC_IOCTLREF(Q_TO_CONN(q))
9261 9259 }
9262 9260 ip_sioctl_plink(NULL, q, mp, NULL);
9263 9261 return;
9264 9262
9265 9263 case IP_IOCTL:
9266 9264 ip_wput_ioctl(q, mp);
9267 9265 return;
9268 9266
9269 9267 case SIOCILB:
9270 9268 /* The ioctl length varies depending on the ILB command. */
9271 9269 copyin_size = iocp->ioc_count;
9272 9270 if (copyin_size < sizeof (ilb_cmd_t))
9273 9271 goto nak;
9274 9272 mi_copyin(q, mp, NULL, copyin_size);
9275 9273 return;
9276 9274
9277 9275 default:
9278 9276 cmn_err(CE_WARN, "Unknown ioctl %d/0x%x slipped through.",
9279 9277 iocp->ioc_cmd, iocp->ioc_cmd);
9280 9278 /* FALLTHRU */
9281 9279 }
9282 9280 nak:
9283 9281 if (mp->b_cont != NULL) {
9284 9282 freemsg(mp->b_cont);
9285 9283 mp->b_cont = NULL;
9286 9284 }
9287 9285 iocp->ioc_error = EINVAL;
9288 9286 mp->b_datap->db_type = M_IOCNAK;
9289 9287 iocp->ioc_count = 0;
9290 9288 qreply(q, mp);
9291 9289 }
9292 9290
9293 9291 static void
9294 9292 ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags)
9295 9293 {
9296 9294 struct arpreq *ar;
9297 9295 struct xarpreq *xar;
9298 9296 mblk_t *tmp;
9299 9297 struct iocblk *iocp;
9300 9298 int x_arp_ioctl = B_FALSE;
9301 9299 int *flagsp;
9302 9300 char *storage = NULL;
9303 9301
9304 9302 ASSERT(ill != NULL);
9305 9303
9306 9304 iocp = (struct iocblk *)mp->b_rptr;
9307 9305 ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP);
9308 9306
9309 9307 tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */
9310 9308 if ((iocp->ioc_cmd == SIOCGXARP) ||
9311 9309 (iocp->ioc_cmd == SIOCSXARP)) {
9312 9310 x_arp_ioctl = B_TRUE;
9313 9311 xar = (struct xarpreq *)tmp->b_rptr;
9314 9312 flagsp = &xar->xarp_flags;
9315 9313 storage = xar->xarp_ha.sdl_data;
9316 9314 } else {
9317 9315 ar = (struct arpreq *)tmp->b_rptr;
9318 9316 flagsp = &ar->arp_flags;
9319 9317 storage = ar->arp_ha.sa_data;
9320 9318 }
9321 9319
9322 9320 /*
9323 9321 * We're done if this is not an SIOCG{X}ARP
9324 9322 */
9325 9323 if (x_arp_ioctl) {
9326 9324 storage += ill_xarp_info(&xar->xarp_ha, ill);
9327 9325 if ((ill->ill_phys_addr_length + ill->ill_name_length) >
9328 9326 sizeof (xar->xarp_ha.sdl_data)) {
9329 9327 iocp->ioc_error = EINVAL;
9330 9328 return;
9331 9329 }
9332 9330 }
9333 9331 *flagsp = ATF_INUSE;
9334 9332 /*
9335 9333 * If /sbin/arp told us we are the authority using the "permanent"
9336 9334 * flag, or if this is one of my addresses print "permanent"
9337 9335 * in the /sbin/arp output.
9338 9336 */
9339 9337 if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY))
9340 9338 *flagsp |= ATF_AUTHORITY;
9341 9339 if (flags & NCE_F_NONUD)
9342 9340 *flagsp |= ATF_PERM; /* not subject to aging */
9343 9341 if (flags & NCE_F_PUBLISH)
9344 9342 *flagsp |= ATF_PUBL;
9345 9343 if (hwaddr != NULL) {
9346 9344 *flagsp |= ATF_COM;
9347 9345 bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length);
9348 9346 }
9349 9347 }
9350 9348
9351 9349 /*
9352 9350 * Create a new logical interface. If ipif_id is zero (i.e. not a logical
9353 9351 * interface) create the next available logical interface for this
9354 9352 * physical interface.
9355 9353 * If ipif is NULL (i.e. the lookup didn't find one) attempt to create an
9356 9354 * ipif with the specified name.
9357 9355 *
9358 9356 * If the address family is not AF_UNSPEC then set the address as well.
9359 9357 *
9360 9358 * If ip_sioctl_addr returns EINPROGRESS then the ioctl (the copyout)
9361 9359 * is completed when the DL_BIND_ACK arrive in ip_rput_dlpi_writer.
9362 9360 *
9363 9361 * Executed as a writer on the ill.
9364 9362 * So no lock is needed to traverse the ipif chain, or examine the
9365 9363 * phyint flags.
9366 9364 */
9367 9365 /* ARGSUSED */
9368 9366 int
9369 9367 ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
9370 9368 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9371 9369 {
9372 9370 mblk_t *mp1;
9373 9371 struct lifreq *lifr;
9374 9372 boolean_t isv6;
9375 9373 boolean_t exists;
9376 9374 char *name;
9377 9375 char *endp;
9378 9376 char *cp;
9379 9377 int namelen;
9380 9378 ipif_t *ipif;
9381 9379 long id;
9382 9380 ipsq_t *ipsq;
9383 9381 ill_t *ill;
9384 9382 sin_t *sin;
9385 9383 int err = 0;
9386 9384 boolean_t found_sep = B_FALSE;
9387 9385 conn_t *connp;
9388 9386 zoneid_t zoneid;
9389 9387 ip_stack_t *ipst = CONNQ_TO_IPST(q);
9390 9388
9391 9389 ASSERT(q->q_next == NULL);
9392 9390 ip1dbg(("ip_sioctl_addif\n"));
9393 9391 /* Existence of mp1 has been checked in ip_wput_nondata */
9394 9392 mp1 = mp->b_cont->b_cont;
9395 9393 /*
9396 9394 * Null terminate the string to protect against buffer
9397 9395 * overrun. String was generated by user code and may not
9398 9396 * be trusted.
9399 9397 */
9400 9398 lifr = (struct lifreq *)mp1->b_rptr;
9401 9399 lifr->lifr_name[LIFNAMSIZ - 1] = '\0';
9402 9400 name = lifr->lifr_name;
9403 9401 ASSERT(CONN_Q(q));
9404 9402 connp = Q_TO_CONN(q);
9405 9403 isv6 = (connp->conn_family == AF_INET6);
9406 9404 zoneid = connp->conn_zoneid;
9407 9405 namelen = mi_strlen(name);
9408 9406 if (namelen == 0)
9409 9407 return (EINVAL);
9410 9408
9411 9409 exists = B_FALSE;
9412 9410 if ((namelen + 1 == sizeof (ipif_loopback_name)) &&
9413 9411 (mi_strcmp(name, ipif_loopback_name) == 0)) {
9414 9412 /*
9415 9413 * Allow creating lo0 using SIOCLIFADDIF.
9416 9414 * can't be any other writer thread. So can pass null below
9417 9415 * for the last 4 args to ipif_lookup_name.
9418 9416 */
9419 9417 ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE,
9420 9418 &exists, isv6, zoneid, ipst);
9421 9419 /* Prevent any further action */
9422 9420 if (ipif == NULL) {
9423 9421 return (ENOBUFS);
9424 9422 } else if (!exists) {
9425 9423 /* We created the ipif now and as writer */
9426 9424 ipif_refrele(ipif);
9427 9425 return (0);
9428 9426 } else {
9429 9427 ill = ipif->ipif_ill;
9430 9428 ill_refhold(ill);
9431 9429 ipif_refrele(ipif);
9432 9430 }
9433 9431 } else {
9434 9432 /* Look for a colon in the name. */
9435 9433 endp = &name[namelen];
9436 9434 for (cp = endp; --cp > name; ) {
9437 9435 if (*cp == IPIF_SEPARATOR_CHAR) {
9438 9436 found_sep = B_TRUE;
9439 9437 /*
9440 9438 * Reject any non-decimal aliases for plumbing
9441 9439 * of logical interfaces. Aliases with leading
9442 9440 * zeroes are also rejected as they introduce
9443 9441 * ambiguity in the naming of the interfaces.
9444 9442 * Comparing with "0" takes care of all such
9445 9443 * cases.
9446 9444 */
9447 9445 if ((strncmp("0", cp+1, 1)) == 0)
9448 9446 return (EINVAL);
9449 9447
9450 9448 if (ddi_strtol(cp+1, &endp, 10, &id) != 0 ||
9451 9449 id <= 0 || *endp != '\0') {
9452 9450 return (EINVAL);
9453 9451 }
9454 9452 *cp = '\0';
9455 9453 break;
9456 9454 }
9457 9455 }
9458 9456 ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst);
9459 9457 if (found_sep)
9460 9458 *cp = IPIF_SEPARATOR_CHAR;
9461 9459 if (ill == NULL)
9462 9460 return (ENXIO);
9463 9461 }
9464 9462
9465 9463 ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP,
9466 9464 B_TRUE);
9467 9465
9468 9466 /*
9469 9467 * Release the refhold due to the lookup, now that we are excl
9470 9468 * or we are just returning
9471 9469 */
9472 9470 ill_refrele(ill);
9473 9471
9474 9472 if (ipsq == NULL)
9475 9473 return (EINPROGRESS);
9476 9474
9477 9475 /* We are now exclusive on the IPSQ */
9478 9476 ASSERT(IAM_WRITER_ILL(ill));
9479 9477
9480 9478 if (found_sep) {
9481 9479 /* Now see if there is an IPIF with this unit number. */
9482 9480 for (ipif = ill->ill_ipif; ipif != NULL;
9483 9481 ipif = ipif->ipif_next) {
9484 9482 if (ipif->ipif_id == id) {
9485 9483 err = EEXIST;
9486 9484 goto done;
9487 9485 }
9488 9486 }
9489 9487 }
9490 9488
9491 9489 /*
9492 9490 * We use IRE_LOCAL for lo0:1 etc. for "receive only" use
9493 9491 * of lo0. Plumbing for lo0:0 happens in ipif_lookup_on_name()
9494 9492 * instead.
9495 9493 */
9496 9494 if ((ipif = ipif_allocate(ill, found_sep ? id : -1, IRE_LOCAL,
9497 9495 B_TRUE, B_TRUE, &err)) == NULL) {
9498 9496 goto done;
9499 9497 }
9500 9498
9501 9499 /* Return created name with ioctl */
9502 9500 (void) sprintf(lifr->lifr_name, "%s%c%d", ill->ill_name,
9503 9501 IPIF_SEPARATOR_CHAR, ipif->ipif_id);
9504 9502 ip1dbg(("created %s\n", lifr->lifr_name));
9505 9503
9506 9504 /* Set address */
9507 9505 sin = (sin_t *)&lifr->lifr_addr;
9508 9506 if (sin->sin_family != AF_UNSPEC) {
9509 9507 err = ip_sioctl_addr(ipif, sin, q, mp,
9510 9508 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], lifr);
9511 9509 }
9512 9510
9513 9511 done:
9514 9512 ipsq_exit(ipsq);
9515 9513 return (err);
9516 9514 }
9517 9515
9518 9516 /*
9519 9517 * Remove an existing logical interface. If ipif_id is zero (i.e. not a logical
9520 9518 * interface) delete it based on the IP address (on this physical interface).
9521 9519 * Otherwise delete it based on the ipif_id.
9522 9520 * Also, special handling to allow a removeif of lo0.
9523 9521 */
9524 9522 /* ARGSUSED */
9525 9523 int
9526 9524 ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9527 9525 ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9528 9526 {
9529 9527 conn_t *connp;
9530 9528 ill_t *ill = ipif->ipif_ill;
9531 9529 boolean_t success;
9532 9530 ip_stack_t *ipst;
9533 9531
9534 9532 ipst = CONNQ_TO_IPST(q);
9535 9533
9536 9534 ASSERT(q->q_next == NULL);
9537 9535 ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n",
9538 9536 ill->ill_name, ipif->ipif_id, (void *)ipif));
9539 9537 ASSERT(IAM_WRITER_IPIF(ipif));
9540 9538
9541 9539 connp = Q_TO_CONN(q);
9542 9540 /*
9543 9541 * Special case for unplumbing lo0 (the loopback physical interface).
9544 9542 * If unplumbing lo0, the incoming address structure has been
9545 9543 * initialized to all zeros. When unplumbing lo0, all its logical
9546 9544 * interfaces must be removed too.
9547 9545 *
9548 9546 * Note that this interface may be called to remove a specific
9549 9547 * loopback logical interface (eg, lo0:1). But in that case
9550 9548 * ipif->ipif_id != 0 so that the code path for that case is the
9551 9549 * same as any other interface (meaning it skips the code directly
9552 9550 * below).
9553 9551 */
9554 9552 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9555 9553 if (sin->sin_family == AF_UNSPEC &&
9556 9554 (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) {
9557 9555 /*
9558 9556 * Mark it condemned. No new ref. will be made to ill.
9559 9557 */
9560 9558 mutex_enter(&ill->ill_lock);
9561 9559 ill->ill_state_flags |= ILL_CONDEMNED;
9562 9560 for (ipif = ill->ill_ipif; ipif != NULL;
9563 9561 ipif = ipif->ipif_next) {
9564 9562 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9565 9563 }
9566 9564 mutex_exit(&ill->ill_lock);
9567 9565
9568 9566 ipif = ill->ill_ipif;
9569 9567 /* unplumb the loopback interface */
9570 9568 ill_delete(ill);
9571 9569 mutex_enter(&connp->conn_lock);
9572 9570 mutex_enter(&ill->ill_lock);
9573 9571
9574 9572 /* Are any references to this ill active */
9575 9573 if (ill_is_freeable(ill)) {
9576 9574 mutex_exit(&ill->ill_lock);
9577 9575 mutex_exit(&connp->conn_lock);
9578 9576 ill_delete_tail(ill);
9579 9577 mi_free(ill);
9580 9578 return (0);
9581 9579 }
9582 9580 success = ipsq_pending_mp_add(connp, ipif,
9583 9581 CONNP_TO_WQ(connp), mp, ILL_FREE);
9584 9582 mutex_exit(&connp->conn_lock);
9585 9583 mutex_exit(&ill->ill_lock);
9586 9584 if (success)
9587 9585 return (EINPROGRESS);
9588 9586 else
9589 9587 return (EINTR);
9590 9588 }
9591 9589 }
9592 9590
9593 9591 if (ipif->ipif_id == 0) {
9594 9592 ipsq_t *ipsq;
9595 9593
9596 9594 /* Find based on address */
9597 9595 if (ipif->ipif_isv6) {
9598 9596 sin6_t *sin6;
9599 9597
9600 9598 if (sin->sin_family != AF_INET6)
9601 9599 return (EAFNOSUPPORT);
9602 9600
9603 9601 sin6 = (sin6_t *)sin;
9604 9602 /* We are a writer, so we should be able to lookup */
9605 9603 ipif = ipif_lookup_addr_exact_v6(&sin6->sin6_addr, ill,
9606 9604 ipst);
9607 9605 } else {
9608 9606 if (sin->sin_family != AF_INET)
9609 9607 return (EAFNOSUPPORT);
9610 9608
9611 9609 /* We are a writer, so we should be able to lookup */
9612 9610 ipif = ipif_lookup_addr_exact(sin->sin_addr.s_addr, ill,
9613 9611 ipst);
9614 9612 }
9615 9613 if (ipif == NULL) {
9616 9614 return (EADDRNOTAVAIL);
9617 9615 }
9618 9616
9619 9617 /*
9620 9618 * It is possible for a user to send an SIOCLIFREMOVEIF with
9621 9619 * lifr_name of the physical interface but with an ip address
9622 9620 * lifr_addr of a logical interface plumbed over it.
9623 9621 * So update ipx_current_ipif now that ipif points to the
9624 9622 * correct one.
9625 9623 */
9626 9624 ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq;
9627 9625 ipsq->ipsq_xop->ipx_current_ipif = ipif;
9628 9626
9629 9627 /* This is a writer */
9630 9628 ipif_refrele(ipif);
9631 9629 }
9632 9630
9633 9631 /*
9634 9632 * Can not delete instance zero since it is tied to the ill.
9635 9633 */
9636 9634 if (ipif->ipif_id == 0)
9637 9635 return (EBUSY);
9638 9636
9639 9637 mutex_enter(&ill->ill_lock);
9640 9638 ipif->ipif_state_flags |= IPIF_CONDEMNED;
9641 9639 mutex_exit(&ill->ill_lock);
9642 9640
9643 9641 ipif_free(ipif);
9644 9642
9645 9643 mutex_enter(&connp->conn_lock);
9646 9644 mutex_enter(&ill->ill_lock);
9647 9645
9648 9646 /* Are any references to this ipif active */
9649 9647 if (ipif_is_freeable(ipif)) {
9650 9648 mutex_exit(&ill->ill_lock);
9651 9649 mutex_exit(&connp->conn_lock);
9652 9650 ipif_non_duplicate(ipif);
9653 9651 (void) ipif_down_tail(ipif);
9654 9652 ipif_free_tail(ipif); /* frees ipif */
9655 9653 return (0);
9656 9654 }
9657 9655 success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp,
9658 9656 IPIF_FREE);
9659 9657 mutex_exit(&ill->ill_lock);
9660 9658 mutex_exit(&connp->conn_lock);
9661 9659 if (success)
9662 9660 return (EINPROGRESS);
9663 9661 else
9664 9662 return (EINTR);
9665 9663 }
9666 9664
9667 9665 /*
9668 9666 * Restart the removeif ioctl. The refcnt has gone down to 0.
9669 9667 * The ipif is already condemned. So can't find it thru lookups.
9670 9668 */
9671 9669 /* ARGSUSED */
9672 9670 int
9673 9671 ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q,
9674 9672 mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_if_req)
9675 9673 {
9676 9674 ill_t *ill = ipif->ipif_ill;
9677 9675
9678 9676 ASSERT(IAM_WRITER_IPIF(ipif));
9679 9677 ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED);
9680 9678
9681 9679 ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n",
9682 9680 ill->ill_name, ipif->ipif_id, (void *)ipif));
9683 9681
9684 9682 if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) {
9685 9683 ASSERT(ill->ill_state_flags & ILL_CONDEMNED);
9686 9684 ill_delete_tail(ill);
9687 9685 mi_free(ill);
9688 9686 return (0);
9689 9687 }
9690 9688
9691 9689 ipif_non_duplicate(ipif);
9692 9690 (void) ipif_down_tail(ipif);
9693 9691 ipif_free_tail(ipif);
9694 9692
9695 9693 return (0);
9696 9694 }
9697 9695
9698 9696 /*
9699 9697 * Set the local interface address using the given prefix and ill_token.
9700 9698 */
9701 9699 /* ARGSUSED */
9702 9700 int
9703 9701 ip_sioctl_prefix(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9704 9702 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9705 9703 {
9706 9704 int err;
9707 9705 in6_addr_t v6addr;
9708 9706 sin6_t *sin6;
9709 9707 ill_t *ill;
9710 9708 int i;
9711 9709
9712 9710 ip1dbg(("ip_sioctl_prefix(%s:%u %p)\n",
9713 9711 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9714 9712
9715 9713 ASSERT(IAM_WRITER_IPIF(ipif));
9716 9714
9717 9715 if (!ipif->ipif_isv6)
9718 9716 return (EINVAL);
9719 9717
9720 9718 if (sin->sin_family != AF_INET6)
9721 9719 return (EAFNOSUPPORT);
9722 9720
9723 9721 sin6 = (sin6_t *)sin;
9724 9722 v6addr = sin6->sin6_addr;
9725 9723 ill = ipif->ipif_ill;
9726 9724
9727 9725 if (IN6_IS_ADDR_UNSPECIFIED(&v6addr) ||
9728 9726 IN6_IS_ADDR_UNSPECIFIED(&ill->ill_token))
9729 9727 return (EADDRNOTAVAIL);
9730 9728
9731 9729 for (i = 0; i < 4; i++)
9732 9730 sin6->sin6_addr.s6_addr32[i] |= ill->ill_token.s6_addr32[i];
9733 9731
9734 9732 err = ip_sioctl_addr(ipif, sin, q, mp,
9735 9733 &ip_ndx_ioctl_table[SIOCLIFADDR_NDX], dummy_ifreq);
9736 9734 return (err);
9737 9735 }
9738 9736
9739 9737 /*
9740 9738 * Restart entry point to restart the address set operation after the
9741 9739 * refcounts have dropped to zero.
9742 9740 */
9743 9741 /* ARGSUSED */
9744 9742 int
9745 9743 ip_sioctl_prefix_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9746 9744 ip_ioctl_cmd_t *ipip, void *ifreq)
9747 9745 {
9748 9746 ip1dbg(("ip_sioctl_prefix_restart(%s:%u %p)\n",
9749 9747 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9750 9748 return (ip_sioctl_addr_restart(ipif, sin, q, mp, ipip, ifreq));
9751 9749 }
9752 9750
9753 9751 /*
9754 9752 * Set the local interface address.
9755 9753 * Allow an address of all zero when the interface is down.
9756 9754 */
9757 9755 /* ARGSUSED */
9758 9756 int
9759 9757 ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9760 9758 ip_ioctl_cmd_t *dummy_ipip, void *dummy_ifreq)
9761 9759 {
9762 9760 int err = 0;
9763 9761 in6_addr_t v6addr;
9764 9762 boolean_t need_up = B_FALSE;
9765 9763 ill_t *ill;
9766 9764
9767 9765 ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
9768 9766 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
9769 9767
9770 9768 ASSERT(IAM_WRITER_IPIF(ipif));
9771 9769
9772 9770 ill = ipif->ipif_ill;
9773 9771 if (ipif->ipif_isv6) {
9774 9772 sin6_t *sin6;
9775 9773 phyint_t *phyi;
9776 9774
9777 9775 if (sin->sin_family != AF_INET6)
9778 9776 return (EAFNOSUPPORT);
9779 9777
9780 9778 sin6 = (sin6_t *)sin;
9781 9779 v6addr = sin6->sin6_addr;
9782 9780 phyi = ill->ill_phyint;
9783 9781
9784 9782 /*
9785 9783 * Enforce that true multicast interfaces have a link-local
9786 9784 * address for logical unit 0.
9787 9785 *
9788 9786 * However for those ipif's for which link-local address was
9789 9787 * not created by default, also allow setting :: as the address.
9790 9788 * This scenario would arise, when we delete an address on ipif
9791 9789 * with logical unit 0, we would want to set :: as the address.
9792 9790 */
9793 9791 if (ipif->ipif_id == 0 &&
9794 9792 (ill->ill_flags & ILLF_MULTICAST) &&
9795 9793 !(ipif->ipif_flags & (IPIF_POINTOPOINT)) &&
9796 9794 !(phyi->phyint_flags & (PHYI_LOOPBACK)) &&
9797 9795 !IN6_IS_ADDR_LINKLOCAL(&v6addr)) {
9798 9796
9799 9797 /*
9800 9798 * if default link-local was not created by kernel for
9801 9799 * this ill, allow setting :: as the address on ipif:0.
9802 9800 */
9803 9801 if (ill->ill_flags & ILLF_NOLINKLOCAL) {
9804 9802 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr))
9805 9803 return (EADDRNOTAVAIL);
9806 9804 } else {
9807 9805 return (EADDRNOTAVAIL);
9808 9806 }
9809 9807 }
9810 9808
9811 9809 /*
9812 9810 * up interfaces shouldn't have the unspecified address
9813 9811 * unless they also have the IPIF_NOLOCAL flags set and
9814 9812 * have a subnet assigned.
9815 9813 */
9816 9814 if ((ipif->ipif_flags & IPIF_UP) &&
9817 9815 IN6_IS_ADDR_UNSPECIFIED(&v6addr) &&
9818 9816 (!(ipif->ipif_flags & IPIF_NOLOCAL) ||
9819 9817 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) {
9820 9818 return (EADDRNOTAVAIL);
9821 9819 }
9822 9820
9823 9821 if (!ip_local_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
9824 9822 return (EADDRNOTAVAIL);
9825 9823 } else {
9826 9824 ipaddr_t addr;
9827 9825
9828 9826 if (sin->sin_family != AF_INET)
9829 9827 return (EAFNOSUPPORT);
9830 9828
9831 9829 addr = sin->sin_addr.s_addr;
9832 9830
9833 9831 /* Allow INADDR_ANY as the local address. */
9834 9832 if (addr != INADDR_ANY &&
9835 9833 !ip_addr_ok_v4(addr, ipif->ipif_net_mask))
9836 9834 return (EADDRNOTAVAIL);
9837 9835
9838 9836 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9839 9837 }
9840 9838 /* verify that the address being configured is permitted by mac */
9841 9839 if (!ill_ipcheck_addr(ill, &v6addr)) {
9842 9840 return (EPERM);
9843 9841 }
9844 9842 /*
9845 9843 * Even if there is no change we redo things just to rerun
9846 9844 * ipif_set_default.
9847 9845 */
9848 9846 if (ipif->ipif_flags & IPIF_UP) {
9849 9847 /*
9850 9848 * Setting a new local address, make sure
9851 9849 * we have net and subnet bcast ire's for
9852 9850 * the old address if we need them.
9853 9851 */
9854 9852 /*
9855 9853 * If the interface is already marked up,
9856 9854 * we call ipif_down which will take care
9857 9855 * of ditching any IREs that have been set
9858 9856 * up based on the old interface address.
9859 9857 */
9860 9858 err = ipif_logical_down(ipif, q, mp);
9861 9859 if (err == EINPROGRESS)
9862 9860 return (err);
9863 9861 (void) ipif_down_tail(ipif);
9864 9862 need_up = 1;
9865 9863 }
9866 9864
9867 9865 err = ip_sioctl_addr_tail(ipif, sin, q, mp, need_up);
9868 9866 return (err);
9869 9867 }
9870 9868
9871 9869 int
9872 9870 ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9873 9871 boolean_t need_up)
9874 9872 {
9875 9873 in6_addr_t v6addr;
9876 9874 in6_addr_t ov6addr;
9877 9875 ipaddr_t addr;
9878 9876 sin6_t *sin6;
9879 9877 int sinlen;
9880 9878 int err = 0;
9881 9879 ill_t *ill = ipif->ipif_ill;
9882 9880 boolean_t need_dl_down;
9883 9881 boolean_t need_arp_down;
9884 9882 struct iocblk *iocp;
9885 9883
9886 9884 iocp = (mp != NULL) ? (struct iocblk *)mp->b_rptr : NULL;
9887 9885
9888 9886 ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n",
9889 9887 ill->ill_name, ipif->ipif_id, (void *)ipif));
9890 9888 ASSERT(IAM_WRITER_IPIF(ipif));
9891 9889
9892 9890 /* Must cancel any pending timer before taking the ill_lock */
9893 9891 if (ipif->ipif_recovery_id != 0)
9894 9892 (void) untimeout(ipif->ipif_recovery_id);
9895 9893 ipif->ipif_recovery_id = 0;
9896 9894
9897 9895 if (ipif->ipif_isv6) {
9898 9896 sin6 = (sin6_t *)sin;
9899 9897 v6addr = sin6->sin6_addr;
9900 9898 sinlen = sizeof (struct sockaddr_in6);
9901 9899 } else {
9902 9900 addr = sin->sin_addr.s_addr;
9903 9901 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
9904 9902 sinlen = sizeof (struct sockaddr_in);
9905 9903 }
9906 9904 mutex_enter(&ill->ill_lock);
9907 9905 ov6addr = ipif->ipif_v6lcl_addr;
9908 9906 ipif->ipif_v6lcl_addr = v6addr;
9909 9907 sctp_update_ipif_addr(ipif, ov6addr);
9910 9908 ipif->ipif_addr_ready = 0;
9911 9909
9912 9910 ip_rts_newaddrmsg(RTM_CHGADDR, 0, ipif, RTSQ_DEFAULT);
9913 9911
9914 9912 /*
9915 9913 * If the interface was previously marked as a duplicate, then since
9916 9914 * we've now got a "new" address, it should no longer be considered a
9917 9915 * duplicate -- even if the "new" address is the same as the old one.
9918 9916 * Note that if all ipifs are down, we may have a pending ARP down
9919 9917 * event to handle. This is because we want to recover from duplicates
9920 9918 * and thus delay tearing down ARP until the duplicates have been
9921 9919 * removed or disabled.
9922 9920 */
9923 9921 need_dl_down = need_arp_down = B_FALSE;
9924 9922 if (ipif->ipif_flags & IPIF_DUPLICATE) {
9925 9923 need_arp_down = !need_up;
9926 9924 ipif->ipif_flags &= ~IPIF_DUPLICATE;
9927 9925 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
9928 9926 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
9929 9927 need_dl_down = B_TRUE;
9930 9928 }
9931 9929 }
9932 9930
9933 9931 ipif_set_default(ipif);
9934 9932
9935 9933 /*
9936 9934 * If we've just manually set the IPv6 link-local address (0th ipif),
9937 9935 * tag the ill so that future updates to the interface ID don't result
9938 9936 * in this address getting automatically reconfigured from under the
9939 9937 * administrator.
9940 9938 */
9941 9939 if (ipif->ipif_isv6 && ipif->ipif_id == 0) {
9942 9940 if (iocp == NULL || (iocp->ioc_cmd == SIOCSLIFADDR &&
9943 9941 !IN6_IS_ADDR_UNSPECIFIED(&v6addr)))
9944 9942 ill->ill_manual_linklocal = 1;
9945 9943 }
9946 9944
9947 9945 /*
9948 9946 * When publishing an interface address change event, we only notify
9949 9947 * the event listeners of the new address. It is assumed that if they
9950 9948 * actively care about the addresses assigned that they will have
9951 9949 * already discovered the previous address assigned (if there was one.)
9952 9950 *
9953 9951 * Don't attach nic event message for SIOCLIFADDIF ioctl.
9954 9952 */
9955 9953 if (iocp != NULL && iocp->ioc_cmd != SIOCLIFADDIF) {
9956 9954 ill_nic_event_dispatch(ill, MAP_IPIF_ID(ipif->ipif_id),
9957 9955 NE_ADDRESS_CHANGE, sin, sinlen);
9958 9956 }
9959 9957
9960 9958 mutex_exit(&ill->ill_lock);
9961 9959
9962 9960 if (need_up) {
9963 9961 /*
9964 9962 * Now bring the interface back up. If this
9965 9963 * is the only IPIF for the ILL, ipif_up
9966 9964 * will have to re-bind to the device, so
9967 9965 * we may get back EINPROGRESS, in which
9968 9966 * case, this IOCTL will get completed in
9969 9967 * ip_rput_dlpi when we see the DL_BIND_ACK.
9970 9968 */
9971 9969 err = ipif_up(ipif, q, mp);
9972 9970 } else {
9973 9971 /* Perhaps ilgs should use this ill */
9974 9972 update_conn_ill(NULL, ill->ill_ipst);
9975 9973 }
9976 9974
9977 9975 if (need_dl_down)
9978 9976 ill_dl_down(ill);
9979 9977
9980 9978 if (need_arp_down && !ill->ill_isv6)
9981 9979 (void) ipif_arp_down(ipif);
9982 9980
9983 9981 /*
9984 9982 * The default multicast interface might have changed (for
9985 9983 * instance if the IPv6 scope of the address changed)
9986 9984 */
9987 9985 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
9988 9986
9989 9987 return (err);
9990 9988 }
9991 9989
9992 9990 /*
9993 9991 * Restart entry point to restart the address set operation after the
9994 9992 * refcounts have dropped to zero.
9995 9993 */
9996 9994 /* ARGSUSED */
9997 9995 int
9998 9996 ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
9999 9997 ip_ioctl_cmd_t *ipip, void *ifreq)
10000 9998 {
10001 9999 ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n",
10002 10000 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10003 10001 ASSERT(IAM_WRITER_IPIF(ipif));
10004 10002 (void) ipif_down_tail(ipif);
10005 10003 return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE));
10006 10004 }
10007 10005
10008 10006 /* ARGSUSED */
10009 10007 int
10010 10008 ip_sioctl_get_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10011 10009 ip_ioctl_cmd_t *ipip, void *if_req)
10012 10010 {
10013 10011 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
10014 10012 struct lifreq *lifr = (struct lifreq *)if_req;
10015 10013
10016 10014 ip1dbg(("ip_sioctl_get_addr(%s:%u %p)\n",
10017 10015 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10018 10016 /*
10019 10017 * The net mask and address can't change since we have a
10020 10018 * reference to the ipif. So no lock is necessary.
10021 10019 */
10022 10020 if (ipif->ipif_isv6) {
10023 10021 *sin6 = sin6_null;
10024 10022 sin6->sin6_family = AF_INET6;
10025 10023 sin6->sin6_addr = ipif->ipif_v6lcl_addr;
10026 10024 if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
10027 10025 sin6->sin6_scope_id =
10028 10026 ipif->ipif_ill->ill_phyint->phyint_ifindex;
10029 10027 }
10030 10028 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10031 10029 lifr->lifr_addrlen =
10032 10030 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
10033 10031 } else {
10034 10032 *sin = sin_null;
10035 10033 sin->sin_family = AF_INET;
10036 10034 sin->sin_addr.s_addr = ipif->ipif_lcl_addr;
10037 10035 if (ipip->ipi_cmd_type == LIF_CMD) {
10038 10036 lifr->lifr_addrlen =
10039 10037 ip_mask_to_plen(ipif->ipif_net_mask);
10040 10038 }
10041 10039 }
10042 10040 return (0);
10043 10041 }
10044 10042
10045 10043 /*
10046 10044 * Set the destination address for a pt-pt interface.
10047 10045 */
10048 10046 /* ARGSUSED */
10049 10047 int
10050 10048 ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10051 10049 ip_ioctl_cmd_t *ipip, void *if_req)
10052 10050 {
10053 10051 int err = 0;
10054 10052 in6_addr_t v6addr;
10055 10053 boolean_t need_up = B_FALSE;
10056 10054
10057 10055 ip1dbg(("ip_sioctl_dstaddr(%s:%u %p)\n",
10058 10056 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10059 10057 ASSERT(IAM_WRITER_IPIF(ipif));
10060 10058
10061 10059 if (ipif->ipif_isv6) {
10062 10060 sin6_t *sin6;
10063 10061
10064 10062 if (sin->sin_family != AF_INET6)
10065 10063 return (EAFNOSUPPORT);
10066 10064
10067 10065 sin6 = (sin6_t *)sin;
10068 10066 v6addr = sin6->sin6_addr;
10069 10067
10070 10068 if (!ip_remote_addr_ok_v6(&v6addr, &ipif->ipif_v6net_mask))
10071 10069 return (EADDRNOTAVAIL);
10072 10070 } else {
10073 10071 ipaddr_t addr;
10074 10072
10075 10073 if (sin->sin_family != AF_INET)
10076 10074 return (EAFNOSUPPORT);
10077 10075
10078 10076 addr = sin->sin_addr.s_addr;
10079 10077 if (addr != INADDR_ANY &&
10080 10078 !ip_addr_ok_v4(addr, ipif->ipif_net_mask)) {
10081 10079 return (EADDRNOTAVAIL);
10082 10080 }
10083 10081
10084 10082 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10085 10083 }
10086 10084
10087 10085 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, &v6addr))
10088 10086 return (0); /* No change */
10089 10087
10090 10088 if (ipif->ipif_flags & IPIF_UP) {
10091 10089 /*
10092 10090 * If the interface is already marked up,
10093 10091 * we call ipif_down which will take care
10094 10092 * of ditching any IREs that have been set
10095 10093 * up based on the old pp dst address.
10096 10094 */
10097 10095 err = ipif_logical_down(ipif, q, mp);
10098 10096 if (err == EINPROGRESS)
10099 10097 return (err);
10100 10098 (void) ipif_down_tail(ipif);
10101 10099 need_up = B_TRUE;
10102 10100 }
10103 10101 /*
10104 10102 * could return EINPROGRESS. If so ioctl will complete in
10105 10103 * ip_rput_dlpi_writer
10106 10104 */
10107 10105 err = ip_sioctl_dstaddr_tail(ipif, sin, q, mp, need_up);
10108 10106 return (err);
10109 10107 }
10110 10108
10111 10109 static int
10112 10110 ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10113 10111 boolean_t need_up)
10114 10112 {
10115 10113 in6_addr_t v6addr;
10116 10114 ill_t *ill = ipif->ipif_ill;
10117 10115 int err = 0;
10118 10116 boolean_t need_dl_down;
10119 10117 boolean_t need_arp_down;
10120 10118
10121 10119 ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name,
10122 10120 ipif->ipif_id, (void *)ipif));
10123 10121
10124 10122 /* Must cancel any pending timer before taking the ill_lock */
10125 10123 if (ipif->ipif_recovery_id != 0)
10126 10124 (void) untimeout(ipif->ipif_recovery_id);
10127 10125 ipif->ipif_recovery_id = 0;
10128 10126
10129 10127 if (ipif->ipif_isv6) {
10130 10128 sin6_t *sin6;
10131 10129
10132 10130 sin6 = (sin6_t *)sin;
10133 10131 v6addr = sin6->sin6_addr;
10134 10132 } else {
10135 10133 ipaddr_t addr;
10136 10134
10137 10135 addr = sin->sin_addr.s_addr;
10138 10136 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
10139 10137 }
10140 10138 mutex_enter(&ill->ill_lock);
10141 10139 /* Set point to point destination address. */
10142 10140 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
10143 10141 /*
10144 10142 * Allow this as a means of creating logical
10145 10143 * pt-pt interfaces on top of e.g. an Ethernet.
10146 10144 * XXX Undocumented HACK for testing.
10147 10145 * pt-pt interfaces are created with NUD disabled.
10148 10146 */
10149 10147 ipif->ipif_flags |= IPIF_POINTOPOINT;
10150 10148 ipif->ipif_flags &= ~IPIF_BROADCAST;
10151 10149 if (ipif->ipif_isv6)
10152 10150 ill->ill_flags |= ILLF_NONUD;
10153 10151 }
10154 10152
10155 10153 /*
10156 10154 * If the interface was previously marked as a duplicate, then since
10157 10155 * we've now got a "new" address, it should no longer be considered a
10158 10156 * duplicate -- even if the "new" address is the same as the old one.
10159 10157 * Note that if all ipifs are down, we may have a pending ARP down
10160 10158 * event to handle.
10161 10159 */
10162 10160 need_dl_down = need_arp_down = B_FALSE;
10163 10161 if (ipif->ipif_flags & IPIF_DUPLICATE) {
10164 10162 need_arp_down = !need_up;
10165 10163 ipif->ipif_flags &= ~IPIF_DUPLICATE;
10166 10164 if (--ill->ill_ipif_dup_count == 0 && !need_up &&
10167 10165 ill->ill_ipif_up_count == 0 && ill->ill_dl_up) {
10168 10166 need_dl_down = B_TRUE;
10169 10167 }
10170 10168 }
10171 10169
10172 10170 /*
10173 10171 * If we've just manually set the IPv6 destination link-local address
10174 10172 * (0th ipif), tag the ill so that future updates to the destination
10175 10173 * interface ID (as can happen with interfaces over IP tunnels) don't
10176 10174 * result in this address getting automatically reconfigured from
10177 10175 * under the administrator.
10178 10176 */
10179 10177 if (ipif->ipif_isv6 && ipif->ipif_id == 0)
10180 10178 ill->ill_manual_dst_linklocal = 1;
10181 10179
10182 10180 /* Set the new address. */
10183 10181 ipif->ipif_v6pp_dst_addr = v6addr;
10184 10182 /* Make sure subnet tracks pp_dst */
10185 10183 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
10186 10184 mutex_exit(&ill->ill_lock);
10187 10185
10188 10186 if (need_up) {
10189 10187 /*
10190 10188 * Now bring the interface back up. If this
10191 10189 * is the only IPIF for the ILL, ipif_up
10192 10190 * will have to re-bind to the device, so
10193 10191 * we may get back EINPROGRESS, in which
10194 10192 * case, this IOCTL will get completed in
10195 10193 * ip_rput_dlpi when we see the DL_BIND_ACK.
10196 10194 */
10197 10195 err = ipif_up(ipif, q, mp);
10198 10196 }
10199 10197
10200 10198 if (need_dl_down)
10201 10199 ill_dl_down(ill);
10202 10200 if (need_arp_down && !ipif->ipif_isv6)
10203 10201 (void) ipif_arp_down(ipif);
10204 10202
10205 10203 return (err);
10206 10204 }
10207 10205
10208 10206 /*
10209 10207 * Restart entry point to restart the dstaddress set operation after the
10210 10208 * refcounts have dropped to zero.
10211 10209 */
10212 10210 /* ARGSUSED */
10213 10211 int
10214 10212 ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10215 10213 ip_ioctl_cmd_t *ipip, void *ifreq)
10216 10214 {
10217 10215 ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n",
10218 10216 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10219 10217 (void) ipif_down_tail(ipif);
10220 10218 return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE));
10221 10219 }
10222 10220
10223 10221 /* ARGSUSED */
10224 10222 int
10225 10223 ip_sioctl_get_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10226 10224 ip_ioctl_cmd_t *ipip, void *if_req)
10227 10225 {
10228 10226 sin6_t *sin6 = (struct sockaddr_in6 *)sin;
10229 10227
10230 10228 ip1dbg(("ip_sioctl_get_dstaddr(%s:%u %p)\n",
10231 10229 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10232 10230 /*
10233 10231 * Get point to point destination address. The addresses can't
10234 10232 * change since we hold a reference to the ipif.
10235 10233 */
10236 10234 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0)
10237 10235 return (EADDRNOTAVAIL);
10238 10236
10239 10237 if (ipif->ipif_isv6) {
10240 10238 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
10241 10239 *sin6 = sin6_null;
10242 10240 sin6->sin6_family = AF_INET6;
10243 10241 sin6->sin6_addr = ipif->ipif_v6pp_dst_addr;
10244 10242 } else {
10245 10243 *sin = sin_null;
10246 10244 sin->sin_family = AF_INET;
10247 10245 sin->sin_addr.s_addr = ipif->ipif_pp_dst_addr;
10248 10246 }
10249 10247 return (0);
10250 10248 }
10251 10249
10252 10250 /*
10253 10251 * Check which flags will change by the given flags being set
10254 10252 * silently ignore flags which userland is not allowed to control.
10255 10253 * (Because these flags may change between SIOCGLIFFLAGS and
10256 10254 * SIOCSLIFFLAGS, and that's outside of userland's control,
10257 10255 * we need to silently ignore them rather than fail.)
10258 10256 */
10259 10257 static void
10260 10258 ip_sioctl_flags_onoff(ipif_t *ipif, uint64_t flags, uint64_t *onp,
10261 10259 uint64_t *offp)
10262 10260 {
10263 10261 ill_t *ill = ipif->ipif_ill;
10264 10262 phyint_t *phyi = ill->ill_phyint;
10265 10263 uint64_t cantchange_flags, intf_flags;
10266 10264 uint64_t turn_on, turn_off;
10267 10265
10268 10266 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10269 10267 cantchange_flags = IFF_CANTCHANGE;
10270 10268 if (IS_IPMP(ill))
10271 10269 cantchange_flags |= IFF_IPMP_CANTCHANGE;
10272 10270 turn_on = (flags ^ intf_flags) & ~cantchange_flags;
10273 10271 turn_off = intf_flags & turn_on;
10274 10272 turn_on ^= turn_off;
10275 10273 *onp = turn_on;
10276 10274 *offp = turn_off;
10277 10275 }
10278 10276
10279 10277 /*
10280 10278 * Set interface flags. Many flags require special handling (e.g.,
10281 10279 * bringing the interface down); see below for details.
10282 10280 *
10283 10281 * NOTE : We really don't enforce that ipif_id zero should be used
10284 10282 * for setting any flags other than IFF_LOGINT_FLAGS. This
10285 10283 * is because applications generally does SICGLIFFLAGS and
10286 10284 * ORs in the new flags (that affects the logical) and does a
10287 10285 * SIOCSLIFFLAGS. Thus, "flags" below could contain bits other
10288 10286 * than IFF_LOGINT_FLAGS. One could check whether "turn_on" - the
10289 10287 * flags that will be turned on is correct with respect to
10290 10288 * ipif_id 0. For backward compatibility reasons, it is not done.
10291 10289 */
10292 10290 /* ARGSUSED */
10293 10291 int
10294 10292 ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10295 10293 ip_ioctl_cmd_t *ipip, void *if_req)
10296 10294 {
10297 10295 uint64_t turn_on;
10298 10296 uint64_t turn_off;
10299 10297 int err = 0;
10300 10298 phyint_t *phyi;
10301 10299 ill_t *ill;
10302 10300 conn_t *connp;
10303 10301 uint64_t intf_flags;
10304 10302 boolean_t phyint_flags_modified = B_FALSE;
10305 10303 uint64_t flags;
10306 10304 struct ifreq *ifr;
10307 10305 struct lifreq *lifr;
10308 10306 boolean_t set_linklocal = B_FALSE;
10309 10307
10310 10308 ip1dbg(("ip_sioctl_flags(%s:%u %p)\n",
10311 10309 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10312 10310
10313 10311 ASSERT(IAM_WRITER_IPIF(ipif));
10314 10312
10315 10313 ill = ipif->ipif_ill;
10316 10314 phyi = ill->ill_phyint;
10317 10315
10318 10316 if (ipip->ipi_cmd_type == IF_CMD) {
10319 10317 ifr = (struct ifreq *)if_req;
10320 10318 flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff);
10321 10319 } else {
10322 10320 lifr = (struct lifreq *)if_req;
10323 10321 flags = lifr->lifr_flags;
10324 10322 }
10325 10323
10326 10324 intf_flags = ipif->ipif_flags | ill->ill_flags | phyi->phyint_flags;
10327 10325
10328 10326 /*
10329 10327 * Have the flags been set correctly until now?
10330 10328 */
10331 10329 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10332 10330 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10333 10331 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10334 10332 /*
10335 10333 * Compare the new flags to the old, and partition
10336 10334 * into those coming on and those going off.
10337 10335 * For the 16 bit command keep the bits above bit 16 unchanged.
10338 10336 */
10339 10337 if (ipip->ipi_cmd == SIOCSIFFLAGS)
10340 10338 flags |= intf_flags & ~0xFFFF;
10341 10339
10342 10340 /*
10343 10341 * Explicitly fail attempts to change flags that are always invalid on
10344 10342 * an IPMP meta-interface.
10345 10343 */
10346 10344 if (IS_IPMP(ill) && ((flags ^ intf_flags) & IFF_IPMP_INVALID))
10347 10345 return (EINVAL);
10348 10346
10349 10347 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10350 10348 if ((turn_on|turn_off) == 0)
10351 10349 return (0); /* No change */
10352 10350
10353 10351 /*
10354 10352 * All test addresses must be IFF_DEPRECATED (to ensure source address
10355 10353 * selection avoids them) -- so force IFF_DEPRECATED on, and do not
10356 10354 * allow it to be turned off.
10357 10355 */
10358 10356 if ((turn_off & (IFF_DEPRECATED|IFF_NOFAILOVER)) == IFF_DEPRECATED &&
10359 10357 (turn_on|intf_flags) & IFF_NOFAILOVER)
10360 10358 return (EINVAL);
10361 10359
10362 10360 if ((connp = Q_TO_CONN(q)) == NULL)
10363 10361 return (EINVAL);
10364 10362
10365 10363 /*
10366 10364 * Only vrrp control socket is allowed to change IFF_UP and
10367 10365 * IFF_NOACCEPT flags when IFF_VRRP is set.
10368 10366 */
10369 10367 if ((intf_flags & IFF_VRRP) && ((turn_off | turn_on) & IFF_UP)) {
10370 10368 if (!connp->conn_isvrrp)
10371 10369 return (EINVAL);
10372 10370 }
10373 10371
10374 10372 /*
10375 10373 * The IFF_NOACCEPT flag can only be set on an IFF_VRRP IP address by
10376 10374 * VRRP control socket.
10377 10375 */
10378 10376 if ((turn_off | turn_on) & IFF_NOACCEPT) {
10379 10377 if (!connp->conn_isvrrp || !(intf_flags & IFF_VRRP))
10380 10378 return (EINVAL);
10381 10379 }
10382 10380
10383 10381 if (turn_on & IFF_NOFAILOVER) {
10384 10382 turn_on |= IFF_DEPRECATED;
10385 10383 flags |= IFF_DEPRECATED;
10386 10384 }
10387 10385
10388 10386 /*
10389 10387 * On underlying interfaces, only allow applications to manage test
10390 10388 * addresses -- otherwise, they may get confused when the address
10391 10389 * moves as part of being brought up. Likewise, prevent an
10392 10390 * application-managed test address from being converted to a data
10393 10391 * address. To prevent migration of administratively up addresses in
10394 10392 * the kernel, we don't allow them to be converted either.
10395 10393 */
10396 10394 if (IS_UNDER_IPMP(ill)) {
10397 10395 const uint64_t appflags = IFF_DHCPRUNNING | IFF_ADDRCONF;
10398 10396
10399 10397 if ((turn_on & appflags) && !(flags & IFF_NOFAILOVER))
10400 10398 return (EINVAL);
10401 10399
10402 10400 if ((turn_off & IFF_NOFAILOVER) &&
10403 10401 (flags & (appflags | IFF_UP | IFF_DUPLICATE)))
10404 10402 return (EINVAL);
10405 10403 }
10406 10404
10407 10405 /*
10408 10406 * Only allow IFF_TEMPORARY flag to be set on
10409 10407 * IPv6 interfaces.
10410 10408 */
10411 10409 if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6))
10412 10410 return (EINVAL);
10413 10411
10414 10412 /*
10415 10413 * cannot turn off IFF_NOXMIT on VNI interfaces.
10416 10414 */
10417 10415 if ((turn_off & IFF_NOXMIT) && IS_VNI(ipif->ipif_ill))
10418 10416 return (EINVAL);
10419 10417
10420 10418 /*
10421 10419 * Don't allow the IFF_ROUTER flag to be turned on on loopback
10422 10420 * interfaces. It makes no sense in that context.
10423 10421 */
10424 10422 if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK))
10425 10423 return (EINVAL);
10426 10424
10427 10425 /*
10428 10426 * For IPv6 ipif_id 0, don't allow the interface to be up without
10429 10427 * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set.
10430 10428 * If the link local address isn't set, and can be set, it will get
10431 10429 * set later on in this function.
10432 10430 */
10433 10431 if (ipif->ipif_id == 0 && ipif->ipif_isv6 &&
10434 10432 (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) &&
10435 10433 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) {
10436 10434 if (ipif_cant_setlinklocal(ipif))
10437 10435 return (EINVAL);
10438 10436 set_linklocal = B_TRUE;
10439 10437 }
10440 10438
10441 10439 /*
10442 10440 * If we modify physical interface flags, we'll potentially need to
10443 10441 * send up two routing socket messages for the changes (one for the
10444 10442 * IPv4 ill, and another for the IPv6 ill). Note that here.
10445 10443 */
10446 10444 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10447 10445 phyint_flags_modified = B_TRUE;
10448 10446
10449 10447 /*
10450 10448 * All functioning PHYI_STANDBY interfaces start life PHYI_INACTIVE
10451 10449 * (otherwise, we'd immediately use them, defeating standby). Also,
10452 10450 * since PHYI_INACTIVE has a separate meaning when PHYI_STANDBY is not
10453 10451 * set, don't allow PHYI_STANDBY to be set if PHYI_INACTIVE is already
10454 10452 * set, and clear PHYI_INACTIVE if PHYI_STANDBY is being cleared. We
10455 10453 * also don't allow PHYI_STANDBY if VNI is enabled since its semantics
10456 10454 * will not be honored.
10457 10455 */
10458 10456 if (turn_on & PHYI_STANDBY) {
10459 10457 /*
10460 10458 * No need to grab ill_g_usesrc_lock here; see the
10461 10459 * synchronization notes in ip.c.
10462 10460 */
10463 10461 if (ill->ill_usesrc_grp_next != NULL ||
10464 10462 intf_flags & PHYI_INACTIVE)
10465 10463 return (EINVAL);
10466 10464 if (!(flags & PHYI_FAILED)) {
10467 10465 flags |= PHYI_INACTIVE;
10468 10466 turn_on |= PHYI_INACTIVE;
10469 10467 }
10470 10468 }
10471 10469
10472 10470 if (turn_off & PHYI_STANDBY) {
10473 10471 flags &= ~PHYI_INACTIVE;
10474 10472 turn_off |= PHYI_INACTIVE;
10475 10473 }
10476 10474
10477 10475 /*
10478 10476 * PHYI_FAILED and PHYI_INACTIVE are mutually exclusive; fail if both
10479 10477 * would end up on.
10480 10478 */
10481 10479 if ((flags & (PHYI_FAILED | PHYI_INACTIVE)) ==
10482 10480 (PHYI_FAILED | PHYI_INACTIVE))
10483 10481 return (EINVAL);
10484 10482
10485 10483 /*
10486 10484 * If ILLF_ROUTER changes, we need to change the ip forwarding
10487 10485 * status of the interface.
10488 10486 */
10489 10487 if ((turn_on | turn_off) & ILLF_ROUTER) {
10490 10488 err = ill_forward_set(ill, ((turn_on & ILLF_ROUTER) != 0));
10491 10489 if (err != 0)
10492 10490 return (err);
10493 10491 }
10494 10492
10495 10493 /*
10496 10494 * If the interface is not UP and we are not going to
10497 10495 * bring it UP, record the flags and return. When the
10498 10496 * interface comes UP later, the right actions will be
10499 10497 * taken.
10500 10498 */
10501 10499 if (!(ipif->ipif_flags & IPIF_UP) &&
10502 10500 !(turn_on & IPIF_UP)) {
10503 10501 /* Record new flags in their respective places. */
10504 10502 mutex_enter(&ill->ill_lock);
10505 10503 mutex_enter(&ill->ill_phyint->phyint_lock);
10506 10504 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10507 10505 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10508 10506 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10509 10507 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10510 10508 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10511 10509 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10512 10510 mutex_exit(&ill->ill_lock);
10513 10511 mutex_exit(&ill->ill_phyint->phyint_lock);
10514 10512
10515 10513 /*
10516 10514 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the
10517 10515 * same to the kernel: if any of them has been set by
10518 10516 * userland, the interface cannot be used for data traffic.
10519 10517 */
10520 10518 if ((turn_on|turn_off) &
10521 10519 (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10522 10520 ASSERT(!IS_IPMP(ill));
10523 10521 /*
10524 10522 * It's possible the ill is part of an "anonymous"
10525 10523 * IPMP group rather than a real group. In that case,
10526 10524 * there are no other interfaces in the group and thus
10527 10525 * no need to call ipmp_phyint_refresh_active().
10528 10526 */
10529 10527 if (IS_UNDER_IPMP(ill))
10530 10528 ipmp_phyint_refresh_active(phyi);
10531 10529 }
10532 10530
10533 10531 if (phyint_flags_modified) {
10534 10532 if (phyi->phyint_illv4 != NULL) {
10535 10533 ip_rts_ifmsg(phyi->phyint_illv4->
10536 10534 ill_ipif, RTSQ_DEFAULT);
10537 10535 }
10538 10536 if (phyi->phyint_illv6 != NULL) {
10539 10537 ip_rts_ifmsg(phyi->phyint_illv6->
10540 10538 ill_ipif, RTSQ_DEFAULT);
10541 10539 }
10542 10540 }
10543 10541 /* The default multicast interface might have changed */
10544 10542 ire_increment_multicast_generation(ill->ill_ipst,
10545 10543 ill->ill_isv6);
10546 10544
10547 10545 return (0);
10548 10546 } else if (set_linklocal) {
10549 10547 mutex_enter(&ill->ill_lock);
10550 10548 if (set_linklocal)
10551 10549 ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL;
10552 10550 mutex_exit(&ill->ill_lock);
10553 10551 }
10554 10552
10555 10553 /*
10556 10554 * Disallow IPv6 interfaces coming up that have the unspecified address,
10557 10555 * or point-to-point interfaces with an unspecified destination. We do
10558 10556 * allow the address to be unspecified for IPIF_NOLOCAL interfaces that
10559 10557 * have a subnet assigned, which is how in.ndpd currently manages its
10560 10558 * onlink prefix list when no addresses are configured with those
10561 10559 * prefixes.
10562 10560 */
10563 10561 if (ipif->ipif_isv6 &&
10564 10562 ((IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
10565 10563 (!(ipif->ipif_flags & IPIF_NOLOCAL) && !(turn_on & IPIF_NOLOCAL) ||
10566 10564 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet))) ||
10567 10565 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10568 10566 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)))) {
10569 10567 return (EINVAL);
10570 10568 }
10571 10569
10572 10570 /*
10573 10571 * Prevent IPv4 point-to-point interfaces with a 0.0.0.0 destination
10574 10572 * from being brought up.
10575 10573 */
10576 10574 if (!ipif->ipif_isv6 &&
10577 10575 ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
10578 10576 ipif->ipif_pp_dst_addr == INADDR_ANY)) {
10579 10577 return (EINVAL);
10580 10578 }
10581 10579
10582 10580 /*
10583 10581 * If we are going to change one or more of the flags that are
10584 10582 * IPIF_UP, IPIF_DEPRECATED, IPIF_NOXMIT, IPIF_NOLOCAL, ILLF_NOARP,
10585 10583 * ILLF_NONUD, IPIF_PRIVATE, IPIF_ANYCAST, IPIF_PREFERRED, and
10586 10584 * IPIF_NOFAILOVER, we will take special action. This is
10587 10585 * done by bring the ipif down, changing the flags and bringing
10588 10586 * it back up again. For IPIF_NOFAILOVER, the act of bringing it
10589 10587 * back up will trigger the address to be moved.
10590 10588 *
10591 10589 * If we are going to change IFF_NOACCEPT, we need to bring
10592 10590 * all the ipifs down then bring them up again. The act of
10593 10591 * bringing all the ipifs back up will trigger the local
10594 10592 * ires being recreated with "no_accept" set/cleared.
10595 10593 *
10596 10594 * Note that ILLF_NOACCEPT is always set separately from the
10597 10595 * other flags.
10598 10596 */
10599 10597 if ((turn_on|turn_off) &
10600 10598 (IPIF_UP|IPIF_DEPRECATED|IPIF_NOXMIT|IPIF_NOLOCAL|ILLF_NOARP|
10601 10599 ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED|
10602 10600 IPIF_NOFAILOVER)) {
10603 10601 /*
10604 10602 * ipif_down() will ire_delete bcast ire's for the subnet,
10605 10603 * while the ire_identical_ref tracks the case of IRE_BROADCAST
10606 10604 * entries shared between multiple ipifs on the same subnet.
10607 10605 */
10608 10606 if (((ipif->ipif_flags | turn_on) & IPIF_UP) &&
10609 10607 !(turn_off & IPIF_UP)) {
10610 10608 if (ipif->ipif_flags & IPIF_UP)
10611 10609 ill->ill_logical_down = 1;
10612 10610 turn_on &= ~IPIF_UP;
10613 10611 }
10614 10612 err = ipif_down(ipif, q, mp);
10615 10613 ip1dbg(("ipif_down returns %d err ", err));
10616 10614 if (err == EINPROGRESS)
10617 10615 return (err);
10618 10616 (void) ipif_down_tail(ipif);
10619 10617 } else if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10620 10618 /*
10621 10619 * If we can quiesce the ill, then continue. If not, then
10622 10620 * ip_sioctl_flags_tail() will be called from
10623 10621 * ipif_ill_refrele_tail().
10624 10622 */
10625 10623 ill_down_ipifs(ill, B_TRUE);
10626 10624
10627 10625 mutex_enter(&connp->conn_lock);
10628 10626 mutex_enter(&ill->ill_lock);
10629 10627 if (!ill_is_quiescent(ill)) {
10630 10628 boolean_t success;
10631 10629
10632 10630 success = ipsq_pending_mp_add(connp, ill->ill_ipif,
10633 10631 q, mp, ILL_DOWN);
10634 10632 mutex_exit(&ill->ill_lock);
10635 10633 mutex_exit(&connp->conn_lock);
10636 10634 return (success ? EINPROGRESS : EINTR);
10637 10635 }
10638 10636 mutex_exit(&ill->ill_lock);
10639 10637 mutex_exit(&connp->conn_lock);
10640 10638 }
10641 10639 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10642 10640 }
10643 10641
10644 10642 static int
10645 10643 ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp)
10646 10644 {
10647 10645 ill_t *ill;
10648 10646 phyint_t *phyi;
10649 10647 uint64_t turn_on, turn_off;
10650 10648 boolean_t phyint_flags_modified = B_FALSE;
10651 10649 int err = 0;
10652 10650 boolean_t set_linklocal = B_FALSE;
10653 10651
10654 10652 ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n",
10655 10653 ipif->ipif_ill->ill_name, ipif->ipif_id));
10656 10654
10657 10655 ASSERT(IAM_WRITER_IPIF(ipif));
10658 10656
10659 10657 ill = ipif->ipif_ill;
10660 10658 phyi = ill->ill_phyint;
10661 10659
10662 10660 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10663 10661
10664 10662 /*
10665 10663 * IFF_UP is handled separately.
10666 10664 */
10667 10665 turn_on &= ~IFF_UP;
10668 10666 turn_off &= ~IFF_UP;
10669 10667
10670 10668 if ((turn_on|turn_off) & IFF_PHYINT_FLAGS)
10671 10669 phyint_flags_modified = B_TRUE;
10672 10670
10673 10671 /*
10674 10672 * Now we change the flags. Track current value of
10675 10673 * other flags in their respective places.
10676 10674 */
10677 10675 mutex_enter(&ill->ill_lock);
10678 10676 mutex_enter(&phyi->phyint_lock);
10679 10677 ipif->ipif_flags |= (turn_on & IFF_LOGINT_FLAGS);
10680 10678 ipif->ipif_flags &= (~turn_off & IFF_LOGINT_FLAGS);
10681 10679 ill->ill_flags |= (turn_on & IFF_PHYINTINST_FLAGS);
10682 10680 ill->ill_flags &= (~turn_off & IFF_PHYINTINST_FLAGS);
10683 10681 phyi->phyint_flags |= (turn_on & IFF_PHYINT_FLAGS);
10684 10682 phyi->phyint_flags &= (~turn_off & IFF_PHYINT_FLAGS);
10685 10683 if (ipif->ipif_state_flags & IPIF_SET_LINKLOCAL) {
10686 10684 set_linklocal = B_TRUE;
10687 10685 ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL;
10688 10686 }
10689 10687
10690 10688 mutex_exit(&ill->ill_lock);
10691 10689 mutex_exit(&phyi->phyint_lock);
10692 10690
10693 10691 if (set_linklocal)
10694 10692 (void) ipif_setlinklocal(ipif);
10695 10693
10696 10694 /*
10697 10695 * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to
10698 10696 * the kernel: if any of them has been set by userland, the interface
10699 10697 * cannot be used for data traffic.
10700 10698 */
10701 10699 if ((turn_on|turn_off) & (PHYI_FAILED | PHYI_INACTIVE | PHYI_OFFLINE)) {
10702 10700 ASSERT(!IS_IPMP(ill));
10703 10701 /*
10704 10702 * It's possible the ill is part of an "anonymous" IPMP group
10705 10703 * rather than a real group. In that case, there are no other
10706 10704 * interfaces in the group and thus no need for us to call
10707 10705 * ipmp_phyint_refresh_active().
10708 10706 */
10709 10707 if (IS_UNDER_IPMP(ill))
10710 10708 ipmp_phyint_refresh_active(phyi);
10711 10709 }
10712 10710
10713 10711 if ((turn_on|turn_off) & ILLF_NOACCEPT) {
10714 10712 /*
10715 10713 * If the ILLF_NOACCEPT flag is changed, bring up all the
10716 10714 * ipifs that were brought down.
10717 10715 *
10718 10716 * The routing sockets messages are sent as the result
10719 10717 * of ill_up_ipifs(), further, SCTP's IPIF list was updated
10720 10718 * as well.
10721 10719 */
10722 10720 err = ill_up_ipifs(ill, q, mp);
10723 10721 } else if ((flags & IFF_UP) && !(ipif->ipif_flags & IPIF_UP)) {
10724 10722 /*
10725 10723 * XXX ipif_up really does not know whether a phyint flags
10726 10724 * was modified or not. So, it sends up information on
10727 10725 * only one routing sockets message. As we don't bring up
10728 10726 * the interface and also set PHYI_ flags simultaneously
10729 10727 * it should be okay.
10730 10728 */
10731 10729 err = ipif_up(ipif, q, mp);
10732 10730 } else {
10733 10731 /*
10734 10732 * Make sure routing socket sees all changes to the flags.
10735 10733 * ipif_up_done* handles this when we use ipif_up.
10736 10734 */
10737 10735 if (phyint_flags_modified) {
10738 10736 if (phyi->phyint_illv4 != NULL) {
10739 10737 ip_rts_ifmsg(phyi->phyint_illv4->
10740 10738 ill_ipif, RTSQ_DEFAULT);
10741 10739 }
10742 10740 if (phyi->phyint_illv6 != NULL) {
10743 10741 ip_rts_ifmsg(phyi->phyint_illv6->
10744 10742 ill_ipif, RTSQ_DEFAULT);
10745 10743 }
10746 10744 } else {
10747 10745 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
10748 10746 }
10749 10747 /*
10750 10748 * Update the flags in SCTP's IPIF list, ipif_up() will do
10751 10749 * this in need_up case.
10752 10750 */
10753 10751 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10754 10752 }
10755 10753
10756 10754 /* The default multicast interface might have changed */
10757 10755 ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6);
10758 10756 return (err);
10759 10757 }
10760 10758
10761 10759 /*
10762 10760 * Restart the flags operation now that the refcounts have dropped to zero.
10763 10761 */
10764 10762 /* ARGSUSED */
10765 10763 int
10766 10764 ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10767 10765 ip_ioctl_cmd_t *ipip, void *if_req)
10768 10766 {
10769 10767 uint64_t flags;
10770 10768 struct ifreq *ifr = if_req;
10771 10769 struct lifreq *lifr = if_req;
10772 10770 uint64_t turn_on, turn_off;
10773 10771
10774 10772 ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n",
10775 10773 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10776 10774
10777 10775 if (ipip->ipi_cmd_type == IF_CMD) {
10778 10776 /* cast to uint16_t prevents unwanted sign extension */
10779 10777 flags = (uint16_t)ifr->ifr_flags;
10780 10778 } else {
10781 10779 flags = lifr->lifr_flags;
10782 10780 }
10783 10781
10784 10782 /*
10785 10783 * If this function call is a result of the ILLF_NOACCEPT flag
10786 10784 * change, do not call ipif_down_tail(). See ip_sioctl_flags().
10787 10785 */
10788 10786 ip_sioctl_flags_onoff(ipif, flags, &turn_on, &turn_off);
10789 10787 if (!((turn_on|turn_off) & ILLF_NOACCEPT))
10790 10788 (void) ipif_down_tail(ipif);
10791 10789
10792 10790 return (ip_sioctl_flags_tail(ipif, flags, q, mp));
10793 10791 }
10794 10792
10795 10793 /*
10796 10794 * Can operate on either a module or a driver queue.
10797 10795 */
10798 10796 /* ARGSUSED */
10799 10797 int
10800 10798 ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10801 10799 ip_ioctl_cmd_t *ipip, void *if_req)
10802 10800 {
10803 10801 /*
10804 10802 * Has the flags been set correctly till now ?
10805 10803 */
10806 10804 ill_t *ill = ipif->ipif_ill;
10807 10805 phyint_t *phyi = ill->ill_phyint;
10808 10806
10809 10807 ip1dbg(("ip_sioctl_get_flags(%s:%u %p)\n",
10810 10808 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10811 10809 ASSERT((phyi->phyint_flags & ~(IFF_PHYINT_FLAGS)) == 0);
10812 10810 ASSERT((ill->ill_flags & ~(IFF_PHYINTINST_FLAGS)) == 0);
10813 10811 ASSERT((ipif->ipif_flags & ~(IFF_LOGINT_FLAGS)) == 0);
10814 10812
10815 10813 /*
10816 10814 * Need a lock since some flags can be set even when there are
10817 10815 * references to the ipif.
10818 10816 */
10819 10817 mutex_enter(&ill->ill_lock);
10820 10818 if (ipip->ipi_cmd_type == IF_CMD) {
10821 10819 struct ifreq *ifr = (struct ifreq *)if_req;
10822 10820
10823 10821 /* Get interface flags (low 16 only). */
10824 10822 ifr->ifr_flags = ((ipif->ipif_flags |
10825 10823 ill->ill_flags | phyi->phyint_flags) & 0xffff);
10826 10824 } else {
10827 10825 struct lifreq *lifr = (struct lifreq *)if_req;
10828 10826
10829 10827 /* Get interface flags. */
10830 10828 lifr->lifr_flags = ipif->ipif_flags |
10831 10829 ill->ill_flags | phyi->phyint_flags;
10832 10830 }
10833 10831 mutex_exit(&ill->ill_lock);
10834 10832 return (0);
10835 10833 }
10836 10834
10837 10835 /*
10838 10836 * We allow the MTU to be set on an ILL, but not have it be different
10839 10837 * for different IPIFs since we don't actually send packets on IPIFs.
10840 10838 */
10841 10839 /* ARGSUSED */
10842 10840 int
10843 10841 ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10844 10842 ip_ioctl_cmd_t *ipip, void *if_req)
10845 10843 {
10846 10844 int mtu;
10847 10845 int ip_min_mtu;
10848 10846 struct ifreq *ifr;
10849 10847 struct lifreq *lifr;
10850 10848 ill_t *ill;
10851 10849
10852 10850 ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name,
10853 10851 ipif->ipif_id, (void *)ipif));
10854 10852 if (ipip->ipi_cmd_type == IF_CMD) {
10855 10853 ifr = (struct ifreq *)if_req;
10856 10854 mtu = ifr->ifr_metric;
10857 10855 } else {
10858 10856 lifr = (struct lifreq *)if_req;
10859 10857 mtu = lifr->lifr_mtu;
10860 10858 }
10861 10859 /* Only allow for logical unit zero i.e. not on "bge0:17" */
10862 10860 if (ipif->ipif_id != 0)
10863 10861 return (EINVAL);
10864 10862
10865 10863 ill = ipif->ipif_ill;
10866 10864 if (ipif->ipif_isv6)
10867 10865 ip_min_mtu = IPV6_MIN_MTU;
10868 10866 else
10869 10867 ip_min_mtu = IP_MIN_MTU;
10870 10868
10871 10869 mutex_enter(&ill->ill_lock);
10872 10870 if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) {
10873 10871 mutex_exit(&ill->ill_lock);
10874 10872 return (EINVAL);
10875 10873 }
10876 10874 /* Avoid increasing ill_mc_mtu */
10877 10875 if (ill->ill_mc_mtu > mtu)
10878 10876 ill->ill_mc_mtu = mtu;
10879 10877
10880 10878 /*
10881 10879 * The dce and fragmentation code can handle changes to ill_mtu
10882 10880 * concurrent with sending/fragmenting packets.
10883 10881 */
10884 10882 ill->ill_mtu = mtu;
10885 10883 ill->ill_flags |= ILLF_FIXEDMTU;
10886 10884 mutex_exit(&ill->ill_lock);
10887 10885
10888 10886 /*
10889 10887 * Make sure all dce_generation checks find out
10890 10888 * that ill_mtu/ill_mc_mtu has changed.
10891 10889 */
10892 10890 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
10893 10891
10894 10892 /*
10895 10893 * Refresh IPMP meta-interface MTU if necessary.
10896 10894 */
10897 10895 if (IS_UNDER_IPMP(ill))
10898 10896 ipmp_illgrp_refresh_mtu(ill->ill_grp);
10899 10897
10900 10898 /* Update the MTU in SCTP's list */
10901 10899 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
10902 10900 return (0);
10903 10901 }
10904 10902
10905 10903 /* Get interface MTU. */
10906 10904 /* ARGSUSED */
10907 10905 int
10908 10906 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10909 10907 ip_ioctl_cmd_t *ipip, void *if_req)
10910 10908 {
10911 10909 struct ifreq *ifr;
10912 10910 struct lifreq *lifr;
10913 10911
10914 10912 ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n",
10915 10913 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10916 10914
10917 10915 /*
10918 10916 * We allow a get on any logical interface even though the set
10919 10917 * can only be done on logical unit 0.
10920 10918 */
10921 10919 if (ipip->ipi_cmd_type == IF_CMD) {
10922 10920 ifr = (struct ifreq *)if_req;
10923 10921 ifr->ifr_metric = ipif->ipif_ill->ill_mtu;
10924 10922 } else {
10925 10923 lifr = (struct lifreq *)if_req;
10926 10924 lifr->lifr_mtu = ipif->ipif_ill->ill_mtu;
10927 10925 }
10928 10926 return (0);
10929 10927 }
10930 10928
10931 10929 /* Set interface broadcast address. */
10932 10930 /* ARGSUSED2 */
10933 10931 int
10934 10932 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10935 10933 ip_ioctl_cmd_t *ipip, void *if_req)
10936 10934 {
10937 10935 ipaddr_t addr;
10938 10936 ire_t *ire;
10939 10937 ill_t *ill = ipif->ipif_ill;
10940 10938 ip_stack_t *ipst = ill->ill_ipst;
10941 10939
10942 10940 ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name,
10943 10941 ipif->ipif_id));
10944 10942
10945 10943 ASSERT(IAM_WRITER_IPIF(ipif));
10946 10944 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10947 10945 return (EADDRNOTAVAIL);
10948 10946
10949 10947 ASSERT(!(ipif->ipif_isv6)); /* No IPv6 broadcast */
10950 10948
10951 10949 if (sin->sin_family != AF_INET)
10952 10950 return (EAFNOSUPPORT);
10953 10951
10954 10952 addr = sin->sin_addr.s_addr;
10955 10953
10956 10954 if (ipif->ipif_flags & IPIF_UP) {
10957 10955 /*
10958 10956 * If we are already up, make sure the new
10959 10957 * broadcast address makes sense. If it does,
10960 10958 * there should be an IRE for it already.
10961 10959 */
10962 10960 ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST,
10963 10961 ill, ipif->ipif_zoneid, NULL,
10964 10962 (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL);
10965 10963 if (ire == NULL) {
10966 10964 return (EINVAL);
10967 10965 } else {
10968 10966 ire_refrele(ire);
10969 10967 }
10970 10968 }
10971 10969 /*
10972 10970 * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST
10973 10971 * needs to already exist we never need to change the set of
10974 10972 * IRE_BROADCASTs when we are UP.
10975 10973 */
10976 10974 if (addr != ipif->ipif_brd_addr)
10977 10975 IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr);
10978 10976
10979 10977 return (0);
10980 10978 }
10981 10979
10982 10980 /* Get interface broadcast address. */
10983 10981 /* ARGSUSED */
10984 10982 int
10985 10983 ip_sioctl_get_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
10986 10984 ip_ioctl_cmd_t *ipip, void *if_req)
10987 10985 {
10988 10986 ip1dbg(("ip_sioctl_get_brdaddr(%s:%u %p)\n",
10989 10987 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
10990 10988 if (!(ipif->ipif_flags & IPIF_BROADCAST))
10991 10989 return (EADDRNOTAVAIL);
10992 10990
10993 10991 /* IPIF_BROADCAST not possible with IPv6 */
10994 10992 ASSERT(!ipif->ipif_isv6);
10995 10993 *sin = sin_null;
10996 10994 sin->sin_family = AF_INET;
10997 10995 sin->sin_addr.s_addr = ipif->ipif_brd_addr;
10998 10996 return (0);
10999 10997 }
11000 10998
11001 10999 /*
11002 11000 * This routine is called to handle the SIOCS*IFNETMASK IOCTL.
11003 11001 */
11004 11002 /* ARGSUSED */
11005 11003 int
11006 11004 ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11007 11005 ip_ioctl_cmd_t *ipip, void *if_req)
11008 11006 {
11009 11007 int err = 0;
11010 11008 in6_addr_t v6mask;
11011 11009
11012 11010 ip1dbg(("ip_sioctl_netmask(%s:%u %p)\n",
11013 11011 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11014 11012
11015 11013 ASSERT(IAM_WRITER_IPIF(ipif));
11016 11014
11017 11015 if (ipif->ipif_isv6) {
11018 11016 sin6_t *sin6;
11019 11017
11020 11018 if (sin->sin_family != AF_INET6)
11021 11019 return (EAFNOSUPPORT);
11022 11020
11023 11021 sin6 = (sin6_t *)sin;
11024 11022 v6mask = sin6->sin6_addr;
11025 11023 } else {
11026 11024 ipaddr_t mask;
11027 11025
11028 11026 if (sin->sin_family != AF_INET)
11029 11027 return (EAFNOSUPPORT);
11030 11028
11031 11029 mask = sin->sin_addr.s_addr;
11032 11030 if (!ip_contiguous_mask(ntohl(mask)))
11033 11031 return (ENOTSUP);
11034 11032 V4MASK_TO_V6(mask, v6mask);
11035 11033 }
11036 11034
11037 11035 /*
11038 11036 * No big deal if the interface isn't already up, or the mask
11039 11037 * isn't really changing, or this is pt-pt.
11040 11038 */
11041 11039 if (!(ipif->ipif_flags & IPIF_UP) ||
11042 11040 IN6_ARE_ADDR_EQUAL(&v6mask, &ipif->ipif_v6net_mask) ||
11043 11041 (ipif->ipif_flags & IPIF_POINTOPOINT)) {
11044 11042 ipif->ipif_v6net_mask = v6mask;
11045 11043 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11046 11044 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
11047 11045 ipif->ipif_v6net_mask,
11048 11046 ipif->ipif_v6subnet);
11049 11047 }
11050 11048 return (0);
11051 11049 }
11052 11050 /*
11053 11051 * Make sure we have valid net and subnet broadcast ire's
11054 11052 * for the old netmask, if needed by other logical interfaces.
11055 11053 */
11056 11054 err = ipif_logical_down(ipif, q, mp);
11057 11055 if (err == EINPROGRESS)
11058 11056 return (err);
11059 11057 (void) ipif_down_tail(ipif);
11060 11058 err = ip_sioctl_netmask_tail(ipif, sin, q, mp);
11061 11059 return (err);
11062 11060 }
11063 11061
11064 11062 static int
11065 11063 ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp)
11066 11064 {
11067 11065 in6_addr_t v6mask;
11068 11066 int err = 0;
11069 11067
11070 11068 ip1dbg(("ip_sioctl_netmask_tail(%s:%u %p)\n",
11071 11069 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11072 11070
11073 11071 if (ipif->ipif_isv6) {
11074 11072 sin6_t *sin6;
11075 11073
11076 11074 sin6 = (sin6_t *)sin;
11077 11075 v6mask = sin6->sin6_addr;
11078 11076 } else {
11079 11077 ipaddr_t mask;
11080 11078
11081 11079 mask = sin->sin_addr.s_addr;
11082 11080 V4MASK_TO_V6(mask, v6mask);
11083 11081 }
11084 11082
11085 11083 ipif->ipif_v6net_mask = v6mask;
11086 11084 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11087 11085 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
11088 11086 ipif->ipif_v6subnet);
11089 11087 }
11090 11088 err = ipif_up(ipif, q, mp);
11091 11089
11092 11090 if (err == 0 || err == EINPROGRESS) {
11093 11091 /*
11094 11092 * The interface must be DL_BOUND if this packet has to
11095 11093 * go out on the wire. Since we only go through a logical
11096 11094 * down and are bound with the driver during an internal
11097 11095 * down/up that is satisfied.
11098 11096 */
11099 11097 if (!ipif->ipif_isv6 && ipif->ipif_ill->ill_wq != NULL) {
11100 11098 /* Potentially broadcast an address mask reply. */
11101 11099 ipif_mask_reply(ipif);
11102 11100 }
11103 11101 }
11104 11102 return (err);
11105 11103 }
11106 11104
11107 11105 /* ARGSUSED */
11108 11106 int
11109 11107 ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11110 11108 ip_ioctl_cmd_t *ipip, void *if_req)
11111 11109 {
11112 11110 ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n",
11113 11111 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11114 11112 (void) ipif_down_tail(ipif);
11115 11113 return (ip_sioctl_netmask_tail(ipif, sin, q, mp));
11116 11114 }
11117 11115
11118 11116 /* Get interface net mask. */
11119 11117 /* ARGSUSED */
11120 11118 int
11121 11119 ip_sioctl_get_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11122 11120 ip_ioctl_cmd_t *ipip, void *if_req)
11123 11121 {
11124 11122 struct lifreq *lifr = (struct lifreq *)if_req;
11125 11123 struct sockaddr_in6 *sin6 = (sin6_t *)sin;
11126 11124
11127 11125 ip1dbg(("ip_sioctl_get_netmask(%s:%u %p)\n",
11128 11126 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11129 11127
11130 11128 /*
11131 11129 * net mask can't change since we have a reference to the ipif.
11132 11130 */
11133 11131 if (ipif->ipif_isv6) {
11134 11132 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11135 11133 *sin6 = sin6_null;
11136 11134 sin6->sin6_family = AF_INET6;
11137 11135 sin6->sin6_addr = ipif->ipif_v6net_mask;
11138 11136 lifr->lifr_addrlen =
11139 11137 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11140 11138 } else {
11141 11139 *sin = sin_null;
11142 11140 sin->sin_family = AF_INET;
11143 11141 sin->sin_addr.s_addr = ipif->ipif_net_mask;
11144 11142 if (ipip->ipi_cmd_type == LIF_CMD) {
11145 11143 lifr->lifr_addrlen =
11146 11144 ip_mask_to_plen(ipif->ipif_net_mask);
11147 11145 }
11148 11146 }
11149 11147 return (0);
11150 11148 }
11151 11149
11152 11150 /* ARGSUSED */
11153 11151 int
11154 11152 ip_sioctl_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11155 11153 ip_ioctl_cmd_t *ipip, void *if_req)
11156 11154 {
11157 11155 ip1dbg(("ip_sioctl_metric(%s:%u %p)\n",
11158 11156 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11159 11157
11160 11158 /*
11161 11159 * Since no applications should ever be setting metrics on underlying
11162 11160 * interfaces, we explicitly fail to smoke 'em out.
11163 11161 */
11164 11162 if (IS_UNDER_IPMP(ipif->ipif_ill))
11165 11163 return (EINVAL);
11166 11164
11167 11165 /*
11168 11166 * Set interface metric. We don't use this for
11169 11167 * anything but we keep track of it in case it is
11170 11168 * important to routing applications or such.
11171 11169 */
11172 11170 if (ipip->ipi_cmd_type == IF_CMD) {
11173 11171 struct ifreq *ifr;
11174 11172
11175 11173 ifr = (struct ifreq *)if_req;
11176 11174 ipif->ipif_ill->ill_metric = ifr->ifr_metric;
11177 11175 } else {
11178 11176 struct lifreq *lifr;
11179 11177
11180 11178 lifr = (struct lifreq *)if_req;
11181 11179 ipif->ipif_ill->ill_metric = lifr->lifr_metric;
11182 11180 }
11183 11181 return (0);
11184 11182 }
11185 11183
11186 11184 /* ARGSUSED */
11187 11185 int
11188 11186 ip_sioctl_get_metric(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11189 11187 ip_ioctl_cmd_t *ipip, void *if_req)
11190 11188 {
11191 11189 /* Get interface metric. */
11192 11190 ip1dbg(("ip_sioctl_get_metric(%s:%u %p)\n",
11193 11191 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11194 11192
11195 11193 if (ipip->ipi_cmd_type == IF_CMD) {
11196 11194 struct ifreq *ifr;
11197 11195
11198 11196 ifr = (struct ifreq *)if_req;
11199 11197 ifr->ifr_metric = ipif->ipif_ill->ill_metric;
11200 11198 } else {
11201 11199 struct lifreq *lifr;
11202 11200
11203 11201 lifr = (struct lifreq *)if_req;
11204 11202 lifr->lifr_metric = ipif->ipif_ill->ill_metric;
11205 11203 }
11206 11204
11207 11205 return (0);
11208 11206 }
11209 11207
11210 11208 /* ARGSUSED */
11211 11209 int
11212 11210 ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11213 11211 ip_ioctl_cmd_t *ipip, void *if_req)
11214 11212 {
11215 11213 int arp_muxid;
11216 11214
11217 11215 ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n",
11218 11216 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11219 11217 /*
11220 11218 * Set the muxid returned from I_PLINK.
11221 11219 */
11222 11220 if (ipip->ipi_cmd_type == IF_CMD) {
11223 11221 struct ifreq *ifr = (struct ifreq *)if_req;
11224 11222
11225 11223 ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid;
11226 11224 arp_muxid = ifr->ifr_arp_muxid;
11227 11225 } else {
11228 11226 struct lifreq *lifr = (struct lifreq *)if_req;
11229 11227
11230 11228 ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid;
11231 11229 arp_muxid = lifr->lifr_arp_muxid;
11232 11230 }
11233 11231 arl_set_muxid(ipif->ipif_ill, arp_muxid);
11234 11232 return (0);
11235 11233 }
11236 11234
11237 11235 /* ARGSUSED */
11238 11236 int
11239 11237 ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11240 11238 ip_ioctl_cmd_t *ipip, void *if_req)
11241 11239 {
11242 11240 int arp_muxid = 0;
11243 11241
11244 11242 ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n",
11245 11243 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11246 11244 /*
11247 11245 * Get the muxid saved in ill for I_PUNLINK.
11248 11246 */
11249 11247 arp_muxid = arl_get_muxid(ipif->ipif_ill);
11250 11248 if (ipip->ipi_cmd_type == IF_CMD) {
11251 11249 struct ifreq *ifr = (struct ifreq *)if_req;
11252 11250
11253 11251 ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11254 11252 ifr->ifr_arp_muxid = arp_muxid;
11255 11253 } else {
11256 11254 struct lifreq *lifr = (struct lifreq *)if_req;
11257 11255
11258 11256 lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid;
11259 11257 lifr->lifr_arp_muxid = arp_muxid;
11260 11258 }
11261 11259 return (0);
11262 11260 }
11263 11261
11264 11262 /*
11265 11263 * Set the subnet prefix. Does not modify the broadcast address.
11266 11264 */
11267 11265 /* ARGSUSED */
11268 11266 int
11269 11267 ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11270 11268 ip_ioctl_cmd_t *ipip, void *if_req)
11271 11269 {
11272 11270 int err = 0;
11273 11271 in6_addr_t v6addr;
11274 11272 in6_addr_t v6mask;
11275 11273 boolean_t need_up = B_FALSE;
11276 11274 int addrlen;
11277 11275
11278 11276 ip1dbg(("ip_sioctl_subnet(%s:%u %p)\n",
11279 11277 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11280 11278
11281 11279 ASSERT(IAM_WRITER_IPIF(ipif));
11282 11280 addrlen = ((struct lifreq *)if_req)->lifr_addrlen;
11283 11281
11284 11282 if (ipif->ipif_isv6) {
11285 11283 sin6_t *sin6;
11286 11284
11287 11285 if (sin->sin_family != AF_INET6)
11288 11286 return (EAFNOSUPPORT);
11289 11287
11290 11288 sin6 = (sin6_t *)sin;
11291 11289 v6addr = sin6->sin6_addr;
11292 11290 if (!ip_remote_addr_ok_v6(&v6addr, &ipv6_all_ones))
11293 11291 return (EADDRNOTAVAIL);
11294 11292 } else {
11295 11293 ipaddr_t addr;
11296 11294
11297 11295 if (sin->sin_family != AF_INET)
11298 11296 return (EAFNOSUPPORT);
11299 11297
11300 11298 addr = sin->sin_addr.s_addr;
11301 11299 if (!ip_addr_ok_v4(addr, 0xFFFFFFFF))
11302 11300 return (EADDRNOTAVAIL);
11303 11301 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11304 11302 /* Add 96 bits */
11305 11303 addrlen += IPV6_ABITS - IP_ABITS;
11306 11304 }
11307 11305
11308 11306 if (ip_plen_to_mask_v6(addrlen, &v6mask) == NULL)
11309 11307 return (EINVAL);
11310 11308
11311 11309 /* Check if bits in the address is set past the mask */
11312 11310 if (!V6_MASK_EQ(v6addr, v6mask, v6addr))
11313 11311 return (EINVAL);
11314 11312
11315 11313 if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6subnet, &v6addr) &&
11316 11314 IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6net_mask, &v6mask))
11317 11315 return (0); /* No change */
11318 11316
11319 11317 if (ipif->ipif_flags & IPIF_UP) {
11320 11318 /*
11321 11319 * If the interface is already marked up,
11322 11320 * we call ipif_down which will take care
11323 11321 * of ditching any IREs that have been set
11324 11322 * up based on the old interface address.
11325 11323 */
11326 11324 err = ipif_logical_down(ipif, q, mp);
11327 11325 if (err == EINPROGRESS)
11328 11326 return (err);
11329 11327 (void) ipif_down_tail(ipif);
11330 11328 need_up = B_TRUE;
11331 11329 }
11332 11330
11333 11331 err = ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, need_up);
11334 11332 return (err);
11335 11333 }
11336 11334
11337 11335 static int
11338 11336 ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t v6addr, in6_addr_t v6mask,
11339 11337 queue_t *q, mblk_t *mp, boolean_t need_up)
11340 11338 {
11341 11339 ill_t *ill = ipif->ipif_ill;
11342 11340 int err = 0;
11343 11341
11344 11342 ip1dbg(("ip_sioctl_subnet_tail(%s:%u %p)\n",
11345 11343 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11346 11344
11347 11345 /* Set the new address. */
11348 11346 mutex_enter(&ill->ill_lock);
11349 11347 ipif->ipif_v6net_mask = v6mask;
11350 11348 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
11351 11349 V6_MASK_COPY(v6addr, ipif->ipif_v6net_mask,
11352 11350 ipif->ipif_v6subnet);
11353 11351 }
11354 11352 mutex_exit(&ill->ill_lock);
11355 11353
11356 11354 if (need_up) {
11357 11355 /*
11358 11356 * Now bring the interface back up. If this
11359 11357 * is the only IPIF for the ILL, ipif_up
11360 11358 * will have to re-bind to the device, so
11361 11359 * we may get back EINPROGRESS, in which
11362 11360 * case, this IOCTL will get completed in
11363 11361 * ip_rput_dlpi when we see the DL_BIND_ACK.
11364 11362 */
11365 11363 err = ipif_up(ipif, q, mp);
11366 11364 if (err == EINPROGRESS)
11367 11365 return (err);
11368 11366 }
11369 11367 return (err);
11370 11368 }
11371 11369
11372 11370 /* ARGSUSED */
11373 11371 int
11374 11372 ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11375 11373 ip_ioctl_cmd_t *ipip, void *if_req)
11376 11374 {
11377 11375 int addrlen;
11378 11376 in6_addr_t v6addr;
11379 11377 in6_addr_t v6mask;
11380 11378 struct lifreq *lifr = (struct lifreq *)if_req;
11381 11379
11382 11380 ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n",
11383 11381 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11384 11382 (void) ipif_down_tail(ipif);
11385 11383
11386 11384 addrlen = lifr->lifr_addrlen;
11387 11385 if (ipif->ipif_isv6) {
11388 11386 sin6_t *sin6;
11389 11387
11390 11388 sin6 = (sin6_t *)sin;
11391 11389 v6addr = sin6->sin6_addr;
11392 11390 } else {
11393 11391 ipaddr_t addr;
11394 11392
11395 11393 addr = sin->sin_addr.s_addr;
11396 11394 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
11397 11395 addrlen += IPV6_ABITS - IP_ABITS;
11398 11396 }
11399 11397 (void) ip_plen_to_mask_v6(addrlen, &v6mask);
11400 11398
11401 11399 return (ip_sioctl_subnet_tail(ipif, v6addr, v6mask, q, mp, B_TRUE));
11402 11400 }
11403 11401
11404 11402 /* ARGSUSED */
11405 11403 int
11406 11404 ip_sioctl_get_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11407 11405 ip_ioctl_cmd_t *ipip, void *if_req)
11408 11406 {
11409 11407 struct lifreq *lifr = (struct lifreq *)if_req;
11410 11408 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sin;
11411 11409
11412 11410 ip1dbg(("ip_sioctl_get_subnet(%s:%u %p)\n",
11413 11411 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11414 11412 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
11415 11413
11416 11414 if (ipif->ipif_isv6) {
11417 11415 *sin6 = sin6_null;
11418 11416 sin6->sin6_family = AF_INET6;
11419 11417 sin6->sin6_addr = ipif->ipif_v6subnet;
11420 11418 lifr->lifr_addrlen =
11421 11419 ip_mask_to_plen_v6(&ipif->ipif_v6net_mask);
11422 11420 } else {
11423 11421 *sin = sin_null;
11424 11422 sin->sin_family = AF_INET;
11425 11423 sin->sin_addr.s_addr = ipif->ipif_subnet;
11426 11424 lifr->lifr_addrlen = ip_mask_to_plen(ipif->ipif_net_mask);
11427 11425 }
11428 11426 return (0);
11429 11427 }
11430 11428
11431 11429 /*
11432 11430 * Set the IPv6 address token.
11433 11431 */
11434 11432 /* ARGSUSED */
11435 11433 int
11436 11434 ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11437 11435 ip_ioctl_cmd_t *ipi, void *if_req)
11438 11436 {
11439 11437 ill_t *ill = ipif->ipif_ill;
11440 11438 int err;
11441 11439 in6_addr_t v6addr;
11442 11440 in6_addr_t v6mask;
11443 11441 boolean_t need_up = B_FALSE;
11444 11442 int i;
11445 11443 sin6_t *sin6 = (sin6_t *)sin;
11446 11444 struct lifreq *lifr = (struct lifreq *)if_req;
11447 11445 int addrlen;
11448 11446
11449 11447 ip1dbg(("ip_sioctl_token(%s:%u %p)\n",
11450 11448 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11451 11449 ASSERT(IAM_WRITER_IPIF(ipif));
11452 11450
11453 11451 addrlen = lifr->lifr_addrlen;
11454 11452 /* Only allow for logical unit zero i.e. not on "le0:17" */
11455 11453 if (ipif->ipif_id != 0)
11456 11454 return (EINVAL);
11457 11455
11458 11456 if (!ipif->ipif_isv6)
11459 11457 return (EINVAL);
11460 11458
11461 11459 if (addrlen > IPV6_ABITS)
11462 11460 return (EINVAL);
11463 11461
11464 11462 v6addr = sin6->sin6_addr;
11465 11463
11466 11464 /*
11467 11465 * The length of the token is the length from the end. To get
11468 11466 * the proper mask for this, compute the mask of the bits not
11469 11467 * in the token; ie. the prefix, and then xor to get the mask.
11470 11468 */
11471 11469 if (ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask) == NULL)
11472 11470 return (EINVAL);
11473 11471 for (i = 0; i < 4; i++) {
11474 11472 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11475 11473 }
11476 11474
11477 11475 if (V6_MASK_EQ(v6addr, v6mask, ill->ill_token) &&
11478 11476 ill->ill_token_length == addrlen)
11479 11477 return (0); /* No change */
11480 11478
11481 11479 if (ipif->ipif_flags & IPIF_UP) {
11482 11480 err = ipif_logical_down(ipif, q, mp);
11483 11481 if (err == EINPROGRESS)
11484 11482 return (err);
11485 11483 (void) ipif_down_tail(ipif);
11486 11484 need_up = B_TRUE;
11487 11485 }
11488 11486 err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up);
11489 11487 return (err);
11490 11488 }
11491 11489
11492 11490 static int
11493 11491 ip_sioctl_token_tail(ipif_t *ipif, sin6_t *sin6, int addrlen, queue_t *q,
11494 11492 mblk_t *mp, boolean_t need_up)
11495 11493 {
11496 11494 in6_addr_t v6addr;
11497 11495 in6_addr_t v6mask;
11498 11496 ill_t *ill = ipif->ipif_ill;
11499 11497 int i;
11500 11498 int err = 0;
11501 11499
11502 11500 ip1dbg(("ip_sioctl_token_tail(%s:%u %p)\n",
11503 11501 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11504 11502 v6addr = sin6->sin6_addr;
11505 11503 /*
11506 11504 * The length of the token is the length from the end. To get
11507 11505 * the proper mask for this, compute the mask of the bits not
11508 11506 * in the token; ie. the prefix, and then xor to get the mask.
11509 11507 */
11510 11508 (void) ip_plen_to_mask_v6(IPV6_ABITS - addrlen, &v6mask);
11511 11509 for (i = 0; i < 4; i++)
11512 11510 v6mask.s6_addr32[i] ^= (uint32_t)0xffffffff;
11513 11511
11514 11512 mutex_enter(&ill->ill_lock);
11515 11513 V6_MASK_COPY(v6addr, v6mask, ill->ill_token);
11516 11514 ill->ill_token_length = addrlen;
11517 11515 ill->ill_manual_token = 1;
11518 11516
11519 11517 /* Reconfigure the link-local address based on this new token */
11520 11518 ipif_setlinklocal(ill->ill_ipif);
11521 11519
11522 11520 mutex_exit(&ill->ill_lock);
11523 11521
11524 11522 if (need_up) {
11525 11523 /*
11526 11524 * Now bring the interface back up. If this
11527 11525 * is the only IPIF for the ILL, ipif_up
11528 11526 * will have to re-bind to the device, so
11529 11527 * we may get back EINPROGRESS, in which
11530 11528 * case, this IOCTL will get completed in
11531 11529 * ip_rput_dlpi when we see the DL_BIND_ACK.
11532 11530 */
11533 11531 err = ipif_up(ipif, q, mp);
11534 11532 if (err == EINPROGRESS)
11535 11533 return (err);
11536 11534 }
11537 11535 return (err);
11538 11536 }
11539 11537
11540 11538 /* ARGSUSED */
11541 11539 int
11542 11540 ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11543 11541 ip_ioctl_cmd_t *ipi, void *if_req)
11544 11542 {
11545 11543 ill_t *ill;
11546 11544 sin6_t *sin6 = (sin6_t *)sin;
11547 11545 struct lifreq *lifr = (struct lifreq *)if_req;
11548 11546
11549 11547 ip1dbg(("ip_sioctl_get_token(%s:%u %p)\n",
11550 11548 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11551 11549 if (ipif->ipif_id != 0)
11552 11550 return (EINVAL);
11553 11551
11554 11552 ill = ipif->ipif_ill;
11555 11553 if (!ill->ill_isv6)
11556 11554 return (ENXIO);
11557 11555
11558 11556 *sin6 = sin6_null;
11559 11557 sin6->sin6_family = AF_INET6;
11560 11558 ASSERT(!IN6_IS_ADDR_V4MAPPED(&ill->ill_token));
11561 11559 sin6->sin6_addr = ill->ill_token;
11562 11560 lifr->lifr_addrlen = ill->ill_token_length;
11563 11561 return (0);
11564 11562 }
11565 11563
11566 11564 /*
11567 11565 * Set (hardware) link specific information that might override
11568 11566 * what was acquired through the DL_INFO_ACK.
11569 11567 */
11570 11568 /* ARGSUSED */
11571 11569 int
11572 11570 ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11573 11571 ip_ioctl_cmd_t *ipi, void *if_req)
11574 11572 {
11575 11573 ill_t *ill = ipif->ipif_ill;
11576 11574 int ip_min_mtu;
11577 11575 struct lifreq *lifr = (struct lifreq *)if_req;
11578 11576 lif_ifinfo_req_t *lir;
11579 11577
11580 11578 ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n",
11581 11579 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11582 11580 lir = &lifr->lifr_ifinfo;
11583 11581 ASSERT(IAM_WRITER_IPIF(ipif));
11584 11582
11585 11583 /* Only allow for logical unit zero i.e. not on "bge0:17" */
11586 11584 if (ipif->ipif_id != 0)
11587 11585 return (EINVAL);
11588 11586
11589 11587 /* Set interface MTU. */
11590 11588 if (ipif->ipif_isv6)
11591 11589 ip_min_mtu = IPV6_MIN_MTU;
11592 11590 else
11593 11591 ip_min_mtu = IP_MIN_MTU;
11594 11592
11595 11593 /*
11596 11594 * Verify values before we set anything. Allow zero to
11597 11595 * mean unspecified.
11598 11596 *
11599 11597 * XXX We should be able to set the user-defined lir_mtu to some value
11600 11598 * that is greater than ill_current_frag but less than ill_max_frag- the
11601 11599 * ill_max_frag value tells us the max MTU that can be handled by the
11602 11600 * datalink, whereas the ill_current_frag is dynamically computed for
11603 11601 * some link-types like tunnels, based on the tunnel PMTU. However,
11604 11602 * since there is currently no way of distinguishing between
11605 11603 * administratively fixed link mtu values (e.g., those set via
11606 11604 * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered
11607 11605 * for tunnels) we conservatively choose the ill_current_frag as the
11608 11606 * upper-bound.
11609 11607 */
11610 11608 if (lir->lir_maxmtu != 0 &&
11611 11609 (lir->lir_maxmtu > ill->ill_current_frag ||
11612 11610 lir->lir_maxmtu < ip_min_mtu))
11613 11611 return (EINVAL);
11614 11612 if (lir->lir_reachtime != 0 &&
11615 11613 lir->lir_reachtime > ND_MAX_REACHTIME)
11616 11614 return (EINVAL);
11617 11615 if (lir->lir_reachretrans != 0 &&
11618 11616 lir->lir_reachretrans > ND_MAX_REACHRETRANSTIME)
11619 11617 return (EINVAL);
11620 11618
11621 11619 mutex_enter(&ill->ill_lock);
11622 11620 /*
11623 11621 * The dce and fragmentation code can handle changes to ill_mtu
11624 11622 * concurrent with sending/fragmenting packets.
11625 11623 */
11626 11624 if (lir->lir_maxmtu != 0)
11627 11625 ill->ill_user_mtu = lir->lir_maxmtu;
11628 11626
11629 11627 if (lir->lir_reachtime != 0)
11630 11628 ill->ill_reachable_time = lir->lir_reachtime;
11631 11629
11632 11630 if (lir->lir_reachretrans != 0)
11633 11631 ill->ill_reachable_retrans_time = lir->lir_reachretrans;
11634 11632
11635 11633 ill->ill_max_hops = lir->lir_maxhops;
11636 11634 ill->ill_max_buf = ND_MAX_Q;
11637 11635 if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) {
11638 11636 /*
11639 11637 * ill_mtu is the actual interface MTU, obtained as the min
11640 11638 * of user-configured mtu and the value announced by the
11641 11639 * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since
11642 11640 * we have already made the choice of requiring
11643 11641 * ill_user_mtu < ill_current_frag by the time we get here,
11644 11642 * the ill_mtu effectively gets assigned to the ill_user_mtu
11645 11643 * here.
11646 11644 */
11647 11645 ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu);
11648 11646 ill->ill_mc_mtu = MIN(ill->ill_mc_mtu, ill->ill_user_mtu);
11649 11647 }
11650 11648 mutex_exit(&ill->ill_lock);
11651 11649
11652 11650 /*
11653 11651 * Make sure all dce_generation checks find out
11654 11652 * that ill_mtu/ill_mc_mtu has changed.
11655 11653 */
11656 11654 if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0))
11657 11655 dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst);
11658 11656
11659 11657 /*
11660 11658 * Refresh IPMP meta-interface MTU if necessary.
11661 11659 */
11662 11660 if (IS_UNDER_IPMP(ill))
11663 11661 ipmp_illgrp_refresh_mtu(ill->ill_grp);
11664 11662
11665 11663 return (0);
11666 11664 }
11667 11665
11668 11666 /* ARGSUSED */
11669 11667 int
11670 11668 ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
11671 11669 ip_ioctl_cmd_t *ipi, void *if_req)
11672 11670 {
11673 11671 struct lif_ifinfo_req *lir;
11674 11672 ill_t *ill = ipif->ipif_ill;
11675 11673
11676 11674 ip1dbg(("ip_sioctl_get_lnkinfo(%s:%u %p)\n",
11677 11675 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
11678 11676 if (ipif->ipif_id != 0)
11679 11677 return (EINVAL);
11680 11678
11681 11679 lir = &((struct lifreq *)if_req)->lifr_ifinfo;
11682 11680 lir->lir_maxhops = ill->ill_max_hops;
11683 11681 lir->lir_reachtime = ill->ill_reachable_time;
11684 11682 lir->lir_reachretrans = ill->ill_reachable_retrans_time;
11685 11683 lir->lir_maxmtu = ill->ill_mtu;
11686 11684
11687 11685 return (0);
11688 11686 }
11689 11687
11690 11688 /*
11691 11689 * Return best guess as to the subnet mask for the specified address.
11692 11690 * Based on the subnet masks for all the configured interfaces.
11693 11691 *
11694 11692 * We end up returning a zero mask in the case of default, multicast or
11695 11693 * experimental.
11696 11694 */
11697 11695 static ipaddr_t
11698 11696 ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst)
11699 11697 {
11700 11698 ipaddr_t net_mask;
11701 11699 ill_t *ill;
11702 11700 ipif_t *ipif;
11703 11701 ill_walk_context_t ctx;
11704 11702 ipif_t *fallback_ipif = NULL;
11705 11703
11706 11704 net_mask = ip_net_mask(addr);
11707 11705 if (net_mask == 0) {
11708 11706 *ipifp = NULL;
11709 11707 return (0);
11710 11708 }
11711 11709
11712 11710 /* Let's check to see if this is maybe a local subnet route. */
11713 11711 /* this function only applies to IPv4 interfaces */
11714 11712 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
11715 11713 ill = ILL_START_WALK_V4(&ctx, ipst);
11716 11714 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
11717 11715 mutex_enter(&ill->ill_lock);
11718 11716 for (ipif = ill->ill_ipif; ipif != NULL;
11719 11717 ipif = ipif->ipif_next) {
11720 11718 if (IPIF_IS_CONDEMNED(ipif))
11721 11719 continue;
11722 11720 if (!(ipif->ipif_flags & IPIF_UP))
11723 11721 continue;
11724 11722 if ((ipif->ipif_subnet & net_mask) ==
11725 11723 (addr & net_mask)) {
11726 11724 /*
11727 11725 * Don't trust pt-pt interfaces if there are
11728 11726 * other interfaces.
11729 11727 */
11730 11728 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
11731 11729 if (fallback_ipif == NULL) {
11732 11730 ipif_refhold_locked(ipif);
11733 11731 fallback_ipif = ipif;
11734 11732 }
11735 11733 continue;
11736 11734 }
11737 11735
11738 11736 /*
11739 11737 * Fine. Just assume the same net mask as the
11740 11738 * directly attached subnet interface is using.
11741 11739 */
11742 11740 ipif_refhold_locked(ipif);
11743 11741 mutex_exit(&ill->ill_lock);
11744 11742 rw_exit(&ipst->ips_ill_g_lock);
11745 11743 if (fallback_ipif != NULL)
11746 11744 ipif_refrele(fallback_ipif);
11747 11745 *ipifp = ipif;
11748 11746 return (ipif->ipif_net_mask);
11749 11747 }
11750 11748 }
11751 11749 mutex_exit(&ill->ill_lock);
11752 11750 }
11753 11751 rw_exit(&ipst->ips_ill_g_lock);
11754 11752
11755 11753 *ipifp = fallback_ipif;
11756 11754 return ((fallback_ipif != NULL) ?
11757 11755 fallback_ipif->ipif_net_mask : net_mask);
11758 11756 }
11759 11757
11760 11758 /*
11761 11759 * ip_sioctl_copyin_setup calls ip_wput_ioctl to process the IP_IOCTL ioctl.
11762 11760 */
11763 11761 static void
11764 11762 ip_wput_ioctl(queue_t *q, mblk_t *mp)
11765 11763 {
11766 11764 IOCP iocp;
11767 11765 ipft_t *ipft;
11768 11766 ipllc_t *ipllc;
11769 11767 mblk_t *mp1;
11770 11768 cred_t *cr;
11771 11769 int error = 0;
11772 11770 conn_t *connp;
11773 11771
11774 11772 ip1dbg(("ip_wput_ioctl"));
11775 11773 iocp = (IOCP)mp->b_rptr;
11776 11774 mp1 = mp->b_cont;
11777 11775 if (mp1 == NULL) {
11778 11776 iocp->ioc_error = EINVAL;
11779 11777 mp->b_datap->db_type = M_IOCNAK;
11780 11778 iocp->ioc_count = 0;
11781 11779 qreply(q, mp);
11782 11780 return;
11783 11781 }
11784 11782
11785 11783 /*
11786 11784 * These IOCTLs provide various control capabilities to
11787 11785 * upstream agents such as ULPs and processes. There
11788 11786 * are currently two such IOCTLs implemented. They
11789 11787 * are used by TCP to provide update information for
11790 11788 * existing IREs and to forcibly delete an IRE for a
11791 11789 * host that is not responding, thereby forcing an
11792 11790 * attempt at a new route.
11793 11791 */
11794 11792 iocp->ioc_error = EINVAL;
11795 11793 if (!pullupmsg(mp1, sizeof (ipllc->ipllc_cmd)))
11796 11794 goto done;
11797 11795
11798 11796 ipllc = (ipllc_t *)mp1->b_rptr;
11799 11797 for (ipft = ip_ioctl_ftbl; ipft->ipft_pfi; ipft++) {
11800 11798 if (ipllc->ipllc_cmd == ipft->ipft_cmd)
11801 11799 break;
11802 11800 }
11803 11801 /*
11804 11802 * prefer credential from mblk over ioctl;
11805 11803 * see ip_sioctl_copyin_setup
11806 11804 */
11807 11805 cr = msg_getcred(mp, NULL);
11808 11806 if (cr == NULL)
11809 11807 cr = iocp->ioc_cr;
11810 11808
11811 11809 /*
11812 11810 * Refhold the conn in case the request gets queued up in some lookup
11813 11811 */
11814 11812 ASSERT(CONN_Q(q));
11815 11813 connp = Q_TO_CONN(q);
11816 11814 CONN_INC_REF(connp);
11817 11815 CONN_INC_IOCTLREF(connp);
11818 11816 if (ipft->ipft_pfi &&
11819 11817 ((mp1->b_wptr - mp1->b_rptr) >= ipft->ipft_min_size ||
11820 11818 pullupmsg(mp1, ipft->ipft_min_size))) {
11821 11819 error = (*ipft->ipft_pfi)(q,
11822 11820 (ipft->ipft_flags & IPFT_F_SELF_REPLY) ? mp : mp1, cr);
11823 11821 }
11824 11822 if (ipft->ipft_flags & IPFT_F_SELF_REPLY) {
11825 11823 /*
11826 11824 * CONN_OPER_PENDING_DONE happens in the function called
11827 11825 * through ipft_pfi above.
11828 11826 */
11829 11827 return;
11830 11828 }
11831 11829
11832 11830 CONN_DEC_IOCTLREF(connp);
11833 11831 CONN_OPER_PENDING_DONE(connp);
11834 11832 if (ipft->ipft_flags & IPFT_F_NO_REPLY) {
11835 11833 freemsg(mp);
11836 11834 return;
11837 11835 }
11838 11836 iocp->ioc_error = error;
11839 11837
11840 11838 done:
11841 11839 mp->b_datap->db_type = M_IOCACK;
11842 11840 if (iocp->ioc_error)
11843 11841 iocp->ioc_count = 0;
11844 11842 qreply(q, mp);
11845 11843 }
11846 11844
11847 11845 /*
11848 11846 * Assign a unique id for the ipif. This is used by sctp_addr.c
11849 11847 * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures.
11850 11848 */
11851 11849 static void
11852 11850 ipif_assign_seqid(ipif_t *ipif)
11853 11851 {
11854 11852 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
11855 11853
11856 11854 ipif->ipif_seqid = atomic_inc_64_nv(&ipst->ips_ipif_g_seqid);
11857 11855 }
11858 11856
11859 11857 /*
11860 11858 * Clone the contents of `sipif' to `dipif'. Requires that both ipifs are
11861 11859 * administratively down (i.e., no DAD), of the same type, and locked. Note
11862 11860 * that the clone is complete -- including the seqid -- and the expectation is
11863 11861 * that the caller will either free or overwrite `sipif' before it's unlocked.
11864 11862 */
11865 11863 static void
11866 11864 ipif_clone(const ipif_t *sipif, ipif_t *dipif)
11867 11865 {
11868 11866 ASSERT(MUTEX_HELD(&sipif->ipif_ill->ill_lock));
11869 11867 ASSERT(MUTEX_HELD(&dipif->ipif_ill->ill_lock));
11870 11868 ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11871 11869 ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)));
11872 11870 ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type);
11873 11871
11874 11872 dipif->ipif_flags = sipif->ipif_flags;
11875 11873 dipif->ipif_zoneid = sipif->ipif_zoneid;
11876 11874 dipif->ipif_v6subnet = sipif->ipif_v6subnet;
11877 11875 dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr;
11878 11876 dipif->ipif_v6net_mask = sipif->ipif_v6net_mask;
11879 11877 dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr;
11880 11878 dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr;
11881 11879
11882 11880 /*
11883 11881 * As per the comment atop the function, we assume that these sipif
11884 11882 * fields will be changed before sipif is unlocked.
11885 11883 */
11886 11884 dipif->ipif_seqid = sipif->ipif_seqid;
11887 11885 dipif->ipif_state_flags = sipif->ipif_state_flags;
11888 11886 }
11889 11887
11890 11888 /*
11891 11889 * Transfer the contents of `sipif' to `dipif', and then free (if `virgipif'
11892 11890 * is NULL) or overwrite `sipif' with `virgipif', which must be a virgin
11893 11891 * (unreferenced) ipif. Also, if `sipif' is used by the current xop, then
11894 11892 * transfer the xop to `dipif'. Requires that all ipifs are administratively
11895 11893 * down (i.e., no DAD), of the same type, and unlocked.
11896 11894 */
11897 11895 static void
11898 11896 ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif)
11899 11897 {
11900 11898 ipsq_t *ipsq = sipif->ipif_ill->ill_phyint->phyint_ipsq;
11901 11899 ipxop_t *ipx = ipsq->ipsq_xop;
11902 11900
11903 11901 ASSERT(sipif != dipif);
11904 11902 ASSERT(sipif != virgipif);
11905 11903
11906 11904 /*
11907 11905 * Grab all of the locks that protect the ipif in a defined order.
11908 11906 */
11909 11907 GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11910 11908
11911 11909 ipif_clone(sipif, dipif);
11912 11910 if (virgipif != NULL) {
11913 11911 ipif_clone(virgipif, sipif);
11914 11912 mi_free(virgipif);
11915 11913 }
11916 11914
11917 11915 RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill);
11918 11916
11919 11917 /*
11920 11918 * Transfer ownership of the current xop, if necessary.
11921 11919 */
11922 11920 if (ipx->ipx_current_ipif == sipif) {
11923 11921 ASSERT(ipx->ipx_pending_ipif == NULL);
11924 11922 mutex_enter(&ipx->ipx_lock);
11925 11923 ipx->ipx_current_ipif = dipif;
11926 11924 mutex_exit(&ipx->ipx_lock);
11927 11925 }
11928 11926
11929 11927 if (virgipif == NULL)
11930 11928 mi_free(sipif);
11931 11929 }
11932 11930
11933 11931 /*
11934 11932 * checks if:
11935 11933 * - <ill_name>:<ipif_id> is at most LIFNAMSIZ - 1 and
11936 11934 * - logical interface is within the allowed range
11937 11935 */
11938 11936 static int
11939 11937 is_lifname_valid(ill_t *ill, unsigned int ipif_id)
11940 11938 {
11941 11939 if (snprintf(NULL, 0, "%s:%d", ill->ill_name, ipif_id) >= LIFNAMSIZ)
11942 11940 return (ENAMETOOLONG);
11943 11941
11944 11942 if (ipif_id >= ill->ill_ipst->ips_ip_addrs_per_if)
11945 11943 return (ERANGE);
11946 11944 return (0);
11947 11945 }
11948 11946
11949 11947 /*
11950 11948 * Insert the ipif, so that the list of ipifs on the ill will be sorted
11951 11949 * with respect to ipif_id. Note that an ipif with an ipif_id of -1 will
11952 11950 * be inserted into the first space available in the list. The value of
11953 11951 * ipif_id will then be set to the appropriate value for its position.
11954 11952 */
11955 11953 static int
11956 11954 ipif_insert(ipif_t *ipif, boolean_t acquire_g_lock)
11957 11955 {
11958 11956 ill_t *ill;
11959 11957 ipif_t *tipif;
11960 11958 ipif_t **tipifp;
11961 11959 int id, err;
11962 11960 ip_stack_t *ipst;
11963 11961
11964 11962 ASSERT(ipif->ipif_ill->ill_net_type == IRE_LOOPBACK ||
11965 11963 IAM_WRITER_IPIF(ipif));
11966 11964
11967 11965 ill = ipif->ipif_ill;
11968 11966 ASSERT(ill != NULL);
11969 11967 ipst = ill->ill_ipst;
11970 11968
11971 11969 /*
11972 11970 * In the case of lo0:0 we already hold the ill_g_lock.
11973 11971 * ill_lookup_on_name (acquires ill_g_lock) -> ipif_allocate ->
11974 11972 * ipif_insert.
11975 11973 */
11976 11974 if (acquire_g_lock)
11977 11975 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
11978 11976 mutex_enter(&ill->ill_lock);
11979 11977 id = ipif->ipif_id;
11980 11978 tipifp = &(ill->ill_ipif);
11981 11979 if (id == -1) { /* need to find a real id */
11982 11980 id = 0;
11983 11981 while ((tipif = *tipifp) != NULL) {
11984 11982 ASSERT(tipif->ipif_id >= id);
11985 11983 if (tipif->ipif_id != id)
11986 11984 break; /* non-consecutive id */
11987 11985 id++;
11988 11986 tipifp = &(tipif->ipif_next);
11989 11987 }
11990 11988 if ((err = is_lifname_valid(ill, id)) != 0) {
11991 11989 mutex_exit(&ill->ill_lock);
11992 11990 if (acquire_g_lock)
11993 11991 rw_exit(&ipst->ips_ill_g_lock);
11994 11992 return (err);
11995 11993 }
11996 11994 ipif->ipif_id = id; /* assign new id */
11997 11995 } else if ((err = is_lifname_valid(ill, id)) == 0) {
11998 11996 /* we have a real id; insert ipif in the right place */
11999 11997 while ((tipif = *tipifp) != NULL) {
12000 11998 ASSERT(tipif->ipif_id != id);
12001 11999 if (tipif->ipif_id > id)
12002 12000 break; /* found correct location */
12003 12001 tipifp = &(tipif->ipif_next);
12004 12002 }
12005 12003 } else {
12006 12004 mutex_exit(&ill->ill_lock);
12007 12005 if (acquire_g_lock)
12008 12006 rw_exit(&ipst->ips_ill_g_lock);
12009 12007 return (err);
12010 12008 }
12011 12009
12012 12010 ASSERT(tipifp != &(ill->ill_ipif) || id == 0);
12013 12011
12014 12012 ipif->ipif_next = tipif;
12015 12013 *tipifp = ipif;
12016 12014 mutex_exit(&ill->ill_lock);
12017 12015 if (acquire_g_lock)
12018 12016 rw_exit(&ipst->ips_ill_g_lock);
12019 12017
12020 12018 return (0);
12021 12019 }
12022 12020
12023 12021 static void
12024 12022 ipif_remove(ipif_t *ipif)
12025 12023 {
12026 12024 ipif_t **ipifp;
12027 12025 ill_t *ill = ipif->ipif_ill;
12028 12026
12029 12027 ASSERT(RW_WRITE_HELD(&ill->ill_ipst->ips_ill_g_lock));
12030 12028
12031 12029 mutex_enter(&ill->ill_lock);
12032 12030 ipifp = &ill->ill_ipif;
12033 12031 for (; *ipifp != NULL; ipifp = &ipifp[0]->ipif_next) {
12034 12032 if (*ipifp == ipif) {
12035 12033 *ipifp = ipif->ipif_next;
12036 12034 break;
12037 12035 }
12038 12036 }
12039 12037 mutex_exit(&ill->ill_lock);
12040 12038 }
12041 12039
12042 12040 /*
12043 12041 * Allocate and initialize a new interface control structure. (Always
12044 12042 * called as writer.)
12045 12043 * When ipif_allocate() is called from ip_ll_subnet_defaults, the ill
12046 12044 * is not part of the global linked list of ills. ipif_seqid is unique
12047 12045 * in the system and to preserve the uniqueness, it is assigned only
12048 12046 * when ill becomes part of the global list. At that point ill will
12049 12047 * have a name. If it doesn't get assigned here, it will get assigned
12050 12048 * in ipif_set_values() as part of SIOCSLIFNAME processing.
12051 12049 * Aditionally, if we come here from ip_ll_subnet_defaults, we don't set
12052 12050 * the interface flags or any other information from the DL_INFO_ACK for
12053 12051 * DL_STYLE2 drivers (initialize == B_FALSE), since we won't have them at
12054 12052 * this point. The flags etc. will be set in ip_ll_subnet_defaults when the
12055 12053 * second DL_INFO_ACK comes in from the driver.
12056 12054 */
12057 12055 static ipif_t *
12058 12056 ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize,
12059 12057 boolean_t insert, int *errorp)
12060 12058 {
12061 12059 int err;
12062 12060 ipif_t *ipif;
12063 12061 ip_stack_t *ipst = ill->ill_ipst;
12064 12062
12065 12063 ip1dbg(("ipif_allocate(%s:%d ill %p)\n",
12066 12064 ill->ill_name, id, (void *)ill));
12067 12065 ASSERT(ire_type == IRE_LOOPBACK || IAM_WRITER_ILL(ill));
12068 12066
12069 12067 if (errorp != NULL)
12070 12068 *errorp = 0;
12071 12069
12072 12070 if ((ipif = mi_alloc(sizeof (ipif_t), BPRI_MED)) == NULL) {
12073 12071 if (errorp != NULL)
12074 12072 *errorp = ENOMEM;
12075 12073 return (NULL);
12076 12074 }
12077 12075 *ipif = ipif_zero; /* start clean */
12078 12076
12079 12077 ipif->ipif_ill = ill;
12080 12078 ipif->ipif_id = id; /* could be -1 */
12081 12079 /*
12082 12080 * Inherit the zoneid from the ill; for the shared stack instance
12083 12081 * this is always the global zone
12084 12082 */
12085 12083 ipif->ipif_zoneid = ill->ill_zoneid;
12086 12084
12087 12085 ipif->ipif_refcnt = 0;
12088 12086
12089 12087 if (insert) {
12090 12088 if ((err = ipif_insert(ipif, ire_type != IRE_LOOPBACK)) != 0) {
12091 12089 mi_free(ipif);
12092 12090 if (errorp != NULL)
12093 12091 *errorp = err;
12094 12092 return (NULL);
12095 12093 }
12096 12094 /* -1 id should have been replaced by real id */
12097 12095 id = ipif->ipif_id;
12098 12096 ASSERT(id >= 0);
12099 12097 }
12100 12098
12101 12099 if (ill->ill_name[0] != '\0')
12102 12100 ipif_assign_seqid(ipif);
12103 12101
12104 12102 /*
12105 12103 * If this is the zeroth ipif on the IPMP ill, create the illgrp
12106 12104 * (which must not exist yet because the zeroth ipif is created once
12107 12105 * per ill). However, do not not link it to the ipmp_grp_t until
12108 12106 * I_PLINK is called; see ip_sioctl_plink_ipmp() for details.
12109 12107 */
12110 12108 if (id == 0 && IS_IPMP(ill)) {
12111 12109 if (ipmp_illgrp_create(ill) == NULL) {
12112 12110 if (insert) {
12113 12111 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
12114 12112 ipif_remove(ipif);
12115 12113 rw_exit(&ipst->ips_ill_g_lock);
12116 12114 }
12117 12115 mi_free(ipif);
12118 12116 if (errorp != NULL)
12119 12117 *errorp = ENOMEM;
12120 12118 return (NULL);
12121 12119 }
12122 12120 }
12123 12121
12124 12122 /*
12125 12123 * We grab ill_lock to protect the flag changes. The ipif is still
12126 12124 * not up and can't be looked up until the ioctl completes and the
12127 12125 * IPIF_CHANGING flag is cleared.
12128 12126 */
12129 12127 mutex_enter(&ill->ill_lock);
12130 12128
12131 12129 ipif->ipif_ire_type = ire_type;
12132 12130
12133 12131 if (ipif->ipif_isv6) {
12134 12132 ill->ill_flags |= ILLF_IPV6;
12135 12133 } else {
12136 12134 ipaddr_t inaddr_any = INADDR_ANY;
12137 12135
12138 12136 ill->ill_flags |= ILLF_IPV4;
12139 12137
12140 12138 /* Keep the IN6_IS_ADDR_V4MAPPED assertions happy */
12141 12139 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12142 12140 &ipif->ipif_v6lcl_addr);
12143 12141 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12144 12142 &ipif->ipif_v6subnet);
12145 12143 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12146 12144 &ipif->ipif_v6net_mask);
12147 12145 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12148 12146 &ipif->ipif_v6brd_addr);
12149 12147 IN6_IPADDR_TO_V4MAPPED(inaddr_any,
12150 12148 &ipif->ipif_v6pp_dst_addr);
12151 12149 }
12152 12150
12153 12151 /*
12154 12152 * Don't set the interface flags etc. now, will do it in
12155 12153 * ip_ll_subnet_defaults.
12156 12154 */
12157 12155 if (!initialize)
12158 12156 goto out;
12159 12157
12160 12158 /*
12161 12159 * NOTE: The IPMP meta-interface is special-cased because it starts
12162 12160 * with no underlying interfaces (and thus an unknown broadcast
12163 12161 * address length), but all interfaces that can be placed into an IPMP
12164 12162 * group are required to be broadcast-capable.
12165 12163 */
12166 12164 if (ill->ill_bcast_addr_length != 0 || IS_IPMP(ill)) {
12167 12165 /*
12168 12166 * Later detect lack of DLPI driver multicast capability by
12169 12167 * catching DL_ENABMULTI_REQ errors in ip_rput_dlpi().
12170 12168 */
12171 12169 ill->ill_flags |= ILLF_MULTICAST;
12172 12170 if (!ipif->ipif_isv6)
12173 12171 ipif->ipif_flags |= IPIF_BROADCAST;
12174 12172 } else {
12175 12173 if (ill->ill_net_type != IRE_LOOPBACK) {
12176 12174 if (ipif->ipif_isv6)
12177 12175 /*
12178 12176 * Note: xresolv interfaces will eventually need
12179 12177 * NOARP set here as well, but that will require
12180 12178 * those external resolvers to have some
12181 12179 * knowledge of that flag and act appropriately.
12182 12180 * Not to be changed at present.
12183 12181 */
12184 12182 ill->ill_flags |= ILLF_NONUD;
12185 12183 else
12186 12184 ill->ill_flags |= ILLF_NOARP;
12187 12185 }
12188 12186 if (ill->ill_phys_addr_length == 0) {
12189 12187 if (IS_VNI(ill)) {
12190 12188 ipif->ipif_flags |= IPIF_NOXMIT;
12191 12189 } else {
12192 12190 /* pt-pt supports multicast. */
12193 12191 ill->ill_flags |= ILLF_MULTICAST;
12194 12192 if (ill->ill_net_type != IRE_LOOPBACK)
12195 12193 ipif->ipif_flags |= IPIF_POINTOPOINT;
12196 12194 }
12197 12195 }
12198 12196 }
12199 12197 out:
12200 12198 mutex_exit(&ill->ill_lock);
12201 12199 return (ipif);
12202 12200 }
12203 12201
12204 12202 /*
12205 12203 * Remove the neighbor cache entries associated with this logical
12206 12204 * interface.
12207 12205 */
12208 12206 int
12209 12207 ipif_arp_down(ipif_t *ipif)
12210 12208 {
12211 12209 ill_t *ill = ipif->ipif_ill;
12212 12210 int err = 0;
12213 12211
12214 12212 ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
12215 12213 ASSERT(IAM_WRITER_IPIF(ipif));
12216 12214
12217 12215 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down",
12218 12216 ill_t *, ill, ipif_t *, ipif);
12219 12217 ipif_nce_down(ipif);
12220 12218
12221 12219 /*
12222 12220 * If this is the last ipif that is going down and there are no
12223 12221 * duplicate addresses we may yet attempt to re-probe, then we need to
12224 12222 * clean up ARP completely.
12225 12223 */
12226 12224 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
12227 12225 !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) {
12228 12226 /*
12229 12227 * If this was the last ipif on an IPMP interface, purge any
12230 12228 * static ARP entries associated with it.
12231 12229 */
12232 12230 if (IS_IPMP(ill))
12233 12231 ipmp_illgrp_refresh_arpent(ill->ill_grp);
12234 12232
12235 12233 /* UNBIND, DETACH */
12236 12234 err = arp_ll_down(ill);
12237 12235 }
12238 12236
12239 12237 return (err);
12240 12238 }
12241 12239
12242 12240 /*
12243 12241 * Get the resolver set up for a new IP address. (Always called as writer.)
12244 12242 * Called both for IPv4 and IPv6 interfaces, though it only does some
12245 12243 * basic DAD related initialization for IPv6. Honors ILLF_NOARP.
12246 12244 *
12247 12245 * The enumerated value res_act tunes the behavior:
12248 12246 * * Res_act_initial: set up all the resolver structures for a new
12249 12247 * IP address.
12250 12248 * * Res_act_defend: tell ARP that it needs to send a single gratuitous
12251 12249 * ARP message in defense of the address.
12252 12250 * * Res_act_rebind: tell ARP to change the hardware address for an IP
12253 12251 * address (and issue gratuitous ARPs). Used by ipmp_ill_bind_ipif().
12254 12252 *
12255 12253 * Returns zero on success, or an errno upon failure.
12256 12254 */
12257 12255 int
12258 12256 ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act)
12259 12257 {
12260 12258 ill_t *ill = ipif->ipif_ill;
12261 12259 int err;
12262 12260 boolean_t was_dup;
12263 12261
12264 12262 ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n",
12265 12263 ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags));
12266 12264 ASSERT(IAM_WRITER_IPIF(ipif));
12267 12265
12268 12266 was_dup = B_FALSE;
12269 12267 if (res_act == Res_act_initial) {
12270 12268 ipif->ipif_addr_ready = 0;
12271 12269 /*
12272 12270 * We're bringing an interface up here. There's no way that we
12273 12271 * should need to shut down ARP now.
12274 12272 */
12275 12273 mutex_enter(&ill->ill_lock);
12276 12274 if (ipif->ipif_flags & IPIF_DUPLICATE) {
12277 12275 ipif->ipif_flags &= ~IPIF_DUPLICATE;
12278 12276 ill->ill_ipif_dup_count--;
12279 12277 was_dup = B_TRUE;
12280 12278 }
12281 12279 mutex_exit(&ill->ill_lock);
12282 12280 }
12283 12281 if (ipif->ipif_recovery_id != 0)
12284 12282 (void) untimeout(ipif->ipif_recovery_id);
12285 12283 ipif->ipif_recovery_id = 0;
12286 12284 if (ill->ill_net_type != IRE_IF_RESOLVER) {
12287 12285 ipif->ipif_addr_ready = 1;
12288 12286 return (0);
12289 12287 }
12290 12288 /* NDP will set the ipif_addr_ready flag when it's ready */
12291 12289 if (ill->ill_isv6)
12292 12290 return (0);
12293 12291
12294 12292 err = ipif_arp_up(ipif, res_act, was_dup);
12295 12293 return (err);
12296 12294 }
12297 12295
12298 12296 /*
12299 12297 * This routine restarts IPv4/IPv6 duplicate address detection (DAD)
12300 12298 * when a link has just gone back up.
12301 12299 */
12302 12300 static void
12303 12301 ipif_nce_start_dad(ipif_t *ipif)
12304 12302 {
12305 12303 ncec_t *ncec;
12306 12304 ill_t *ill = ipif->ipif_ill;
12307 12305 boolean_t isv6 = ill->ill_isv6;
12308 12306
12309 12307 if (isv6) {
12310 12308 ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill,
12311 12309 &ipif->ipif_v6lcl_addr);
12312 12310 } else {
12313 12311 ipaddr_t v4addr;
12314 12312
12315 12313 if (ill->ill_net_type != IRE_IF_RESOLVER ||
12316 12314 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
12317 12315 ipif->ipif_lcl_addr == INADDR_ANY) {
12318 12316 /*
12319 12317 * If we can't contact ARP for some reason,
12320 12318 * that's not really a problem. Just send
12321 12319 * out the routing socket notification that
12322 12320 * DAD completion would have done, and continue.
12323 12321 */
12324 12322 ipif_mask_reply(ipif);
12325 12323 ipif_up_notify(ipif);
12326 12324 ipif->ipif_addr_ready = 1;
12327 12325 return;
12328 12326 }
12329 12327
12330 12328 IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr);
12331 12329 ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr);
12332 12330 }
12333 12331
12334 12332 if (ncec == NULL) {
12335 12333 ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n",
12336 12334 (void *)ipif));
12337 12335 return;
12338 12336 }
12339 12337 if (!nce_restart_dad(ncec)) {
12340 12338 /*
12341 12339 * If we can't restart DAD for some reason, that's not really a
12342 12340 * problem. Just send out the routing socket notification that
12343 12341 * DAD completion would have done, and continue.
12344 12342 */
12345 12343 ipif_up_notify(ipif);
12346 12344 ipif->ipif_addr_ready = 1;
12347 12345 }
12348 12346 ncec_refrele(ncec);
12349 12347 }
12350 12348
12351 12349 /*
12352 12350 * Restart duplicate address detection on all interfaces on the given ill.
12353 12351 *
12354 12352 * This is called when an interface transitions from down to up
12355 12353 * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN).
12356 12354 *
12357 12355 * Note that since the underlying physical link has transitioned, we must cause
12358 12356 * at least one routing socket message to be sent here, either via DAD
12359 12357 * completion or just by default on the first ipif. (If we don't do this, then
12360 12358 * in.mpathd will see long delays when doing link-based failure recovery.)
12361 12359 */
12362 12360 void
12363 12361 ill_restart_dad(ill_t *ill, boolean_t went_up)
12364 12362 {
12365 12363 ipif_t *ipif;
12366 12364
12367 12365 if (ill == NULL)
12368 12366 return;
12369 12367
12370 12368 /*
12371 12369 * If layer two doesn't support duplicate address detection, then just
12372 12370 * send the routing socket message now and be done with it.
12373 12371 */
12374 12372 if (!ill->ill_isv6 && arp_no_defense) {
12375 12373 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12376 12374 return;
12377 12375 }
12378 12376
12379 12377 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12380 12378 if (went_up) {
12381 12379
12382 12380 if (ipif->ipif_flags & IPIF_UP) {
12383 12381 ipif_nce_start_dad(ipif);
12384 12382 } else if (ipif->ipif_flags & IPIF_DUPLICATE) {
12385 12383 /*
12386 12384 * kick off the bring-up process now.
12387 12385 */
12388 12386 ipif_do_recovery(ipif);
12389 12387 } else {
12390 12388 /*
12391 12389 * Unfortunately, the first ipif is "special"
12392 12390 * and represents the underlying ill in the
12393 12391 * routing socket messages. Thus, when this
12394 12392 * one ipif is down, we must still notify so
12395 12393 * that the user knows the IFF_RUNNING status
12396 12394 * change. (If the first ipif is up, then
12397 12395 * we'll handle eventual routing socket
12398 12396 * notification via DAD completion.)
12399 12397 */
12400 12398 if (ipif == ill->ill_ipif) {
12401 12399 ip_rts_ifmsg(ill->ill_ipif,
12402 12400 RTSQ_DEFAULT);
12403 12401 }
12404 12402 }
12405 12403 } else {
12406 12404 /*
12407 12405 * After link down, we'll need to send a new routing
12408 12406 * message when the link comes back, so clear
12409 12407 * ipif_addr_ready.
12410 12408 */
12411 12409 ipif->ipif_addr_ready = 0;
12412 12410 }
12413 12411 }
12414 12412
12415 12413 /*
12416 12414 * If we've torn down links, then notify the user right away.
12417 12415 */
12418 12416 if (!went_up)
12419 12417 ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT);
12420 12418 }
12421 12419
12422 12420 static void
12423 12421 ipsq_delete(ipsq_t *ipsq)
12424 12422 {
12425 12423 ipxop_t *ipx = ipsq->ipsq_xop;
12426 12424
12427 12425 ipsq->ipsq_ipst = NULL;
12428 12426 ASSERT(ipsq->ipsq_phyint == NULL);
12429 12427 ASSERT(ipsq->ipsq_xop != NULL);
12430 12428 ASSERT(ipsq->ipsq_xopq_mphead == NULL && ipx->ipx_mphead == NULL);
12431 12429 ASSERT(ipx->ipx_pending_mp == NULL);
12432 12430 kmem_free(ipsq, sizeof (ipsq_t));
12433 12431 }
12434 12432
12435 12433 static int
12436 12434 ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp)
12437 12435 {
12438 12436 int err = 0;
12439 12437 ipif_t *ipif;
12440 12438
12441 12439 if (ill == NULL)
12442 12440 return (0);
12443 12441
12444 12442 ASSERT(IAM_WRITER_ILL(ill));
12445 12443 ill->ill_up_ipifs = B_TRUE;
12446 12444 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12447 12445 if (ipif->ipif_was_up) {
12448 12446 if (!(ipif->ipif_flags & IPIF_UP))
12449 12447 err = ipif_up(ipif, q, mp);
12450 12448 ipif->ipif_was_up = B_FALSE;
12451 12449 if (err != 0) {
12452 12450 ASSERT(err == EINPROGRESS);
12453 12451 return (err);
12454 12452 }
12455 12453 }
12456 12454 }
12457 12455 ill->ill_up_ipifs = B_FALSE;
12458 12456 return (0);
12459 12457 }
12460 12458
12461 12459 /*
12462 12460 * This function is called to bring up all the ipifs that were up before
12463 12461 * bringing the ill down via ill_down_ipifs().
12464 12462 */
12465 12463 int
12466 12464 ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp)
12467 12465 {
12468 12466 int err;
12469 12467
12470 12468 ASSERT(IAM_WRITER_ILL(ill));
12471 12469
12472 12470 if (ill->ill_replumbing) {
12473 12471 ill->ill_replumbing = 0;
12474 12472 /*
12475 12473 * Send down REPLUMB_DONE notification followed by the
12476 12474 * BIND_REQ on the arp stream.
12477 12475 */
12478 12476 if (!ill->ill_isv6)
12479 12477 arp_send_replumb_conf(ill);
12480 12478 }
12481 12479 err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp);
12482 12480 if (err != 0)
12483 12481 return (err);
12484 12482
12485 12483 return (ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv6, q, mp));
12486 12484 }
12487 12485
12488 12486 /*
12489 12487 * Bring down any IPIF_UP ipifs on ill. If "logical" is B_TRUE, we bring
12490 12488 * down the ipifs without sending DL_UNBIND_REQ to the driver.
12491 12489 */
12492 12490 static void
12493 12491 ill_down_ipifs(ill_t *ill, boolean_t logical)
12494 12492 {
12495 12493 ipif_t *ipif;
12496 12494
12497 12495 ASSERT(IAM_WRITER_ILL(ill));
12498 12496
12499 12497 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
12500 12498 /*
12501 12499 * We go through the ipif_down logic even if the ipif
12502 12500 * is already down, since routes can be added based
12503 12501 * on down ipifs. Going through ipif_down once again
12504 12502 * will delete any IREs created based on these routes.
12505 12503 */
12506 12504 if (ipif->ipif_flags & IPIF_UP)
12507 12505 ipif->ipif_was_up = B_TRUE;
12508 12506
12509 12507 if (logical) {
12510 12508 (void) ipif_logical_down(ipif, NULL, NULL);
12511 12509 ipif_non_duplicate(ipif);
12512 12510 (void) ipif_down_tail(ipif);
12513 12511 } else {
12514 12512 (void) ipif_down(ipif, NULL, NULL);
12515 12513 }
12516 12514 }
12517 12515 }
12518 12516
12519 12517 /*
12520 12518 * Redo source address selection. This makes IXAF_VERIFY_SOURCE take
12521 12519 * a look again at valid source addresses.
12522 12520 * This should be called each time after the set of source addresses has been
12523 12521 * changed.
12524 12522 */
12525 12523 void
12526 12524 ip_update_source_selection(ip_stack_t *ipst)
12527 12525 {
12528 12526 /* We skip past SRC_GENERATION_VERIFY */
12529 12527 if (atomic_inc_32_nv(&ipst->ips_src_generation) ==
12530 12528 SRC_GENERATION_VERIFY)
12531 12529 atomic_inc_32(&ipst->ips_src_generation);
12532 12530 }
12533 12531
12534 12532 /*
12535 12533 * Finish the group join started in ip_sioctl_groupname().
12536 12534 */
12537 12535 /* ARGSUSED */
12538 12536 static void
12539 12537 ip_join_illgrps(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
12540 12538 {
12541 12539 ill_t *ill = q->q_ptr;
12542 12540 phyint_t *phyi = ill->ill_phyint;
12543 12541 ipmp_grp_t *grp = phyi->phyint_grp;
12544 12542 ip_stack_t *ipst = ill->ill_ipst;
12545 12543
12546 12544 /* IS_UNDER_IPMP() won't work until ipmp_ill_join_illgrp() is called */
12547 12545 ASSERT(!IS_IPMP(ill) && grp != NULL);
12548 12546 ASSERT(IAM_WRITER_IPSQ(ipsq));
12549 12547
12550 12548 if (phyi->phyint_illv4 != NULL) {
12551 12549 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12552 12550 VERIFY(grp->gr_pendv4-- > 0);
12553 12551 rw_exit(&ipst->ips_ipmp_lock);
12554 12552 ipmp_ill_join_illgrp(phyi->phyint_illv4, grp->gr_v4);
12555 12553 }
12556 12554 if (phyi->phyint_illv6 != NULL) {
12557 12555 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12558 12556 VERIFY(grp->gr_pendv6-- > 0);
12559 12557 rw_exit(&ipst->ips_ipmp_lock);
12560 12558 ipmp_ill_join_illgrp(phyi->phyint_illv6, grp->gr_v6);
12561 12559 }
12562 12560 freemsg(mp);
12563 12561 }
12564 12562
12565 12563 /*
12566 12564 * Process an SIOCSLIFGROUPNAME request.
12567 12565 */
12568 12566 /* ARGSUSED */
12569 12567 int
12570 12568 ip_sioctl_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12571 12569 ip_ioctl_cmd_t *ipip, void *ifreq)
12572 12570 {
12573 12571 struct lifreq *lifr = ifreq;
12574 12572 ill_t *ill = ipif->ipif_ill;
12575 12573 ip_stack_t *ipst = ill->ill_ipst;
12576 12574 phyint_t *phyi = ill->ill_phyint;
12577 12575 ipmp_grp_t *grp = phyi->phyint_grp;
12578 12576 mblk_t *ipsq_mp;
12579 12577 int err = 0;
12580 12578
12581 12579 /*
12582 12580 * Note that phyint_grp can only change here, where we're exclusive.
12583 12581 */
12584 12582 ASSERT(IAM_WRITER_ILL(ill));
12585 12583
12586 12584 if (ipif->ipif_id != 0 || ill->ill_usesrc_grp_next != NULL ||
12587 12585 (phyi->phyint_flags & PHYI_VIRTUAL))
12588 12586 return (EINVAL);
12589 12587
12590 12588 lifr->lifr_groupname[LIFGRNAMSIZ - 1] = '\0';
12591 12589
12592 12590 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
12593 12591
12594 12592 /*
12595 12593 * If the name hasn't changed, there's nothing to do.
12596 12594 */
12597 12595 if (grp != NULL && strcmp(grp->gr_name, lifr->lifr_groupname) == 0)
12598 12596 goto unlock;
12599 12597
12600 12598 /*
12601 12599 * Handle requests to rename an IPMP meta-interface.
12602 12600 *
12603 12601 * Note that creation of the IPMP meta-interface is handled in
12604 12602 * userland through the standard plumbing sequence. As part of the
12605 12603 * plumbing the IPMP meta-interface, its initial groupname is set to
12606 12604 * the name of the interface (see ipif_set_values_tail()).
12607 12605 */
12608 12606 if (IS_IPMP(ill)) {
12609 12607 err = ipmp_grp_rename(grp, lifr->lifr_groupname);
12610 12608 goto unlock;
12611 12609 }
12612 12610
12613 12611 /*
12614 12612 * Handle requests to add or remove an IP interface from a group.
12615 12613 */
12616 12614 if (lifr->lifr_groupname[0] != '\0') { /* add */
12617 12615 /*
12618 12616 * Moves are handled by first removing the interface from
12619 12617 * its existing group, and then adding it to another group.
12620 12618 * So, fail if it's already in a group.
12621 12619 */
12622 12620 if (IS_UNDER_IPMP(ill)) {
12623 12621 err = EALREADY;
12624 12622 goto unlock;
12625 12623 }
12626 12624
12627 12625 grp = ipmp_grp_lookup(lifr->lifr_groupname, ipst);
12628 12626 if (grp == NULL) {
12629 12627 err = ENOENT;
12630 12628 goto unlock;
12631 12629 }
12632 12630
12633 12631 /*
12634 12632 * Check if the phyint and its ills are suitable for
12635 12633 * inclusion into the group.
12636 12634 */
12637 12635 if ((err = ipmp_grp_vet_phyint(grp, phyi)) != 0)
12638 12636 goto unlock;
12639 12637
12640 12638 /*
12641 12639 * Checks pass; join the group, and enqueue the remaining
12642 12640 * illgrp joins for when we've become part of the group xop
12643 12641 * and are exclusive across its IPSQs. Since qwriter_ip()
12644 12642 * requires an mblk_t to scribble on, and since `mp' will be
12645 12643 * freed as part of completing the ioctl, allocate another.
12646 12644 */
12647 12645 if ((ipsq_mp = allocb(0, BPRI_MED)) == NULL) {
12648 12646 err = ENOMEM;
12649 12647 goto unlock;
12650 12648 }
12651 12649
12652 12650 /*
12653 12651 * Before we drop ipmp_lock, bump gr_pend* to ensure that the
12654 12652 * IPMP meta-interface ills needed by `phyi' cannot go away
12655 12653 * before ip_join_illgrps() is called back. See the comments
12656 12654 * in ip_sioctl_plink_ipmp() for more.
12657 12655 */
12658 12656 if (phyi->phyint_illv4 != NULL)
12659 12657 grp->gr_pendv4++;
12660 12658 if (phyi->phyint_illv6 != NULL)
12661 12659 grp->gr_pendv6++;
12662 12660
12663 12661 rw_exit(&ipst->ips_ipmp_lock);
12664 12662
12665 12663 ipmp_phyint_join_grp(phyi, grp);
12666 12664 ill_refhold(ill);
12667 12665 qwriter_ip(ill, ill->ill_rq, ipsq_mp, ip_join_illgrps,
12668 12666 SWITCH_OP, B_FALSE);
12669 12667 return (0);
12670 12668 } else {
12671 12669 /*
12672 12670 * Request to remove the interface from a group. If the
12673 12671 * interface is not in a group, this trivially succeeds.
12674 12672 */
12675 12673 rw_exit(&ipst->ips_ipmp_lock);
12676 12674 if (IS_UNDER_IPMP(ill))
12677 12675 ipmp_phyint_leave_grp(phyi);
12678 12676 return (0);
12679 12677 }
12680 12678 unlock:
12681 12679 rw_exit(&ipst->ips_ipmp_lock);
12682 12680 return (err);
12683 12681 }
12684 12682
12685 12683 /*
12686 12684 * Process an SIOCGLIFBINDING request.
12687 12685 */
12688 12686 /* ARGSUSED */
12689 12687 int
12690 12688 ip_sioctl_get_binding(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12691 12689 ip_ioctl_cmd_t *ipip, void *ifreq)
12692 12690 {
12693 12691 ill_t *ill;
12694 12692 struct lifreq *lifr = ifreq;
12695 12693 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12696 12694
12697 12695 if (!IS_IPMP(ipif->ipif_ill))
12698 12696 return (EINVAL);
12699 12697
12700 12698 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12701 12699 if ((ill = ipif->ipif_bound_ill) == NULL)
12702 12700 lifr->lifr_binding[0] = '\0';
12703 12701 else
12704 12702 (void) strlcpy(lifr->lifr_binding, ill->ill_name, LIFNAMSIZ);
12705 12703 rw_exit(&ipst->ips_ipmp_lock);
12706 12704 return (0);
12707 12705 }
12708 12706
12709 12707 /*
12710 12708 * Process an SIOCGLIFGROUPNAME request.
12711 12709 */
12712 12710 /* ARGSUSED */
12713 12711 int
12714 12712 ip_sioctl_get_groupname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12715 12713 ip_ioctl_cmd_t *ipip, void *ifreq)
12716 12714 {
12717 12715 ipmp_grp_t *grp;
12718 12716 struct lifreq *lifr = ifreq;
12719 12717 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
12720 12718
12721 12719 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12722 12720 if ((grp = ipif->ipif_ill->ill_phyint->phyint_grp) == NULL)
12723 12721 lifr->lifr_groupname[0] = '\0';
12724 12722 else
12725 12723 (void) strlcpy(lifr->lifr_groupname, grp->gr_name, LIFGRNAMSIZ);
12726 12724 rw_exit(&ipst->ips_ipmp_lock);
12727 12725 return (0);
12728 12726 }
12729 12727
12730 12728 /*
12731 12729 * Process an SIOCGLIFGROUPINFO request.
12732 12730 */
12733 12731 /* ARGSUSED */
12734 12732 int
12735 12733 ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp,
12736 12734 ip_ioctl_cmd_t *ipip, void *dummy)
12737 12735 {
12738 12736 ipmp_grp_t *grp;
12739 12737 lifgroupinfo_t *lifgr;
12740 12738 ip_stack_t *ipst = CONNQ_TO_IPST(q);
12741 12739
12742 12740 /* ip_wput_nondata() verified mp->b_cont->b_cont */
12743 12741 lifgr = (lifgroupinfo_t *)mp->b_cont->b_cont->b_rptr;
12744 12742 lifgr->gi_grname[LIFGRNAMSIZ - 1] = '\0';
12745 12743
12746 12744 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
12747 12745 if ((grp = ipmp_grp_lookup(lifgr->gi_grname, ipst)) == NULL) {
12748 12746 rw_exit(&ipst->ips_ipmp_lock);
12749 12747 return (ENOENT);
12750 12748 }
12751 12749 ipmp_grp_info(grp, lifgr);
12752 12750 rw_exit(&ipst->ips_ipmp_lock);
12753 12751 return (0);
12754 12752 }
12755 12753
12756 12754 static void
12757 12755 ill_dl_down(ill_t *ill)
12758 12756 {
12759 12757 DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill);
12760 12758
12761 12759 /*
12762 12760 * The ill is down; unbind but stay attached since we're still
12763 12761 * associated with a PPA. If we have negotiated DLPI capabilites
12764 12762 * with the data link service provider (IDS_OK) then reset them.
12765 12763 * The interval between unbinding and rebinding is potentially
12766 12764 * unbounded hence we cannot assume things will be the same.
12767 12765 * The DLPI capabilities will be probed again when the data link
12768 12766 * is brought up.
12769 12767 */
12770 12768 mblk_t *mp = ill->ill_unbind_mp;
12771 12769
12772 12770 ip1dbg(("ill_dl_down(%s)\n", ill->ill_name));
12773 12771
12774 12772 if (!ill->ill_replumbing) {
12775 12773 /* Free all ilms for this ill */
12776 12774 update_conn_ill(ill, ill->ill_ipst);
12777 12775 } else {
12778 12776 ill_leave_multicast(ill);
12779 12777 }
12780 12778
12781 12779 ill->ill_unbind_mp = NULL;
12782 12780
12783 12781 mutex_enter(&ill->ill_lock);
12784 12782 ill->ill_dl_up = 0;
12785 12783 ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
12786 12784 mutex_exit(&ill->ill_lock);
12787 12785
12788 12786 if (mp != NULL) {
12789 12787 ip1dbg(("ill_dl_down: %s (%u) for %s\n",
12790 12788 dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
12791 12789 ill->ill_name));
12792 12790 mutex_enter(&ill->ill_lock);
12793 12791 ill->ill_state_flags |= ILL_DL_UNBIND_IN_PROGRESS;
12794 12792 mutex_exit(&ill->ill_lock);
12795 12793 /*
12796 12794 * ip_rput does not pass up normal (M_PROTO) DLPI messages
12797 12795 * after ILL_CONDEMNED is set. So in the unplumb case, we call
12798 12796 * ill_capability_dld_disable disable rightaway. If this is not
12799 12797 * an unplumb operation then the disable happens on receipt of
12800 12798 * the capab ack via ip_rput_dlpi_writer ->
12801 12799 * ill_capability_ack_thr. In both cases the order of
12802 12800 * the operations seen by DLD is capability disable followed
12803 12801 * by DL_UNBIND. Also the DLD capability disable needs a
12804 12802 * cv_wait'able context.
12805 12803 */
12806 12804 if (ill->ill_state_flags & ILL_CONDEMNED)
12807 12805 ill_capability_dld_disable(ill);
12808 12806 ill_capability_reset(ill, B_FALSE);
12809 12807 ill_dlpi_send(ill, mp);
12810 12808
12811 12809 /*
12812 12810 * Wait for the capability reset to finish.
12813 12811 * In this case, it doesn't matter WHY or HOW it finished.
12814 12812 */
12815 12813 (void) ill_capability_wait(ill);
12816 12814 }
12817 12815 }
12818 12816
12819 12817 void
12820 12818 ill_dlpi_dispatch(ill_t *ill, mblk_t *mp)
12821 12819 {
12822 12820 union DL_primitives *dlp;
12823 12821 t_uscalar_t prim;
12824 12822 boolean_t waitack = B_FALSE;
12825 12823
12826 12824 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12827 12825
12828 12826 dlp = (union DL_primitives *)mp->b_rptr;
12829 12827 prim = dlp->dl_primitive;
12830 12828
12831 12829 ip1dbg(("ill_dlpi_dispatch: sending %s (%u) to %s\n",
12832 12830 dl_primstr(prim), prim, ill->ill_name));
12833 12831
12834 12832 switch (prim) {
12835 12833 case DL_PHYS_ADDR_REQ:
12836 12834 {
12837 12835 dl_phys_addr_req_t *dlpap = (dl_phys_addr_req_t *)mp->b_rptr;
12838 12836 ill->ill_phys_addr_pend = dlpap->dl_addr_type;
12839 12837 break;
12840 12838 }
12841 12839 case DL_BIND_REQ:
12842 12840 mutex_enter(&ill->ill_lock);
12843 12841 ill->ill_state_flags &= ~ILL_DL_UNBIND_IN_PROGRESS;
12844 12842 mutex_exit(&ill->ill_lock);
12845 12843 break;
12846 12844 }
12847 12845
12848 12846 /*
12849 12847 * Except for the ACKs for the M_PCPROTO messages, all other ACKs
12850 12848 * are dropped by ip_rput() if ILL_CONDEMNED is set. Therefore
12851 12849 * we only wait for the ACK of the DL_UNBIND_REQ.
12852 12850 */
12853 12851 mutex_enter(&ill->ill_lock);
12854 12852 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
12855 12853 (prim == DL_UNBIND_REQ)) {
12856 12854 ill->ill_dlpi_pending = prim;
12857 12855 waitack = B_TRUE;
12858 12856 }
12859 12857
12860 12858 mutex_exit(&ill->ill_lock);
12861 12859 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch",
12862 12860 char *, dl_primstr(prim), ill_t *, ill);
12863 12861 putnext(ill->ill_wq, mp);
12864 12862
12865 12863 /*
12866 12864 * There is no ack for DL_NOTIFY_CONF messages
12867 12865 */
12868 12866 if (waitack && prim == DL_NOTIFY_CONF)
12869 12867 ill_dlpi_done(ill, prim);
12870 12868 }
12871 12869
12872 12870 /*
12873 12871 * Helper function for ill_dlpi_send().
12874 12872 */
12875 12873 /* ARGSUSED */
12876 12874 static void
12877 12875 ill_dlpi_send_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
12878 12876 {
12879 12877 ill_dlpi_send(q->q_ptr, mp);
12880 12878 }
12881 12879
12882 12880 /*
12883 12881 * Send a DLPI control message to the driver but make sure there
12884 12882 * is only one outstanding message. Uses ill_dlpi_pending to tell
12885 12883 * when it must queue. ip_rput_dlpi_writer calls ill_dlpi_done()
12886 12884 * when an ACK or a NAK is received to process the next queued message.
12887 12885 */
12888 12886 void
12889 12887 ill_dlpi_send(ill_t *ill, mblk_t *mp)
12890 12888 {
12891 12889 mblk_t **mpp;
12892 12890
12893 12891 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
12894 12892
12895 12893 /*
12896 12894 * To ensure that any DLPI requests for current exclusive operation
12897 12895 * are always completely sent before any DLPI messages for other
12898 12896 * operations, require writer access before enqueuing.
12899 12897 */
12900 12898 if (!IAM_WRITER_ILL(ill)) {
12901 12899 ill_refhold(ill);
12902 12900 /* qwriter_ip() does the ill_refrele() */
12903 12901 qwriter_ip(ill, ill->ill_wq, mp, ill_dlpi_send_writer,
12904 12902 NEW_OP, B_TRUE);
12905 12903 return;
12906 12904 }
12907 12905
12908 12906 mutex_enter(&ill->ill_lock);
12909 12907 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
12910 12908 /* Must queue message. Tail insertion */
12911 12909 mpp = &ill->ill_dlpi_deferred;
12912 12910 while (*mpp != NULL)
12913 12911 mpp = &((*mpp)->b_next);
12914 12912
12915 12913 ip1dbg(("ill_dlpi_send: deferring request for %s "
12916 12914 "while %s pending\n", ill->ill_name,
12917 12915 dl_primstr(ill->ill_dlpi_pending)));
12918 12916
12919 12917 *mpp = mp;
12920 12918 mutex_exit(&ill->ill_lock);
12921 12919 return;
12922 12920 }
12923 12921 mutex_exit(&ill->ill_lock);
12924 12922 ill_dlpi_dispatch(ill, mp);
12925 12923 }
12926 12924
12927 12925 void
|
↓ open down ↓ |
9399 lines elided |
↑ open up ↑ |
12928 12926 ill_capability_send(ill_t *ill, mblk_t *mp)
12929 12927 {
12930 12928 ill->ill_capab_pending_cnt++;
12931 12929 ill_dlpi_send(ill, mp);
12932 12930 }
12933 12931
12934 12932 void
12935 12933 ill_capability_done(ill_t *ill)
12936 12934 {
12937 12935 ASSERT(ill->ill_capab_pending_cnt != 0);
12936 + ASSERT(IAM_WRITER_ILL(ill));
12938 12937
12939 12938 ill_dlpi_done(ill, DL_CAPABILITY_REQ);
12940 12939
12941 12940 ill->ill_capab_pending_cnt--;
12942 12941 if (ill->ill_capab_pending_cnt == 0 &&
12943 12942 ill->ill_dlpi_capab_state == IDCS_OK)
12944 12943 ill_capability_reset_alloc(ill);
12945 -
12946 - mutex_enter(&ill->ill_dlpi_capab_lock);
12947 - cv_broadcast(&ill->ill_dlpi_capab_cv);
12948 - mutex_exit(&ill->ill_dlpi_capab_lock);
12949 12944 }
12950 12945
12951 12946 /*
12952 12947 * Send all deferred DLPI messages without waiting for their ACKs.
12953 12948 */
12954 12949 void
12955 12950 ill_dlpi_send_deferred(ill_t *ill)
12956 12951 {
12957 12952 mblk_t *mp, *nextmp;
12958 12953
12959 12954 /*
12960 12955 * Clear ill_dlpi_pending so that the message is not queued in
12961 12956 * ill_dlpi_send().
12962 12957 */
12963 12958 mutex_enter(&ill->ill_lock);
12964 12959 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12965 12960 mp = ill->ill_dlpi_deferred;
12966 12961 ill->ill_dlpi_deferred = NULL;
12967 12962 mutex_exit(&ill->ill_lock);
12968 12963
12969 12964 for (; mp != NULL; mp = nextmp) {
12970 12965 nextmp = mp->b_next;
12971 12966 mp->b_next = NULL;
12972 12967 ill_dlpi_send(ill, mp);
12973 12968 }
12974 12969 }
12975 12970
12976 12971 /*
12977 12972 * Clear all the deferred DLPI messages. Called on receiving an M_ERROR
12978 12973 * or M_HANGUP
12979 12974 */
12980 12975 static void
12981 12976 ill_dlpi_clear_deferred(ill_t *ill)
12982 12977 {
12983 12978 mblk_t *mp, *nextmp;
12984 12979
12985 12980 mutex_enter(&ill->ill_lock);
12986 12981 ill->ill_dlpi_pending = DL_PRIM_INVAL;
12987 12982 mp = ill->ill_dlpi_deferred;
12988 12983 ill->ill_dlpi_deferred = NULL;
12989 12984 mutex_exit(&ill->ill_lock);
12990 12985
12991 12986 for (; mp != NULL; mp = nextmp) {
12992 12987 nextmp = mp->b_next;
12993 12988 inet_freemsg(mp);
12994 12989 }
12995 12990 }
12996 12991
12997 12992 /*
12998 12993 * Check if the DLPI primitive `prim' is pending; print a warning if not.
12999 12994 */
13000 12995 boolean_t
13001 12996 ill_dlpi_pending(ill_t *ill, t_uscalar_t prim)
13002 12997 {
13003 12998 t_uscalar_t pending;
13004 12999
13005 13000 mutex_enter(&ill->ill_lock);
13006 13001 if (ill->ill_dlpi_pending == prim) {
13007 13002 mutex_exit(&ill->ill_lock);
13008 13003 return (B_TRUE);
13009 13004 }
13010 13005
13011 13006 /*
13012 13007 * During teardown, ill_dlpi_dispatch() will send DLPI requests
13013 13008 * without waiting, so don't print any warnings in that case.
13014 13009 */
13015 13010 if (ill->ill_state_flags & ILL_CONDEMNED) {
13016 13011 mutex_exit(&ill->ill_lock);
13017 13012 return (B_FALSE);
13018 13013 }
13019 13014 pending = ill->ill_dlpi_pending;
13020 13015 mutex_exit(&ill->ill_lock);
13021 13016
13022 13017 if (pending == DL_PRIM_INVAL) {
13023 13018 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
13024 13019 "received unsolicited ack for %s on %s\n",
13025 13020 dl_primstr(prim), ill->ill_name);
13026 13021 } else {
13027 13022 (void) mi_strlog(ill->ill_rq, 1, SL_CONSOLE|SL_ERROR|SL_TRACE,
13028 13023 "received unexpected ack for %s on %s (expecting %s)\n",
13029 13024 dl_primstr(prim), ill->ill_name, dl_primstr(pending));
13030 13025 }
13031 13026 return (B_FALSE);
13032 13027 }
13033 13028
13034 13029 /*
13035 13030 * Complete the current DLPI operation associated with `prim' on `ill' and
13036 13031 * start the next queued DLPI operation (if any). If there are no queued DLPI
13037 13032 * operations and the ill's current exclusive IPSQ operation has finished
13038 13033 * (i.e., ipsq_current_finish() was called), then clear ipsq_current_ipif to
13039 13034 * allow the next exclusive IPSQ operation to begin upon ipsq_exit(). See
13040 13035 * the comments above ipsq_current_finish() for details.
13041 13036 */
13042 13037 void
13043 13038 ill_dlpi_done(ill_t *ill, t_uscalar_t prim)
13044 13039 {
13045 13040 mblk_t *mp;
13046 13041 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
13047 13042 ipxop_t *ipx = ipsq->ipsq_xop;
13048 13043
13049 13044 ASSERT(IAM_WRITER_IPSQ(ipsq));
13050 13045 mutex_enter(&ill->ill_lock);
13051 13046
13052 13047 ASSERT(prim != DL_PRIM_INVAL);
13053 13048 ASSERT(ill->ill_dlpi_pending == prim);
13054 13049
13055 13050 ip1dbg(("ill_dlpi_done: %s has completed %s (%u)\n", ill->ill_name,
13056 13051 dl_primstr(ill->ill_dlpi_pending), ill->ill_dlpi_pending));
13057 13052
13058 13053 if ((mp = ill->ill_dlpi_deferred) == NULL) {
13059 13054 ill->ill_dlpi_pending = DL_PRIM_INVAL;
13060 13055 if (ipx->ipx_current_done) {
13061 13056 mutex_enter(&ipx->ipx_lock);
13062 13057 ipx->ipx_current_ipif = NULL;
13063 13058 mutex_exit(&ipx->ipx_lock);
13064 13059 }
13065 13060 cv_signal(&ill->ill_cv);
13066 13061 mutex_exit(&ill->ill_lock);
13067 13062 return;
13068 13063 }
13069 13064
13070 13065 ill->ill_dlpi_deferred = mp->b_next;
13071 13066 mp->b_next = NULL;
13072 13067 mutex_exit(&ill->ill_lock);
13073 13068
13074 13069 ill_dlpi_dispatch(ill, mp);
13075 13070 }
13076 13071
13077 13072 /*
13078 13073 * Queue a (multicast) DLPI control message to be sent to the driver by
13079 13074 * later calling ill_dlpi_send_queued.
13080 13075 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13081 13076 * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ
13082 13077 * for the same group to race.
13083 13078 * We send DLPI control messages in order using ill_lock.
13084 13079 * For IPMP we should be called on the cast_ill.
13085 13080 */
13086 13081 void
13087 13082 ill_dlpi_queue(ill_t *ill, mblk_t *mp)
13088 13083 {
13089 13084 mblk_t **mpp;
13090 13085
13091 13086 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
13092 13087
13093 13088 mutex_enter(&ill->ill_lock);
13094 13089 /* Must queue message. Tail insertion */
13095 13090 mpp = &ill->ill_dlpi_deferred;
13096 13091 while (*mpp != NULL)
13097 13092 mpp = &((*mpp)->b_next);
13098 13093
13099 13094 *mpp = mp;
13100 13095 mutex_exit(&ill->ill_lock);
13101 13096 }
13102 13097
13103 13098 /*
13104 13099 * Send the messages that were queued. Make sure there is only
13105 13100 * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done()
13106 13101 * when an ACK or a NAK is received to process the next queued message.
13107 13102 * For IPMP we are called on the upper ill, but when send what is queued
13108 13103 * on the cast_ill.
13109 13104 */
13110 13105 void
13111 13106 ill_dlpi_send_queued(ill_t *ill)
13112 13107 {
13113 13108 mblk_t *mp;
13114 13109 union DL_primitives *dlp;
13115 13110 t_uscalar_t prim;
13116 13111 ill_t *release_ill = NULL;
13117 13112
13118 13113 if (IS_IPMP(ill)) {
13119 13114 /* On the upper IPMP ill. */
13120 13115 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13121 13116 if (release_ill == NULL) {
13122 13117 /* Avoid ever sending anything down to the ipmpstub */
13123 13118 return;
13124 13119 }
13125 13120 ill = release_ill;
13126 13121 }
13127 13122 mutex_enter(&ill->ill_lock);
13128 13123 while ((mp = ill->ill_dlpi_deferred) != NULL) {
13129 13124 if (ill->ill_dlpi_pending != DL_PRIM_INVAL) {
13130 13125 /* Can't send. Somebody else will send it */
13131 13126 mutex_exit(&ill->ill_lock);
13132 13127 goto done;
13133 13128 }
13134 13129 ill->ill_dlpi_deferred = mp->b_next;
13135 13130 mp->b_next = NULL;
13136 13131 if (!ill->ill_dl_up) {
13137 13132 /*
13138 13133 * Nobody there. All multicast addresses will be
13139 13134 * re-joined when we get the DL_BIND_ACK bringing the
13140 13135 * interface up.
13141 13136 */
13142 13137 freemsg(mp);
13143 13138 continue;
13144 13139 }
13145 13140 dlp = (union DL_primitives *)mp->b_rptr;
13146 13141 prim = dlp->dl_primitive;
13147 13142
13148 13143 if (!(ill->ill_state_flags & ILL_CONDEMNED) ||
13149 13144 (prim == DL_UNBIND_REQ)) {
13150 13145 ill->ill_dlpi_pending = prim;
13151 13146 }
13152 13147 mutex_exit(&ill->ill_lock);
13153 13148
13154 13149 DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued",
13155 13150 char *, dl_primstr(prim), ill_t *, ill);
13156 13151 putnext(ill->ill_wq, mp);
13157 13152 mutex_enter(&ill->ill_lock);
13158 13153 }
13159 13154 mutex_exit(&ill->ill_lock);
13160 13155 done:
13161 13156 if (release_ill != NULL)
13162 13157 ill_refrele(release_ill);
13163 13158 }
13164 13159
13165 13160 /*
13166 13161 * Queue an IP (IGMP/MLD) message to be sent by IP from
13167 13162 * ill_mcast_send_queued
13168 13163 * We queue them while holding a lock (ill_mcast_lock) to ensure that they
13169 13164 * are sent in order i.e., prevent a IGMP leave and IGMP join for the same
13170 13165 * group to race.
13171 13166 * We send them in order using ill_lock.
13172 13167 * For IPMP we are called on the upper ill, but we queue on the cast_ill.
13173 13168 */
13174 13169 void
13175 13170 ill_mcast_queue(ill_t *ill, mblk_t *mp)
13176 13171 {
13177 13172 mblk_t **mpp;
13178 13173 ill_t *release_ill = NULL;
13179 13174
13180 13175 ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
13181 13176
13182 13177 if (IS_IPMP(ill)) {
13183 13178 /* On the upper IPMP ill. */
13184 13179 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13185 13180 if (release_ill == NULL) {
13186 13181 /* Discard instead of queuing for the ipmp interface */
13187 13182 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
13188 13183 ip_drop_output("ipIfStatsOutDiscards - no cast_ill",
13189 13184 mp, ill);
13190 13185 freemsg(mp);
13191 13186 return;
13192 13187 }
13193 13188 ill = release_ill;
13194 13189 }
13195 13190
13196 13191 mutex_enter(&ill->ill_lock);
13197 13192 /* Must queue message. Tail insertion */
13198 13193 mpp = &ill->ill_mcast_deferred;
13199 13194 while (*mpp != NULL)
13200 13195 mpp = &((*mpp)->b_next);
13201 13196
13202 13197 *mpp = mp;
13203 13198 mutex_exit(&ill->ill_lock);
13204 13199 if (release_ill != NULL)
13205 13200 ill_refrele(release_ill);
13206 13201 }
13207 13202
13208 13203 /*
13209 13204 * Send the IP packets that were queued by ill_mcast_queue.
13210 13205 * These are IGMP/MLD packets.
13211 13206 *
13212 13207 * For IPMP we are called on the upper ill, but when send what is queued
13213 13208 * on the cast_ill.
13214 13209 *
13215 13210 * Request loopback of the report if we are acting as a multicast
13216 13211 * router, so that the process-level routing demon can hear it.
13217 13212 * This will run multiple times for the same group if there are members
13218 13213 * on the same group for multiple ipif's on the same ill. The
13219 13214 * igmp_input/mld_input code will suppress this due to the loopback thus we
13220 13215 * always loopback membership report.
13221 13216 *
13222 13217 * We also need to make sure that this does not get load balanced
13223 13218 * by IPMP. We do this by passing an ill to ip_output_simple.
13224 13219 */
13225 13220 void
13226 13221 ill_mcast_send_queued(ill_t *ill)
13227 13222 {
13228 13223 mblk_t *mp;
13229 13224 ip_xmit_attr_t ixas;
13230 13225 ill_t *release_ill = NULL;
13231 13226
13232 13227 if (IS_IPMP(ill)) {
13233 13228 /* On the upper IPMP ill. */
13234 13229 release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
13235 13230 if (release_ill == NULL) {
13236 13231 /*
13237 13232 * We should have no messages on the ipmp interface
13238 13233 * but no point in trying to send them.
13239 13234 */
13240 13235 return;
13241 13236 }
13242 13237 ill = release_ill;
13243 13238 }
13244 13239 bzero(&ixas, sizeof (ixas));
13245 13240 ixas.ixa_zoneid = ALL_ZONES;
13246 13241 ixas.ixa_cred = kcred;
13247 13242 ixas.ixa_cpid = NOPID;
13248 13243 ixas.ixa_tsl = NULL;
13249 13244 /*
13250 13245 * Here we set ixa_ifindex. If IPMP it will be the lower ill which
13251 13246 * makes ip_select_route pick the IRE_MULTICAST for the cast_ill.
13252 13247 * That is necessary to handle IGMP/MLD snooping switches.
13253 13248 */
13254 13249 ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex;
13255 13250 ixas.ixa_ipst = ill->ill_ipst;
13256 13251
13257 13252 mutex_enter(&ill->ill_lock);
13258 13253 while ((mp = ill->ill_mcast_deferred) != NULL) {
13259 13254 ill->ill_mcast_deferred = mp->b_next;
13260 13255 mp->b_next = NULL;
13261 13256 if (!ill->ill_dl_up) {
13262 13257 /*
13263 13258 * Nobody there. Just drop the ip packets.
13264 13259 * IGMP/MLD will resend later, if this is a replumb.
13265 13260 */
13266 13261 freemsg(mp);
13267 13262 continue;
13268 13263 }
13269 13264 mutex_enter(&ill->ill_phyint->phyint_lock);
13270 13265 if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) {
13271 13266 /*
13272 13267 * When the ill is getting deactivated, we only want to
13273 13268 * send the DLPI messages, so drop IGMP/MLD packets.
13274 13269 * DLPI messages are handled by ill_dlpi_send_queued()
13275 13270 */
13276 13271 mutex_exit(&ill->ill_phyint->phyint_lock);
13277 13272 freemsg(mp);
13278 13273 continue;
13279 13274 }
13280 13275 mutex_exit(&ill->ill_phyint->phyint_lock);
13281 13276 mutex_exit(&ill->ill_lock);
13282 13277
13283 13278 /* Check whether we are sending IPv4 or IPv6. */
13284 13279 if (ill->ill_isv6) {
13285 13280 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
13286 13281
13287 13282 ixas.ixa_multicast_ttl = ip6h->ip6_hops;
13288 13283 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6;
13289 13284 } else {
13290 13285 ipha_t *ipha = (ipha_t *)mp->b_rptr;
13291 13286
13292 13287 ixas.ixa_multicast_ttl = ipha->ipha_ttl;
13293 13288 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
13294 13289 ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM;
13295 13290 }
13296 13291 ixas.ixa_flags &= ~IXAF_VERIFY_SOURCE;
13297 13292 ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE;
13298 13293 (void) ip_output_simple(mp, &ixas);
13299 13294 ixa_cleanup(&ixas);
13300 13295
13301 13296 mutex_enter(&ill->ill_lock);
13302 13297 }
13303 13298 mutex_exit(&ill->ill_lock);
13304 13299
13305 13300 done:
13306 13301 if (release_ill != NULL)
13307 13302 ill_refrele(release_ill);
13308 13303 }
13309 13304
13310 13305 /*
13311 13306 * Take down a specific interface, but don't lose any information about it.
13312 13307 * (Always called as writer.)
13313 13308 * This function goes through the down sequence even if the interface is
13314 13309 * already down. There are 2 reasons.
13315 13310 * a. Currently we permit interface routes that depend on down interfaces
13316 13311 * to be added. This behaviour itself is questionable. However it appears
13317 13312 * that both Solaris and 4.3 BSD have exhibited this behaviour for a long
13318 13313 * time. We go thru the cleanup in order to remove these routes.
13319 13314 * b. The bringup of the interface could fail in ill_dl_up i.e. we get
13320 13315 * DL_ERROR_ACK in response to the DL_BIND request. The interface is
13321 13316 * down, but we need to cleanup i.e. do ill_dl_down and
13322 13317 * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down.
13323 13318 *
13324 13319 * IP-MT notes:
13325 13320 *
13326 13321 * Model of reference to interfaces.
13327 13322 *
13328 13323 * The following members in ipif_t track references to the ipif.
13329 13324 * int ipif_refcnt; Active reference count
13330 13325 *
13331 13326 * The following members in ill_t track references to the ill.
13332 13327 * int ill_refcnt; active refcnt
13333 13328 * uint_t ill_ire_cnt; Number of ires referencing ill
13334 13329 * uint_t ill_ncec_cnt; Number of ncecs referencing ill
13335 13330 * uint_t ill_nce_cnt; Number of nces referencing ill
13336 13331 * uint_t ill_ilm_cnt; Number of ilms referencing ill
13337 13332 *
13338 13333 * Reference to an ipif or ill can be obtained in any of the following ways.
13339 13334 *
13340 13335 * Through the lookup functions ipif_lookup_* / ill_lookup_* functions
13341 13336 * Pointers to ipif / ill from other data structures viz ire and conn.
13342 13337 * Implicit reference to the ipif / ill by holding a reference to the ire.
13343 13338 *
13344 13339 * The ipif/ill lookup functions return a reference held ipif / ill.
13345 13340 * ipif_refcnt and ill_refcnt track the reference counts respectively.
13346 13341 * This is a purely dynamic reference count associated with threads holding
13347 13342 * references to the ipif / ill. Pointers from other structures do not
13348 13343 * count towards this reference count.
13349 13344 *
13350 13345 * ill_ire_cnt is the number of ire's associated with the
13351 13346 * ill. This is incremented whenever a new ire is created referencing the
13352 13347 * ill. This is done atomically inside ire_add_v[46] where the ire is
13353 13348 * actually added to the ire hash table. The count is decremented in
13354 13349 * ire_inactive where the ire is destroyed.
13355 13350 *
13356 13351 * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill.
13357 13352 * This is incremented atomically in
13358 13353 * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the
13359 13354 * table. Similarly it is decremented in ncec_inactive() where the ncec
13360 13355 * is destroyed.
13361 13356 *
13362 13357 * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is
13363 13358 * incremented atomically in nce_add() where the nce is actually added to the
13364 13359 * ill_nce. Similarly it is decremented in nce_inactive() where the nce
13365 13360 * is destroyed.
13366 13361 *
13367 13362 * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in
13368 13363 * ilm_add() and decremented before the ilm is freed in ilm_delete().
13369 13364 *
13370 13365 * Flow of ioctls involving interface down/up
13371 13366 *
13372 13367 * The following is the sequence of an attempt to set some critical flags on an
13373 13368 * up interface.
13374 13369 * ip_sioctl_flags
13375 13370 * ipif_down
13376 13371 * wait for ipif to be quiescent
13377 13372 * ipif_down_tail
13378 13373 * ip_sioctl_flags_tail
13379 13374 *
13380 13375 * All set ioctls that involve down/up sequence would have a skeleton similar
13381 13376 * to the above. All the *tail functions are called after the refcounts have
13382 13377 * dropped to the appropriate values.
13383 13378 *
13384 13379 * SIOC ioctls during the IPIF_CHANGING interval.
13385 13380 *
13386 13381 * Threads handling SIOC set ioctls serialize on the squeue, but this
13387 13382 * is not done for SIOC get ioctls. Since a set ioctl can cause several
13388 13383 * steps of internal changes to the state, some of which are visible in
13389 13384 * ipif_flags (such as IFF_UP being cleared and later set), and we want
13390 13385 * the set ioctl to be atomic related to the get ioctls, the SIOC get code
13391 13386 * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then
13392 13387 * enqueued in the ipsq and the operation is restarted by ipsq_exit() when
13393 13388 * the current exclusive operation completes. The IPIF_CHANGING check
13394 13389 * and enqueue is atomic using the ill_lock and ipsq_lock. The
13395 13390 * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't
13396 13391 * change while the ill_lock is held. Before dropping the ill_lock we acquire
13397 13392 * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish
13398 13393 * until we release the ipsq_lock, even though the ill/ipif state flags
13399 13394 * can change after we drop the ill_lock.
13400 13395 */
13401 13396 int
13402 13397 ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13403 13398 {
13404 13399 ill_t *ill = ipif->ipif_ill;
13405 13400 conn_t *connp;
13406 13401 boolean_t success;
13407 13402 boolean_t ipif_was_up = B_FALSE;
13408 13403 ip_stack_t *ipst = ill->ill_ipst;
13409 13404
13410 13405 ASSERT(IAM_WRITER_IPIF(ipif));
13411 13406
13412 13407 ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id));
13413 13408
13414 13409 DTRACE_PROBE3(ipif__downup, char *, "ipif_down",
13415 13410 ill_t *, ill, ipif_t *, ipif);
13416 13411
13417 13412 if (ipif->ipif_flags & IPIF_UP) {
13418 13413 mutex_enter(&ill->ill_lock);
13419 13414 ipif->ipif_flags &= ~IPIF_UP;
13420 13415 ASSERT(ill->ill_ipif_up_count > 0);
13421 13416 --ill->ill_ipif_up_count;
13422 13417 mutex_exit(&ill->ill_lock);
13423 13418 ipif_was_up = B_TRUE;
13424 13419 /* Update status in SCTP's list */
13425 13420 sctp_update_ipif(ipif, SCTP_IPIF_DOWN);
13426 13421 ill_nic_event_dispatch(ipif->ipif_ill,
13427 13422 MAP_IPIF_ID(ipif->ipif_id), NE_LIF_DOWN, NULL, 0);
13428 13423 }
13429 13424
13430 13425 /*
13431 13426 * Removal of the last ipif from an ill may result in a DL_UNBIND
13432 13427 * being sent to the driver, and we must not send any data packets to
13433 13428 * the driver after the DL_UNBIND_REQ. To ensure this, all the
13434 13429 * ire and nce entries used in the data path will be cleaned
13435 13430 * up, and we also set the ILL_DOWN_IN_PROGRESS bit to make
13436 13431 * sure on new entries will be added until the ill is bound
13437 13432 * again. The ILL_DOWN_IN_PROGRESS bit is turned off upon
13438 13433 * receipt of a DL_BIND_ACK.
13439 13434 */
13440 13435 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13441 13436 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13442 13437 ill->ill_dl_up) {
13443 13438 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
13444 13439 }
13445 13440
13446 13441 /*
13447 13442 * Blow away memberships we established in ipif_multicast_up().
13448 13443 */
13449 13444 ipif_multicast_down(ipif);
13450 13445
13451 13446 /*
13452 13447 * Remove from the mapping for __sin6_src_id. We insert only
13453 13448 * when the address is not INADDR_ANY. As IPv4 addresses are
13454 13449 * stored as mapped addresses, we need to check for mapped
13455 13450 * INADDR_ANY also.
13456 13451 */
13457 13452 if (ipif_was_up && !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) &&
13458 13453 !IN6_IS_ADDR_V4MAPPED_ANY(&ipif->ipif_v6lcl_addr) &&
13459 13454 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
13460 13455 int err;
13461 13456
13462 13457 err = ip_srcid_remove(&ipif->ipif_v6lcl_addr,
13463 13458 ipif->ipif_zoneid, ipst);
13464 13459 if (err != 0) {
13465 13460 ip0dbg(("ipif_down: srcid_remove %d\n", err));
13466 13461 }
13467 13462 }
13468 13463
13469 13464 if (ipif_was_up) {
13470 13465 /* only delete if we'd added ire's before */
13471 13466 if (ipif->ipif_isv6)
13472 13467 ipif_delete_ires_v6(ipif);
13473 13468 else
13474 13469 ipif_delete_ires_v4(ipif);
13475 13470 }
13476 13471
13477 13472 if (ipif_was_up && ill->ill_ipif_up_count == 0) {
13478 13473 /*
13479 13474 * Since the interface is now down, it may have just become
13480 13475 * inactive. Note that this needs to be done even for a
13481 13476 * lll_logical_down(), or ARP entries will not get correctly
13482 13477 * restored when the interface comes back up.
13483 13478 */
13484 13479 if (IS_UNDER_IPMP(ill))
13485 13480 ipmp_ill_refresh_active(ill);
13486 13481 }
13487 13482
13488 13483 /*
13489 13484 * neighbor-discovery or arp entries for this interface. The ipif
13490 13485 * has to be quiesced, so we walk all the nce's and delete those
13491 13486 * that point at the ipif->ipif_ill. At the same time, we also
13492 13487 * update IPMP so that ipifs for data addresses are unbound. We dont
13493 13488 * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer
13494 13489 * that for ipif_down_tail()
13495 13490 */
13496 13491 ipif_nce_down(ipif);
13497 13492
13498 13493 /*
13499 13494 * If this is the last ipif on the ill, we also need to remove
13500 13495 * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will
13501 13496 * never succeed.
13502 13497 */
13503 13498 if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0)
13504 13499 ire_walk_ill(0, 0, ill_downi, ill, ill);
13505 13500
13506 13501 /*
13507 13502 * Walk all CONNs that can have a reference on an ire for this
13508 13503 * ipif (we actually walk all that now have stale references).
13509 13504 */
13510 13505 ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst);
13511 13506
13512 13507 /*
13513 13508 * If mp is NULL the caller will wait for the appropriate refcnt.
13514 13509 * Eg. ip_sioctl_removeif -> ipif_free -> ipif_down
13515 13510 * and ill_delete -> ipif_free -> ipif_down
13516 13511 */
13517 13512 if (mp == NULL) {
13518 13513 ASSERT(q == NULL);
13519 13514 return (0);
13520 13515 }
13521 13516
13522 13517 if (CONN_Q(q)) {
13523 13518 connp = Q_TO_CONN(q);
13524 13519 mutex_enter(&connp->conn_lock);
13525 13520 } else {
13526 13521 connp = NULL;
13527 13522 }
13528 13523 mutex_enter(&ill->ill_lock);
13529 13524 /*
13530 13525 * Are there any ire's pointing to this ipif that are still active ?
13531 13526 * If this is the last ipif going down, are there any ire's pointing
13532 13527 * to this ill that are still active ?
13533 13528 */
13534 13529 if (ipif_is_quiescent(ipif)) {
13535 13530 mutex_exit(&ill->ill_lock);
13536 13531 if (connp != NULL)
13537 13532 mutex_exit(&connp->conn_lock);
13538 13533 return (0);
13539 13534 }
13540 13535
13541 13536 ip1dbg(("ipif_down: need to wait, adding pending mp %s ill %p",
13542 13537 ill->ill_name, (void *)ill));
13543 13538 /*
13544 13539 * Enqueue the mp atomically in ipsq_pending_mp. When the refcount
13545 13540 * drops down, the operation will be restarted by ipif_ill_refrele_tail
13546 13541 * which in turn is called by the last refrele on the ipif/ill/ire.
13547 13542 */
13548 13543 success = ipsq_pending_mp_add(connp, ipif, q, mp, IPIF_DOWN);
13549 13544 if (!success) {
13550 13545 /* The conn is closing. So just return */
13551 13546 ASSERT(connp != NULL);
13552 13547 mutex_exit(&ill->ill_lock);
13553 13548 mutex_exit(&connp->conn_lock);
13554 13549 return (EINTR);
13555 13550 }
13556 13551
13557 13552 mutex_exit(&ill->ill_lock);
13558 13553 if (connp != NULL)
13559 13554 mutex_exit(&connp->conn_lock);
13560 13555 return (EINPROGRESS);
13561 13556 }
13562 13557
13563 13558 int
13564 13559 ipif_down_tail(ipif_t *ipif)
13565 13560 {
13566 13561 ill_t *ill = ipif->ipif_ill;
13567 13562 int err = 0;
13568 13563
13569 13564 DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail",
13570 13565 ill_t *, ill, ipif_t *, ipif);
13571 13566
13572 13567 /*
13573 13568 * Skip any loopback interface (null wq).
13574 13569 * If this is the last logical interface on the ill
13575 13570 * have ill_dl_down tell the driver we are gone (unbind)
13576 13571 * Note that lun 0 can ipif_down even though
13577 13572 * there are other logical units that are up.
13578 13573 * This occurs e.g. when we change a "significant" IFF_ flag.
13579 13574 */
13580 13575 if (ill->ill_wq != NULL && !ill->ill_logical_down &&
13581 13576 ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 &&
13582 13577 ill->ill_dl_up) {
13583 13578 ill_dl_down(ill);
13584 13579 }
13585 13580 if (!ipif->ipif_isv6)
13586 13581 err = ipif_arp_down(ipif);
13587 13582
13588 13583 ill->ill_logical_down = 0;
13589 13584
13590 13585 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
13591 13586 ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT);
13592 13587 return (err);
13593 13588 }
13594 13589
13595 13590 /*
13596 13591 * Bring interface logically down without bringing the physical interface
13597 13592 * down e.g. when the netmask is changed. This avoids long lasting link
13598 13593 * negotiations between an ethernet interface and a certain switches.
13599 13594 */
13600 13595 static int
13601 13596 ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp)
13602 13597 {
13603 13598 DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down",
13604 13599 ill_t *, ipif->ipif_ill, ipif_t *, ipif);
13605 13600
13606 13601 /*
13607 13602 * The ill_logical_down flag is a transient flag. It is set here
13608 13603 * and is cleared once the down has completed in ipif_down_tail.
13609 13604 * This flag does not indicate whether the ill stream is in the
13610 13605 * DL_BOUND state with the driver. Instead this flag is used by
13611 13606 * ipif_down_tail to determine whether to DL_UNBIND the stream with
13612 13607 * the driver. The state of the ill stream i.e. whether it is
13613 13608 * DL_BOUND with the driver or not is indicated by the ill_dl_up flag.
13614 13609 */
13615 13610 ipif->ipif_ill->ill_logical_down = 1;
13616 13611 return (ipif_down(ipif, q, mp));
13617 13612 }
13618 13613
13619 13614 /*
13620 13615 * Initiate deallocate of an IPIF. Always called as writer. Called by
13621 13616 * ill_delete or ip_sioctl_removeif.
13622 13617 */
13623 13618 static void
13624 13619 ipif_free(ipif_t *ipif)
13625 13620 {
13626 13621 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13627 13622
13628 13623 ASSERT(IAM_WRITER_IPIF(ipif));
13629 13624
13630 13625 if (ipif->ipif_recovery_id != 0)
13631 13626 (void) untimeout(ipif->ipif_recovery_id);
13632 13627 ipif->ipif_recovery_id = 0;
13633 13628
13634 13629 /*
13635 13630 * Take down the interface. We can be called either from ill_delete
13636 13631 * or from ip_sioctl_removeif.
13637 13632 */
13638 13633 (void) ipif_down(ipif, NULL, NULL);
13639 13634
13640 13635 /*
13641 13636 * Now that the interface is down, there's no chance it can still
13642 13637 * become a duplicate. Cancel any timer that may have been set while
13643 13638 * tearing down.
13644 13639 */
13645 13640 if (ipif->ipif_recovery_id != 0)
13646 13641 (void) untimeout(ipif->ipif_recovery_id);
13647 13642 ipif->ipif_recovery_id = 0;
13648 13643
13649 13644 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13650 13645 /* Remove pointers to this ill in the multicast routing tables */
13651 13646 reset_mrt_vif_ipif(ipif);
13652 13647 /* If necessary, clear the cached source ipif rotor. */
13653 13648 if (ipif->ipif_ill->ill_src_ipif == ipif)
13654 13649 ipif->ipif_ill->ill_src_ipif = NULL;
13655 13650 rw_exit(&ipst->ips_ill_g_lock);
13656 13651 }
13657 13652
13658 13653 static void
13659 13654 ipif_free_tail(ipif_t *ipif)
13660 13655 {
13661 13656 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
13662 13657
13663 13658 /*
13664 13659 * Need to hold both ill_g_lock and ill_lock while
13665 13660 * inserting or removing an ipif from the linked list
13666 13661 * of ipifs hanging off the ill.
13667 13662 */
13668 13663 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
13669 13664
13670 13665 #ifdef DEBUG
13671 13666 ipif_trace_cleanup(ipif);
13672 13667 #endif
13673 13668
13674 13669 /* Ask SCTP to take it out of it list */
13675 13670 sctp_update_ipif(ipif, SCTP_IPIF_REMOVE);
13676 13671 ip_rts_newaddrmsg(RTM_FREEADDR, 0, ipif, RTSQ_DEFAULT);
13677 13672
13678 13673 /* Get it out of the ILL interface list. */
13679 13674 ipif_remove(ipif);
13680 13675 rw_exit(&ipst->ips_ill_g_lock);
13681 13676
13682 13677 ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE)));
13683 13678 ASSERT(ipif->ipif_recovery_id == 0);
13684 13679 ASSERT(ipif->ipif_ire_local == NULL);
13685 13680 ASSERT(ipif->ipif_ire_if == NULL);
13686 13681
13687 13682 /* Free the memory. */
13688 13683 mi_free(ipif);
13689 13684 }
13690 13685
13691 13686 /*
13692 13687 * Sets `buf' to an ipif name of the form "ill_name:id", or "ill_name" if "id"
13693 13688 * is zero.
13694 13689 */
13695 13690 void
13696 13691 ipif_get_name(const ipif_t *ipif, char *buf, int len)
13697 13692 {
13698 13693 char lbuf[LIFNAMSIZ];
13699 13694 char *name;
13700 13695 size_t name_len;
13701 13696
13702 13697 buf[0] = '\0';
13703 13698 name = ipif->ipif_ill->ill_name;
13704 13699 name_len = ipif->ipif_ill->ill_name_length;
13705 13700 if (ipif->ipif_id != 0) {
13706 13701 (void) sprintf(lbuf, "%s%c%d", name, IPIF_SEPARATOR_CHAR,
13707 13702 ipif->ipif_id);
13708 13703 name = lbuf;
13709 13704 name_len = mi_strlen(name) + 1;
13710 13705 }
13711 13706 len -= 1;
13712 13707 buf[len] = '\0';
13713 13708 len = MIN(len, name_len);
13714 13709 bcopy(name, buf, len);
13715 13710 }
13716 13711
13717 13712 /*
13718 13713 * Sets `buf' to an ill name.
13719 13714 */
13720 13715 void
13721 13716 ill_get_name(const ill_t *ill, char *buf, int len)
13722 13717 {
13723 13718 char *name;
13724 13719 size_t name_len;
13725 13720
13726 13721 name = ill->ill_name;
13727 13722 name_len = ill->ill_name_length;
13728 13723 len -= 1;
13729 13724 buf[len] = '\0';
13730 13725 len = MIN(len, name_len);
13731 13726 bcopy(name, buf, len);
13732 13727 }
13733 13728
13734 13729 /*
13735 13730 * Find an IPIF based on the name passed in. Names can be of the form <phys>
13736 13731 * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the
13737 13732 * implied unit id is zero. <phys> must correspond to the name of an ILL.
13738 13733 * (May be called as writer.)
13739 13734 */
13740 13735 static ipif_t *
13741 13736 ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc,
13742 13737 boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst)
13743 13738 {
13744 13739 char *cp;
13745 13740 char *endp;
13746 13741 long id;
13747 13742 ill_t *ill;
13748 13743 ipif_t *ipif;
13749 13744 uint_t ire_type;
13750 13745 boolean_t did_alloc = B_FALSE;
13751 13746 char last;
13752 13747
13753 13748 /*
13754 13749 * If the caller wants to us to create the ipif, make sure we have a
13755 13750 * valid zoneid
13756 13751 */
13757 13752 ASSERT(!do_alloc || zoneid != ALL_ZONES);
13758 13753
13759 13754 if (namelen == 0) {
13760 13755 return (NULL);
13761 13756 }
13762 13757
13763 13758 *exists = B_FALSE;
13764 13759 /* Look for a colon in the name. */
13765 13760 endp = &name[namelen];
13766 13761 for (cp = endp; --cp > name; ) {
13767 13762 if (*cp == IPIF_SEPARATOR_CHAR)
13768 13763 break;
13769 13764 }
13770 13765
13771 13766 if (*cp == IPIF_SEPARATOR_CHAR) {
13772 13767 /*
13773 13768 * Reject any non-decimal aliases for logical
13774 13769 * interfaces. Aliases with leading zeroes
13775 13770 * are also rejected as they introduce ambiguity
13776 13771 * in the naming of the interfaces.
13777 13772 * In order to confirm with existing semantics,
13778 13773 * and to not break any programs/script relying
13779 13774 * on that behaviour, if<0>:0 is considered to be
13780 13775 * a valid interface.
13781 13776 *
13782 13777 * If alias has two or more digits and the first
13783 13778 * is zero, fail.
13784 13779 */
13785 13780 if (&cp[2] < endp && cp[1] == '0') {
13786 13781 return (NULL);
13787 13782 }
13788 13783 }
13789 13784
13790 13785 if (cp <= name) {
13791 13786 cp = endp;
13792 13787 }
13793 13788 last = *cp;
13794 13789 *cp = '\0';
13795 13790
13796 13791 /*
13797 13792 * Look up the ILL, based on the portion of the name
13798 13793 * before the slash. ill_lookup_on_name returns a held ill.
13799 13794 * Temporary to check whether ill exists already. If so
13800 13795 * ill_lookup_on_name will clear it.
13801 13796 */
13802 13797 ill = ill_lookup_on_name(name, do_alloc, isv6,
13803 13798 &did_alloc, ipst);
13804 13799 *cp = last;
13805 13800 if (ill == NULL)
13806 13801 return (NULL);
13807 13802
13808 13803 /* Establish the unit number in the name. */
13809 13804 id = 0;
13810 13805 if (cp < endp && *endp == '\0') {
13811 13806 /* If there was a colon, the unit number follows. */
13812 13807 cp++;
13813 13808 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13814 13809 ill_refrele(ill);
13815 13810 return (NULL);
13816 13811 }
13817 13812 }
13818 13813
13819 13814 mutex_enter(&ill->ill_lock);
13820 13815 /* Now see if there is an IPIF with this unit number. */
13821 13816 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13822 13817 if (ipif->ipif_id == id) {
13823 13818 if (zoneid != ALL_ZONES &&
13824 13819 zoneid != ipif->ipif_zoneid &&
13825 13820 ipif->ipif_zoneid != ALL_ZONES) {
13826 13821 mutex_exit(&ill->ill_lock);
13827 13822 ill_refrele(ill);
13828 13823 return (NULL);
13829 13824 }
13830 13825 if (IPIF_CAN_LOOKUP(ipif)) {
13831 13826 ipif_refhold_locked(ipif);
13832 13827 mutex_exit(&ill->ill_lock);
13833 13828 if (!did_alloc)
13834 13829 *exists = B_TRUE;
13835 13830 /*
13836 13831 * Drop locks before calling ill_refrele
13837 13832 * since it can potentially call into
13838 13833 * ipif_ill_refrele_tail which can end up
13839 13834 * in trying to acquire any lock.
13840 13835 */
13841 13836 ill_refrele(ill);
13842 13837 return (ipif);
13843 13838 }
13844 13839 }
13845 13840 }
13846 13841
13847 13842 if (!do_alloc) {
13848 13843 mutex_exit(&ill->ill_lock);
13849 13844 ill_refrele(ill);
13850 13845 return (NULL);
13851 13846 }
13852 13847
13853 13848 /*
13854 13849 * If none found, atomically allocate and return a new one.
13855 13850 * Historically, we used IRE_LOOPBACK only for lun 0, and IRE_LOCAL
13856 13851 * to support "receive only" use of lo0:1 etc. as is still done
13857 13852 * below as an initial guess.
13858 13853 * However, this is now likely to be overriden later in ipif_up_done()
13859 13854 * when we know for sure what address has been configured on the
13860 13855 * interface, since we might have more than one loopback interface
13861 13856 * with a loopback address, e.g. in the case of zones, and all the
13862 13857 * interfaces with loopback addresses need to be marked IRE_LOOPBACK.
13863 13858 */
13864 13859 if (ill->ill_net_type == IRE_LOOPBACK && id == 0)
13865 13860 ire_type = IRE_LOOPBACK;
13866 13861 else
13867 13862 ire_type = IRE_LOCAL;
13868 13863 ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE, NULL);
13869 13864 if (ipif != NULL)
13870 13865 ipif_refhold_locked(ipif);
13871 13866 mutex_exit(&ill->ill_lock);
13872 13867 ill_refrele(ill);
13873 13868 return (ipif);
13874 13869 }
13875 13870
13876 13871 /*
13877 13872 * Variant of the above that queues the request on the ipsq when
13878 13873 * IPIF_CHANGING is set.
13879 13874 */
13880 13875 static ipif_t *
13881 13876 ipif_lookup_on_name_async(char *name, size_t namelen, boolean_t isv6,
13882 13877 zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error,
13883 13878 ip_stack_t *ipst)
13884 13879 {
13885 13880 char *cp;
13886 13881 char *endp;
13887 13882 long id;
13888 13883 ill_t *ill;
13889 13884 ipif_t *ipif;
13890 13885 boolean_t did_alloc = B_FALSE;
13891 13886 ipsq_t *ipsq;
13892 13887
13893 13888 if (error != NULL)
13894 13889 *error = 0;
13895 13890
13896 13891 if (namelen == 0) {
13897 13892 if (error != NULL)
13898 13893 *error = ENXIO;
13899 13894 return (NULL);
13900 13895 }
13901 13896
13902 13897 /* Look for a colon in the name. */
13903 13898 endp = &name[namelen];
13904 13899 for (cp = endp; --cp > name; ) {
13905 13900 if (*cp == IPIF_SEPARATOR_CHAR)
13906 13901 break;
13907 13902 }
13908 13903
13909 13904 if (*cp == IPIF_SEPARATOR_CHAR) {
13910 13905 /*
13911 13906 * Reject any non-decimal aliases for logical
13912 13907 * interfaces. Aliases with leading zeroes
13913 13908 * are also rejected as they introduce ambiguity
13914 13909 * in the naming of the interfaces.
13915 13910 * In order to confirm with existing semantics,
13916 13911 * and to not break any programs/script relying
13917 13912 * on that behaviour, if<0>:0 is considered to be
13918 13913 * a valid interface.
13919 13914 *
13920 13915 * If alias has two or more digits and the first
13921 13916 * is zero, fail.
13922 13917 */
13923 13918 if (&cp[2] < endp && cp[1] == '0') {
13924 13919 if (error != NULL)
13925 13920 *error = EINVAL;
13926 13921 return (NULL);
13927 13922 }
13928 13923 }
13929 13924
13930 13925 if (cp <= name) {
13931 13926 cp = endp;
13932 13927 } else {
13933 13928 *cp = '\0';
13934 13929 }
13935 13930
13936 13931 /*
13937 13932 * Look up the ILL, based on the portion of the name
13938 13933 * before the slash. ill_lookup_on_name returns a held ill.
13939 13934 * Temporary to check whether ill exists already. If so
13940 13935 * ill_lookup_on_name will clear it.
13941 13936 */
13942 13937 ill = ill_lookup_on_name(name, B_FALSE, isv6, &did_alloc, ipst);
13943 13938 if (cp != endp)
13944 13939 *cp = IPIF_SEPARATOR_CHAR;
13945 13940 if (ill == NULL)
13946 13941 return (NULL);
13947 13942
13948 13943 /* Establish the unit number in the name. */
13949 13944 id = 0;
13950 13945 if (cp < endp && *endp == '\0') {
13951 13946 /* If there was a colon, the unit number follows. */
13952 13947 cp++;
13953 13948 if (ddi_strtol(cp, NULL, 0, &id) != 0) {
13954 13949 ill_refrele(ill);
13955 13950 if (error != NULL)
13956 13951 *error = ENXIO;
13957 13952 return (NULL);
13958 13953 }
13959 13954 }
13960 13955
13961 13956 GRAB_CONN_LOCK(q);
13962 13957 mutex_enter(&ill->ill_lock);
13963 13958 /* Now see if there is an IPIF with this unit number. */
13964 13959 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
13965 13960 if (ipif->ipif_id == id) {
13966 13961 if (zoneid != ALL_ZONES &&
13967 13962 zoneid != ipif->ipif_zoneid &&
13968 13963 ipif->ipif_zoneid != ALL_ZONES) {
13969 13964 mutex_exit(&ill->ill_lock);
13970 13965 RELEASE_CONN_LOCK(q);
13971 13966 ill_refrele(ill);
13972 13967 if (error != NULL)
13973 13968 *error = ENXIO;
13974 13969 return (NULL);
13975 13970 }
13976 13971
13977 13972 if (!(IPIF_IS_CHANGING(ipif) ||
13978 13973 IPIF_IS_CONDEMNED(ipif)) ||
13979 13974 IAM_WRITER_IPIF(ipif)) {
13980 13975 ipif_refhold_locked(ipif);
13981 13976 mutex_exit(&ill->ill_lock);
13982 13977 /*
13983 13978 * Drop locks before calling ill_refrele
13984 13979 * since it can potentially call into
13985 13980 * ipif_ill_refrele_tail which can end up
13986 13981 * in trying to acquire any lock.
13987 13982 */
13988 13983 RELEASE_CONN_LOCK(q);
13989 13984 ill_refrele(ill);
13990 13985 return (ipif);
13991 13986 } else if (q != NULL && !IPIF_IS_CONDEMNED(ipif)) {
13992 13987 ipsq = ill->ill_phyint->phyint_ipsq;
13993 13988 mutex_enter(&ipsq->ipsq_lock);
13994 13989 mutex_enter(&ipsq->ipsq_xop->ipx_lock);
13995 13990 mutex_exit(&ill->ill_lock);
13996 13991 ipsq_enq(ipsq, q, mp, func, NEW_OP, ill);
13997 13992 mutex_exit(&ipsq->ipsq_xop->ipx_lock);
13998 13993 mutex_exit(&ipsq->ipsq_lock);
13999 13994 RELEASE_CONN_LOCK(q);
14000 13995 ill_refrele(ill);
14001 13996 if (error != NULL)
14002 13997 *error = EINPROGRESS;
14003 13998 return (NULL);
14004 13999 }
14005 14000 }
14006 14001 }
14007 14002 RELEASE_CONN_LOCK(q);
14008 14003 mutex_exit(&ill->ill_lock);
14009 14004 ill_refrele(ill);
14010 14005 if (error != NULL)
14011 14006 *error = ENXIO;
14012 14007 return (NULL);
14013 14008 }
14014 14009
14015 14010 /*
14016 14011 * This routine is called whenever a new address comes up on an ipif. If
14017 14012 * we are configured to respond to address mask requests, then we are supposed
14018 14013 * to broadcast an address mask reply at this time. This routine is also
14019 14014 * called if we are already up, but a netmask change is made. This is legal
14020 14015 * but might not make the system manager very popular. (May be called
14021 14016 * as writer.)
14022 14017 */
14023 14018 void
14024 14019 ipif_mask_reply(ipif_t *ipif)
14025 14020 {
14026 14021 icmph_t *icmph;
14027 14022 ipha_t *ipha;
14028 14023 mblk_t *mp;
14029 14024 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
14030 14025 ip_xmit_attr_t ixas;
14031 14026
14032 14027 #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN)
14033 14028
14034 14029 if (!ipst->ips_ip_respond_to_address_mask_broadcast)
14035 14030 return;
14036 14031
14037 14032 /* ICMP mask reply is IPv4 only */
14038 14033 ASSERT(!ipif->ipif_isv6);
14039 14034 /* ICMP mask reply is not for a loopback interface */
14040 14035 ASSERT(ipif->ipif_ill->ill_wq != NULL);
14041 14036
14042 14037 if (ipif->ipif_lcl_addr == INADDR_ANY)
14043 14038 return;
14044 14039
14045 14040 mp = allocb(REPLY_LEN, BPRI_HI);
14046 14041 if (mp == NULL)
14047 14042 return;
14048 14043 mp->b_wptr = mp->b_rptr + REPLY_LEN;
14049 14044
14050 14045 ipha = (ipha_t *)mp->b_rptr;
14051 14046 bzero(ipha, REPLY_LEN);
14052 14047 *ipha = icmp_ipha;
14053 14048 ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl;
14054 14049 ipha->ipha_src = ipif->ipif_lcl_addr;
14055 14050 ipha->ipha_dst = ipif->ipif_brd_addr;
14056 14051 ipha->ipha_length = htons(REPLY_LEN);
14057 14052 ipha->ipha_ident = 0;
14058 14053
14059 14054 icmph = (icmph_t *)&ipha[1];
14060 14055 icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
14061 14056 bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
14062 14057 icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
14063 14058
14064 14059 bzero(&ixas, sizeof (ixas));
14065 14060 ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4;
14066 14061 ixas.ixa_zoneid = ALL_ZONES;
14067 14062 ixas.ixa_ifindex = 0;
14068 14063 ixas.ixa_ipst = ipst;
14069 14064 ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
14070 14065 (void) ip_output_simple(mp, &ixas);
14071 14066 ixa_cleanup(&ixas);
14072 14067 #undef REPLY_LEN
14073 14068 }
14074 14069
14075 14070 /*
14076 14071 * Join the ipif specific multicast groups.
14077 14072 * Must be called after a mapping has been set up in the resolver. (Always
14078 14073 * called as writer.)
14079 14074 */
14080 14075 void
14081 14076 ipif_multicast_up(ipif_t *ipif)
14082 14077 {
14083 14078 int err;
14084 14079 ill_t *ill;
14085 14080 ilm_t *ilm;
14086 14081
14087 14082 ASSERT(IAM_WRITER_IPIF(ipif));
14088 14083
14089 14084 ill = ipif->ipif_ill;
14090 14085
14091 14086 ip1dbg(("ipif_multicast_up\n"));
14092 14087 if (!(ill->ill_flags & ILLF_MULTICAST) ||
14093 14088 ipif->ipif_allhosts_ilm != NULL)
14094 14089 return;
14095 14090
14096 14091 if (ipif->ipif_isv6) {
14097 14092 in6_addr_t v6allmc = ipv6_all_hosts_mcast;
14098 14093 in6_addr_t v6solmc = ipv6_solicited_node_mcast;
14099 14094
14100 14095 v6solmc.s6_addr32[3] |= ipif->ipif_v6lcl_addr.s6_addr32[3];
14101 14096
14102 14097 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr))
14103 14098 return;
14104 14099
14105 14100 ip1dbg(("ipif_multicast_up - addmulti\n"));
14106 14101
14107 14102 /*
14108 14103 * Join the all hosts multicast address. We skip this for
14109 14104 * underlying IPMP interfaces since they should be invisible.
14110 14105 */
14111 14106 if (!IS_UNDER_IPMP(ill)) {
14112 14107 ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid,
14113 14108 &err);
14114 14109 if (ilm == NULL) {
14115 14110 ASSERT(err != 0);
14116 14111 ip0dbg(("ipif_multicast_up: "
14117 14112 "all_hosts_mcast failed %d\n", err));
14118 14113 return;
14119 14114 }
14120 14115 ipif->ipif_allhosts_ilm = ilm;
14121 14116 }
14122 14117
14123 14118 /*
14124 14119 * Enable multicast for the solicited node multicast address.
14125 14120 * If IPMP we need to put the membership on the upper ill.
14126 14121 */
14127 14122 if (!(ipif->ipif_flags & IPIF_NOLOCAL)) {
14128 14123 ill_t *mcast_ill = NULL;
14129 14124 boolean_t need_refrele;
14130 14125
14131 14126 if (IS_UNDER_IPMP(ill) &&
14132 14127 (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) {
14133 14128 need_refrele = B_TRUE;
14134 14129 } else {
14135 14130 mcast_ill = ill;
14136 14131 need_refrele = B_FALSE;
14137 14132 }
14138 14133
14139 14134 ilm = ip_addmulti(&v6solmc, mcast_ill,
14140 14135 ipif->ipif_zoneid, &err);
14141 14136 if (need_refrele)
14142 14137 ill_refrele(mcast_ill);
14143 14138
14144 14139 if (ilm == NULL) {
14145 14140 ASSERT(err != 0);
14146 14141 ip0dbg(("ipif_multicast_up: solicited MC"
14147 14142 " failed %d\n", err));
14148 14143 if ((ilm = ipif->ipif_allhosts_ilm) != NULL) {
14149 14144 ipif->ipif_allhosts_ilm = NULL;
14150 14145 (void) ip_delmulti(ilm);
14151 14146 }
14152 14147 return;
14153 14148 }
14154 14149 ipif->ipif_solmulti_ilm = ilm;
14155 14150 }
14156 14151 } else {
14157 14152 in6_addr_t v6group;
14158 14153
14159 14154 if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill))
14160 14155 return;
14161 14156
14162 14157 /* Join the all hosts multicast address */
14163 14158 ip1dbg(("ipif_multicast_up - addmulti\n"));
14164 14159 IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group);
14165 14160
14166 14161 ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err);
14167 14162 if (ilm == NULL) {
14168 14163 ASSERT(err != 0);
14169 14164 ip0dbg(("ipif_multicast_up: failed %d\n", err));
14170 14165 return;
14171 14166 }
14172 14167 ipif->ipif_allhosts_ilm = ilm;
14173 14168 }
14174 14169 }
14175 14170
14176 14171 /*
14177 14172 * Blow away any multicast groups that we joined in ipif_multicast_up().
14178 14173 * (ilms from explicit memberships are handled in conn_update_ill.)
14179 14174 */
14180 14175 void
14181 14176 ipif_multicast_down(ipif_t *ipif)
14182 14177 {
14183 14178 ASSERT(IAM_WRITER_IPIF(ipif));
14184 14179
14185 14180 ip1dbg(("ipif_multicast_down\n"));
14186 14181
14187 14182 if (ipif->ipif_allhosts_ilm != NULL) {
14188 14183 (void) ip_delmulti(ipif->ipif_allhosts_ilm);
14189 14184 ipif->ipif_allhosts_ilm = NULL;
14190 14185 }
14191 14186 if (ipif->ipif_solmulti_ilm != NULL) {
14192 14187 (void) ip_delmulti(ipif->ipif_solmulti_ilm);
14193 14188 ipif->ipif_solmulti_ilm = NULL;
14194 14189 }
14195 14190 }
14196 14191
14197 14192 /*
14198 14193 * Used when an interface comes up to recreate any extra routes on this
14199 14194 * interface.
14200 14195 */
14201 14196 int
14202 14197 ill_recover_saved_ire(ill_t *ill)
14203 14198 {
14204 14199 mblk_t *mp;
14205 14200 ip_stack_t *ipst = ill->ill_ipst;
14206 14201
14207 14202 ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name));
14208 14203
14209 14204 mutex_enter(&ill->ill_saved_ire_lock);
14210 14205 for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) {
14211 14206 ire_t *ire, *nire;
14212 14207 ifrt_t *ifrt;
14213 14208
14214 14209 ifrt = (ifrt_t *)mp->b_rptr;
14215 14210 /*
14216 14211 * Create a copy of the IRE with the saved address and netmask.
14217 14212 */
14218 14213 if (ill->ill_isv6) {
14219 14214 ire = ire_create_v6(
14220 14215 &ifrt->ifrt_v6addr,
14221 14216 &ifrt->ifrt_v6mask,
14222 14217 &ifrt->ifrt_v6gateway_addr,
14223 14218 ifrt->ifrt_type,
14224 14219 ill,
14225 14220 ifrt->ifrt_zoneid,
14226 14221 ifrt->ifrt_flags,
14227 14222 NULL,
14228 14223 ipst);
14229 14224 } else {
14230 14225 ire = ire_create(
14231 14226 (uint8_t *)&ifrt->ifrt_addr,
14232 14227 (uint8_t *)&ifrt->ifrt_mask,
14233 14228 (uint8_t *)&ifrt->ifrt_gateway_addr,
14234 14229 ifrt->ifrt_type,
14235 14230 ill,
14236 14231 ifrt->ifrt_zoneid,
14237 14232 ifrt->ifrt_flags,
14238 14233 NULL,
14239 14234 ipst);
14240 14235 }
14241 14236 if (ire == NULL) {
14242 14237 mutex_exit(&ill->ill_saved_ire_lock);
14243 14238 return (ENOMEM);
14244 14239 }
14245 14240
14246 14241 if (ifrt->ifrt_flags & RTF_SETSRC) {
14247 14242 if (ill->ill_isv6) {
14248 14243 ire->ire_setsrc_addr_v6 =
14249 14244 ifrt->ifrt_v6setsrc_addr;
14250 14245 } else {
14251 14246 ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr;
14252 14247 }
14253 14248 }
14254 14249
14255 14250 /*
14256 14251 * Some software (for example, GateD and Sun Cluster) attempts
14257 14252 * to create (what amount to) IRE_PREFIX routes with the
14258 14253 * loopback address as the gateway. This is primarily done to
14259 14254 * set up prefixes with the RTF_REJECT flag set (for example,
14260 14255 * when generating aggregate routes.)
14261 14256 *
14262 14257 * If the IRE type (as defined by ill->ill_net_type) is
14263 14258 * IRE_LOOPBACK, then we map the request into a
14264 14259 * IRE_IF_NORESOLVER.
14265 14260 */
14266 14261 if (ill->ill_net_type == IRE_LOOPBACK)
14267 14262 ire->ire_type = IRE_IF_NORESOLVER;
14268 14263
14269 14264 /*
14270 14265 * ire held by ire_add, will be refreled' towards the
14271 14266 * the end of ipif_up_done
14272 14267 */
14273 14268 nire = ire_add(ire);
14274 14269 /*
14275 14270 * Check if it was a duplicate entry. This handles
14276 14271 * the case of two racing route adds for the same route
14277 14272 */
14278 14273 if (nire == NULL) {
14279 14274 ip1dbg(("ill_recover_saved_ire: FAILED\n"));
14280 14275 } else if (nire != ire) {
14281 14276 ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n",
14282 14277 (void *)nire));
14283 14278 ire_delete(nire);
14284 14279 } else {
14285 14280 ip1dbg(("ill_recover_saved_ire: added ire %p\n",
14286 14281 (void *)nire));
14287 14282 }
14288 14283 if (nire != NULL)
14289 14284 ire_refrele(nire);
14290 14285 }
14291 14286 mutex_exit(&ill->ill_saved_ire_lock);
14292 14287 return (0);
14293 14288 }
14294 14289
14295 14290 /*
14296 14291 * Used to set the netmask and broadcast address to default values when the
14297 14292 * interface is brought up. (Always called as writer.)
14298 14293 */
14299 14294 static void
14300 14295 ipif_set_default(ipif_t *ipif)
14301 14296 {
14302 14297 ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14303 14298
14304 14299 if (!ipif->ipif_isv6) {
14305 14300 /*
14306 14301 * Interface holds an IPv4 address. Default
14307 14302 * mask is the natural netmask.
14308 14303 */
14309 14304 if (!ipif->ipif_net_mask) {
14310 14305 ipaddr_t v4mask;
14311 14306
14312 14307 v4mask = ip_net_mask(ipif->ipif_lcl_addr);
14313 14308 V4MASK_TO_V6(v4mask, ipif->ipif_v6net_mask);
14314 14309 }
14315 14310 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14316 14311 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14317 14312 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14318 14313 } else {
14319 14314 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14320 14315 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14321 14316 }
14322 14317 /*
14323 14318 * NOTE: SunOS 4.X does this even if the broadcast address
14324 14319 * has been already set thus we do the same here.
14325 14320 */
14326 14321 if (ipif->ipif_flags & IPIF_BROADCAST) {
14327 14322 ipaddr_t v4addr;
14328 14323
14329 14324 v4addr = ipif->ipif_subnet | ~ipif->ipif_net_mask;
14330 14325 IN6_IPADDR_TO_V4MAPPED(v4addr, &ipif->ipif_v6brd_addr);
14331 14326 }
14332 14327 } else {
14333 14328 /*
14334 14329 * Interface holds an IPv6-only address. Default
14335 14330 * mask is all-ones.
14336 14331 */
14337 14332 if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))
14338 14333 ipif->ipif_v6net_mask = ipv6_all_ones;
14339 14334 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
14340 14335 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
14341 14336 ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr;
14342 14337 } else {
14343 14338 V6_MASK_COPY(ipif->ipif_v6lcl_addr,
14344 14339 ipif->ipif_v6net_mask, ipif->ipif_v6subnet);
14345 14340 }
14346 14341 }
14347 14342 }
14348 14343
14349 14344 /*
14350 14345 * Return 0 if this address can be used as local address without causing
14351 14346 * duplicate address problems. Otherwise, return EADDRNOTAVAIL if the address
14352 14347 * is already up on a different ill, and EADDRINUSE if it's up on the same ill.
14353 14348 * Note that the same IPv6 link-local address is allowed as long as the ills
14354 14349 * are not on the same link.
14355 14350 */
14356 14351 int
14357 14352 ip_addr_availability_check(ipif_t *new_ipif)
14358 14353 {
14359 14354 in6_addr_t our_v6addr;
14360 14355 ill_t *ill;
14361 14356 ipif_t *ipif;
14362 14357 ill_walk_context_t ctx;
14363 14358 ip_stack_t *ipst = new_ipif->ipif_ill->ill_ipst;
14364 14359
14365 14360 ASSERT(IAM_WRITER_IPIF(new_ipif));
14366 14361 ASSERT(MUTEX_HELD(&ipst->ips_ip_addr_avail_lock));
14367 14362 ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock));
14368 14363
14369 14364 new_ipif->ipif_flags &= ~IPIF_UNNUMBERED;
14370 14365 if (IN6_IS_ADDR_UNSPECIFIED(&new_ipif->ipif_v6lcl_addr) ||
14371 14366 IN6_IS_ADDR_V4MAPPED_ANY(&new_ipif->ipif_v6lcl_addr))
14372 14367 return (0);
14373 14368
14374 14369 our_v6addr = new_ipif->ipif_v6lcl_addr;
14375 14370
14376 14371 if (new_ipif->ipif_isv6)
14377 14372 ill = ILL_START_WALK_V6(&ctx, ipst);
14378 14373 else
14379 14374 ill = ILL_START_WALK_V4(&ctx, ipst);
14380 14375
14381 14376 for (; ill != NULL; ill = ill_next(&ctx, ill)) {
14382 14377 for (ipif = ill->ill_ipif; ipif != NULL;
14383 14378 ipif = ipif->ipif_next) {
14384 14379 if ((ipif == new_ipif) ||
14385 14380 !(ipif->ipif_flags & IPIF_UP) ||
14386 14381 (ipif->ipif_flags & IPIF_UNNUMBERED) ||
14387 14382 !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
14388 14383 &our_v6addr))
14389 14384 continue;
14390 14385
14391 14386 if (new_ipif->ipif_flags & IPIF_POINTOPOINT)
14392 14387 new_ipif->ipif_flags |= IPIF_UNNUMBERED;
14393 14388 else if (ipif->ipif_flags & IPIF_POINTOPOINT)
14394 14389 ipif->ipif_flags |= IPIF_UNNUMBERED;
14395 14390 else if ((IN6_IS_ADDR_LINKLOCAL(&our_v6addr) ||
14396 14391 IN6_IS_ADDR_SITELOCAL(&our_v6addr)) &&
14397 14392 !IS_ON_SAME_LAN(ill, new_ipif->ipif_ill))
14398 14393 continue;
14399 14394 else if (new_ipif->ipif_zoneid != ipif->ipif_zoneid &&
14400 14395 ipif->ipif_zoneid != ALL_ZONES && IS_LOOPBACK(ill))
14401 14396 continue;
14402 14397 else if (new_ipif->ipif_ill == ill)
14403 14398 return (EADDRINUSE);
14404 14399 else
14405 14400 return (EADDRNOTAVAIL);
14406 14401 }
14407 14402 }
14408 14403
14409 14404 return (0);
14410 14405 }
14411 14406
14412 14407 /*
14413 14408 * Bring up an ipif: bring up arp/ndp, bring up the DLPI stream, and add
14414 14409 * IREs for the ipif.
14415 14410 * When the routine returns EINPROGRESS then mp has been consumed and
14416 14411 * the ioctl will be acked from ip_rput_dlpi.
14417 14412 */
14418 14413 int
14419 14414 ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
14420 14415 {
14421 14416 ill_t *ill = ipif->ipif_ill;
14422 14417 boolean_t isv6 = ipif->ipif_isv6;
14423 14418 int err = 0;
14424 14419 boolean_t success;
14425 14420 uint_t ipif_orig_id;
14426 14421 ip_stack_t *ipst = ill->ill_ipst;
14427 14422
14428 14423 ASSERT(IAM_WRITER_IPIF(ipif));
14429 14424
14430 14425 ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id));
14431 14426 DTRACE_PROBE3(ipif__downup, char *, "ipif_up",
14432 14427 ill_t *, ill, ipif_t *, ipif);
14433 14428
14434 14429 /* Shouldn't get here if it is already up. */
14435 14430 if (ipif->ipif_flags & IPIF_UP)
14436 14431 return (EALREADY);
14437 14432
14438 14433 /*
14439 14434 * If this is a request to bring up a data address on an interface
14440 14435 * under IPMP, then move the address to its IPMP meta-interface and
14441 14436 * try to bring it up. One complication is that the zeroth ipif for
14442 14437 * an ill is special, in that every ill always has one, and that code
14443 14438 * throughout IP deferences ill->ill_ipif without holding any locks.
14444 14439 */
14445 14440 if (IS_UNDER_IPMP(ill) && ipmp_ipif_is_dataaddr(ipif) &&
14446 14441 (!ipif->ipif_isv6 || !V6_IPIF_LINKLOCAL(ipif))) {
14447 14442 ipif_t *stubipif = NULL, *moveipif = NULL;
14448 14443 ill_t *ipmp_ill = ipmp_illgrp_ipmp_ill(ill->ill_grp);
14449 14444
14450 14445 /*
14451 14446 * The ipif being brought up should be quiesced. If it's not,
14452 14447 * something has gone amiss and we need to bail out. (If it's
14453 14448 * quiesced, we know it will remain so via IPIF_CONDEMNED.)
14454 14449 */
14455 14450 mutex_enter(&ill->ill_lock);
14456 14451 if (!ipif_is_quiescent(ipif)) {
14457 14452 mutex_exit(&ill->ill_lock);
14458 14453 return (EINVAL);
14459 14454 }
14460 14455 mutex_exit(&ill->ill_lock);
14461 14456
14462 14457 /*
14463 14458 * If we're going to need to allocate ipifs, do it prior
14464 14459 * to starting the move (and grabbing locks).
14465 14460 */
14466 14461 if (ipif->ipif_id == 0) {
14467 14462 if ((moveipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14468 14463 B_FALSE, &err)) == NULL) {
14469 14464 return (err);
14470 14465 }
14471 14466 if ((stubipif = ipif_allocate(ill, 0, IRE_LOCAL, B_TRUE,
14472 14467 B_FALSE, &err)) == NULL) {
14473 14468 mi_free(moveipif);
14474 14469 return (err);
14475 14470 }
14476 14471 }
14477 14472
14478 14473 /*
14479 14474 * Grab or transfer the ipif to move. During the move, keep
14480 14475 * ill_g_lock held to prevent any ill walker threads from
14481 14476 * seeing things in an inconsistent state.
14482 14477 */
14483 14478 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
14484 14479 if (ipif->ipif_id != 0) {
14485 14480 ipif_remove(ipif);
14486 14481 } else {
14487 14482 ipif_transfer(ipif, moveipif, stubipif);
14488 14483 ipif = moveipif;
14489 14484 }
14490 14485
14491 14486 /*
14492 14487 * Place the ipif on the IPMP ill. If the zeroth ipif on
14493 14488 * the IPMP ill is a stub (0.0.0.0 down address) then we
14494 14489 * replace that one. Otherwise, pick the next available slot.
14495 14490 */
14496 14491 ipif->ipif_ill = ipmp_ill;
14497 14492 ipif_orig_id = ipif->ipif_id;
14498 14493
14499 14494 if (ipmp_ipif_is_stubaddr(ipmp_ill->ill_ipif)) {
14500 14495 ipif_transfer(ipif, ipmp_ill->ill_ipif, NULL);
14501 14496 ipif = ipmp_ill->ill_ipif;
14502 14497 } else {
14503 14498 ipif->ipif_id = -1;
14504 14499 if ((err = ipif_insert(ipif, B_FALSE)) != 0) {
14505 14500 /*
14506 14501 * No more available ipif_id's -- put it back
14507 14502 * on the original ill and fail the operation.
14508 14503 * Since we're writer on the ill, we can be
14509 14504 * sure our old slot is still available.
14510 14505 */
14511 14506 ipif->ipif_id = ipif_orig_id;
14512 14507 ipif->ipif_ill = ill;
14513 14508 if (ipif_orig_id == 0) {
14514 14509 ipif_transfer(ipif, ill->ill_ipif,
14515 14510 NULL);
14516 14511 } else {
14517 14512 VERIFY(ipif_insert(ipif, B_FALSE) == 0);
14518 14513 }
14519 14514 rw_exit(&ipst->ips_ill_g_lock);
14520 14515 return (err);
14521 14516 }
14522 14517 }
14523 14518 rw_exit(&ipst->ips_ill_g_lock);
14524 14519
14525 14520 /*
14526 14521 * Tell SCTP that the ipif has moved. Note that even if we
14527 14522 * had to allocate a new ipif, the original sequence id was
14528 14523 * preserved and therefore SCTP won't know.
14529 14524 */
14530 14525 sctp_move_ipif(ipif, ill, ipmp_ill);
14531 14526
14532 14527 /*
14533 14528 * If the ipif being brought up was on slot zero, then we
14534 14529 * first need to bring up the placeholder we stuck there. In
14535 14530 * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive
14536 14531 * call to ipif_up() itself, if we successfully bring up the
14537 14532 * placeholder, we'll check ill_move_ipif and bring it up too.
14538 14533 */
14539 14534 if (ipif_orig_id == 0) {
14540 14535 ASSERT(ill->ill_move_ipif == NULL);
14541 14536 ill->ill_move_ipif = ipif;
14542 14537 if ((err = ipif_up(ill->ill_ipif, q, mp)) == 0)
14543 14538 ASSERT(ill->ill_move_ipif == NULL);
14544 14539 if (err != EINPROGRESS)
14545 14540 ill->ill_move_ipif = NULL;
14546 14541 return (err);
14547 14542 }
14548 14543
14549 14544 /*
14550 14545 * Bring it up on the IPMP ill.
14551 14546 */
14552 14547 return (ipif_up(ipif, q, mp));
14553 14548 }
14554 14549
14555 14550 /* Skip arp/ndp for any loopback interface. */
14556 14551 if (ill->ill_wq != NULL) {
14557 14552 conn_t *connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
14558 14553 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
14559 14554
14560 14555 if (!ill->ill_dl_up) {
14561 14556 /*
14562 14557 * ill_dl_up is not yet set. i.e. we are yet to
14563 14558 * DL_BIND with the driver and this is the first
14564 14559 * logical interface on the ill to become "up".
14565 14560 * Tell the driver to get going (via DL_BIND_REQ).
14566 14561 * Note that changing "significant" IFF_ flags
14567 14562 * address/netmask etc cause a down/up dance, but
14568 14563 * does not cause an unbind (DL_UNBIND) with the driver
14569 14564 */
14570 14565 if ((err = ill_dl_up(ill, ipif)) != 0) {
14571 14566 return (err);
14572 14567 }
14573 14568 }
14574 14569
14575 14570 /* Reject bringing up interfaces with unusable IP addresses */
14576 14571 if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) {
14577 14572 return (EPERM);
14578 14573 }
14579 14574
14580 14575 /*
14581 14576 * ipif_resolver_up may end up needeing to bind/attach
14582 14577 * the ARP stream, which in turn necessitates a
14583 14578 * DLPI message exchange with the driver. ioctls are
14584 14579 * serialized and so we cannot send more than one
14585 14580 * interface up message at a time. If ipif_resolver_up
14586 14581 * does need to wait for the DLPI handshake for the ARP stream,
14587 14582 * we get EINPROGRESS and we will complete in arp_bringup_done.
14588 14583 */
14589 14584
14590 14585 ASSERT(connp != NULL || !CONN_Q(q));
14591 14586 if (connp != NULL)
14592 14587 mutex_enter(&connp->conn_lock);
14593 14588 mutex_enter(&ill->ill_lock);
14594 14589 success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
14595 14590 mutex_exit(&ill->ill_lock);
14596 14591 if (connp != NULL)
14597 14592 mutex_exit(&connp->conn_lock);
14598 14593 if (!success)
14599 14594 return (EINTR);
14600 14595
14601 14596 /*
14602 14597 * Crank up IPv6 neighbor discovery. Unlike ARP, this should
14603 14598 * complete when ipif_ndp_up returns.
14604 14599 */
14605 14600 err = ipif_resolver_up(ipif, Res_act_initial);
14606 14601 if (err == EINPROGRESS) {
14607 14602 /* We will complete it in arp_bringup_done() */
14608 14603 return (err);
14609 14604 }
14610 14605
14611 14606 if (isv6 && err == 0)
14612 14607 err = ipif_ndp_up(ipif, B_TRUE);
14613 14608
14614 14609 ASSERT(err != EINPROGRESS);
14615 14610 mp = ipsq_pending_mp_get(ipsq, &connp);
14616 14611 ASSERT(mp != NULL);
14617 14612 if (err != 0)
14618 14613 return (err);
14619 14614 } else {
14620 14615 /*
14621 14616 * Interfaces without underlying hardware don't do duplicate
14622 14617 * address detection.
14623 14618 */
14624 14619 ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
14625 14620 ipif->ipif_addr_ready = 1;
14626 14621 err = ill_add_ires(ill);
14627 14622 /* allocation failure? */
14628 14623 if (err != 0)
14629 14624 return (err);
14630 14625 }
14631 14626
14632 14627 err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif));
14633 14628 if (err == 0 && ill->ill_move_ipif != NULL) {
14634 14629 ipif = ill->ill_move_ipif;
14635 14630 ill->ill_move_ipif = NULL;
14636 14631 return (ipif_up(ipif, q, mp));
14637 14632 }
14638 14633 return (err);
14639 14634 }
14640 14635
14641 14636 /*
14642 14637 * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST.
14643 14638 * The identical set of IREs need to be removed in ill_delete_ires().
14644 14639 */
14645 14640 int
14646 14641 ill_add_ires(ill_t *ill)
14647 14642 {
14648 14643 ire_t *ire;
14649 14644 in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1};
14650 14645 in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP);
14651 14646
14652 14647 if (ill->ill_ire_multicast != NULL)
14653 14648 return (0);
14654 14649
14655 14650 /*
14656 14651 * provide some dummy ire_addr for creating the ire.
14657 14652 */
14658 14653 if (ill->ill_isv6) {
14659 14654 ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill,
14660 14655 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14661 14656 } else {
14662 14657 ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill,
14663 14658 ALL_ZONES, RTF_UP, NULL, ill->ill_ipst);
14664 14659 }
14665 14660 if (ire == NULL)
14666 14661 return (ENOMEM);
14667 14662
14668 14663 ill->ill_ire_multicast = ire;
14669 14664 return (0);
14670 14665 }
14671 14666
14672 14667 void
14673 14668 ill_delete_ires(ill_t *ill)
14674 14669 {
14675 14670 if (ill->ill_ire_multicast != NULL) {
14676 14671 /*
14677 14672 * BIND/ATTACH completed; Release the ref for ill_ire_multicast
14678 14673 * which was taken without any th_tracing enabled.
14679 14674 * We also mark it as condemned (note that it was never added)
14680 14675 * so that caching conn's can move off of it.
14681 14676 */
14682 14677 ire_make_condemned(ill->ill_ire_multicast);
14683 14678 ire_refrele_notr(ill->ill_ire_multicast);
14684 14679 ill->ill_ire_multicast = NULL;
14685 14680 }
14686 14681 }
14687 14682
14688 14683 /*
14689 14684 * Perform a bind for the physical device.
14690 14685 *
14691 14686 * When the routine returns successfully then dlpi has been bound and
14692 14687 * capabilities negotiated. An unbind message will have been allocated
14693 14688 * for later use in ipif_down.
14694 14689 */
14695 14690 static int
14696 14691 ill_dl_up(ill_t *ill, ipif_t *ipif)
14697 14692 {
14698 14693 mblk_t *bind_mp = NULL;
14699 14694 mblk_t *unbind_mp = NULL;
14700 14695 int err;
14701 14696
14702 14697 DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
14703 14698
14704 14699 ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
14705 14700 ASSERT(IAM_WRITER_ILL(ill));
14706 14701
14707 14702 /*
14708 14703 * Make sure we have an IRE_MULTICAST in case we immediately
14709 14704 * start receiving packets.
14710 14705 */
14711 14706 err = ill_add_ires(ill);
14712 14707 if (err != 0)
14713 14708 goto bad;
14714 14709
14715 14710 bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long),
14716 14711 DL_BIND_REQ);
14717 14712 if (bind_mp == NULL)
14718 14713 goto bad;
14719 14714 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ill->ill_sap;
14720 14715 ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
14721 14716
14722 14717 /*
14723 14718 * ill_unbind_mp would be non-null if the following sequence had
14724 14719 * happened:
14725 14720 * - send DL_BIND_REQ to driver, wait for response
14726 14721 * - multiple ioctls that need to bring the ipif up are encountered,
14727 14722 * but they cannot enter the ipsq due to the outstanding DL_BIND_REQ.
14728 14723 * These ioctls will then be enqueued on the ipsq
14729 14724 * - a DL_ERROR_ACK is returned for the DL_BIND_REQ
14730 14725 * At this point, the pending ioctls in the ipsq will be drained, and
14731 14726 * since ill->ill_dl_up was not set, ill_dl_up would be invoked with
14732 14727 * a non-null ill->ill_unbind_mp
14733 14728 */
14734 14729 if (ill->ill_unbind_mp == NULL) {
14735 14730 unbind_mp = ip_dlpi_alloc(sizeof (dl_unbind_req_t),
14736 14731 DL_UNBIND_REQ);
14737 14732 if (unbind_mp == NULL)
14738 14733 goto bad;
14739 14734 }
14740 14735
14741 14736 /*
14742 14737 * Save the unbind message for ill_dl_down(); it will be consumed when
14743 14738 * the interface goes down.
14744 14739 */
14745 14740 if (ill->ill_unbind_mp == NULL)
14746 14741 ill->ill_unbind_mp = unbind_mp;
14747 14742
14748 14743 ill_dlpi_send(ill, bind_mp);
14749 14744 /* Send down link-layer capabilities probe if not already done. */
14750 14745 ill_capability_probe(ill);
14751 14746 /*
14752 14747 * Wait for DLPI to be bound and the capability probe to finish.
14753 14748 * The call drops-and-reacquires the squeue. If it couldn't because
14754 14749 * ILL_CONDEMNED got set, bail.
14755 14750 */
14756 14751 if (!ill_capability_wait(ill))
14757 14752 return (ENXIO);
14758 14753
14759 14754 /* DLPI failed to bind. Return the saved error */
14760 14755 if (!ill->ill_dl_up) {
14761 14756 return (ill->ill_dl_bind_err);
14762 14757 }
14763 14758
14764 14759 /*
14765 14760 * Sysid used to rely on the fact that netboots set domainname
14766 14761 * and the like. Now that miniroot boots aren't strictly netboots
14767 14762 * and miniroot network configuration is driven from userland
14768 14763 * these things still need to be set. This situation can be detected
14769 14764 * by comparing the interface being configured here to the one
14770 14765 * dhcifname was set to reference by the boot loader. Once sysid is
14771 14766 * converted to use dhcp_ipc_getinfo() this call can go away.
14772 14767 */
14773 14768 if ((ipif->ipif_flags & IPIF_DHCPRUNNING) &&
14774 14769 (strcmp(ill->ill_name, dhcifname) == 0) &&
14775 14770 (strlen(srpc_domain) == 0)) {
14776 14771 if (dhcpinit() != 0)
14777 14772 cmn_err(CE_WARN, "no cached dhcp response");
14778 14773 }
14779 14774
14780 14775 return (0);
14781 14776 bad:
14782 14777 ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
14783 14778
14784 14779 freemsg(bind_mp);
14785 14780 freemsg(unbind_mp);
14786 14781 return (ENOMEM);
14787 14782 }
14788 14783
14789 14784 /* Add room for tcp+ip headers */
14790 14785 uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20;
14791 14786
14792 14787 /*
14793 14788 * DLPI and ARP is up.
14794 14789 * Create all the IREs associated with an interface. Bring up multicast.
14795 14790 * Set the interface flag and finish other initialization
14796 14791 * that potentially had to be deferred to after DL_BIND_ACK.
14797 14792 */
14798 14793 int
14799 14794 ipif_up_done(ipif_t *ipif)
14800 14795 {
14801 14796 ill_t *ill = ipif->ipif_ill;
14802 14797 int err = 0;
14803 14798 boolean_t loopback = B_FALSE;
14804 14799 boolean_t update_src_selection = B_TRUE;
14805 14800 ipif_t *tmp_ipif;
14806 14801
14807 14802 ip1dbg(("ipif_up_done(%s:%u)\n",
14808 14803 ipif->ipif_ill->ill_name, ipif->ipif_id));
14809 14804 DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done",
14810 14805 ill_t *, ill, ipif_t *, ipif);
14811 14806
14812 14807 /* Check if this is a loopback interface */
14813 14808 if (ipif->ipif_ill->ill_wq == NULL)
14814 14809 loopback = B_TRUE;
14815 14810
14816 14811 ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
14817 14812
14818 14813 /*
14819 14814 * If all other interfaces for this ill are down or DEPRECATED,
14820 14815 * or otherwise unsuitable for source address selection,
14821 14816 * reset the src generation numbers to make sure source
14822 14817 * address selection gets to take this new ipif into account.
14823 14818 * No need to hold ill_lock while traversing the ipif list since
14824 14819 * we are writer
14825 14820 */
14826 14821 for (tmp_ipif = ill->ill_ipif; tmp_ipif;
14827 14822 tmp_ipif = tmp_ipif->ipif_next) {
14828 14823 if (((tmp_ipif->ipif_flags &
14829 14824 (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) ||
14830 14825 !(tmp_ipif->ipif_flags & IPIF_UP)) ||
14831 14826 (tmp_ipif == ipif))
14832 14827 continue;
14833 14828 /* first useable pre-existing interface */
14834 14829 update_src_selection = B_FALSE;
14835 14830 break;
14836 14831 }
14837 14832 if (update_src_selection)
14838 14833 ip_update_source_selection(ill->ill_ipst);
14839 14834
14840 14835 if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) {
14841 14836 nce_t *loop_nce = NULL;
14842 14837 uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD);
14843 14838
14844 14839 /*
14845 14840 * lo0:1 and subsequent ipifs were marked IRE_LOCAL in
14846 14841 * ipif_lookup_on_name(), but in the case of zones we can have
14847 14842 * several loopback addresses on lo0. So all the interfaces with
14848 14843 * loopback addresses need to be marked IRE_LOOPBACK.
14849 14844 */
14850 14845 if (V4_PART_OF_V6(ipif->ipif_v6lcl_addr) ==
14851 14846 htonl(INADDR_LOOPBACK))
14852 14847 ipif->ipif_ire_type = IRE_LOOPBACK;
14853 14848 else
14854 14849 ipif->ipif_ire_type = IRE_LOCAL;
14855 14850 if (ill->ill_net_type != IRE_LOOPBACK)
14856 14851 flags |= NCE_F_PUBLISH;
14857 14852
14858 14853 /* add unicast nce for the local addr */
14859 14854 err = nce_lookup_then_add_v4(ill, NULL,
14860 14855 ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags,
14861 14856 ND_REACHABLE, &loop_nce);
14862 14857 /* A shared-IP zone sees EEXIST for lo0:N */
14863 14858 if (err == 0 || err == EEXIST) {
14864 14859 ipif->ipif_added_nce = 1;
14865 14860 loop_nce->nce_ipif_cnt++;
14866 14861 nce_refrele(loop_nce);
14867 14862 err = 0;
14868 14863 } else {
14869 14864 ASSERT(loop_nce == NULL);
14870 14865 return (err);
14871 14866 }
14872 14867 }
14873 14868
14874 14869 /* Create all the IREs associated with this interface */
14875 14870 err = ipif_add_ires_v4(ipif, loopback);
14876 14871 if (err != 0) {
14877 14872 /*
14878 14873 * see comments about return value from
14879 14874 * ip_addr_availability_check() in ipif_add_ires_v4().
14880 14875 */
14881 14876 if (err != EADDRINUSE) {
14882 14877 (void) ipif_arp_down(ipif);
14883 14878 } else {
14884 14879 /*
14885 14880 * Make IPMP aware of the deleted ipif so that
14886 14881 * the needed ipmp cleanup (e.g., of ipif_bound_ill)
14887 14882 * can be completed. Note that we do not want to
14888 14883 * destroy the nce that was created on the ipmp_ill
14889 14884 * for the active copy of the duplicate address in
14890 14885 * use.
14891 14886 */
14892 14887 if (IS_IPMP(ill))
14893 14888 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
14894 14889 err = EADDRNOTAVAIL;
14895 14890 }
14896 14891 return (err);
14897 14892 }
14898 14893
14899 14894 if (ill->ill_ipif_up_count == 1 && !loopback) {
14900 14895 /* Recover any additional IREs entries for this ill */
14901 14896 (void) ill_recover_saved_ire(ill);
14902 14897 }
14903 14898
14904 14899 if (ill->ill_need_recover_multicast) {
14905 14900 /*
14906 14901 * Need to recover all multicast memberships in the driver.
14907 14902 * This had to be deferred until we had attached. The same
14908 14903 * code exists in ipif_up_done_v6() to recover IPv6
14909 14904 * memberships.
14910 14905 *
14911 14906 * Note that it would be preferable to unconditionally do the
14912 14907 * ill_recover_multicast() in ill_dl_up(), but we cannot do
14913 14908 * that since ill_join_allmulti() depends on ill_dl_up being
14914 14909 * set, and it is not set until we receive a DL_BIND_ACK after
14915 14910 * having called ill_dl_up().
14916 14911 */
14917 14912 ill_recover_multicast(ill);
14918 14913 }
14919 14914
14920 14915 if (ill->ill_ipif_up_count == 1) {
14921 14916 /*
14922 14917 * Since the interface is now up, it may now be active.
14923 14918 */
14924 14919 if (IS_UNDER_IPMP(ill))
14925 14920 ipmp_ill_refresh_active(ill);
14926 14921
14927 14922 /*
14928 14923 * If this is an IPMP interface, we may now be able to
14929 14924 * establish ARP entries.
14930 14925 */
14931 14926 if (IS_IPMP(ill))
14932 14927 ipmp_illgrp_refresh_arpent(ill->ill_grp);
14933 14928 }
14934 14929
14935 14930 /* Join the allhosts multicast address */
14936 14931 ipif_multicast_up(ipif);
14937 14932
14938 14933 if (!loopback && !update_src_selection &&
14939 14934 !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED)))
14940 14935 ip_update_source_selection(ill->ill_ipst);
14941 14936
14942 14937 if (!loopback && ipif->ipif_addr_ready) {
14943 14938 /* Broadcast an address mask reply. */
14944 14939 ipif_mask_reply(ipif);
14945 14940 }
14946 14941 /* Perhaps ilgs should use this ill */
14947 14942 update_conn_ill(NULL, ill->ill_ipst);
14948 14943
14949 14944 /*
14950 14945 * This had to be deferred until we had bound. Tell routing sockets and
14951 14946 * others that this interface is up if it looks like the address has
14952 14947 * been validated. Otherwise, if it isn't ready yet, wait for
14953 14948 * duplicate address detection to do its thing.
14954 14949 */
14955 14950 if (ipif->ipif_addr_ready)
14956 14951 ipif_up_notify(ipif);
14957 14952 return (0);
14958 14953 }
14959 14954
14960 14955 /*
14961 14956 * Add the IREs associated with the ipif.
14962 14957 * Those MUST be explicitly removed in ipif_delete_ires_v4.
14963 14958 */
14964 14959 static int
14965 14960 ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback)
14966 14961 {
14967 14962 ill_t *ill = ipif->ipif_ill;
14968 14963 ip_stack_t *ipst = ill->ill_ipst;
14969 14964 ire_t *ire_array[20];
14970 14965 ire_t **irep = ire_array;
14971 14966 ire_t **irep1;
14972 14967 ipaddr_t net_mask = 0;
14973 14968 ipaddr_t subnet_mask, route_mask;
14974 14969 int err;
14975 14970 ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */
14976 14971 ire_t *ire_if = NULL;
14977 14972 uchar_t *gw;
14978 14973
14979 14974 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
14980 14975 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
14981 14976 /*
14982 14977 * If we're on a labeled system then make sure that zone-
14983 14978 * private addresses have proper remote host database entries.
14984 14979 */
14985 14980 if (is_system_labeled() &&
14986 14981 ipif->ipif_ire_type != IRE_LOOPBACK &&
14987 14982 !tsol_check_interface_address(ipif))
14988 14983 return (EINVAL);
14989 14984
14990 14985 /* Register the source address for __sin6_src_id */
14991 14986 err = ip_srcid_insert(&ipif->ipif_v6lcl_addr,
14992 14987 ipif->ipif_zoneid, ipst);
14993 14988 if (err != 0) {
14994 14989 ip0dbg(("ipif_add_ires: srcid_insert %d\n", err));
14995 14990 return (err);
14996 14991 }
14997 14992
14998 14993 if (loopback)
14999 14994 gw = (uchar_t *)&ipif->ipif_lcl_addr;
15000 14995 else
15001 14996 gw = NULL;
15002 14997
15003 14998 /* If the interface address is set, create the local IRE. */
15004 14999 ire_local = ire_create(
15005 15000 (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */
15006 15001 (uchar_t *)&ip_g_all_ones, /* mask */
15007 15002 gw, /* gateway */
15008 15003 ipif->ipif_ire_type, /* LOCAL or LOOPBACK */
15009 15004 ipif->ipif_ill,
15010 15005 ipif->ipif_zoneid,
15011 15006 ((ipif->ipif_flags & IPIF_PRIVATE) ?
15012 15007 RTF_PRIVATE : 0) | RTF_KERNEL,
15013 15008 NULL,
15014 15009 ipst);
15015 15010 ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x"
15016 15011 " for 0x%x\n", (void *)ipif, (void *)ire_local,
15017 15012 ipif->ipif_ire_type,
15018 15013 ntohl(ipif->ipif_lcl_addr)));
15019 15014 if (ire_local == NULL) {
15020 15015 ip1dbg(("ipif_up_done: NULL ire_local\n"));
15021 15016 err = ENOMEM;
15022 15017 goto bad;
15023 15018 }
15024 15019 } else {
15025 15020 ip1dbg((
15026 15021 "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n",
15027 15022 ipif->ipif_ire_type,
15028 15023 ntohl(ipif->ipif_lcl_addr),
15029 15024 (uint_t)ipif->ipif_flags));
15030 15025 }
15031 15026 if ((ipif->ipif_lcl_addr != INADDR_ANY) &&
15032 15027 !(ipif->ipif_flags & IPIF_NOLOCAL)) {
15033 15028 net_mask = ip_net_mask(ipif->ipif_lcl_addr);
15034 15029 } else {
15035 15030 net_mask = htonl(IN_CLASSA_NET); /* fallback */
15036 15031 }
15037 15032
15038 15033 subnet_mask = ipif->ipif_net_mask;
15039 15034
15040 15035 /*
15041 15036 * If mask was not specified, use natural netmask of
15042 15037 * interface address. Also, store this mask back into the
15043 15038 * ipif struct.
15044 15039 */
15045 15040 if (subnet_mask == 0) {
15046 15041 subnet_mask = net_mask;
15047 15042 V4MASK_TO_V6(subnet_mask, ipif->ipif_v6net_mask);
15048 15043 V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask,
15049 15044 ipif->ipif_v6subnet);
15050 15045 }
15051 15046
15052 15047 /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */
15053 15048 if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) &&
15054 15049 ipif->ipif_subnet != INADDR_ANY) {
15055 15050 /* ipif_subnet is ipif_pp_dst_addr for pt-pt */
15056 15051
15057 15052 if (ipif->ipif_flags & IPIF_POINTOPOINT) {
15058 15053 route_mask = IP_HOST_MASK;
15059 15054 } else {
15060 15055 route_mask = subnet_mask;
15061 15056 }
15062 15057
15063 15058 ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p "
15064 15059 "creating if IRE ill_net_type 0x%x for 0x%x\n",
15065 15060 (void *)ipif, (void *)ill, ill->ill_net_type,
15066 15061 ntohl(ipif->ipif_subnet)));
15067 15062 ire_if = ire_create(
15068 15063 (uchar_t *)&ipif->ipif_subnet,
15069 15064 (uchar_t *)&route_mask,
15070 15065 (uchar_t *)&ipif->ipif_lcl_addr,
15071 15066 ill->ill_net_type,
15072 15067 ill,
15073 15068 ipif->ipif_zoneid,
15074 15069 ((ipif->ipif_flags & IPIF_PRIVATE) ?
15075 15070 RTF_PRIVATE: 0) | RTF_KERNEL,
15076 15071 NULL,
15077 15072 ipst);
15078 15073 if (ire_if == NULL) {
15079 15074 ip1dbg(("ipif_up_done: NULL ire_if\n"));
15080 15075 err = ENOMEM;
15081 15076 goto bad;
15082 15077 }
15083 15078 }
15084 15079
15085 15080 /*
15086 15081 * Create any necessary broadcast IREs.
15087 15082 */
15088 15083 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15089 15084 !(ipif->ipif_flags & IPIF_NOXMIT))
15090 15085 irep = ipif_create_bcast_ires(ipif, irep);
15091 15086
15092 15087 /* If an earlier ire_create failed, get out now */
15093 15088 for (irep1 = irep; irep1 > ire_array; ) {
15094 15089 irep1--;
15095 15090 if (*irep1 == NULL) {
15096 15091 ip1dbg(("ipif_up_done: NULL ire found in ire_array\n"));
15097 15092 err = ENOMEM;
15098 15093 goto bad;
15099 15094 }
15100 15095 }
15101 15096
15102 15097 /*
15103 15098 * Need to atomically check for IP address availability under
15104 15099 * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new
15105 15100 * ills or new ipifs can be added while we are checking availability.
15106 15101 */
15107 15102 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15108 15103 mutex_enter(&ipst->ips_ip_addr_avail_lock);
15109 15104 /* Mark it up, and increment counters. */
15110 15105 ipif->ipif_flags |= IPIF_UP;
15111 15106 ill->ill_ipif_up_count++;
15112 15107 err = ip_addr_availability_check(ipif);
15113 15108 mutex_exit(&ipst->ips_ip_addr_avail_lock);
15114 15109 rw_exit(&ipst->ips_ill_g_lock);
15115 15110
15116 15111 if (err != 0) {
15117 15112 /*
15118 15113 * Our address may already be up on the same ill. In this case,
15119 15114 * the ARP entry for our ipif replaced the one for the other
15120 15115 * ipif. So we don't want to delete it (otherwise the other ipif
15121 15116 * would be unable to send packets).
15122 15117 * ip_addr_availability_check() identifies this case for us and
15123 15118 * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL
15124 15119 * which is the expected error code.
15125 15120 */
15126 15121 ill->ill_ipif_up_count--;
15127 15122 ipif->ipif_flags &= ~IPIF_UP;
15128 15123 goto bad;
15129 15124 }
15130 15125
15131 15126 /*
15132 15127 * Add in all newly created IREs. ire_create_bcast() has
15133 15128 * already checked for duplicates of the IRE_BROADCAST type.
15134 15129 * We add the IRE_INTERFACE before the IRE_LOCAL to ensure
15135 15130 * that lookups find the IRE_LOCAL even if the IRE_INTERFACE is
15136 15131 * a /32 route.
15137 15132 */
15138 15133 if (ire_if != NULL) {
15139 15134 ire_if = ire_add(ire_if);
15140 15135 if (ire_if == NULL) {
15141 15136 err = ENOMEM;
15142 15137 goto bad2;
15143 15138 }
15144 15139 #ifdef DEBUG
15145 15140 ire_refhold_notr(ire_if);
15146 15141 ire_refrele(ire_if);
15147 15142 #endif
15148 15143 }
15149 15144 if (ire_local != NULL) {
15150 15145 ire_local = ire_add(ire_local);
15151 15146 if (ire_local == NULL) {
15152 15147 err = ENOMEM;
15153 15148 goto bad2;
15154 15149 }
15155 15150 #ifdef DEBUG
15156 15151 ire_refhold_notr(ire_local);
15157 15152 ire_refrele(ire_local);
15158 15153 #endif
15159 15154 }
15160 15155 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15161 15156 if (ire_local != NULL)
15162 15157 ipif->ipif_ire_local = ire_local;
15163 15158 if (ire_if != NULL)
15164 15159 ipif->ipif_ire_if = ire_if;
15165 15160 rw_exit(&ipst->ips_ill_g_lock);
15166 15161 ire_local = NULL;
15167 15162 ire_if = NULL;
15168 15163
15169 15164 /*
15170 15165 * We first add all of them, and if that succeeds we refrele the
15171 15166 * bunch. That enables us to delete all of them should any of the
15172 15167 * ire_adds fail.
15173 15168 */
15174 15169 for (irep1 = irep; irep1 > ire_array; ) {
15175 15170 irep1--;
15176 15171 ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock)));
15177 15172 *irep1 = ire_add(*irep1);
15178 15173 if (*irep1 == NULL) {
15179 15174 err = ENOMEM;
15180 15175 goto bad2;
15181 15176 }
15182 15177 }
15183 15178
15184 15179 for (irep1 = irep; irep1 > ire_array; ) {
15185 15180 irep1--;
15186 15181 /* refheld by ire_add. */
15187 15182 if (*irep1 != NULL) {
15188 15183 ire_refrele(*irep1);
15189 15184 *irep1 = NULL;
15190 15185 }
15191 15186 }
15192 15187
15193 15188 if (!loopback) {
15194 15189 /*
15195 15190 * If the broadcast address has been set, make sure it makes
15196 15191 * sense based on the interface address.
15197 15192 * Only match on ill since we are sharing broadcast addresses.
15198 15193 */
15199 15194 if ((ipif->ipif_brd_addr != INADDR_ANY) &&
15200 15195 (ipif->ipif_flags & IPIF_BROADCAST)) {
15201 15196 ire_t *ire;
15202 15197
15203 15198 ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0,
15204 15199 IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL,
15205 15200 (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL);
15206 15201
15207 15202 if (ire == NULL) {
15208 15203 /*
15209 15204 * If there isn't a matching broadcast IRE,
15210 15205 * revert to the default for this netmask.
15211 15206 */
15212 15207 ipif->ipif_v6brd_addr = ipv6_all_zeros;
15213 15208 mutex_enter(&ipif->ipif_ill->ill_lock);
15214 15209 ipif_set_default(ipif);
15215 15210 mutex_exit(&ipif->ipif_ill->ill_lock);
15216 15211 } else {
15217 15212 ire_refrele(ire);
15218 15213 }
15219 15214 }
15220 15215
15221 15216 }
15222 15217 return (0);
15223 15218
15224 15219 bad2:
15225 15220 ill->ill_ipif_up_count--;
15226 15221 ipif->ipif_flags &= ~IPIF_UP;
15227 15222
15228 15223 bad:
15229 15224 ip1dbg(("ipif_add_ires: FAILED \n"));
15230 15225 if (ire_local != NULL)
15231 15226 ire_delete(ire_local);
15232 15227 if (ire_if != NULL)
15233 15228 ire_delete(ire_if);
15234 15229
15235 15230 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15236 15231 ire_local = ipif->ipif_ire_local;
15237 15232 ipif->ipif_ire_local = NULL;
15238 15233 ire_if = ipif->ipif_ire_if;
15239 15234 ipif->ipif_ire_if = NULL;
15240 15235 rw_exit(&ipst->ips_ill_g_lock);
15241 15236 if (ire_local != NULL) {
15242 15237 ire_delete(ire_local);
15243 15238 ire_refrele_notr(ire_local);
15244 15239 }
15245 15240 if (ire_if != NULL) {
15246 15241 ire_delete(ire_if);
15247 15242 ire_refrele_notr(ire_if);
15248 15243 }
15249 15244
15250 15245 while (irep > ire_array) {
15251 15246 irep--;
15252 15247 if (*irep != NULL) {
15253 15248 ire_delete(*irep);
15254 15249 }
15255 15250 }
15256 15251 (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst);
15257 15252
15258 15253 return (err);
15259 15254 }
15260 15255
15261 15256 /* Remove all the IREs created by ipif_add_ires_v4 */
15262 15257 void
15263 15258 ipif_delete_ires_v4(ipif_t *ipif)
15264 15259 {
15265 15260 ill_t *ill = ipif->ipif_ill;
15266 15261 ip_stack_t *ipst = ill->ill_ipst;
15267 15262 ire_t *ire;
15268 15263
15269 15264 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15270 15265 ire = ipif->ipif_ire_local;
15271 15266 ipif->ipif_ire_local = NULL;
15272 15267 rw_exit(&ipst->ips_ill_g_lock);
15273 15268 if (ire != NULL) {
15274 15269 /*
15275 15270 * Move count to ipif so we don't loose the count due to
15276 15271 * a down/up dance.
15277 15272 */
15278 15273 atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count);
15279 15274
15280 15275 ire_delete(ire);
15281 15276 ire_refrele_notr(ire);
15282 15277 }
15283 15278 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
15284 15279 ire = ipif->ipif_ire_if;
15285 15280 ipif->ipif_ire_if = NULL;
15286 15281 rw_exit(&ipst->ips_ill_g_lock);
15287 15282 if (ire != NULL) {
15288 15283 ire_delete(ire);
15289 15284 ire_refrele_notr(ire);
15290 15285 }
15291 15286
15292 15287 /*
15293 15288 * Delete the broadcast IREs.
15294 15289 */
15295 15290 if ((ipif->ipif_flags & IPIF_BROADCAST) &&
15296 15291 !(ipif->ipif_flags & IPIF_NOXMIT))
15297 15292 ipif_delete_bcast_ires(ipif);
15298 15293 }
15299 15294
15300 15295 /*
15301 15296 * Checks for availbility of a usable source address (if there is one) when the
15302 15297 * destination ILL has the ill_usesrc_ifindex pointing to another ILL. Note
15303 15298 * this selection is done regardless of the destination.
15304 15299 */
15305 15300 boolean_t
15306 15301 ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid,
15307 15302 ip_stack_t *ipst)
15308 15303 {
15309 15304 ipif_t *ipif = NULL;
15310 15305 ill_t *uill;
15311 15306
15312 15307 ASSERT(ifindex != 0);
15313 15308
15314 15309 uill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
15315 15310 if (uill == NULL)
15316 15311 return (B_FALSE);
15317 15312
15318 15313 mutex_enter(&uill->ill_lock);
15319 15314 for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15320 15315 if (IPIF_IS_CONDEMNED(ipif))
15321 15316 continue;
15322 15317 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15323 15318 continue;
15324 15319 if (!(ipif->ipif_flags & IPIF_UP))
15325 15320 continue;
15326 15321 if (ipif->ipif_zoneid != zoneid)
15327 15322 continue;
15328 15323 if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15329 15324 ipif->ipif_lcl_addr == INADDR_ANY)
15330 15325 continue;
15331 15326 mutex_exit(&uill->ill_lock);
15332 15327 ill_refrele(uill);
15333 15328 return (B_TRUE);
15334 15329 }
15335 15330 mutex_exit(&uill->ill_lock);
15336 15331 ill_refrele(uill);
15337 15332 return (B_FALSE);
15338 15333 }
15339 15334
15340 15335 /*
15341 15336 * Find an ipif with a good local address on the ill+zoneid.
15342 15337 */
15343 15338 ipif_t *
15344 15339 ipif_good_addr(ill_t *ill, zoneid_t zoneid)
15345 15340 {
15346 15341 ipif_t *ipif;
15347 15342
15348 15343 mutex_enter(&ill->ill_lock);
15349 15344 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
15350 15345 if (IPIF_IS_CONDEMNED(ipif))
15351 15346 continue;
15352 15347 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15353 15348 continue;
15354 15349 if (!(ipif->ipif_flags & IPIF_UP))
15355 15350 continue;
15356 15351 if (ipif->ipif_zoneid != zoneid &&
15357 15352 ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES)
15358 15353 continue;
15359 15354 if (ill->ill_isv6 ?
15360 15355 IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) :
15361 15356 ipif->ipif_lcl_addr == INADDR_ANY)
15362 15357 continue;
15363 15358 ipif_refhold_locked(ipif);
15364 15359 mutex_exit(&ill->ill_lock);
15365 15360 return (ipif);
15366 15361 }
15367 15362 mutex_exit(&ill->ill_lock);
15368 15363 return (NULL);
15369 15364 }
15370 15365
15371 15366 /*
15372 15367 * IP source address type, sorted from worst to best. For a given type,
15373 15368 * always prefer IP addresses on the same subnet. All-zones addresses are
15374 15369 * suboptimal because they pose problems with unlabeled destinations.
15375 15370 */
15376 15371 typedef enum {
15377 15372 IPIF_NONE,
15378 15373 IPIF_DIFFNET_DEPRECATED, /* deprecated and different subnet */
15379 15374 IPIF_SAMENET_DEPRECATED, /* deprecated and same subnet */
15380 15375 IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */
15381 15376 IPIF_SAMENET_ALLZONES, /* allzones and same subnet */
15382 15377 IPIF_DIFFNET, /* normal and different subnet */
15383 15378 IPIF_SAMENET, /* normal and same subnet */
15384 15379 IPIF_LOCALADDR /* local loopback */
15385 15380 } ipif_type_t;
15386 15381
15387 15382 /*
15388 15383 * Pick the optimal ipif on `ill' for sending to destination `dst' from zone
15389 15384 * `zoneid'. We rate usable ipifs from low -> high as per the ipif_type_t
15390 15385 * enumeration, and return the highest-rated ipif. If there's a tie, we pick
15391 15386 * the first one, unless IPMP is used in which case we round-robin among them;
15392 15387 * see below for more.
15393 15388 *
15394 15389 * Returns NULL if there is no suitable source address for the ill.
15395 15390 * This only occurs when there is no valid source address for the ill.
15396 15391 */
15397 15392 ipif_t *
15398 15393 ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid,
15399 15394 boolean_t allow_usesrc, boolean_t *notreadyp)
15400 15395 {
15401 15396 ill_t *usill = NULL;
15402 15397 ill_t *ipmp_ill = NULL;
15403 15398 ipif_t *start_ipif, *next_ipif, *ipif, *best_ipif;
15404 15399 ipif_type_t type, best_type;
15405 15400 tsol_tpc_t *src_rhtp, *dst_rhtp;
15406 15401 ip_stack_t *ipst = ill->ill_ipst;
15407 15402 boolean_t samenet;
15408 15403
15409 15404 if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) {
15410 15405 usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex,
15411 15406 B_FALSE, ipst);
15412 15407 if (usill != NULL)
15413 15408 ill = usill; /* Select source from usesrc ILL */
15414 15409 else
15415 15410 return (NULL);
15416 15411 }
15417 15412
15418 15413 /*
15419 15414 * Test addresses should never be used for source address selection,
15420 15415 * so if we were passed one, switch to the IPMP meta-interface.
15421 15416 */
15422 15417 if (IS_UNDER_IPMP(ill)) {
15423 15418 if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL)
15424 15419 ill = ipmp_ill; /* Select source from IPMP ill */
15425 15420 else
15426 15421 return (NULL);
15427 15422 }
15428 15423
15429 15424 /*
15430 15425 * If we're dealing with an unlabeled destination on a labeled system,
15431 15426 * make sure that we ignore source addresses that are incompatible with
15432 15427 * the destination's default label. That destination's default label
15433 15428 * must dominate the minimum label on the source address.
15434 15429 */
15435 15430 dst_rhtp = NULL;
15436 15431 if (is_system_labeled()) {
15437 15432 dst_rhtp = find_tpc(&dst, IPV4_VERSION, B_FALSE);
15438 15433 if (dst_rhtp == NULL)
15439 15434 return (NULL);
15440 15435 if (dst_rhtp->tpc_tp.host_type != UNLABELED) {
15441 15436 TPC_RELE(dst_rhtp);
15442 15437 dst_rhtp = NULL;
15443 15438 }
15444 15439 }
15445 15440
15446 15441 /*
15447 15442 * Hold the ill_g_lock as reader. This makes sure that no ipif/ill
15448 15443 * can be deleted. But an ipif/ill can get CONDEMNED any time.
15449 15444 * After selecting the right ipif, under ill_lock make sure ipif is
15450 15445 * not condemned, and increment refcnt. If ipif is CONDEMNED,
15451 15446 * we retry. Inside the loop we still need to check for CONDEMNED,
15452 15447 * but not under a lock.
15453 15448 */
15454 15449 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
15455 15450 retry:
15456 15451 /*
15457 15452 * For source address selection, we treat the ipif list as circular
15458 15453 * and continue until we get back to where we started. This allows
15459 15454 * IPMP to vary source address selection (which improves inbound load
15460 15455 * spreading) by caching its last ending point and starting from
15461 15456 * there. NOTE: we don't have to worry about ill_src_ipif changing
15462 15457 * ills since that can't happen on the IPMP ill.
15463 15458 */
15464 15459 start_ipif = ill->ill_ipif;
15465 15460 if (IS_IPMP(ill) && ill->ill_src_ipif != NULL)
15466 15461 start_ipif = ill->ill_src_ipif;
15467 15462
15468 15463 ipif = start_ipif;
15469 15464 best_ipif = NULL;
15470 15465 best_type = IPIF_NONE;
15471 15466 do {
15472 15467 if ((next_ipif = ipif->ipif_next) == NULL)
15473 15468 next_ipif = ill->ill_ipif;
15474 15469
15475 15470 if (IPIF_IS_CONDEMNED(ipif))
15476 15471 continue;
15477 15472 /* Always skip NOLOCAL and ANYCAST interfaces */
15478 15473 if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))
15479 15474 continue;
15480 15475 /* Always skip NOACCEPT interfaces */
15481 15476 if (ipif->ipif_ill->ill_flags & ILLF_NOACCEPT)
15482 15477 continue;
15483 15478 if (!(ipif->ipif_flags & IPIF_UP))
15484 15479 continue;
15485 15480
15486 15481 if (!ipif->ipif_addr_ready) {
15487 15482 if (notreadyp != NULL)
15488 15483 *notreadyp = B_TRUE;
15489 15484 continue;
15490 15485 }
15491 15486
15492 15487 if (zoneid != ALL_ZONES &&
15493 15488 ipif->ipif_zoneid != zoneid &&
15494 15489 ipif->ipif_zoneid != ALL_ZONES)
15495 15490 continue;
15496 15491
15497 15492 /*
15498 15493 * Interfaces with 0.0.0.0 address are allowed to be UP, but
15499 15494 * are not valid as source addresses.
15500 15495 */
15501 15496 if (ipif->ipif_lcl_addr == INADDR_ANY)
15502 15497 continue;
15503 15498
15504 15499 /*
15505 15500 * Check compatibility of local address for destination's
15506 15501 * default label if we're on a labeled system. Incompatible
15507 15502 * addresses can't be used at all.
15508 15503 */
15509 15504 if (dst_rhtp != NULL) {
15510 15505 boolean_t incompat;
15511 15506
15512 15507 src_rhtp = find_tpc(&ipif->ipif_lcl_addr,
15513 15508 IPV4_VERSION, B_FALSE);
15514 15509 if (src_rhtp == NULL)
15515 15510 continue;
15516 15511 incompat = src_rhtp->tpc_tp.host_type != SUN_CIPSO ||
15517 15512 src_rhtp->tpc_tp.tp_doi !=
15518 15513 dst_rhtp->tpc_tp.tp_doi ||
15519 15514 (!_blinrange(&dst_rhtp->tpc_tp.tp_def_label,
15520 15515 &src_rhtp->tpc_tp.tp_sl_range_cipso) &&
15521 15516 !blinlset(&dst_rhtp->tpc_tp.tp_def_label,
15522 15517 src_rhtp->tpc_tp.tp_sl_set_cipso));
15523 15518 TPC_RELE(src_rhtp);
15524 15519 if (incompat)
15525 15520 continue;
15526 15521 }
15527 15522
15528 15523 samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet);
15529 15524
15530 15525 if (ipif->ipif_lcl_addr == dst) {
15531 15526 type = IPIF_LOCALADDR;
15532 15527 } else if (ipif->ipif_flags & IPIF_DEPRECATED) {
15533 15528 type = samenet ? IPIF_SAMENET_DEPRECATED :
15534 15529 IPIF_DIFFNET_DEPRECATED;
15535 15530 } else if (ipif->ipif_zoneid == ALL_ZONES) {
15536 15531 type = samenet ? IPIF_SAMENET_ALLZONES :
15537 15532 IPIF_DIFFNET_ALLZONES;
15538 15533 } else {
15539 15534 type = samenet ? IPIF_SAMENET : IPIF_DIFFNET;
15540 15535 }
15541 15536
15542 15537 if (type > best_type) {
15543 15538 best_type = type;
15544 15539 best_ipif = ipif;
15545 15540 if (best_type == IPIF_LOCALADDR)
15546 15541 break; /* can't get better */
15547 15542 }
15548 15543 } while ((ipif = next_ipif) != start_ipif);
15549 15544
15550 15545 if ((ipif = best_ipif) != NULL) {
15551 15546 mutex_enter(&ipif->ipif_ill->ill_lock);
15552 15547 if (IPIF_IS_CONDEMNED(ipif)) {
15553 15548 mutex_exit(&ipif->ipif_ill->ill_lock);
15554 15549 goto retry;
15555 15550 }
15556 15551 ipif_refhold_locked(ipif);
15557 15552
15558 15553 /*
15559 15554 * For IPMP, update the source ipif rotor to the next ipif,
15560 15555 * provided we can look it up. (We must not use it if it's
15561 15556 * IPIF_CONDEMNED since we may have grabbed ill_g_lock after
15562 15557 * ipif_free() checked ill_src_ipif.)
15563 15558 */
15564 15559 if (IS_IPMP(ill) && ipif != NULL) {
15565 15560 next_ipif = ipif->ipif_next;
15566 15561 if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif))
15567 15562 ill->ill_src_ipif = next_ipif;
15568 15563 else
15569 15564 ill->ill_src_ipif = NULL;
15570 15565 }
15571 15566 mutex_exit(&ipif->ipif_ill->ill_lock);
15572 15567 }
15573 15568
15574 15569 rw_exit(&ipst->ips_ill_g_lock);
15575 15570 if (usill != NULL)
15576 15571 ill_refrele(usill);
15577 15572 if (ipmp_ill != NULL)
15578 15573 ill_refrele(ipmp_ill);
15579 15574 if (dst_rhtp != NULL)
15580 15575 TPC_RELE(dst_rhtp);
15581 15576
15582 15577 #ifdef DEBUG
15583 15578 if (ipif == NULL) {
15584 15579 char buf1[INET6_ADDRSTRLEN];
15585 15580
15586 15581 ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n",
15587 15582 ill->ill_name,
15588 15583 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1))));
15589 15584 } else {
15590 15585 char buf1[INET6_ADDRSTRLEN];
15591 15586 char buf2[INET6_ADDRSTRLEN];
15592 15587
15593 15588 ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n",
15594 15589 ipif->ipif_ill->ill_name,
15595 15590 inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)),
15596 15591 inet_ntop(AF_INET, &ipif->ipif_lcl_addr,
15597 15592 buf2, sizeof (buf2))));
15598 15593 }
15599 15594 #endif /* DEBUG */
15600 15595 return (ipif);
15601 15596 }
15602 15597
15603 15598 /*
15604 15599 * Pick a source address based on the destination ill and an optional setsrc
15605 15600 * address.
15606 15601 * The result is stored in srcp. If generation is set, then put the source
15607 15602 * generation number there before we look for the source address (to avoid
15608 15603 * missing changes in the set of source addresses.
15609 15604 * If flagsp is set, then us it to pass back ipif_flags.
15610 15605 *
15611 15606 * If the caller wants to cache the returned source address and detect when
15612 15607 * that might be stale, the caller should pass in a generation argument,
15613 15608 * which the caller can later compare against ips_src_generation
15614 15609 *
15615 15610 * The precedence order for selecting an IPv4 source address is:
15616 15611 * - RTF_SETSRC on the offlink ire always wins.
15617 15612 * - If usrsrc is set, swap the ill to be the usesrc one.
15618 15613 * - If IPMP is used on the ill, select a random address from the most
15619 15614 * preferred ones below:
15620 15615 * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES
15621 15616 * 2. Not deprecated, not ALL_ZONES
15622 15617 * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES
15623 15618 * 4. Not deprecated, ALL_ZONES
15624 15619 * 5. If onlink destination, same subnet and deprecated
15625 15620 * 6. Deprecated.
15626 15621 *
15627 15622 * We have lower preference for ALL_ZONES IP addresses,
15628 15623 * as they pose problems with unlabeled destinations.
15629 15624 *
15630 15625 * Note that when multiple IP addresses match e.g., #1 we pick
15631 15626 * the first one if IPMP is not in use. With IPMP we randomize.
15632 15627 */
15633 15628 int
15634 15629 ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
15635 15630 ipaddr_t multicast_ifaddr,
15636 15631 zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp,
15637 15632 uint32_t *generation, uint64_t *flagsp)
15638 15633 {
15639 15634 ipif_t *ipif;
15640 15635 boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */
15641 15636
15642 15637 if (flagsp != NULL)
15643 15638 *flagsp = 0;
15644 15639
15645 15640 /*
15646 15641 * Need to grab the generation number before we check to
15647 15642 * avoid a race with a change to the set of local addresses.
15648 15643 * No lock needed since the thread which updates the set of local
15649 15644 * addresses use ipif/ill locks and exit those (hence a store memory
15650 15645 * barrier) before doing the atomic increase of ips_src_generation.
15651 15646 */
15652 15647 if (generation != NULL) {
15653 15648 *generation = ipst->ips_src_generation;
15654 15649 }
15655 15650
15656 15651 if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) {
15657 15652 *srcp = multicast_ifaddr;
15658 15653 return (0);
15659 15654 }
15660 15655
15661 15656 /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */
15662 15657 if (setsrc != INADDR_ANY) {
15663 15658 *srcp = setsrc;
15664 15659 return (0);
15665 15660 }
15666 15661 ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready);
15667 15662 if (ipif == NULL) {
15668 15663 if (notready)
15669 15664 return (ENETDOWN);
15670 15665 else
15671 15666 return (EADDRNOTAVAIL);
15672 15667 }
15673 15668 *srcp = ipif->ipif_lcl_addr;
15674 15669 if (flagsp != NULL)
15675 15670 *flagsp = ipif->ipif_flags;
15676 15671 ipif_refrele(ipif);
15677 15672 return (0);
15678 15673 }
15679 15674
15680 15675 /* ARGSUSED */
15681 15676 int
15682 15677 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15683 15678 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15684 15679 {
15685 15680 /*
15686 15681 * ill_phyint_reinit merged the v4 and v6 into a single
15687 15682 * ipsq. We might not have been able to complete the
15688 15683 * operation in ipif_set_values, if we could not become
15689 15684 * exclusive. If so restart it here.
15690 15685 */
15691 15686 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15692 15687 }
15693 15688
15694 15689 /*
15695 15690 * Can operate on either a module or a driver queue.
15696 15691 * Returns an error if not a module queue.
15697 15692 */
15698 15693 /* ARGSUSED */
15699 15694 int
15700 15695 if_unitsel(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15701 15696 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15702 15697 {
15703 15698 queue_t *q1 = q;
15704 15699 char *cp;
15705 15700 char interf_name[LIFNAMSIZ];
15706 15701 uint_t ppa = *(uint_t *)mp->b_cont->b_cont->b_rptr;
15707 15702
15708 15703 if (q->q_next == NULL) {
15709 15704 ip1dbg((
15710 15705 "if_unitsel: IF_UNITSEL: no q_next\n"));
15711 15706 return (EINVAL);
15712 15707 }
15713 15708
15714 15709 if (((ill_t *)(q->q_ptr))->ill_name[0] != '\0')
15715 15710 return (EALREADY);
15716 15711
15717 15712 do {
15718 15713 q1 = q1->q_next;
15719 15714 } while (q1->q_next);
15720 15715 cp = q1->q_qinfo->qi_minfo->mi_idname;
15721 15716 (void) sprintf(interf_name, "%s%d", cp, ppa);
15722 15717
15723 15718 /*
15724 15719 * Here we are not going to delay the ioack until after
15725 15720 * ACKs from DL_ATTACH_REQ/DL_BIND_REQ. So no need to save the
15726 15721 * original ioctl message before sending the requests.
15727 15722 */
15728 15723 return (ipif_set_values(q, mp, interf_name, &ppa));
15729 15724 }
15730 15725
15731 15726 /* ARGSUSED */
15732 15727 int
15733 15728 ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
15734 15729 ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
15735 15730 {
15736 15731 return (ENXIO);
15737 15732 }
15738 15733
15739 15734 /*
15740 15735 * Create any IRE_BROADCAST entries for `ipif', and store those entries in
15741 15736 * `irep'. Returns a pointer to the next free `irep' entry
15742 15737 * A mirror exists in ipif_delete_bcast_ires().
15743 15738 *
15744 15739 * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is
15745 15740 * done in ire_add.
15746 15741 */
15747 15742 static ire_t **
15748 15743 ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep)
15749 15744 {
15750 15745 ipaddr_t addr;
15751 15746 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15752 15747 ipaddr_t subnetmask = ipif->ipif_net_mask;
15753 15748 ill_t *ill = ipif->ipif_ill;
15754 15749 zoneid_t zoneid = ipif->ipif_zoneid;
15755 15750
15756 15751 ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n"));
15757 15752
15758 15753 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15759 15754 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15760 15755
15761 15756 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15762 15757 (ipif->ipif_flags & IPIF_NOLOCAL))
15763 15758 netmask = htonl(IN_CLASSA_NET); /* fallback */
15764 15759
15765 15760 irep = ire_create_bcast(ill, 0, zoneid, irep);
15766 15761 irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep);
15767 15762
15768 15763 /*
15769 15764 * For backward compatibility, we create net broadcast IREs based on
15770 15765 * the old "IP address class system", since some old machines only
15771 15766 * respond to these class derived net broadcast. However, we must not
15772 15767 * create these net broadcast IREs if the subnetmask is shorter than
15773 15768 * the IP address class based derived netmask. Otherwise, we may
15774 15769 * create a net broadcast address which is the same as an IP address
15775 15770 * on the subnet -- and then TCP will refuse to talk to that address.
15776 15771 */
15777 15772 if (netmask < subnetmask) {
15778 15773 addr = netmask & ipif->ipif_subnet;
15779 15774 irep = ire_create_bcast(ill, addr, zoneid, irep);
15780 15775 irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep);
15781 15776 }
15782 15777
15783 15778 /*
15784 15779 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15785 15780 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15786 15781 * created. Creating these broadcast IREs will only create confusion
15787 15782 * as `addr' will be the same as the IP address.
15788 15783 */
15789 15784 if (subnetmask != 0xFFFFFFFF) {
15790 15785 addr = ipif->ipif_subnet;
15791 15786 irep = ire_create_bcast(ill, addr, zoneid, irep);
15792 15787 irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep);
15793 15788 }
15794 15789
15795 15790 return (irep);
15796 15791 }
15797 15792
15798 15793 /*
15799 15794 * Mirror of ipif_create_bcast_ires()
15800 15795 */
15801 15796 static void
15802 15797 ipif_delete_bcast_ires(ipif_t *ipif)
15803 15798 {
15804 15799 ipaddr_t addr;
15805 15800 ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr);
15806 15801 ipaddr_t subnetmask = ipif->ipif_net_mask;
15807 15802 ill_t *ill = ipif->ipif_ill;
15808 15803 zoneid_t zoneid = ipif->ipif_zoneid;
15809 15804 ire_t *ire;
15810 15805
15811 15806 ASSERT(ipif->ipif_flags & IPIF_BROADCAST);
15812 15807 ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT));
15813 15808
15814 15809 if (ipif->ipif_lcl_addr == INADDR_ANY ||
15815 15810 (ipif->ipif_flags & IPIF_NOLOCAL))
15816 15811 netmask = htonl(IN_CLASSA_NET); /* fallback */
15817 15812
15818 15813 ire = ire_lookup_bcast(ill, 0, zoneid);
15819 15814 ASSERT(ire != NULL);
15820 15815 ire_delete(ire); ire_refrele(ire);
15821 15816 ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid);
15822 15817 ASSERT(ire != NULL);
15823 15818 ire_delete(ire); ire_refrele(ire);
15824 15819
15825 15820 /*
15826 15821 * For backward compatibility, we create net broadcast IREs based on
15827 15822 * the old "IP address class system", since some old machines only
15828 15823 * respond to these class derived net broadcast. However, we must not
15829 15824 * create these net broadcast IREs if the subnetmask is shorter than
15830 15825 * the IP address class based derived netmask. Otherwise, we may
15831 15826 * create a net broadcast address which is the same as an IP address
15832 15827 * on the subnet -- and then TCP will refuse to talk to that address.
15833 15828 */
15834 15829 if (netmask < subnetmask) {
15835 15830 addr = netmask & ipif->ipif_subnet;
15836 15831 ire = ire_lookup_bcast(ill, addr, zoneid);
15837 15832 ASSERT(ire != NULL);
15838 15833 ire_delete(ire); ire_refrele(ire);
15839 15834 ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid);
15840 15835 ASSERT(ire != NULL);
15841 15836 ire_delete(ire); ire_refrele(ire);
15842 15837 }
15843 15838
15844 15839 /*
15845 15840 * Don't create IRE_BROADCAST IREs for the interface if the subnetmask
15846 15841 * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already
15847 15842 * created. Creating these broadcast IREs will only create confusion
15848 15843 * as `addr' will be the same as the IP address.
15849 15844 */
15850 15845 if (subnetmask != 0xFFFFFFFF) {
15851 15846 addr = ipif->ipif_subnet;
15852 15847 ire = ire_lookup_bcast(ill, addr, zoneid);
15853 15848 ASSERT(ire != NULL);
15854 15849 ire_delete(ire); ire_refrele(ire);
15855 15850 ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid);
15856 15851 ASSERT(ire != NULL);
15857 15852 ire_delete(ire); ire_refrele(ire);
15858 15853 }
15859 15854 }
15860 15855
15861 15856 /*
15862 15857 * Extract both the flags (including IFF_CANTCHANGE) such as IFF_IPV*
15863 15858 * from lifr_flags and the name from lifr_name.
15864 15859 * Set IFF_IPV* and ill_isv6 prior to doing the lookup
15865 15860 * since ipif_lookup_on_name uses the _isv6 flags when matching.
15866 15861 * Returns EINPROGRESS when mp has been consumed by queueing it on
15867 15862 * ipx_pending_mp and the ioctl will complete in ip_rput.
15868 15863 *
15869 15864 * Can operate on either a module or a driver queue.
15870 15865 * Returns an error if not a module queue.
15871 15866 */
15872 15867 /* ARGSUSED */
15873 15868 int
15874 15869 ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15875 15870 ip_ioctl_cmd_t *ipip, void *if_req)
15876 15871 {
15877 15872 ill_t *ill = q->q_ptr;
15878 15873 phyint_t *phyi;
15879 15874 ip_stack_t *ipst;
15880 15875 struct lifreq *lifr = if_req;
15881 15876 uint64_t new_flags;
15882 15877
15883 15878 ASSERT(ipif != NULL);
15884 15879 ip1dbg(("ip_sioctl_slifname %s\n", lifr->lifr_name));
15885 15880
15886 15881 if (q->q_next == NULL) {
15887 15882 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: no q_next\n"));
15888 15883 return (EINVAL);
15889 15884 }
15890 15885
15891 15886 /*
15892 15887 * If we are not writer on 'q' then this interface exists already
15893 15888 * and previous lookups (ip_extract_lifreq()) found this ipif --
15894 15889 * so return EALREADY.
15895 15890 */
15896 15891 if (ill != ipif->ipif_ill)
15897 15892 return (EALREADY);
15898 15893
15899 15894 if (ill->ill_name[0] != '\0')
15900 15895 return (EALREADY);
15901 15896
15902 15897 /*
15903 15898 * If there's another ill already with the requested name, ensure
15904 15899 * that it's of the same type. Otherwise, ill_phyint_reinit() will
15905 15900 * fuse together two unrelated ills, which will cause chaos.
15906 15901 */
15907 15902 ipst = ill->ill_ipst;
15908 15903 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
15909 15904 lifr->lifr_name, NULL);
15910 15905 if (phyi != NULL) {
15911 15906 ill_t *ill_mate = phyi->phyint_illv4;
15912 15907
15913 15908 if (ill_mate == NULL)
15914 15909 ill_mate = phyi->phyint_illv6;
15915 15910 ASSERT(ill_mate != NULL);
15916 15911
15917 15912 if (ill_mate->ill_media->ip_m_mac_type !=
15918 15913 ill->ill_media->ip_m_mac_type) {
15919 15914 ip1dbg(("if_sioctl_slifname: SIOCSLIFNAME: attempt to "
15920 15915 "use the same ill name on differing media\n"));
15921 15916 return (EINVAL);
15922 15917 }
15923 15918 }
15924 15919
15925 15920 /*
15926 15921 * We start off as IFF_IPV4 in ipif_allocate and become
15927 15922 * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value.
15928 15923 * The only flags that we read from user space are IFF_IPV4,
15929 15924 * IFF_IPV6, and IFF_BROADCAST.
15930 15925 *
15931 15926 * This ill has not been inserted into the global list.
15932 15927 * So we are still single threaded and don't need any lock
15933 15928 *
15934 15929 * Saniy check the flags.
15935 15930 */
15936 15931
15937 15932 if ((lifr->lifr_flags & IFF_BROADCAST) &&
15938 15933 ((lifr->lifr_flags & IFF_IPV6) ||
15939 15934 (!ill->ill_needs_attach && ill->ill_bcast_addr_length == 0))) {
15940 15935 ip1dbg(("ip_sioctl_slifname: link not broadcast capable "
15941 15936 "or IPv6 i.e., no broadcast \n"));
15942 15937 return (EINVAL);
15943 15938 }
15944 15939
15945 15940 new_flags =
15946 15941 lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST);
15947 15942
15948 15943 if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) {
15949 15944 ip1dbg(("ip_sioctl_slifname: flags must be exactly one of "
15950 15945 "IFF_IPV4 or IFF_IPV6\n"));
15951 15946 return (EINVAL);
15952 15947 }
15953 15948
15954 15949 /*
15955 15950 * We always start off as IPv4, so only need to check for IPv6.
15956 15951 */
15957 15952 if ((new_flags & IFF_IPV6) != 0) {
15958 15953 ill->ill_flags |= ILLF_IPV6;
15959 15954 ill->ill_flags &= ~ILLF_IPV4;
15960 15955
15961 15956 if (lifr->lifr_flags & IFF_NOLINKLOCAL)
15962 15957 ill->ill_flags |= ILLF_NOLINKLOCAL;
15963 15958 }
15964 15959
15965 15960 if ((new_flags & IFF_BROADCAST) != 0)
15966 15961 ipif->ipif_flags |= IPIF_BROADCAST;
15967 15962 else
15968 15963 ipif->ipif_flags &= ~IPIF_BROADCAST;
15969 15964
15970 15965 /* We started off as V4. */
15971 15966 if (ill->ill_flags & ILLF_IPV6) {
15972 15967 ill->ill_phyint->phyint_illv6 = ill;
15973 15968 ill->ill_phyint->phyint_illv4 = NULL;
15974 15969 }
15975 15970
15976 15971 return (ipif_set_values(q, mp, lifr->lifr_name, &lifr->lifr_ppa));
15977 15972 }
15978 15973
15979 15974 /* ARGSUSED */
15980 15975 int
15981 15976 ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
15982 15977 ip_ioctl_cmd_t *ipip, void *if_req)
15983 15978 {
15984 15979 /*
15985 15980 * ill_phyint_reinit merged the v4 and v6 into a single
15986 15981 * ipsq. We might not have been able to complete the
15987 15982 * slifname in ipif_set_values, if we could not become
15988 15983 * exclusive. If so restart it here
15989 15984 */
15990 15985 return (ipif_set_values_tail(ipif->ipif_ill, ipif, mp, q));
15991 15986 }
15992 15987
15993 15988 /*
15994 15989 * Return a pointer to the ipif which matches the index, IP version type and
15995 15990 * zoneid.
15996 15991 */
15997 15992 ipif_t *
15998 15993 ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid,
15999 15994 ip_stack_t *ipst)
16000 15995 {
16001 15996 ill_t *ill;
16002 15997 ipif_t *ipif = NULL;
16003 15998
16004 15999 ill = ill_lookup_on_ifindex(index, isv6, ipst);
16005 16000 if (ill != NULL) {
16006 16001 mutex_enter(&ill->ill_lock);
16007 16002 for (ipif = ill->ill_ipif; ipif != NULL;
16008 16003 ipif = ipif->ipif_next) {
16009 16004 if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES ||
16010 16005 zoneid == ipif->ipif_zoneid ||
16011 16006 ipif->ipif_zoneid == ALL_ZONES)) {
16012 16007 ipif_refhold_locked(ipif);
16013 16008 break;
16014 16009 }
16015 16010 }
16016 16011 mutex_exit(&ill->ill_lock);
16017 16012 ill_refrele(ill);
16018 16013 }
16019 16014 return (ipif);
16020 16015 }
16021 16016
16022 16017 /*
16023 16018 * Change an existing physical interface's index. If the new index
16024 16019 * is acceptable we update the index and the phyint_list_avl_by_index tree.
16025 16020 * Finally, we update other systems which may have a dependence on the
16026 16021 * index value.
16027 16022 */
16028 16023 /* ARGSUSED */
16029 16024 int
16030 16025 ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16031 16026 ip_ioctl_cmd_t *ipip, void *ifreq)
16032 16027 {
16033 16028 ill_t *ill;
16034 16029 phyint_t *phyi;
16035 16030 struct ifreq *ifr = (struct ifreq *)ifreq;
16036 16031 struct lifreq *lifr = (struct lifreq *)ifreq;
16037 16032 uint_t old_index, index;
16038 16033 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
16039 16034 avl_index_t where;
16040 16035
16041 16036 if (ipip->ipi_cmd_type == IF_CMD)
16042 16037 index = ifr->ifr_index;
16043 16038 else
16044 16039 index = lifr->lifr_index;
16045 16040
16046 16041 /*
16047 16042 * Only allow on physical interface. Also, index zero is illegal.
16048 16043 */
16049 16044 ill = ipif->ipif_ill;
16050 16045 phyi = ill->ill_phyint;
16051 16046 if (ipif->ipif_id != 0 || index == 0 || index > IF_INDEX_MAX) {
16052 16047 return (EINVAL);
16053 16048 }
16054 16049
16055 16050 /* If the index is not changing, no work to do */
16056 16051 if (phyi->phyint_ifindex == index)
16057 16052 return (0);
16058 16053
16059 16054 /*
16060 16055 * Use phyint_exists() to determine if the new interface index
16061 16056 * is already in use. If the index is unused then we need to
16062 16057 * change the phyint's position in the phyint_list_avl_by_index
16063 16058 * tree. If we do not do this, subsequent lookups (using the new
16064 16059 * index value) will not find the phyint.
16065 16060 */
16066 16061 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
16067 16062 if (phyint_exists(index, ipst)) {
16068 16063 rw_exit(&ipst->ips_ill_g_lock);
16069 16064 return (EEXIST);
16070 16065 }
16071 16066
16072 16067 /*
16073 16068 * The new index is unused. Set it in the phyint. However we must not
16074 16069 * forget to trigger NE_IFINDEX_CHANGE event before the ifindex
16075 16070 * changes. The event must be bound to old ifindex value.
16076 16071 */
16077 16072 ill_nic_event_dispatch(ill, 0, NE_IFINDEX_CHANGE,
16078 16073 &index, sizeof (index));
16079 16074
16080 16075 old_index = phyi->phyint_ifindex;
16081 16076 phyi->phyint_ifindex = index;
16082 16077
16083 16078 avl_remove(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, phyi);
16084 16079 (void) avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16085 16080 &index, &where);
16086 16081 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16087 16082 phyi, where);
16088 16083 rw_exit(&ipst->ips_ill_g_lock);
16089 16084
16090 16085 /* Update SCTP's ILL list */
16091 16086 sctp_ill_reindex(ill, old_index);
16092 16087
16093 16088 /* Send the routing sockets message */
16094 16089 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
16095 16090 if (ILL_OTHER(ill))
16096 16091 ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT);
16097 16092
16098 16093 /* Perhaps ilgs should use this ill */
16099 16094 update_conn_ill(NULL, ill->ill_ipst);
16100 16095 return (0);
16101 16096 }
16102 16097
16103 16098 /* ARGSUSED */
16104 16099 int
16105 16100 ip_sioctl_get_lifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16106 16101 ip_ioctl_cmd_t *ipip, void *ifreq)
16107 16102 {
16108 16103 struct ifreq *ifr = (struct ifreq *)ifreq;
16109 16104 struct lifreq *lifr = (struct lifreq *)ifreq;
16110 16105
16111 16106 ip1dbg(("ip_sioctl_get_lifindex(%s:%u %p)\n",
16112 16107 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16113 16108 /* Get the interface index */
16114 16109 if (ipip->ipi_cmd_type == IF_CMD) {
16115 16110 ifr->ifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16116 16111 } else {
16117 16112 lifr->lifr_index = ipif->ipif_ill->ill_phyint->phyint_ifindex;
16118 16113 }
16119 16114 return (0);
16120 16115 }
16121 16116
16122 16117 /* ARGSUSED */
16123 16118 int
16124 16119 ip_sioctl_get_lifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16125 16120 ip_ioctl_cmd_t *ipip, void *ifreq)
16126 16121 {
16127 16122 struct lifreq *lifr = (struct lifreq *)ifreq;
16128 16123
16129 16124 ip1dbg(("ip_sioctl_get_lifzone(%s:%u %p)\n",
16130 16125 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16131 16126 /* Get the interface zone */
16132 16127 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16133 16128 lifr->lifr_zoneid = ipif->ipif_zoneid;
16134 16129 return (0);
16135 16130 }
16136 16131
16137 16132 /*
16138 16133 * Set the zoneid of an interface.
16139 16134 */
16140 16135 /* ARGSUSED */
16141 16136 int
16142 16137 ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16143 16138 ip_ioctl_cmd_t *ipip, void *ifreq)
16144 16139 {
16145 16140 struct lifreq *lifr = (struct lifreq *)ifreq;
16146 16141 int err = 0;
16147 16142 boolean_t need_up = B_FALSE;
16148 16143 zone_t *zptr;
16149 16144 zone_status_t status;
16150 16145 zoneid_t zoneid;
16151 16146
16152 16147 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16153 16148 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) {
16154 16149 if (!is_system_labeled())
16155 16150 return (ENOTSUP);
16156 16151 zoneid = GLOBAL_ZONEID;
16157 16152 }
16158 16153
16159 16154 /* cannot assign instance zero to a non-global zone */
16160 16155 if (ipif->ipif_id == 0 && zoneid != GLOBAL_ZONEID)
16161 16156 return (ENOTSUP);
16162 16157
16163 16158 /*
16164 16159 * Cannot assign to a zone that doesn't exist or is shutting down. In
16165 16160 * the event of a race with the zone shutdown processing, since IP
16166 16161 * serializes this ioctl and SIOCGLIFCONF/SIOCLIFREMOVEIF, we know the
16167 16162 * interface will be cleaned up even if the zone is shut down
16168 16163 * immediately after the status check. If the interface can't be brought
16169 16164 * down right away, and the zone is shut down before the restart
16170 16165 * function is called, we resolve the possible races by rechecking the
16171 16166 * zone status in the restart function.
16172 16167 */
16173 16168 if ((zptr = zone_find_by_id(zoneid)) == NULL)
16174 16169 return (EINVAL);
16175 16170 status = zone_status_get(zptr);
16176 16171 zone_rele(zptr);
16177 16172
16178 16173 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING)
16179 16174 return (EINVAL);
16180 16175
16181 16176 if (ipif->ipif_flags & IPIF_UP) {
16182 16177 /*
16183 16178 * If the interface is already marked up,
16184 16179 * we call ipif_down which will take care
16185 16180 * of ditching any IREs that have been set
16186 16181 * up based on the old interface address.
16187 16182 */
16188 16183 err = ipif_logical_down(ipif, q, mp);
16189 16184 if (err == EINPROGRESS)
16190 16185 return (err);
16191 16186 (void) ipif_down_tail(ipif);
16192 16187 need_up = B_TRUE;
16193 16188 }
16194 16189
16195 16190 err = ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, need_up);
16196 16191 return (err);
16197 16192 }
16198 16193
16199 16194 static int
16200 16195 ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid,
16201 16196 queue_t *q, mblk_t *mp, boolean_t need_up)
16202 16197 {
16203 16198 int err = 0;
16204 16199 ip_stack_t *ipst;
16205 16200
16206 16201 ip1dbg(("ip_sioctl_zoneid_tail(%s:%u %p)\n",
16207 16202 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16208 16203
16209 16204 if (CONN_Q(q))
16210 16205 ipst = CONNQ_TO_IPST(q);
16211 16206 else
16212 16207 ipst = ILLQ_TO_IPST(q);
16213 16208
16214 16209 /*
16215 16210 * For exclusive stacks we don't allow a different zoneid than
16216 16211 * global.
16217 16212 */
16218 16213 if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID &&
16219 16214 zoneid != GLOBAL_ZONEID)
16220 16215 return (EINVAL);
16221 16216
16222 16217 /* Set the new zone id. */
16223 16218 ipif->ipif_zoneid = zoneid;
16224 16219
16225 16220 /* Update sctp list */
16226 16221 sctp_update_ipif(ipif, SCTP_IPIF_UPDATE);
16227 16222
16228 16223 /* The default multicast interface might have changed */
16229 16224 ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6);
16230 16225
16231 16226 if (need_up) {
16232 16227 /*
16233 16228 * Now bring the interface back up. If this
16234 16229 * is the only IPIF for the ILL, ipif_up
16235 16230 * will have to re-bind to the device, so
16236 16231 * we may get back EINPROGRESS, in which
16237 16232 * case, this IOCTL will get completed in
16238 16233 * ip_rput_dlpi when we see the DL_BIND_ACK.
16239 16234 */
16240 16235 err = ipif_up(ipif, q, mp);
16241 16236 }
16242 16237 return (err);
16243 16238 }
16244 16239
16245 16240 /* ARGSUSED */
16246 16241 int
16247 16242 ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16248 16243 ip_ioctl_cmd_t *ipip, void *if_req)
16249 16244 {
16250 16245 struct lifreq *lifr = (struct lifreq *)if_req;
16251 16246 zoneid_t zoneid;
16252 16247 zone_t *zptr;
16253 16248 zone_status_t status;
16254 16249
16255 16250 ASSERT(ipip->ipi_cmd_type == LIF_CMD);
16256 16251 if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES)
16257 16252 zoneid = GLOBAL_ZONEID;
16258 16253
16259 16254 ip1dbg(("ip_sioctl_slifzone_restart(%s:%u %p)\n",
16260 16255 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16261 16256
16262 16257 /*
16263 16258 * We recheck the zone status to resolve the following race condition:
16264 16259 * 1) process sends SIOCSLIFZONE to put hme0:1 in zone "myzone";
16265 16260 * 2) hme0:1 is up and can't be brought down right away;
16266 16261 * ip_sioctl_slifzone() returns EINPROGRESS and the request is queued;
16267 16262 * 3) zone "myzone" is halted; the zone status switches to
16268 16263 * 'shutting_down' and the zones framework sends SIOCGLIFCONF to list
16269 16264 * the interfaces to remove - hme0:1 is not returned because it's not
16270 16265 * yet in "myzone", so it won't be removed;
16271 16266 * 4) the restart function for SIOCSLIFZONE is called; without the
16272 16267 * status check here, we would have hme0:1 in "myzone" after it's been
16273 16268 * destroyed.
16274 16269 * Note that if the status check fails, we need to bring the interface
16275 16270 * back to its state prior to ip_sioctl_slifzone(), hence the call to
16276 16271 * ipif_up_done[_v6]().
16277 16272 */
16278 16273 status = ZONE_IS_UNINITIALIZED;
16279 16274 if ((zptr = zone_find_by_id(zoneid)) != NULL) {
16280 16275 status = zone_status_get(zptr);
16281 16276 zone_rele(zptr);
16282 16277 }
16283 16278 if (status != ZONE_IS_READY && status != ZONE_IS_RUNNING) {
16284 16279 if (ipif->ipif_isv6) {
16285 16280 (void) ipif_up_done_v6(ipif);
16286 16281 } else {
16287 16282 (void) ipif_up_done(ipif);
16288 16283 }
16289 16284 return (EINVAL);
16290 16285 }
16291 16286
16292 16287 (void) ipif_down_tail(ipif);
16293 16288
16294 16289 return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp,
16295 16290 B_TRUE));
16296 16291 }
16297 16292
16298 16293 /*
16299 16294 * Return the number of addresses on `ill' with one or more of the values
16300 16295 * in `set' set and all of the values in `clear' clear.
16301 16296 */
16302 16297 static uint_t
16303 16298 ill_flagaddr_cnt(const ill_t *ill, uint64_t set, uint64_t clear)
16304 16299 {
16305 16300 ipif_t *ipif;
16306 16301 uint_t cnt = 0;
16307 16302
16308 16303 ASSERT(IAM_WRITER_ILL(ill));
16309 16304
16310 16305 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
16311 16306 if ((ipif->ipif_flags & set) && !(ipif->ipif_flags & clear))
16312 16307 cnt++;
16313 16308
16314 16309 return (cnt);
16315 16310 }
16316 16311
16317 16312 /*
16318 16313 * Return the number of migratable addresses on `ill' that are under
16319 16314 * application control.
16320 16315 */
16321 16316 uint_t
16322 16317 ill_appaddr_cnt(const ill_t *ill)
16323 16318 {
16324 16319 return (ill_flagaddr_cnt(ill, IPIF_DHCPRUNNING | IPIF_ADDRCONF,
16325 16320 IPIF_NOFAILOVER));
16326 16321 }
16327 16322
16328 16323 /*
16329 16324 * Return the number of point-to-point addresses on `ill'.
16330 16325 */
16331 16326 uint_t
16332 16327 ill_ptpaddr_cnt(const ill_t *ill)
16333 16328 {
16334 16329 return (ill_flagaddr_cnt(ill, IPIF_POINTOPOINT, 0));
16335 16330 }
16336 16331
16337 16332 /* ARGSUSED */
16338 16333 int
16339 16334 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16340 16335 ip_ioctl_cmd_t *ipip, void *ifreq)
16341 16336 {
16342 16337 struct lifreq *lifr = ifreq;
16343 16338
16344 16339 ASSERT(q->q_next == NULL);
16345 16340 ASSERT(CONN_Q(q));
16346 16341
16347 16342 ip1dbg(("ip_sioctl_get_lifusesrc(%s:%u %p)\n",
16348 16343 ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
16349 16344 lifr->lifr_index = ipif->ipif_ill->ill_usesrc_ifindex;
16350 16345 ip1dbg(("ip_sioctl_get_lifusesrc:lifr_index = %d\n", lifr->lifr_index));
16351 16346
16352 16347 return (0);
16353 16348 }
16354 16349
16355 16350 /* Find the previous ILL in this usesrc group */
16356 16351 static ill_t *
16357 16352 ill_prev_usesrc(ill_t *uill)
16358 16353 {
16359 16354 ill_t *ill;
16360 16355
16361 16356 for (ill = uill->ill_usesrc_grp_next;
16362 16357 ASSERT(ill), ill->ill_usesrc_grp_next != uill;
16363 16358 ill = ill->ill_usesrc_grp_next)
16364 16359 /* do nothing */;
16365 16360 return (ill);
16366 16361 }
16367 16362
16368 16363 /*
16369 16364 * Release all members of the usesrc group. This routine is called
16370 16365 * from ill_delete when the interface being unplumbed is the
16371 16366 * group head.
16372 16367 *
16373 16368 * This silently clears the usesrc that ifconfig setup.
16374 16369 * An alternative would be to keep that ifindex, and drop packets on the floor
16375 16370 * since no source address can be selected.
16376 16371 * Even if we keep the current semantics, don't need a lock and a linked list.
16377 16372 * Can walk all the ills checking if they have a ill_usesrc_ifindex matching
16378 16373 * the one that is being removed. Issue is how we return the usesrc users
16379 16374 * (SIOCGLIFSRCOF). We want to be able to find the ills which have an
16380 16375 * ill_usesrc_ifindex matching a target ill. We could also do that with an
16381 16376 * ill walk, but the walker would need to insert in the ioctl response.
16382 16377 */
16383 16378 static void
16384 16379 ill_disband_usesrc_group(ill_t *uill)
16385 16380 {
16386 16381 ill_t *next_ill, *tmp_ill;
16387 16382 ip_stack_t *ipst = uill->ill_ipst;
16388 16383
16389 16384 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16390 16385 next_ill = uill->ill_usesrc_grp_next;
16391 16386
16392 16387 do {
16393 16388 ASSERT(next_ill != NULL);
16394 16389 tmp_ill = next_ill->ill_usesrc_grp_next;
16395 16390 ASSERT(tmp_ill != NULL);
16396 16391 next_ill->ill_usesrc_grp_next = NULL;
16397 16392 next_ill->ill_usesrc_ifindex = 0;
16398 16393 next_ill = tmp_ill;
16399 16394 } while (next_ill->ill_usesrc_ifindex != 0);
16400 16395 uill->ill_usesrc_grp_next = NULL;
16401 16396 }
16402 16397
16403 16398 /*
16404 16399 * Remove the client usesrc ILL from the list and relink to a new list
16405 16400 */
16406 16401 int
16407 16402 ill_relink_usesrc_ills(ill_t *ucill, ill_t *uill, uint_t ifindex)
16408 16403 {
16409 16404 ill_t *ill, *tmp_ill;
16410 16405 ip_stack_t *ipst = ucill->ill_ipst;
16411 16406
16412 16407 ASSERT((ucill != NULL) && (ucill->ill_usesrc_grp_next != NULL) &&
16413 16408 (uill != NULL) && RW_WRITE_HELD(&ipst->ips_ill_g_usesrc_lock));
16414 16409
16415 16410 /*
16416 16411 * Check if the usesrc client ILL passed in is not already
16417 16412 * in use as a usesrc ILL i.e one whose source address is
16418 16413 * in use OR a usesrc ILL is not already in use as a usesrc
16419 16414 * client ILL
16420 16415 */
16421 16416 if ((ucill->ill_usesrc_ifindex == 0) ||
16422 16417 (uill->ill_usesrc_ifindex != 0)) {
16423 16418 return (-1);
16424 16419 }
16425 16420
16426 16421 ill = ill_prev_usesrc(ucill);
16427 16422 ASSERT(ill->ill_usesrc_grp_next != NULL);
16428 16423
16429 16424 /* Remove from the current list */
16430 16425 if (ill->ill_usesrc_grp_next->ill_usesrc_grp_next == ill) {
16431 16426 /* Only two elements in the list */
16432 16427 ASSERT(ill->ill_usesrc_ifindex == 0);
16433 16428 ill->ill_usesrc_grp_next = NULL;
16434 16429 } else {
16435 16430 ill->ill_usesrc_grp_next = ucill->ill_usesrc_grp_next;
16436 16431 }
16437 16432
16438 16433 if (ifindex == 0) {
16439 16434 ucill->ill_usesrc_ifindex = 0;
16440 16435 ucill->ill_usesrc_grp_next = NULL;
16441 16436 return (0);
16442 16437 }
16443 16438
16444 16439 ucill->ill_usesrc_ifindex = ifindex;
16445 16440 tmp_ill = uill->ill_usesrc_grp_next;
16446 16441 uill->ill_usesrc_grp_next = ucill;
16447 16442 ucill->ill_usesrc_grp_next =
16448 16443 (tmp_ill != NULL) ? tmp_ill : uill;
16449 16444 return (0);
16450 16445 }
16451 16446
16452 16447 /*
16453 16448 * Set the ill_usesrc and ill_usesrc_head fields. See synchronization notes in
16454 16449 * ip.c for locking details.
16455 16450 */
16456 16451 /* ARGSUSED */
16457 16452 int
16458 16453 ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16459 16454 ip_ioctl_cmd_t *ipip, void *ifreq)
16460 16455 {
16461 16456 struct lifreq *lifr = (struct lifreq *)ifreq;
16462 16457 boolean_t isv6 = B_FALSE, reset_flg = B_FALSE;
16463 16458 ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill;
16464 16459 int err = 0, ret;
16465 16460 uint_t ifindex;
16466 16461 ipsq_t *ipsq = NULL;
16467 16462 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
16468 16463
16469 16464 ASSERT(IAM_WRITER_IPIF(ipif));
16470 16465 ASSERT(q->q_next == NULL);
16471 16466 ASSERT(CONN_Q(q));
16472 16467
16473 16468 isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6;
16474 16469
16475 16470 ifindex = lifr->lifr_index;
16476 16471 if (ifindex == 0) {
16477 16472 if (usesrc_cli_ill->ill_usesrc_grp_next == NULL) {
16478 16473 /* non usesrc group interface, nothing to reset */
16479 16474 return (0);
16480 16475 }
16481 16476 ifindex = usesrc_cli_ill->ill_usesrc_ifindex;
16482 16477 /* valid reset request */
16483 16478 reset_flg = B_TRUE;
16484 16479 }
16485 16480
16486 16481 usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
16487 16482 if (usesrc_ill == NULL)
16488 16483 return (ENXIO);
16489 16484 if (usesrc_ill == ipif->ipif_ill) {
16490 16485 ill_refrele(usesrc_ill);
16491 16486 return (EINVAL);
16492 16487 }
16493 16488
16494 16489 ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl,
16495 16490 NEW_OP, B_TRUE);
16496 16491 if (ipsq == NULL) {
16497 16492 err = EINPROGRESS;
16498 16493 /* Operation enqueued on the ipsq of the usesrc ILL */
16499 16494 goto done;
16500 16495 }
16501 16496
16502 16497 /* USESRC isn't currently supported with IPMP */
16503 16498 if (IS_IPMP(usesrc_ill) || IS_UNDER_IPMP(usesrc_ill)) {
16504 16499 err = ENOTSUP;
16505 16500 goto done;
16506 16501 }
16507 16502
16508 16503 /*
16509 16504 * USESRC isn't compatible with the STANDBY flag. (STANDBY is only
16510 16505 * used by IPMP underlying interfaces, but someone might think it's
16511 16506 * more general and try to use it independently with VNI.)
16512 16507 */
16513 16508 if (usesrc_ill->ill_phyint->phyint_flags & PHYI_STANDBY) {
16514 16509 err = ENOTSUP;
16515 16510 goto done;
16516 16511 }
16517 16512
16518 16513 /*
16519 16514 * If the client is already in use as a usesrc_ill or a usesrc_ill is
16520 16515 * already a client then return EINVAL
16521 16516 */
16522 16517 if (IS_USESRC_ILL(usesrc_cli_ill) || IS_USESRC_CLI_ILL(usesrc_ill)) {
16523 16518 err = EINVAL;
16524 16519 goto done;
16525 16520 }
16526 16521
16527 16522 /*
16528 16523 * If the ill_usesrc_ifindex field is already set to what it needs to
16529 16524 * be then this is a duplicate operation.
16530 16525 */
16531 16526 if (!reset_flg && usesrc_cli_ill->ill_usesrc_ifindex == ifindex) {
16532 16527 err = 0;
16533 16528 goto done;
16534 16529 }
16535 16530
16536 16531 ip1dbg(("ip_sioctl_slifusesrc: usesrc_cli_ill %s, usesrc_ill %s,"
16537 16532 " v6 = %d", usesrc_cli_ill->ill_name, usesrc_ill->ill_name,
16538 16533 usesrc_ill->ill_isv6));
16539 16534
16540 16535 /*
16541 16536 * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next
16542 16537 * and the ill_usesrc_ifindex fields
16543 16538 */
16544 16539 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
16545 16540
16546 16541 if (reset_flg) {
16547 16542 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill, 0);
16548 16543 if (ret != 0) {
16549 16544 err = EINVAL;
16550 16545 }
16551 16546 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16552 16547 goto done;
16553 16548 }
16554 16549
16555 16550 /*
16556 16551 * Four possibilities to consider:
16557 16552 * 1. Both usesrc_ill and usesrc_cli_ill are not part of any usesrc grp
16558 16553 * 2. usesrc_ill is part of a group but usesrc_cli_ill isn't
16559 16554 * 3. usesrc_cli_ill is part of a group but usesrc_ill isn't
16560 16555 * 4. Both are part of their respective usesrc groups
16561 16556 */
16562 16557 if ((usesrc_ill->ill_usesrc_grp_next == NULL) &&
16563 16558 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16564 16559 ASSERT(usesrc_ill->ill_usesrc_ifindex == 0);
16565 16560 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16566 16561 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16567 16562 usesrc_cli_ill->ill_usesrc_grp_next = usesrc_ill;
16568 16563 } else if ((usesrc_ill->ill_usesrc_grp_next != NULL) &&
16569 16564 (usesrc_cli_ill->ill_usesrc_grp_next == NULL)) {
16570 16565 usesrc_cli_ill->ill_usesrc_ifindex = ifindex;
16571 16566 /* Insert at head of list */
16572 16567 usesrc_cli_ill->ill_usesrc_grp_next =
16573 16568 usesrc_ill->ill_usesrc_grp_next;
16574 16569 usesrc_ill->ill_usesrc_grp_next = usesrc_cli_ill;
16575 16570 } else {
16576 16571 ret = ill_relink_usesrc_ills(usesrc_cli_ill, usesrc_ill,
16577 16572 ifindex);
16578 16573 if (ret != 0)
16579 16574 err = EINVAL;
16580 16575 }
16581 16576 rw_exit(&ipst->ips_ill_g_usesrc_lock);
16582 16577
16583 16578 done:
16584 16579 if (ipsq != NULL)
16585 16580 ipsq_exit(ipsq);
16586 16581 /* The refrele on the lifr_name ipif is done by ip_process_ioctl */
16587 16582 ill_refrele(usesrc_ill);
16588 16583
16589 16584 /* Let conn_ixa caching know that source address selection changed */
16590 16585 ip_update_source_selection(ipst);
16591 16586
16592 16587 return (err);
16593 16588 }
16594 16589
16595 16590 /* ARGSUSED */
16596 16591 int
16597 16592 ip_sioctl_get_dadstate(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
16598 16593 ip_ioctl_cmd_t *ipip, void *if_req)
16599 16594 {
16600 16595 struct lifreq *lifr = (struct lifreq *)if_req;
16601 16596 ill_t *ill = ipif->ipif_ill;
16602 16597
16603 16598 /*
16604 16599 * Need a lock since IFF_UP can be set even when there are
16605 16600 * references to the ipif.
16606 16601 */
16607 16602 mutex_enter(&ill->ill_lock);
16608 16603 if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_addr_ready == 0)
16609 16604 lifr->lifr_dadstate = DAD_IN_PROGRESS;
16610 16605 else
16611 16606 lifr->lifr_dadstate = DAD_DONE;
16612 16607 mutex_exit(&ill->ill_lock);
16613 16608 return (0);
16614 16609 }
16615 16610
16616 16611 /*
16617 16612 * comparison function used by avl.
16618 16613 */
16619 16614 static int
16620 16615 ill_phyint_compare_index(const void *index_ptr, const void *phyip)
16621 16616 {
16622 16617
16623 16618 uint_t index;
16624 16619
16625 16620 ASSERT(phyip != NULL && index_ptr != NULL);
16626 16621
16627 16622 index = *((uint_t *)index_ptr);
16628 16623 /*
16629 16624 * let the phyint with the lowest index be on top.
16630 16625 */
16631 16626 if (((phyint_t *)phyip)->phyint_ifindex < index)
16632 16627 return (1);
16633 16628 if (((phyint_t *)phyip)->phyint_ifindex > index)
16634 16629 return (-1);
16635 16630 return (0);
16636 16631 }
16637 16632
16638 16633 /*
16639 16634 * comparison function used by avl.
16640 16635 */
16641 16636 static int
16642 16637 ill_phyint_compare_name(const void *name_ptr, const void *phyip)
16643 16638 {
16644 16639 ill_t *ill;
16645 16640 int res = 0;
16646 16641
16647 16642 ASSERT(phyip != NULL && name_ptr != NULL);
16648 16643
16649 16644 if (((phyint_t *)phyip)->phyint_illv4)
16650 16645 ill = ((phyint_t *)phyip)->phyint_illv4;
16651 16646 else
16652 16647 ill = ((phyint_t *)phyip)->phyint_illv6;
16653 16648 ASSERT(ill != NULL);
16654 16649
16655 16650 res = strcmp(ill->ill_name, (char *)name_ptr);
16656 16651 if (res > 0)
16657 16652 return (1);
16658 16653 else if (res < 0)
16659 16654 return (-1);
16660 16655 return (0);
16661 16656 }
16662 16657
16663 16658 /*
16664 16659 * This function is called on the unplumb path via ill_glist_delete() when
16665 16660 * there are no ills left on the phyint and thus the phyint can be freed.
16666 16661 */
16667 16662 static void
16668 16663 phyint_free(phyint_t *phyi)
16669 16664 {
16670 16665 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
16671 16666
16672 16667 ASSERT(phyi->phyint_illv4 == NULL && phyi->phyint_illv6 == NULL);
16673 16668
16674 16669 /*
16675 16670 * If this phyint was an IPMP meta-interface, blow away the group.
16676 16671 * This is safe to do because all of the illgrps have already been
16677 16672 * removed by I_PUNLINK, and thus SIOCSLIFGROUPNAME cannot find us.
16678 16673 * If we're cleaning up as a result of failed initialization,
16679 16674 * phyint_grp may be NULL.
16680 16675 */
16681 16676 if ((phyi->phyint_flags & PHYI_IPMP) && (phyi->phyint_grp != NULL)) {
16682 16677 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16683 16678 ipmp_grp_destroy(phyi->phyint_grp);
16684 16679 phyi->phyint_grp = NULL;
16685 16680 rw_exit(&ipst->ips_ipmp_lock);
16686 16681 }
16687 16682
16688 16683 /*
16689 16684 * If this interface was under IPMP, take it out of the group.
16690 16685 */
16691 16686 if (phyi->phyint_grp != NULL)
16692 16687 ipmp_phyint_leave_grp(phyi);
16693 16688
16694 16689 /*
16695 16690 * Delete the phyint and disassociate its ipsq. The ipsq itself
16696 16691 * will be freed in ipsq_exit().
16697 16692 */
16698 16693 phyi->phyint_ipsq->ipsq_phyint = NULL;
16699 16694 phyi->phyint_name[0] = '\0';
16700 16695
16701 16696 mi_free(phyi);
16702 16697 }
16703 16698
16704 16699 /*
16705 16700 * Attach the ill to the phyint structure which can be shared by both
16706 16701 * IPv4 and IPv6 ill. ill_init allocates a phyint to just hold flags. This
16707 16702 * function is called from ipif_set_values and ill_lookup_on_name (for
16708 16703 * loopback) where we know the name of the ill. We lookup the ill and if
16709 16704 * there is one present already with the name use that phyint. Otherwise
16710 16705 * reuse the one allocated by ill_init.
16711 16706 */
16712 16707 static void
16713 16708 ill_phyint_reinit(ill_t *ill)
16714 16709 {
16715 16710 boolean_t isv6 = ill->ill_isv6;
16716 16711 phyint_t *phyi_old;
16717 16712 phyint_t *phyi;
16718 16713 avl_index_t where = 0;
16719 16714 ill_t *ill_other = NULL;
16720 16715 ip_stack_t *ipst = ill->ill_ipst;
16721 16716
16722 16717 ASSERT(RW_WRITE_HELD(&ipst->ips_ill_g_lock));
16723 16718
16724 16719 phyi_old = ill->ill_phyint;
16725 16720 ASSERT(isv6 || (phyi_old->phyint_illv4 == ill &&
16726 16721 phyi_old->phyint_illv6 == NULL));
16727 16722 ASSERT(!isv6 || (phyi_old->phyint_illv6 == ill &&
16728 16723 phyi_old->phyint_illv4 == NULL));
16729 16724 ASSERT(phyi_old->phyint_ifindex == 0);
16730 16725
16731 16726 /*
16732 16727 * Now that our ill has a name, set it in the phyint.
16733 16728 */
16734 16729 (void) strlcpy(ill->ill_phyint->phyint_name, ill->ill_name, LIFNAMSIZ);
16735 16730
16736 16731 phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16737 16732 ill->ill_name, &where);
16738 16733
16739 16734 /*
16740 16735 * 1. We grabbed the ill_g_lock before inserting this ill into
16741 16736 * the global list of ills. So no other thread could have located
16742 16737 * this ill and hence the ipsq of this ill is guaranteed to be empty.
16743 16738 * 2. Now locate the other protocol instance of this ill.
16744 16739 * 3. Now grab both ill locks in the right order, and the phyint lock of
16745 16740 * the new ipsq. Holding ill locks + ill_g_lock ensures that the ipsq
16746 16741 * of neither ill can change.
16747 16742 * 4. Merge the phyint and thus the ipsq as well of this ill onto the
16748 16743 * other ill.
16749 16744 * 5. Release all locks.
16750 16745 */
16751 16746
16752 16747 /*
16753 16748 * Look for IPv4 if we are initializing IPv6 or look for IPv6 if
16754 16749 * we are initializing IPv4.
16755 16750 */
16756 16751 if (phyi != NULL) {
16757 16752 ill_other = (isv6) ? phyi->phyint_illv4 : phyi->phyint_illv6;
16758 16753 ASSERT(ill_other->ill_phyint != NULL);
16759 16754 ASSERT((isv6 && !ill_other->ill_isv6) ||
16760 16755 (!isv6 && ill_other->ill_isv6));
16761 16756 GRAB_ILL_LOCKS(ill, ill_other);
16762 16757 /*
16763 16758 * We are potentially throwing away phyint_flags which
16764 16759 * could be different from the one that we obtain from
16765 16760 * ill_other->ill_phyint. But it is okay as we are assuming
16766 16761 * that the state maintained within IP is correct.
16767 16762 */
16768 16763 mutex_enter(&phyi->phyint_lock);
16769 16764 if (isv6) {
16770 16765 ASSERT(phyi->phyint_illv6 == NULL);
16771 16766 phyi->phyint_illv6 = ill;
16772 16767 } else {
16773 16768 ASSERT(phyi->phyint_illv4 == NULL);
16774 16769 phyi->phyint_illv4 = ill;
16775 16770 }
16776 16771
16777 16772 /*
16778 16773 * Delete the old phyint and make its ipsq eligible
16779 16774 * to be freed in ipsq_exit().
16780 16775 */
16781 16776 phyi_old->phyint_illv4 = NULL;
16782 16777 phyi_old->phyint_illv6 = NULL;
16783 16778 phyi_old->phyint_ipsq->ipsq_phyint = NULL;
16784 16779 phyi_old->phyint_name[0] = '\0';
16785 16780 mi_free(phyi_old);
16786 16781 } else {
16787 16782 mutex_enter(&ill->ill_lock);
16788 16783 /*
16789 16784 * We don't need to acquire any lock, since
16790 16785 * the ill is not yet visible globally and we
16791 16786 * have not yet released the ill_g_lock.
16792 16787 */
16793 16788 phyi = phyi_old;
16794 16789 mutex_enter(&phyi->phyint_lock);
16795 16790 /* XXX We need a recovery strategy here. */
16796 16791 if (!phyint_assign_ifindex(phyi, ipst))
16797 16792 cmn_err(CE_PANIC, "phyint_assign_ifindex() failed");
16798 16793
16799 16794 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
16800 16795 (void *)phyi, where);
16801 16796
16802 16797 (void) avl_find(&ipst->ips_phyint_g_list->
16803 16798 phyint_list_avl_by_index,
16804 16799 &phyi->phyint_ifindex, &where);
16805 16800 avl_insert(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
16806 16801 (void *)phyi, where);
16807 16802 }
16808 16803
16809 16804 /*
16810 16805 * Reassigning ill_phyint automatically reassigns the ipsq also.
16811 16806 * pending mp is not affected because that is per ill basis.
16812 16807 */
16813 16808 ill->ill_phyint = phyi;
16814 16809
16815 16810 /*
16816 16811 * Now that the phyint's ifindex has been assigned, complete the
16817 16812 * remaining
16818 16813 */
16819 16814 ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex;
16820 16815 if (ill->ill_isv6) {
16821 16816 ill->ill_icmp6_mib->ipv6IfIcmpIfIndex =
16822 16817 ill->ill_phyint->phyint_ifindex;
16823 16818 ill->ill_mcast_type = ipst->ips_mld_max_version;
16824 16819 } else {
16825 16820 ill->ill_mcast_type = ipst->ips_igmp_max_version;
16826 16821 }
16827 16822
16828 16823 /*
16829 16824 * Generate an event within the hooks framework to indicate that
16830 16825 * a new interface has just been added to IP. For this event to
16831 16826 * be generated, the network interface must, at least, have an
16832 16827 * ifindex assigned to it. (We don't generate the event for
16833 16828 * loopback since ill_lookup_on_name() has its own NE_PLUMB event.)
16834 16829 *
16835 16830 * This needs to be run inside the ill_g_lock perimeter to ensure
16836 16831 * that the ordering of delivered events to listeners matches the
16837 16832 * order of them in the kernel.
16838 16833 */
16839 16834 if (!IS_LOOPBACK(ill)) {
16840 16835 ill_nic_event_dispatch(ill, 0, NE_PLUMB, ill->ill_name,
16841 16836 ill->ill_name_length);
16842 16837 }
16843 16838 RELEASE_ILL_LOCKS(ill, ill_other);
16844 16839 mutex_exit(&phyi->phyint_lock);
16845 16840 }
16846 16841
16847 16842 /*
16848 16843 * Notify any downstream modules of the name of this interface.
16849 16844 * An M_IOCTL is used even though we don't expect a successful reply.
16850 16845 * Any reply message from the driver (presumably an M_IOCNAK) will
16851 16846 * eventually get discarded somewhere upstream. The message format is
16852 16847 * simply an SIOCSLIFNAME ioctl just as might be sent from ifconfig
16853 16848 * to IP.
16854 16849 */
16855 16850 static void
16856 16851 ip_ifname_notify(ill_t *ill, queue_t *q)
16857 16852 {
16858 16853 mblk_t *mp1, *mp2;
16859 16854 struct iocblk *iocp;
16860 16855 struct lifreq *lifr;
16861 16856
16862 16857 mp1 = mkiocb(SIOCSLIFNAME);
16863 16858 if (mp1 == NULL)
16864 16859 return;
16865 16860 mp2 = allocb(sizeof (struct lifreq), BPRI_HI);
16866 16861 if (mp2 == NULL) {
16867 16862 freeb(mp1);
16868 16863 return;
16869 16864 }
16870 16865
16871 16866 mp1->b_cont = mp2;
16872 16867 iocp = (struct iocblk *)mp1->b_rptr;
16873 16868 iocp->ioc_count = sizeof (struct lifreq);
16874 16869
16875 16870 lifr = (struct lifreq *)mp2->b_rptr;
16876 16871 mp2->b_wptr += sizeof (struct lifreq);
16877 16872 bzero(lifr, sizeof (struct lifreq));
16878 16873
16879 16874 (void) strncpy(lifr->lifr_name, ill->ill_name, LIFNAMSIZ);
16880 16875 lifr->lifr_ppa = ill->ill_ppa;
16881 16876 lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6));
16882 16877
16883 16878 DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify",
16884 16879 char *, "SIOCSLIFNAME", ill_t *, ill);
16885 16880 putnext(q, mp1);
16886 16881 }
16887 16882
16888 16883 static int
16889 16884 ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
16890 16885 {
16891 16886 int err;
16892 16887 ip_stack_t *ipst = ill->ill_ipst;
16893 16888 phyint_t *phyi = ill->ill_phyint;
16894 16889
16895 16890 /*
16896 16891 * Now that ill_name is set, the configuration for the IPMP
16897 16892 * meta-interface can be performed.
16898 16893 */
16899 16894 if (IS_IPMP(ill)) {
16900 16895 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
16901 16896 /*
16902 16897 * If phyi->phyint_grp is NULL, then this is the first IPMP
16903 16898 * meta-interface and we need to create the IPMP group.
16904 16899 */
16905 16900 if (phyi->phyint_grp == NULL) {
16906 16901 /*
16907 16902 * If someone has renamed another IPMP group to have
16908 16903 * the same name as our interface, bail.
16909 16904 */
16910 16905 if (ipmp_grp_lookup(ill->ill_name, ipst) != NULL) {
16911 16906 rw_exit(&ipst->ips_ipmp_lock);
16912 16907 return (EEXIST);
16913 16908 }
16914 16909 phyi->phyint_grp = ipmp_grp_create(ill->ill_name, phyi);
16915 16910 if (phyi->phyint_grp == NULL) {
16916 16911 rw_exit(&ipst->ips_ipmp_lock);
16917 16912 return (ENOMEM);
16918 16913 }
16919 16914 }
16920 16915 rw_exit(&ipst->ips_ipmp_lock);
16921 16916 }
16922 16917
16923 16918 /* Tell downstream modules where they are. */
16924 16919 ip_ifname_notify(ill, q);
16925 16920
16926 16921 /*
16927 16922 * ill_dl_phys returns EINPROGRESS in the usual case.
16928 16923 * Error cases are ENOMEM ...
16929 16924 */
16930 16925 err = ill_dl_phys(ill, ipif, mp, q);
16931 16926
16932 16927 if (ill->ill_isv6) {
16933 16928 mutex_enter(&ipst->ips_mld_slowtimeout_lock);
16934 16929 if (ipst->ips_mld_slowtimeout_id == 0) {
16935 16930 ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
16936 16931 (void *)ipst,
16937 16932 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16938 16933 }
16939 16934 mutex_exit(&ipst->ips_mld_slowtimeout_lock);
16940 16935 } else {
16941 16936 mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
16942 16937 if (ipst->ips_igmp_slowtimeout_id == 0) {
16943 16938 ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
16944 16939 (void *)ipst,
16945 16940 MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
16946 16941 }
16947 16942 mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
16948 16943 }
16949 16944
16950 16945 return (err);
16951 16946 }
16952 16947
16953 16948 /*
16954 16949 * Common routine for ppa and ifname setting. Should be called exclusive.
16955 16950 *
16956 16951 * Returns EINPROGRESS when mp has been consumed by queueing it on
16957 16952 * ipx_pending_mp and the ioctl will complete in ip_rput.
16958 16953 *
16959 16954 * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return
16960 16955 * the new name and new ppa in lifr_name and lifr_ppa respectively.
16961 16956 * For SLIFNAME, we pass these values back to the userland.
16962 16957 */
16963 16958 static int
16964 16959 ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr)
16965 16960 {
16966 16961 ill_t *ill;
16967 16962 ipif_t *ipif;
16968 16963 ipsq_t *ipsq;
16969 16964 char *ppa_ptr;
16970 16965 char *old_ptr;
16971 16966 char old_char;
16972 16967 int error;
16973 16968 ip_stack_t *ipst;
16974 16969
16975 16970 ip1dbg(("ipif_set_values: interface %s\n", interf_name));
16976 16971 ASSERT(q->q_next != NULL);
16977 16972 ASSERT(interf_name != NULL);
16978 16973
16979 16974 ill = (ill_t *)q->q_ptr;
16980 16975 ipst = ill->ill_ipst;
16981 16976
16982 16977 ASSERT(ill->ill_ipst != NULL);
16983 16978 ASSERT(ill->ill_name[0] == '\0');
16984 16979 ASSERT(IAM_WRITER_ILL(ill));
16985 16980 ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ);
16986 16981 ASSERT(ill->ill_ppa == UINT_MAX);
16987 16982
16988 16983 ill->ill_defend_start = ill->ill_defend_count = 0;
16989 16984 /* The ppa is sent down by ifconfig or is chosen */
16990 16985 if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) {
16991 16986 return (EINVAL);
16992 16987 }
16993 16988
16994 16989 /*
16995 16990 * make sure ppa passed in is same as ppa in the name.
16996 16991 * This check is not made when ppa == UINT_MAX in that case ppa
16997 16992 * in the name could be anything. System will choose a ppa and
16998 16993 * update new_ppa_ptr and inter_name to contain the choosen ppa.
16999 16994 */
17000 16995 if (*new_ppa_ptr != UINT_MAX) {
17001 16996 /* stoi changes the pointer */
17002 16997 old_ptr = ppa_ptr;
17003 16998 /*
17004 16999 * ifconfig passed in 0 for the ppa for DLPI 1 style devices
17005 17000 * (they don't have an externally visible ppa). We assign one
17006 17001 * here so that we can manage the interface. Note that in
17007 17002 * the past this value was always 0 for DLPI 1 drivers.
17008 17003 */
17009 17004 if (*new_ppa_ptr == 0)
17010 17005 *new_ppa_ptr = stoi(&old_ptr);
17011 17006 else if (*new_ppa_ptr != (uint_t)stoi(&old_ptr))
17012 17007 return (EINVAL);
17013 17008 }
17014 17009 /*
17015 17010 * terminate string before ppa
17016 17011 * save char at that location.
17017 17012 */
17018 17013 old_char = ppa_ptr[0];
17019 17014 ppa_ptr[0] = '\0';
17020 17015
17021 17016 ill->ill_ppa = *new_ppa_ptr;
17022 17017 /*
17023 17018 * Finish as much work now as possible before calling ill_glist_insert
17024 17019 * which makes the ill globally visible and also merges it with the
17025 17020 * other protocol instance of this phyint. The remaining work is
17026 17021 * done after entering the ipsq which may happen sometime later.
17027 17022 */
17028 17023 ipif = ill->ill_ipif;
17029 17024
17030 17025 /* We didn't do this when we allocated ipif in ip_ll_subnet_defaults */
17031 17026 ipif_assign_seqid(ipif);
17032 17027
17033 17028 if (!(ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)))
17034 17029 ill->ill_flags |= ILLF_IPV4;
17035 17030
17036 17031 ASSERT(ipif->ipif_next == NULL); /* Only one ipif on ill */
17037 17032 ASSERT((ipif->ipif_flags & IPIF_UP) == 0);
17038 17033
17039 17034 if (ill->ill_flags & ILLF_IPV6) {
17040 17035
17041 17036 ill->ill_isv6 = B_TRUE;
17042 17037 ill_set_inputfn(ill);
17043 17038 if (ill->ill_rq != NULL) {
17044 17039 ill->ill_rq->q_qinfo = &iprinitv6;
17045 17040 }
17046 17041
17047 17042 /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */
17048 17043 ipif->ipif_v6lcl_addr = ipv6_all_zeros;
17049 17044 ipif->ipif_v6subnet = ipv6_all_zeros;
17050 17045 ipif->ipif_v6net_mask = ipv6_all_zeros;
17051 17046 ipif->ipif_v6brd_addr = ipv6_all_zeros;
17052 17047 ipif->ipif_v6pp_dst_addr = ipv6_all_zeros;
17053 17048 ill->ill_reachable_retrans_time = ND_RETRANS_TIMER;
17054 17049 /*
17055 17050 * point-to-point or Non-mulicast capable
17056 17051 * interfaces won't do NUD unless explicitly
17057 17052 * configured to do so.
17058 17053 */
17059 17054 if (ipif->ipif_flags & IPIF_POINTOPOINT ||
17060 17055 !(ill->ill_flags & ILLF_MULTICAST)) {
17061 17056 ill->ill_flags |= ILLF_NONUD;
17062 17057 }
17063 17058 /* Make sure IPv4 specific flag is not set on IPv6 if */
17064 17059 if (ill->ill_flags & ILLF_NOARP) {
17065 17060 /*
17066 17061 * Note: xresolv interfaces will eventually need
17067 17062 * NOARP set here as well, but that will require
17068 17063 * those external resolvers to have some
17069 17064 * knowledge of that flag and act appropriately.
17070 17065 * Not to be changed at present.
17071 17066 */
17072 17067 ill->ill_flags &= ~ILLF_NOARP;
17073 17068 }
17074 17069 /*
17075 17070 * Set the ILLF_ROUTER flag according to the global
17076 17071 * IPv6 forwarding policy.
17077 17072 */
17078 17073 if (ipst->ips_ipv6_forwarding != 0)
17079 17074 ill->ill_flags |= ILLF_ROUTER;
17080 17075 } else if (ill->ill_flags & ILLF_IPV4) {
17081 17076 ill->ill_isv6 = B_FALSE;
17082 17077 ill_set_inputfn(ill);
17083 17078 ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER;
17084 17079 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr);
17085 17080 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet);
17086 17081 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask);
17087 17082 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr);
17088 17083 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6pp_dst_addr);
17089 17084 /*
17090 17085 * Set the ILLF_ROUTER flag according to the global
17091 17086 * IPv4 forwarding policy.
17092 17087 */
17093 17088 if (ipst->ips_ip_forwarding != 0)
17094 17089 ill->ill_flags |= ILLF_ROUTER;
17095 17090 }
17096 17091
17097 17092 ASSERT(ill->ill_phyint != NULL);
17098 17093
17099 17094 /*
17100 17095 * The ipIfStatsIfindex and ipv6IfIcmpIfIndex assignments will
17101 17096 * be completed in ill_glist_insert -> ill_phyint_reinit
17102 17097 */
17103 17098 if (!ill_allocate_mibs(ill))
17104 17099 return (ENOMEM);
17105 17100
17106 17101 /*
17107 17102 * Pick a default sap until we get the DL_INFO_ACK back from
17108 17103 * the driver.
17109 17104 */
17110 17105 ill->ill_sap = (ill->ill_isv6) ? ill->ill_media->ip_m_ipv6sap :
17111 17106 ill->ill_media->ip_m_ipv4sap;
17112 17107
17113 17108 ill->ill_ifname_pending = 1;
17114 17109 ill->ill_ifname_pending_err = 0;
17115 17110
17116 17111 /*
17117 17112 * When the first ipif comes up in ipif_up_done(), multicast groups
17118 17113 * that were joined while this ill was not bound to the DLPI link need
17119 17114 * to be recovered by ill_recover_multicast().
17120 17115 */
17121 17116 ill->ill_need_recover_multicast = 1;
17122 17117
17123 17118 ill_refhold(ill);
17124 17119 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
17125 17120 if ((error = ill_glist_insert(ill, interf_name,
17126 17121 (ill->ill_flags & ILLF_IPV6) == ILLF_IPV6)) > 0) {
17127 17122 ill->ill_ppa = UINT_MAX;
17128 17123 ill->ill_name[0] = '\0';
17129 17124 /*
17130 17125 * undo null termination done above.
17131 17126 */
17132 17127 ppa_ptr[0] = old_char;
17133 17128 rw_exit(&ipst->ips_ill_g_lock);
17134 17129 ill_refrele(ill);
17135 17130 return (error);
17136 17131 }
17137 17132
17138 17133 ASSERT(ill->ill_name_length <= LIFNAMSIZ);
17139 17134
17140 17135 /*
17141 17136 * When we return the buffer pointed to by interf_name should contain
17142 17137 * the same name as in ill_name.
17143 17138 * If a ppa was choosen by the system (ppa passed in was UINT_MAX)
17144 17139 * the buffer pointed to by new_ppa_ptr would not contain the right ppa
17145 17140 * so copy full name and update the ppa ptr.
17146 17141 * When ppa passed in != UINT_MAX all values are correct just undo
17147 17142 * null termination, this saves a bcopy.
17148 17143 */
17149 17144 if (*new_ppa_ptr == UINT_MAX) {
17150 17145 bcopy(ill->ill_name, interf_name, ill->ill_name_length);
17151 17146 *new_ppa_ptr = ill->ill_ppa;
17152 17147 } else {
17153 17148 /*
17154 17149 * undo null termination done above.
17155 17150 */
17156 17151 ppa_ptr[0] = old_char;
17157 17152 }
17158 17153
17159 17154 /* Let SCTP know about this ILL */
17160 17155 sctp_update_ill(ill, SCTP_ILL_INSERT);
17161 17156
17162 17157 /*
17163 17158 * ill_glist_insert has made the ill visible globally, and
17164 17159 * ill_phyint_reinit could have changed the ipsq. At this point,
17165 17160 * we need to hold the ips_ill_g_lock across the call to enter the
17166 17161 * ipsq to enforce atomicity and prevent reordering. In the event
17167 17162 * the ipsq has changed, and if the new ipsq is currently busy,
17168 17163 * we need to make sure that this half-completed ioctl is ahead of
17169 17164 * any subsequent ioctl. We achieve this by not dropping the
17170 17165 * ips_ill_g_lock which prevents any ill lookup itself thereby
17171 17166 * ensuring that new ioctls can't start.
17172 17167 */
17173 17168 ipsq = ipsq_try_enter_internal(ill, q, mp, ip_reprocess_ioctl, NEW_OP,
17174 17169 B_TRUE);
17175 17170
17176 17171 rw_exit(&ipst->ips_ill_g_lock);
17177 17172 ill_refrele(ill);
17178 17173 if (ipsq == NULL)
17179 17174 return (EINPROGRESS);
17180 17175
17181 17176 /*
17182 17177 * If ill_phyint_reinit() changed our ipsq, then start on the new ipsq.
17183 17178 */
17184 17179 if (ipsq->ipsq_xop->ipx_current_ipif == NULL)
17185 17180 ipsq_current_start(ipsq, ipif, SIOCSLIFNAME);
17186 17181 else
17187 17182 ASSERT(ipsq->ipsq_xop->ipx_current_ipif == ipif);
17188 17183
17189 17184 error = ipif_set_values_tail(ill, ipif, mp, q);
17190 17185 ipsq_exit(ipsq);
17191 17186 if (error != 0 && error != EINPROGRESS) {
17192 17187 /*
17193 17188 * restore previous values
17194 17189 */
17195 17190 ill->ill_isv6 = B_FALSE;
17196 17191 ill_set_inputfn(ill);
17197 17192 }
17198 17193 return (error);
17199 17194 }
17200 17195
17201 17196 void
17202 17197 ipif_init(ip_stack_t *ipst)
17203 17198 {
17204 17199 int i;
17205 17200
17206 17201 for (i = 0; i < MAX_G_HEADS; i++) {
17207 17202 ipst->ips_ill_g_heads[i].ill_g_list_head =
17208 17203 (ill_if_t *)&ipst->ips_ill_g_heads[i];
17209 17204 ipst->ips_ill_g_heads[i].ill_g_list_tail =
17210 17205 (ill_if_t *)&ipst->ips_ill_g_heads[i];
17211 17206 }
17212 17207
17213 17208 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_index,
17214 17209 ill_phyint_compare_index,
17215 17210 sizeof (phyint_t),
17216 17211 offsetof(struct phyint, phyint_avl_by_index));
17217 17212 avl_create(&ipst->ips_phyint_g_list->phyint_list_avl_by_name,
17218 17213 ill_phyint_compare_name,
17219 17214 sizeof (phyint_t),
17220 17215 offsetof(struct phyint, phyint_avl_by_name));
17221 17216 }
17222 17217
17223 17218 /*
17224 17219 * Save enough information so that we can recreate the IRE if
17225 17220 * the interface goes down and then up.
17226 17221 */
17227 17222 void
17228 17223 ill_save_ire(ill_t *ill, ire_t *ire)
17229 17224 {
17230 17225 mblk_t *save_mp;
17231 17226
17232 17227 save_mp = allocb(sizeof (ifrt_t), BPRI_MED);
17233 17228 if (save_mp != NULL) {
17234 17229 ifrt_t *ifrt;
17235 17230
17236 17231 save_mp->b_wptr += sizeof (ifrt_t);
17237 17232 ifrt = (ifrt_t *)save_mp->b_rptr;
17238 17233 bzero(ifrt, sizeof (ifrt_t));
17239 17234 ifrt->ifrt_type = ire->ire_type;
17240 17235 if (ire->ire_ipversion == IPV4_VERSION) {
17241 17236 ASSERT(!ill->ill_isv6);
17242 17237 ifrt->ifrt_addr = ire->ire_addr;
17243 17238 ifrt->ifrt_gateway_addr = ire->ire_gateway_addr;
17244 17239 ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr;
17245 17240 ifrt->ifrt_mask = ire->ire_mask;
17246 17241 } else {
17247 17242 ASSERT(ill->ill_isv6);
17248 17243 ifrt->ifrt_v6addr = ire->ire_addr_v6;
17249 17244 /* ire_gateway_addr_v6 can change due to RTM_CHANGE */
17250 17245 mutex_enter(&ire->ire_lock);
17251 17246 ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6;
17252 17247 mutex_exit(&ire->ire_lock);
17253 17248 ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6;
17254 17249 ifrt->ifrt_v6mask = ire->ire_mask_v6;
17255 17250 }
17256 17251 ifrt->ifrt_flags = ire->ire_flags;
17257 17252 ifrt->ifrt_zoneid = ire->ire_zoneid;
17258 17253 mutex_enter(&ill->ill_saved_ire_lock);
17259 17254 save_mp->b_cont = ill->ill_saved_ire_mp;
17260 17255 ill->ill_saved_ire_mp = save_mp;
17261 17256 ill->ill_saved_ire_cnt++;
17262 17257 mutex_exit(&ill->ill_saved_ire_lock);
17263 17258 }
17264 17259 }
17265 17260
17266 17261 /*
17267 17262 * Remove one entry from ill_saved_ire_mp.
17268 17263 */
17269 17264 void
17270 17265 ill_remove_saved_ire(ill_t *ill, ire_t *ire)
17271 17266 {
17272 17267 mblk_t **mpp;
17273 17268 mblk_t *mp;
17274 17269 ifrt_t *ifrt;
17275 17270
17276 17271 /* Remove from ill_saved_ire_mp list if it is there */
17277 17272 mutex_enter(&ill->ill_saved_ire_lock);
17278 17273 for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL;
17279 17274 mpp = &(*mpp)->b_cont) {
17280 17275 in6_addr_t gw_addr_v6;
17281 17276
17282 17277 /*
17283 17278 * On a given ill, the tuple of address, gateway, mask,
17284 17279 * ire_type, and zoneid is unique for each saved IRE.
17285 17280 */
17286 17281 mp = *mpp;
17287 17282 ifrt = (ifrt_t *)mp->b_rptr;
17288 17283 /* ire_gateway_addr_v6 can change - need lock */
17289 17284 mutex_enter(&ire->ire_lock);
17290 17285 gw_addr_v6 = ire->ire_gateway_addr_v6;
17291 17286 mutex_exit(&ire->ire_lock);
17292 17287
17293 17288 if (ifrt->ifrt_zoneid != ire->ire_zoneid ||
17294 17289 ifrt->ifrt_type != ire->ire_type)
17295 17290 continue;
17296 17291
17297 17292 if (ill->ill_isv6 ?
17298 17293 (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr,
17299 17294 &ire->ire_addr_v6) &&
17300 17295 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr,
17301 17296 &gw_addr_v6) &&
17302 17297 IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask,
17303 17298 &ire->ire_mask_v6)) :
17304 17299 (ifrt->ifrt_addr == ire->ire_addr &&
17305 17300 ifrt->ifrt_gateway_addr == ire->ire_gateway_addr &&
17306 17301 ifrt->ifrt_mask == ire->ire_mask)) {
17307 17302 *mpp = mp->b_cont;
17308 17303 ill->ill_saved_ire_cnt--;
17309 17304 freeb(mp);
17310 17305 break;
17311 17306 }
17312 17307 }
17313 17308 mutex_exit(&ill->ill_saved_ire_lock);
17314 17309 }
17315 17310
17316 17311 /*
17317 17312 * IP multirouting broadcast routes handling
17318 17313 * Append CGTP broadcast IREs to regular ones created
17319 17314 * at ifconfig time.
17320 17315 * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both
17321 17316 * the destination and the gateway are broadcast addresses.
17322 17317 * The caller has verified that the destination is an IRE_BROADCAST and that
17323 17318 * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then
17324 17319 * we create a MULTIRT IRE_BROADCAST.
17325 17320 * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything
17326 17321 * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion.
17327 17322 */
17328 17323 static void
17329 17324 ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst)
17330 17325 {
17331 17326 ire_t *ire_prim;
17332 17327
17333 17328 ASSERT(ire != NULL);
17334 17329
17335 17330 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17336 17331 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst,
17337 17332 NULL);
17338 17333 if (ire_prim != NULL) {
17339 17334 /*
17340 17335 * We are in the special case of broadcasts for
17341 17336 * CGTP. We add an IRE_BROADCAST that holds
17342 17337 * the RTF_MULTIRT flag, the destination
17343 17338 * address and the low level
17344 17339 * info of ire_prim. In other words, CGTP
17345 17340 * broadcast is added to the redundant ipif.
17346 17341 */
17347 17342 ill_t *ill_prim;
17348 17343 ire_t *bcast_ire;
17349 17344
17350 17345 ill_prim = ire_prim->ire_ill;
17351 17346
17352 17347 ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n",
17353 17348 (void *)ire_prim, (void *)ill_prim));
17354 17349
17355 17350 bcast_ire = ire_create(
17356 17351 (uchar_t *)&ire->ire_addr,
17357 17352 (uchar_t *)&ip_g_all_ones,
17358 17353 (uchar_t *)&ire->ire_gateway_addr,
17359 17354 IRE_BROADCAST,
17360 17355 ill_prim,
17361 17356 GLOBAL_ZONEID, /* CGTP is only for the global zone */
17362 17357 ire->ire_flags | RTF_KERNEL,
17363 17358 NULL,
17364 17359 ipst);
17365 17360
17366 17361 /*
17367 17362 * Here we assume that ire_add does head insertion so that
17368 17363 * the added IRE_BROADCAST comes before the existing IRE_HOST.
17369 17364 */
17370 17365 if (bcast_ire != NULL) {
17371 17366 if (ire->ire_flags & RTF_SETSRC) {
17372 17367 bcast_ire->ire_setsrc_addr =
17373 17368 ire->ire_setsrc_addr;
17374 17369 }
17375 17370 bcast_ire = ire_add(bcast_ire);
17376 17371 if (bcast_ire != NULL) {
17377 17372 ip2dbg(("ip_cgtp_filter_bcast_add: "
17378 17373 "added bcast_ire %p\n",
17379 17374 (void *)bcast_ire));
17380 17375
17381 17376 ill_save_ire(ill_prim, bcast_ire);
17382 17377 ire_refrele(bcast_ire);
17383 17378 }
17384 17379 }
17385 17380 ire_refrele(ire_prim);
17386 17381 }
17387 17382 }
17388 17383
17389 17384 /*
17390 17385 * IP multirouting broadcast routes handling
17391 17386 * Remove the broadcast ire.
17392 17387 * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both
17393 17388 * the destination and the gateway are broadcast addresses.
17394 17389 * The caller has only verified that RTF_MULTIRT was set. We check
17395 17390 * that the destination is broadcast and that the gateway is a broadcast
17396 17391 * address, and if so delete the IRE added by ip_cgtp_bcast_add().
17397 17392 */
17398 17393 static void
17399 17394 ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst)
17400 17395 {
17401 17396 ASSERT(ire != NULL);
17402 17397
17403 17398 if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) {
17404 17399 ire_t *ire_prim;
17405 17400
17406 17401 ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0,
17407 17402 IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0,
17408 17403 ipst, NULL);
17409 17404 if (ire_prim != NULL) {
17410 17405 ill_t *ill_prim;
17411 17406 ire_t *bcast_ire;
17412 17407
17413 17408 ill_prim = ire_prim->ire_ill;
17414 17409
17415 17410 ip2dbg(("ip_cgtp_filter_bcast_delete: "
17416 17411 "ire_prim %p, ill_prim %p\n",
17417 17412 (void *)ire_prim, (void *)ill_prim));
17418 17413
17419 17414 bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0,
17420 17415 ire->ire_gateway_addr, IRE_BROADCAST,
17421 17416 ill_prim, ALL_ZONES, NULL,
17422 17417 MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL |
17423 17418 MATCH_IRE_MASK, 0, ipst, NULL);
17424 17419
17425 17420 if (bcast_ire != NULL) {
17426 17421 ip2dbg(("ip_cgtp_filter_bcast_delete: "
17427 17422 "looked up bcast_ire %p\n",
17428 17423 (void *)bcast_ire));
17429 17424 ill_remove_saved_ire(bcast_ire->ire_ill,
17430 17425 bcast_ire);
17431 17426 ire_delete(bcast_ire);
17432 17427 ire_refrele(bcast_ire);
17433 17428 }
17434 17429 ire_refrele(ire_prim);
17435 17430 }
17436 17431 }
17437 17432 }
17438 17433
17439 17434 /*
17440 17435 * Derive an interface id from the link layer address.
17441 17436 * Knows about IEEE 802 and IEEE EUI-64 mappings.
17442 17437 */
17443 17438 static void
17444 17439 ip_ether_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17445 17440 {
17446 17441 char *addr;
17447 17442
17448 17443 /*
17449 17444 * Note that some IPv6 interfaces get plumbed over links that claim to
17450 17445 * be DL_ETHER, but don't actually have Ethernet MAC addresses (e.g.
17451 17446 * PPP links). The ETHERADDRL check here ensures that we only set the
17452 17447 * interface ID on IPv6 interfaces above links that actually have real
17453 17448 * Ethernet addresses.
17454 17449 */
17455 17450 if (ill->ill_phys_addr_length == ETHERADDRL) {
17456 17451 /* Form EUI-64 like address */
17457 17452 addr = (char *)&v6addr->s6_addr32[2];
17458 17453 bcopy(ill->ill_phys_addr, addr, 3);
17459 17454 addr[0] ^= 0x2; /* Toggle Universal/Local bit */
17460 17455 addr[3] = (char)0xff;
17461 17456 addr[4] = (char)0xfe;
17462 17457 bcopy(ill->ill_phys_addr + 3, addr + 5, 3);
17463 17458 }
17464 17459 }
17465 17460
17466 17461 /* ARGSUSED */
17467 17462 static void
17468 17463 ip_nodef_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17469 17464 {
17470 17465 }
17471 17466
17472 17467 typedef struct ipmp_ifcookie {
17473 17468 uint32_t ic_hostid;
17474 17469 char ic_ifname[LIFNAMSIZ];
17475 17470 char ic_zonename[ZONENAME_MAX];
17476 17471 } ipmp_ifcookie_t;
17477 17472
17478 17473 /*
17479 17474 * Construct a pseudo-random interface ID for the IPMP interface that's both
17480 17475 * predictable and (almost) guaranteed to be unique.
17481 17476 */
17482 17477 static void
17483 17478 ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17484 17479 {
17485 17480 zone_t *zp;
17486 17481 uint8_t *addr;
17487 17482 uchar_t hash[16];
17488 17483 ulong_t hostid;
17489 17484 MD5_CTX ctx;
17490 17485 ipmp_ifcookie_t ic = { 0 };
17491 17486
17492 17487 ASSERT(IS_IPMP(ill));
17493 17488
17494 17489 (void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
17495 17490 ic.ic_hostid = htonl((uint32_t)hostid);
17496 17491
17497 17492 (void) strlcpy(ic.ic_ifname, ill->ill_name, LIFNAMSIZ);
17498 17493
17499 17494 if ((zp = zone_find_by_id(ill->ill_zoneid)) != NULL) {
17500 17495 (void) strlcpy(ic.ic_zonename, zp->zone_name, ZONENAME_MAX);
17501 17496 zone_rele(zp);
17502 17497 }
17503 17498
17504 17499 MD5Init(&ctx);
17505 17500 MD5Update(&ctx, &ic, sizeof (ic));
17506 17501 MD5Final(hash, &ctx);
17507 17502
17508 17503 /*
17509 17504 * Map the hash to an interface ID per the basic approach in RFC3041.
17510 17505 */
17511 17506 addr = &v6addr->s6_addr8[8];
17512 17507 bcopy(hash + 8, addr, sizeof (uint64_t));
17513 17508 addr[0] &= ~0x2; /* set local bit */
17514 17509 }
17515 17510
17516 17511 /*
17517 17512 * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet.
17518 17513 */
17519 17514 static void
17520 17515 ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr)
17521 17516 {
17522 17517 phyint_t *phyi = ill->ill_phyint;
17523 17518
17524 17519 /*
17525 17520 * Check PHYI_MULTI_BCAST and length of physical
17526 17521 * address to determine if we use the mapping or the
17527 17522 * broadcast address.
17528 17523 */
17529 17524 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17530 17525 ill->ill_phys_addr_length != ETHERADDRL) {
17531 17526 ip_mbcast_mapping(ill, m_ip6addr, m_physaddr);
17532 17527 return;
17533 17528 }
17534 17529 m_physaddr[0] = 0x33;
17535 17530 m_physaddr[1] = 0x33;
17536 17531 m_physaddr[2] = m_ip6addr[12];
17537 17532 m_physaddr[3] = m_ip6addr[13];
17538 17533 m_physaddr[4] = m_ip6addr[14];
17539 17534 m_physaddr[5] = m_ip6addr[15];
17540 17535 }
17541 17536
17542 17537 /*
17543 17538 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet.
17544 17539 */
17545 17540 static void
17546 17541 ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17547 17542 {
17548 17543 phyint_t *phyi = ill->ill_phyint;
17549 17544
17550 17545 /*
17551 17546 * Check PHYI_MULTI_BCAST and length of physical
17552 17547 * address to determine if we use the mapping or the
17553 17548 * broadcast address.
17554 17549 */
17555 17550 if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 ||
17556 17551 ill->ill_phys_addr_length != ETHERADDRL) {
17557 17552 ip_mbcast_mapping(ill, m_ipaddr, m_physaddr);
17558 17553 return;
17559 17554 }
17560 17555 m_physaddr[0] = 0x01;
17561 17556 m_physaddr[1] = 0x00;
17562 17557 m_physaddr[2] = 0x5e;
17563 17558 m_physaddr[3] = m_ipaddr[1] & 0x7f;
17564 17559 m_physaddr[4] = m_ipaddr[2];
17565 17560 m_physaddr[5] = m_ipaddr[3];
17566 17561 }
17567 17562
17568 17563 /* ARGSUSED */
17569 17564 static void
17570 17565 ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17571 17566 {
17572 17567 /*
17573 17568 * for the MULTI_BCAST case and other cases when we want to
17574 17569 * use the link-layer broadcast address for multicast.
17575 17570 */
17576 17571 uint8_t *bphys_addr;
17577 17572 dl_unitdata_req_t *dlur;
17578 17573
17579 17574 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17580 17575 if (ill->ill_sap_length < 0) {
17581 17576 bphys_addr = (uchar_t *)dlur +
17582 17577 dlur->dl_dest_addr_offset;
17583 17578 } else {
17584 17579 bphys_addr = (uchar_t *)dlur +
17585 17580 dlur->dl_dest_addr_offset + ill->ill_sap_length;
17586 17581 }
17587 17582
17588 17583 bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length);
17589 17584 }
17590 17585
17591 17586 /*
17592 17587 * Derive IPoIB interface id from the link layer address.
17593 17588 */
17594 17589 static void
17595 17590 ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17596 17591 {
17597 17592 char *addr;
17598 17593
17599 17594 ASSERT(ill->ill_phys_addr_length == 20);
17600 17595 addr = (char *)&v6addr->s6_addr32[2];
17601 17596 bcopy(ill->ill_phys_addr + 12, addr, 8);
17602 17597 /*
17603 17598 * In IBA 1.1 timeframe, some vendors erroneously set the u/l bit
17604 17599 * in the globally assigned EUI-64 GUID to 1, in violation of IEEE
17605 17600 * rules. In these cases, the IBA considers these GUIDs to be in
17606 17601 * "Modified EUI-64" format, and thus toggling the u/l bit is not
17607 17602 * required; vendors are required not to assign global EUI-64's
17608 17603 * that differ only in u/l bit values, thus guaranteeing uniqueness
17609 17604 * of the interface identifier. Whether the GUID is in modified
17610 17605 * or proper EUI-64 format, the ipv6 identifier must have the u/l
17611 17606 * bit set to 1.
17612 17607 */
17613 17608 addr[0] |= 2; /* Set Universal/Local bit to 1 */
17614 17609 }
17615 17610
17616 17611 /*
17617 17612 * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand.
17618 17613 * Note on mapping from multicast IP addresses to IPoIB multicast link
17619 17614 * addresses. IPoIB multicast link addresses are based on IBA link addresses.
17620 17615 * The format of an IPoIB multicast address is:
17621 17616 *
17622 17617 * 4 byte QPN Scope Sign. Pkey
17623 17618 * +--------------------------------------------+
17624 17619 * | 00FFFFFF | FF | 1X | X01B | Pkey | GroupID |
17625 17620 * +--------------------------------------------+
17626 17621 *
17627 17622 * The Scope and Pkey components are properties of the IBA port and
17628 17623 * network interface. They can be ascertained from the broadcast address.
17629 17624 * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6.
17630 17625 */
17631 17626 static void
17632 17627 ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17633 17628 {
17634 17629 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17635 17630 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
17636 17631 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17637 17632 uint8_t *bphys_addr;
17638 17633 dl_unitdata_req_t *dlur;
17639 17634
17640 17635 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17641 17636
17642 17637 /*
17643 17638 * RFC 4391: IPv4 MGID is 28-bit long.
17644 17639 */
17645 17640 m_physaddr[16] = m_ipaddr[0] & 0x0f;
17646 17641 m_physaddr[17] = m_ipaddr[1];
17647 17642 m_physaddr[18] = m_ipaddr[2];
17648 17643 m_physaddr[19] = m_ipaddr[3];
17649 17644
17650 17645
17651 17646 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17652 17647 if (ill->ill_sap_length < 0) {
17653 17648 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17654 17649 } else {
17655 17650 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17656 17651 ill->ill_sap_length;
17657 17652 }
17658 17653 /*
17659 17654 * Now fill in the IBA scope/Pkey values from the broadcast address.
17660 17655 */
17661 17656 m_physaddr[5] = bphys_addr[5];
17662 17657 m_physaddr[8] = bphys_addr[8];
17663 17658 m_physaddr[9] = bphys_addr[9];
17664 17659 }
17665 17660
17666 17661 static void
17667 17662 ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr)
17668 17663 {
17669 17664 static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff,
17670 17665 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00,
17671 17666 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
17672 17667 uint8_t *bphys_addr;
17673 17668 dl_unitdata_req_t *dlur;
17674 17669
17675 17670 bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length);
17676 17671
17677 17672 /*
17678 17673 * RFC 4391: IPv4 MGID is 80-bit long.
17679 17674 */
17680 17675 bcopy(&m_ipaddr[6], &m_physaddr[10], 10);
17681 17676
17682 17677 dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr;
17683 17678 if (ill->ill_sap_length < 0) {
17684 17679 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset;
17685 17680 } else {
17686 17681 bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset +
17687 17682 ill->ill_sap_length;
17688 17683 }
17689 17684 /*
17690 17685 * Now fill in the IBA scope/Pkey values from the broadcast address.
17691 17686 */
17692 17687 m_physaddr[5] = bphys_addr[5];
17693 17688 m_physaddr[8] = bphys_addr[8];
17694 17689 m_physaddr[9] = bphys_addr[9];
17695 17690 }
17696 17691
17697 17692 /*
17698 17693 * Derive IPv6 interface id from an IPv4 link-layer address (e.g. from an IPv4
17699 17694 * tunnel). The IPv4 address simply get placed in the lower 4 bytes of the
17700 17695 * IPv6 interface id. This is a suggested mechanism described in section 3.7
17701 17696 * of RFC4213.
17702 17697 */
17703 17698 static void
17704 17699 ip_ipv4_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17705 17700 {
17706 17701 ASSERT(ill->ill_phys_addr_length == sizeof (ipaddr_t));
17707 17702 v6addr->s6_addr32[2] = 0;
17708 17703 bcopy(physaddr, &v6addr->s6_addr32[3], sizeof (ipaddr_t));
17709 17704 }
17710 17705
17711 17706 /*
17712 17707 * Derive IPv6 interface id from an IPv6 link-layer address (e.g. from an IPv6
17713 17708 * tunnel). The lower 8 bytes of the IPv6 address simply become the interface
17714 17709 * id.
17715 17710 */
17716 17711 static void
17717 17712 ip_ipv6_genv6intfid(ill_t *ill, uint8_t *physaddr, in6_addr_t *v6addr)
17718 17713 {
17719 17714 in6_addr_t *v6lladdr = (in6_addr_t *)physaddr;
17720 17715
17721 17716 ASSERT(ill->ill_phys_addr_length == sizeof (in6_addr_t));
17722 17717 bcopy(&v6lladdr->s6_addr32[2], &v6addr->s6_addr32[2], 8);
17723 17718 }
17724 17719
17725 17720 static void
17726 17721 ip_ipv6_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17727 17722 {
17728 17723 ip_ipv6_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17729 17724 }
17730 17725
17731 17726 static void
17732 17727 ip_ipv6_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17733 17728 {
17734 17729 ip_ipv6_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17735 17730 }
17736 17731
17737 17732 static void
17738 17733 ip_ipv4_v6intfid(ill_t *ill, in6_addr_t *v6addr)
17739 17734 {
17740 17735 ip_ipv4_genv6intfid(ill, ill->ill_phys_addr, v6addr);
17741 17736 }
17742 17737
17743 17738 static void
17744 17739 ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr)
17745 17740 {
17746 17741 ip_ipv4_genv6intfid(ill, ill->ill_dest_addr, v6addr);
17747 17742 }
17748 17743
17749 17744 /*
17750 17745 * Lookup an ill and verify that the zoneid has an ipif on that ill.
17751 17746 * Returns an held ill, or NULL.
17752 17747 */
17753 17748 ill_t *
17754 17749 ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6,
17755 17750 ip_stack_t *ipst)
17756 17751 {
17757 17752 ill_t *ill;
17758 17753 ipif_t *ipif;
17759 17754
17760 17755 ill = ill_lookup_on_ifindex(index, isv6, ipst);
17761 17756 if (ill == NULL)
17762 17757 return (NULL);
17763 17758
17764 17759 mutex_enter(&ill->ill_lock);
17765 17760 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17766 17761 if (IPIF_IS_CONDEMNED(ipif))
17767 17762 continue;
17768 17763 if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid &&
17769 17764 ipif->ipif_zoneid != ALL_ZONES)
17770 17765 continue;
17771 17766
17772 17767 mutex_exit(&ill->ill_lock);
17773 17768 return (ill);
17774 17769 }
17775 17770 mutex_exit(&ill->ill_lock);
17776 17771 ill_refrele(ill);
17777 17772 return (NULL);
17778 17773 }
17779 17774
17780 17775 /*
17781 17776 * Return a pointer to an ipif_t given a combination of (ill_idx,ipif_id)
17782 17777 * If a pointer to an ipif_t is returned then the caller will need to do
17783 17778 * an ill_refrele().
17784 17779 */
17785 17780 ipif_t *
17786 17781 ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6,
17787 17782 ip_stack_t *ipst)
17788 17783 {
17789 17784 ipif_t *ipif;
17790 17785 ill_t *ill;
17791 17786
17792 17787 ill = ill_lookup_on_ifindex(ifindex, isv6, ipst);
17793 17788 if (ill == NULL)
17794 17789 return (NULL);
17795 17790
17796 17791 mutex_enter(&ill->ill_lock);
17797 17792 if (ill->ill_state_flags & ILL_CONDEMNED) {
17798 17793 mutex_exit(&ill->ill_lock);
17799 17794 ill_refrele(ill);
17800 17795 return (NULL);
17801 17796 }
17802 17797
17803 17798 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) {
17804 17799 if (!IPIF_CAN_LOOKUP(ipif))
17805 17800 continue;
17806 17801 if (lifidx == ipif->ipif_id) {
17807 17802 ipif_refhold_locked(ipif);
17808 17803 break;
17809 17804 }
17810 17805 }
17811 17806
17812 17807 mutex_exit(&ill->ill_lock);
17813 17808 ill_refrele(ill);
17814 17809 return (ipif);
17815 17810 }
17816 17811
17817 17812 /*
17818 17813 * Set ill_inputfn based on the current know state.
17819 17814 * This needs to be called when any of the factors taken into
17820 17815 * account changes.
17821 17816 */
17822 17817 void
17823 17818 ill_set_inputfn(ill_t *ill)
17824 17819 {
17825 17820 ip_stack_t *ipst = ill->ill_ipst;
17826 17821
17827 17822 if (ill->ill_isv6) {
17828 17823 if (is_system_labeled())
17829 17824 ill->ill_inputfn = ill_input_full_v6;
17830 17825 else
17831 17826 ill->ill_inputfn = ill_input_short_v6;
17832 17827 } else {
17833 17828 if (is_system_labeled())
17834 17829 ill->ill_inputfn = ill_input_full_v4;
17835 17830 else if (ill->ill_dhcpinit != 0)
17836 17831 ill->ill_inputfn = ill_input_full_v4;
17837 17832 else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head
17838 17833 != NULL)
17839 17834 ill->ill_inputfn = ill_input_full_v4;
17840 17835 else if (ipst->ips_ip_cgtp_filter &&
17841 17836 ipst->ips_ip_cgtp_filter_ops != NULL)
17842 17837 ill->ill_inputfn = ill_input_full_v4;
17843 17838 else
17844 17839 ill->ill_inputfn = ill_input_short_v4;
17845 17840 }
17846 17841 }
17847 17842
17848 17843 /*
17849 17844 * Re-evaluate ill_inputfn for all the IPv4 ills.
17850 17845 * Used when RSVP and CGTP comes and goes.
17851 17846 */
17852 17847 void
17853 17848 ill_set_inputfn_all(ip_stack_t *ipst)
17854 17849 {
17855 17850 ill_walk_context_t ctx;
17856 17851 ill_t *ill;
17857 17852
17858 17853 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
17859 17854 ill = ILL_START_WALK_V4(&ctx, ipst);
17860 17855 for (; ill != NULL; ill = ill_next(&ctx, ill))
17861 17856 ill_set_inputfn(ill);
17862 17857
17863 17858 rw_exit(&ipst->ips_ill_g_lock);
17864 17859 }
17865 17860
17866 17861 /*
17867 17862 * Set the physical address information for `ill' to the contents of the
17868 17863 * dl_notify_ind_t pointed to by `mp'. Must be called as writer, and will be
17869 17864 * asynchronous if `ill' cannot immediately be quiesced -- in which case
17870 17865 * EINPROGRESS will be returned.
17871 17866 */
17872 17867 int
17873 17868 ill_set_phys_addr(ill_t *ill, mblk_t *mp)
17874 17869 {
17875 17870 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17876 17871 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)mp->b_rptr;
17877 17872
17878 17873 ASSERT(IAM_WRITER_IPSQ(ipsq));
17879 17874
17880 17875 if (dlindp->dl_data != DL_IPV6_LINK_LAYER_ADDR &&
17881 17876 dlindp->dl_data != DL_CURR_DEST_ADDR &&
17882 17877 dlindp->dl_data != DL_CURR_PHYS_ADDR) {
17883 17878 /* Changing DL_IPV6_TOKEN is not yet supported */
17884 17879 return (0);
17885 17880 }
17886 17881
17887 17882 /*
17888 17883 * We need to store up to two copies of `mp' in `ill'. Due to the
17889 17884 * design of ipsq_pending_mp_add(), we can't pass them as separate
17890 17885 * arguments to ill_set_phys_addr_tail(). Instead, chain them
17891 17886 * together here, then pull 'em apart in ill_set_phys_addr_tail().
17892 17887 */
17893 17888 if ((mp = copyb(mp)) == NULL || (mp->b_cont = copyb(mp)) == NULL) {
17894 17889 freemsg(mp);
17895 17890 return (ENOMEM);
17896 17891 }
17897 17892
17898 17893 ipsq_current_start(ipsq, ill->ill_ipif, 0);
17899 17894
17900 17895 /*
17901 17896 * Since we'll only do a logical down, we can't rely on ipif_down
17902 17897 * to turn on ILL_DOWN_IN_PROGRESS, or for the DL_BIND_ACK to reset
17903 17898 * ILL_DOWN_IN_PROGRESS. We instead manage this separately for this
17904 17899 * case, to quiesce ire's and nce's for ill_is_quiescent.
17905 17900 */
17906 17901 mutex_enter(&ill->ill_lock);
17907 17902 ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS;
17908 17903 /* no more ire/nce addition allowed */
17909 17904 mutex_exit(&ill->ill_lock);
17910 17905
17911 17906 /*
17912 17907 * If we can quiesce the ill, then set the address. If not, then
17913 17908 * ill_set_phys_addr_tail() will be called from ipif_ill_refrele_tail().
17914 17909 */
17915 17910 ill_down_ipifs(ill, B_TRUE);
17916 17911 mutex_enter(&ill->ill_lock);
17917 17912 if (!ill_is_quiescent(ill)) {
17918 17913 /* call cannot fail since `conn_t *' argument is NULL */
17919 17914 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
17920 17915 mp, ILL_DOWN);
17921 17916 mutex_exit(&ill->ill_lock);
17922 17917 return (EINPROGRESS);
17923 17918 }
17924 17919 mutex_exit(&ill->ill_lock);
17925 17920
17926 17921 ill_set_phys_addr_tail(ipsq, ill->ill_rq, mp, NULL);
17927 17922 return (0);
17928 17923 }
17929 17924
17930 17925 /*
17931 17926 * When the allowed-ips link property is set on the datalink, IP receives a
17932 17927 * DL_NOTE_ALLOWED_IPS notification that is processed in ill_set_allowed_ips()
17933 17928 * to initialize the ill_allowed_ips[] array in the ill_t. This array is then
17934 17929 * used to vet addresses passed to ip_sioctl_addr() and to ensure that the
17935 17930 * only IP addresses configured on the ill_t are those in the ill_allowed_ips[]
17936 17931 * array.
17937 17932 */
17938 17933 void
17939 17934 ill_set_allowed_ips(ill_t *ill, mblk_t *mp)
17940 17935 {
17941 17936 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
17942 17937 dl_notify_ind_t *dlip = (dl_notify_ind_t *)mp->b_rptr;
17943 17938 mac_protect_t *mrp;
17944 17939 int i;
17945 17940
17946 17941 ASSERT(IAM_WRITER_IPSQ(ipsq));
17947 17942 mrp = (mac_protect_t *)&dlip[1];
17948 17943
17949 17944 if (mrp->mp_ipaddrcnt == 0) { /* reset allowed-ips */
17950 17945 kmem_free(ill->ill_allowed_ips,
17951 17946 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17952 17947 ill->ill_allowed_ips_cnt = 0;
17953 17948 ill->ill_allowed_ips = NULL;
17954 17949 mutex_enter(&ill->ill_phyint->phyint_lock);
17955 17950 ill->ill_phyint->phyint_flags &= ~PHYI_L3PROTECT;
17956 17951 mutex_exit(&ill->ill_phyint->phyint_lock);
17957 17952 return;
17958 17953 }
17959 17954
17960 17955 if (ill->ill_allowed_ips != NULL) {
17961 17956 kmem_free(ill->ill_allowed_ips,
17962 17957 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t));
17963 17958 }
17964 17959 ill->ill_allowed_ips_cnt = mrp->mp_ipaddrcnt;
17965 17960 ill->ill_allowed_ips = kmem_alloc(
17966 17961 ill->ill_allowed_ips_cnt * sizeof (in6_addr_t), KM_SLEEP);
17967 17962 for (i = 0; i < mrp->mp_ipaddrcnt; i++)
17968 17963 ill->ill_allowed_ips[i] = mrp->mp_ipaddrs[i].ip_addr;
17969 17964
17970 17965 mutex_enter(&ill->ill_phyint->phyint_lock);
17971 17966 ill->ill_phyint->phyint_flags |= PHYI_L3PROTECT;
17972 17967 mutex_exit(&ill->ill_phyint->phyint_lock);
17973 17968 }
17974 17969
17975 17970 /*
17976 17971 * Once the ill associated with `q' has quiesced, set its physical address
17977 17972 * information to the values in `addrmp'. Note that two copies of `addrmp'
17978 17973 * are passed (linked by b_cont), since we sometimes need to save two distinct
17979 17974 * copies in the ill_t, and our context doesn't permit sleeping or allocation
17980 17975 * failure (we'll free the other copy if it's not needed). Since the ill_t
17981 17976 * is quiesced, we know any stale nce's with the old address information have
17982 17977 * already been removed, so we don't need to call nce_flush().
17983 17978 */
17984 17979 /* ARGSUSED */
17985 17980 static void
17986 17981 ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy)
17987 17982 {
17988 17983 ill_t *ill = q->q_ptr;
17989 17984 mblk_t *addrmp2 = unlinkb(addrmp);
17990 17985 dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr;
17991 17986 uint_t addrlen, addroff;
17992 17987 int status;
17993 17988
17994 17989 ASSERT(IAM_WRITER_IPSQ(ipsq));
17995 17990
17996 17991 addroff = dlindp->dl_addr_offset;
17997 17992 addrlen = dlindp->dl_addr_length - ABS(ill->ill_sap_length);
17998 17993
17999 17994 switch (dlindp->dl_data) {
18000 17995 case DL_IPV6_LINK_LAYER_ADDR:
18001 17996 ill_set_ndmp(ill, addrmp, addroff, addrlen);
18002 17997 freemsg(addrmp2);
18003 17998 break;
18004 17999
18005 18000 case DL_CURR_DEST_ADDR:
18006 18001 freemsg(ill->ill_dest_addr_mp);
18007 18002 ill->ill_dest_addr = addrmp->b_rptr + addroff;
18008 18003 ill->ill_dest_addr_mp = addrmp;
18009 18004 if (ill->ill_isv6) {
18010 18005 ill_setdesttoken(ill);
18011 18006 ipif_setdestlinklocal(ill->ill_ipif);
18012 18007 }
18013 18008 freemsg(addrmp2);
18014 18009 break;
18015 18010
18016 18011 case DL_CURR_PHYS_ADDR:
18017 18012 freemsg(ill->ill_phys_addr_mp);
18018 18013 ill->ill_phys_addr = addrmp->b_rptr + addroff;
18019 18014 ill->ill_phys_addr_mp = addrmp;
18020 18015 ill->ill_phys_addr_length = addrlen;
18021 18016 if (ill->ill_isv6)
18022 18017 ill_set_ndmp(ill, addrmp2, addroff, addrlen);
18023 18018 else
18024 18019 freemsg(addrmp2);
18025 18020 if (ill->ill_isv6) {
18026 18021 ill_setdefaulttoken(ill);
18027 18022 ipif_setlinklocal(ill->ill_ipif);
18028 18023 }
18029 18024 break;
18030 18025 default:
18031 18026 ASSERT(0);
18032 18027 }
18033 18028
18034 18029 /*
18035 18030 * reset ILL_DOWN_IN_PROGRESS so that we can successfully add ires
18036 18031 * as we bring the ipifs up again.
18037 18032 */
18038 18033 mutex_enter(&ill->ill_lock);
18039 18034 ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
18040 18035 mutex_exit(&ill->ill_lock);
18041 18036 /*
18042 18037 * If there are ipifs to bring up, ill_up_ipifs() will return
18043 18038 * EINPROGRESS, and ipsq_current_finish() will be called by
18044 18039 * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is
18045 18040 * brought up.
18046 18041 */
18047 18042 status = ill_up_ipifs(ill, q, addrmp);
18048 18043 if (status != EINPROGRESS)
18049 18044 ipsq_current_finish(ipsq);
18050 18045 }
18051 18046
18052 18047 /*
18053 18048 * Helper routine for setting the ill_nd_lla fields.
18054 18049 */
18055 18050 void
18056 18051 ill_set_ndmp(ill_t *ill, mblk_t *ndmp, uint_t addroff, uint_t addrlen)
18057 18052 {
18058 18053 freemsg(ill->ill_nd_lla_mp);
18059 18054 ill->ill_nd_lla = ndmp->b_rptr + addroff;
18060 18055 ill->ill_nd_lla_mp = ndmp;
18061 18056 ill->ill_nd_lla_len = addrlen;
18062 18057 }
18063 18058
18064 18059 /*
18065 18060 * Replumb the ill.
18066 18061 */
18067 18062 int
18068 18063 ill_replumb(ill_t *ill, mblk_t *mp)
18069 18064 {
18070 18065 ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq;
18071 18066
18072 18067 ASSERT(IAM_WRITER_IPSQ(ipsq));
18073 18068
18074 18069 ipsq_current_start(ipsq, ill->ill_ipif, 0);
18075 18070
18076 18071 /*
18077 18072 * If we can quiesce the ill, then continue. If not, then
18078 18073 * ill_replumb_tail() will be called from ipif_ill_refrele_tail().
18079 18074 */
18080 18075 ill_down_ipifs(ill, B_FALSE);
18081 18076
18082 18077 mutex_enter(&ill->ill_lock);
18083 18078 if (!ill_is_quiescent(ill)) {
18084 18079 /* call cannot fail since `conn_t *' argument is NULL */
18085 18080 (void) ipsq_pending_mp_add(NULL, ill->ill_ipif, ill->ill_rq,
18086 18081 mp, ILL_DOWN);
18087 18082 mutex_exit(&ill->ill_lock);
18088 18083 return (EINPROGRESS);
18089 18084 }
18090 18085 mutex_exit(&ill->ill_lock);
18091 18086
18092 18087 ill_replumb_tail(ipsq, ill->ill_rq, mp, NULL);
18093 18088 return (0);
18094 18089 }
18095 18090
18096 18091 /* ARGSUSED */
18097 18092 static void
18098 18093 ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy)
18099 18094 {
18100 18095 ill_t *ill = q->q_ptr;
18101 18096 int err;
18102 18097 conn_t *connp = NULL;
18103 18098
18104 18099 ASSERT(IAM_WRITER_IPSQ(ipsq));
18105 18100 freemsg(ill->ill_replumb_mp);
18106 18101 ill->ill_replumb_mp = copyb(mp);
18107 18102
18108 18103 if (ill->ill_replumb_mp == NULL) {
18109 18104 /* out of memory */
18110 18105 ipsq_current_finish(ipsq);
18111 18106 return;
18112 18107 }
18113 18108
18114 18109 mutex_enter(&ill->ill_lock);
18115 18110 ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif,
18116 18111 ill->ill_rq, ill->ill_replumb_mp, 0);
18117 18112 mutex_exit(&ill->ill_lock);
18118 18113
18119 18114 if (!ill->ill_up_ipifs) {
18120 18115 /* already closing */
18121 18116 ipsq_current_finish(ipsq);
18122 18117 return;
18123 18118 }
18124 18119 ill->ill_replumbing = 1;
18125 18120 err = ill_down_ipifs_tail(ill);
18126 18121
18127 18122 /*
18128 18123 * Successfully quiesced and brought down the interface, now we send
18129 18124 * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the
18130 18125 * DL_NOTE_REPLUMB message.
18131 18126 */
18132 18127 mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO,
18133 18128 DL_NOTIFY_CONF);
18134 18129 ASSERT(mp != NULL);
18135 18130 ((dl_notify_conf_t *)mp->b_rptr)->dl_notification =
18136 18131 DL_NOTE_REPLUMB_DONE;
18137 18132 ill_dlpi_send(ill, mp);
18138 18133
18139 18134 /*
18140 18135 * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP
18141 18136 * streams have to be unbound. When all the DLPI exchanges are done,
18142 18137 * ipsq_current_finish() will be called by arp_bringup_done(). The
18143 18138 * remainder of ipif bringup via ill_up_ipifs() will also be done in
18144 18139 * arp_bringup_done().
18145 18140 */
18146 18141 ASSERT(ill->ill_replumb_mp != NULL);
18147 18142 if (err == EINPROGRESS)
18148 18143 return;
18149 18144 else
18150 18145 ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp);
18151 18146 ASSERT(connp == NULL);
18152 18147 if (err == 0 && ill->ill_replumb_mp != NULL &&
18153 18148 ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) {
18154 18149 return;
18155 18150 }
18156 18151 ipsq_current_finish(ipsq);
18157 18152 }
18158 18153
18159 18154 /*
18160 18155 * Issue ioctl `cmd' on `lh'; caller provides the initial payload in `buf'
18161 18156 * which is `bufsize' bytes. On success, zero is returned and `buf' updated
18162 18157 * as per the ioctl. On failure, an errno is returned.
18163 18158 */
18164 18159 static int
18165 18160 ip_ioctl(ldi_handle_t lh, int cmd, void *buf, uint_t bufsize, cred_t *cr)
18166 18161 {
18167 18162 int rval;
18168 18163 struct strioctl iocb;
18169 18164
18170 18165 iocb.ic_cmd = cmd;
18171 18166 iocb.ic_timout = 15;
18172 18167 iocb.ic_len = bufsize;
18173 18168 iocb.ic_dp = buf;
18174 18169
18175 18170 return (ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, cr, &rval));
18176 18171 }
18177 18172
18178 18173 /*
18179 18174 * Issue an SIOCGLIFCONF for address family `af' and store the result into a
18180 18175 * dynamically-allocated `lifcp' that will be `bufsizep' bytes on success.
18181 18176 */
18182 18177 static int
18183 18178 ip_lifconf_ioctl(ldi_handle_t lh, int af, struct lifconf *lifcp,
18184 18179 uint_t *bufsizep, cred_t *cr)
18185 18180 {
18186 18181 int err;
18187 18182 struct lifnum lifn;
18188 18183
18189 18184 bzero(&lifn, sizeof (lifn));
18190 18185 lifn.lifn_family = af;
18191 18186 lifn.lifn_flags = LIFC_UNDER_IPMP;
18192 18187
18193 18188 if ((err = ip_ioctl(lh, SIOCGLIFNUM, &lifn, sizeof (lifn), cr)) != 0)
18194 18189 return (err);
18195 18190
18196 18191 /*
18197 18192 * Pad the interface count to account for additional interfaces that
18198 18193 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
18199 18194 */
18200 18195 lifn.lifn_count += 4;
18201 18196 bzero(lifcp, sizeof (*lifcp));
18202 18197 lifcp->lifc_flags = LIFC_UNDER_IPMP;
18203 18198 lifcp->lifc_family = af;
18204 18199 lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
18205 18200 lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
18206 18201
18207 18202 err = ip_ioctl(lh, SIOCGLIFCONF, lifcp, sizeof (*lifcp), cr);
18208 18203 if (err != 0) {
18209 18204 kmem_free(lifcp->lifc_buf, *bufsizep);
18210 18205 return (err);
18211 18206 }
18212 18207
18213 18208 return (0);
18214 18209 }
18215 18210
18216 18211 /*
18217 18212 * Helper for ip_interface_cleanup() that removes the loopback interface.
18218 18213 */
18219 18214 static void
18220 18215 ip_loopback_removeif(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18221 18216 {
18222 18217 int err;
18223 18218 struct lifreq lifr;
18224 18219
18225 18220 bzero(&lifr, sizeof (lifr));
18226 18221 (void) strcpy(lifr.lifr_name, ipif_loopback_name);
18227 18222
18228 18223 /*
18229 18224 * Attempt to remove the interface. It may legitimately not exist
18230 18225 * (e.g. the zone administrator unplumbed it), so ignore ENXIO.
18231 18226 */
18232 18227 err = ip_ioctl(lh, SIOCLIFREMOVEIF, &lifr, sizeof (lifr), cr);
18233 18228 if (err != 0 && err != ENXIO) {
18234 18229 ip0dbg(("ip_loopback_removeif: IP%s SIOCLIFREMOVEIF failed: "
18235 18230 "error %d\n", isv6 ? "v6" : "v4", err));
18236 18231 }
18237 18232 }
18238 18233
18239 18234 /*
18240 18235 * Helper for ip_interface_cleanup() that ensures no IP interfaces are in IPMP
18241 18236 * groups and that IPMP data addresses are down. These conditions must be met
18242 18237 * so that IPMP interfaces can be I_PUNLINK'd, as per ip_sioctl_plink_ipmp().
18243 18238 */
18244 18239 static void
18245 18240 ip_ipmp_cleanup(ldi_handle_t lh, boolean_t isv6, cred_t *cr)
18246 18241 {
18247 18242 int af = isv6 ? AF_INET6 : AF_INET;
18248 18243 int i, nifs;
18249 18244 int err;
18250 18245 uint_t bufsize;
18251 18246 uint_t lifrsize = sizeof (struct lifreq);
18252 18247 struct lifconf lifc;
18253 18248 struct lifreq *lifrp;
18254 18249
18255 18250 if ((err = ip_lifconf_ioctl(lh, af, &lifc, &bufsize, cr)) != 0) {
18256 18251 cmn_err(CE_WARN, "ip_ipmp_cleanup: cannot get interface list "
18257 18252 "(error %d); any IPMP interfaces cannot be shutdown", err);
18258 18253 return;
18259 18254 }
18260 18255
18261 18256 nifs = lifc.lifc_len / lifrsize;
18262 18257 for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
18263 18258 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18264 18259 if (err != 0) {
18265 18260 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot get "
18266 18261 "flags: error %d", lifrp->lifr_name, err);
18267 18262 continue;
18268 18263 }
18269 18264
18270 18265 if (lifrp->lifr_flags & IFF_IPMP) {
18271 18266 if ((lifrp->lifr_flags & (IFF_UP|IFF_DUPLICATE)) == 0)
18272 18267 continue;
18273 18268
18274 18269 lifrp->lifr_flags &= ~IFF_UP;
18275 18270 err = ip_ioctl(lh, SIOCSLIFFLAGS, lifrp, lifrsize, cr);
18276 18271 if (err != 0) {
18277 18272 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18278 18273 "bring down (error %d); IPMP interface may "
18279 18274 "not be shutdown", lifrp->lifr_name, err);
18280 18275 }
18281 18276
18282 18277 /*
18283 18278 * Check if IFF_DUPLICATE is still set -- and if so,
18284 18279 * reset the address to clear it.
18285 18280 */
18286 18281 err = ip_ioctl(lh, SIOCGLIFFLAGS, lifrp, lifrsize, cr);
18287 18282 if (err != 0 || !(lifrp->lifr_flags & IFF_DUPLICATE))
18288 18283 continue;
18289 18284
18290 18285 err = ip_ioctl(lh, SIOCGLIFADDR, lifrp, lifrsize, cr);
18291 18286 if (err != 0 || (err = ip_ioctl(lh, SIOCGLIFADDR,
18292 18287 lifrp, lifrsize, cr)) != 0) {
18293 18288 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18294 18289 "reset DAD (error %d); IPMP interface may "
18295 18290 "not be shutdown", lifrp->lifr_name, err);
18296 18291 }
18297 18292 continue;
18298 18293 }
18299 18294
18300 18295 if (strchr(lifrp->lifr_name, IPIF_SEPARATOR_CHAR) == 0) {
18301 18296 lifrp->lifr_groupname[0] = '\0';
18302 18297 if ((err = ip_ioctl(lh, SIOCSLIFGROUPNAME, lifrp,
18303 18298 lifrsize, cr)) != 0) {
18304 18299 cmn_err(CE_WARN, "ip_ipmp_cleanup: %s: cannot "
18305 18300 "leave IPMP group (error %d); associated "
18306 18301 "IPMP interface may not be shutdown",
18307 18302 lifrp->lifr_name, err);
18308 18303 continue;
18309 18304 }
18310 18305 }
18311 18306 }
18312 18307
18313 18308 kmem_free(lifc.lifc_buf, bufsize);
18314 18309 }
18315 18310
18316 18311 #define UDPDEV "/devices/pseudo/udp@0:udp"
18317 18312 #define UDP6DEV "/devices/pseudo/udp6@0:udp6"
18318 18313
18319 18314 /*
18320 18315 * Remove the loopback interfaces and prep the IPMP interfaces to be torn down.
18321 18316 * Non-loopback interfaces are either I_LINK'd or I_PLINK'd; the former go away
18322 18317 * when the user-level processes in the zone are killed and the latter are
18323 18318 * cleaned up by str_stack_shutdown().
18324 18319 */
18325 18320 void
18326 18321 ip_interface_cleanup(ip_stack_t *ipst)
18327 18322 {
18328 18323 ldi_handle_t lh;
18329 18324 ldi_ident_t li;
18330 18325 cred_t *cr;
18331 18326 int err;
18332 18327 int i;
18333 18328 char *devs[] = { UDP6DEV, UDPDEV };
18334 18329 netstackid_t stackid = ipst->ips_netstack->netstack_stackid;
18335 18330
18336 18331 if ((err = ldi_ident_from_major(ddi_name_to_major("ip"), &li)) != 0) {
18337 18332 cmn_err(CE_WARN, "ip_interface_cleanup: cannot get ldi ident:"
18338 18333 " error %d", err);
18339 18334 return;
18340 18335 }
18341 18336
18342 18337 cr = zone_get_kcred(netstackid_to_zoneid(stackid));
18343 18338 ASSERT(cr != NULL);
18344 18339
18345 18340 /*
18346 18341 * NOTE: loop executes exactly twice and is hardcoded to know that the
18347 18342 * first iteration is IPv6. (Unrolling yields repetitious code, hence
18348 18343 * the loop.)
18349 18344 */
18350 18345 for (i = 0; i < 2; i++) {
18351 18346 err = ldi_open_by_name(devs[i], FREAD|FWRITE, cr, &lh, li);
18352 18347 if (err != 0) {
18353 18348 cmn_err(CE_WARN, "ip_interface_cleanup: cannot open %s:"
18354 18349 " error %d", devs[i], err);
18355 18350 continue;
18356 18351 }
18357 18352
18358 18353 ip_loopback_removeif(lh, i == 0, cr);
18359 18354 ip_ipmp_cleanup(lh, i == 0, cr);
18360 18355
18361 18356 (void) ldi_close(lh, FREAD|FWRITE, cr);
18362 18357 }
18363 18358
18364 18359 ldi_ident_release(li);
18365 18360 crfree(cr);
18366 18361 }
18367 18362
18368 18363 /*
18369 18364 * This needs to be in-sync with nic_event_t definition
18370 18365 */
18371 18366 static const char *
18372 18367 ill_hook_event2str(nic_event_t event)
18373 18368 {
18374 18369 switch (event) {
18375 18370 case NE_PLUMB:
18376 18371 return ("PLUMB");
18377 18372 case NE_UNPLUMB:
18378 18373 return ("UNPLUMB");
18379 18374 case NE_UP:
18380 18375 return ("UP");
18381 18376 case NE_DOWN:
18382 18377 return ("DOWN");
18383 18378 case NE_ADDRESS_CHANGE:
18384 18379 return ("ADDRESS_CHANGE");
18385 18380 case NE_LIF_UP:
18386 18381 return ("LIF_UP");
18387 18382 case NE_LIF_DOWN:
18388 18383 return ("LIF_DOWN");
18389 18384 case NE_IFINDEX_CHANGE:
18390 18385 return ("IFINDEX_CHANGE");
18391 18386 default:
18392 18387 return ("UNKNOWN");
18393 18388 }
18394 18389 }
18395 18390
18396 18391 void
18397 18392 ill_nic_event_dispatch(ill_t *ill, lif_if_t lif, nic_event_t event,
18398 18393 nic_event_data_t data, size_t datalen)
18399 18394 {
18400 18395 ip_stack_t *ipst = ill->ill_ipst;
18401 18396 hook_nic_event_int_t *info;
18402 18397 const char *str = NULL;
18403 18398
18404 18399 /* create a new nic event info */
18405 18400 if ((info = kmem_alloc(sizeof (*info), KM_NOSLEEP)) == NULL)
18406 18401 goto fail;
18407 18402
18408 18403 info->hnei_event.hne_nic = ill->ill_phyint->phyint_ifindex;
18409 18404 info->hnei_event.hne_lif = lif;
18410 18405 info->hnei_event.hne_event = event;
18411 18406 info->hnei_event.hne_protocol = ill->ill_isv6 ?
18412 18407 ipst->ips_ipv6_net_data : ipst->ips_ipv4_net_data;
18413 18408 info->hnei_event.hne_data = NULL;
18414 18409 info->hnei_event.hne_datalen = 0;
18415 18410 info->hnei_stackid = ipst->ips_netstack->netstack_stackid;
18416 18411
18417 18412 if (data != NULL && datalen != 0) {
18418 18413 info->hnei_event.hne_data = kmem_alloc(datalen, KM_NOSLEEP);
18419 18414 if (info->hnei_event.hne_data == NULL)
18420 18415 goto fail;
18421 18416 bcopy(data, info->hnei_event.hne_data, datalen);
18422 18417 info->hnei_event.hne_datalen = datalen;
18423 18418 }
18424 18419
18425 18420 if (ddi_taskq_dispatch(eventq_queue_nic, ip_ne_queue_func, info,
18426 18421 DDI_NOSLEEP) == DDI_SUCCESS)
18427 18422 return;
18428 18423
18429 18424 fail:
18430 18425 if (info != NULL) {
18431 18426 if (info->hnei_event.hne_data != NULL) {
18432 18427 kmem_free(info->hnei_event.hne_data,
18433 18428 info->hnei_event.hne_datalen);
18434 18429 }
18435 18430 kmem_free(info, sizeof (hook_nic_event_t));
18436 18431 }
18437 18432 str = ill_hook_event2str(event);
18438 18433 ip2dbg(("ill_nic_event_dispatch: could not dispatch %s nic event "
18439 18434 "information for %s (ENOMEM)\n", str, ill->ill_name));
18440 18435 }
18441 18436
18442 18437 static int
18443 18438 ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act)
18444 18439 {
18445 18440 int err = 0;
18446 18441 const in_addr_t *addr = NULL;
18447 18442 nce_t *nce = NULL;
18448 18443 ill_t *ill = ipif->ipif_ill;
18449 18444 ill_t *bound_ill;
18450 18445 boolean_t added_ipif = B_FALSE;
18451 18446 uint16_t state;
18452 18447 uint16_t flags;
18453 18448
18454 18449 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail",
18455 18450 ill_t *, ill, ipif_t *, ipif);
18456 18451 if (ipif->ipif_lcl_addr != INADDR_ANY) {
18457 18452 addr = &ipif->ipif_lcl_addr;
18458 18453 }
18459 18454
18460 18455 if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) {
18461 18456 if (res_act != Res_act_initial)
18462 18457 return (EINVAL);
18463 18458 }
18464 18459
18465 18460 if (addr != NULL) {
18466 18461 ipmp_illgrp_t *illg = ill->ill_grp;
18467 18462
18468 18463 /* add unicast nce for the local addr */
18469 18464
18470 18465 if (IS_IPMP(ill)) {
18471 18466 /*
18472 18467 * If we're here via ipif_up(), then the ipif
18473 18468 * won't be bound yet -- add it to the group,
18474 18469 * which will bind it if possible. (We would
18475 18470 * add it in ipif_up(), but deleting on failure
18476 18471 * there is gruesome.) If we're here via
18477 18472 * ipmp_ill_bind_ipif(), then the ipif has
18478 18473 * already been added to the group and we
18479 18474 * just need to use the binding.
18480 18475 */
18481 18476 if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) {
18482 18477 bound_ill = ipmp_illgrp_add_ipif(illg, ipif);
18483 18478 if (bound_ill == NULL) {
18484 18479 /*
18485 18480 * We couldn't bind the ipif to an ill
18486 18481 * yet, so we have nothing to publish.
18487 18482 * Mark the address as ready and return.
18488 18483 */
18489 18484 ipif->ipif_addr_ready = 1;
18490 18485 return (0);
18491 18486 }
18492 18487 added_ipif = B_TRUE;
18493 18488 }
18494 18489 } else {
18495 18490 bound_ill = ill;
18496 18491 }
18497 18492
18498 18493 flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY |
18499 18494 NCE_F_NONUD);
18500 18495 /*
18501 18496 * If this is an initial bring-up (or the ipif was never
18502 18497 * completely brought up), do DAD. Otherwise, we're here
18503 18498 * because IPMP has rebound an address to this ill: send
18504 18499 * unsolicited advertisements (ARP announcements) to
18505 18500 * inform others.
18506 18501 */
18507 18502 if (res_act == Res_act_initial || !ipif->ipif_addr_ready) {
18508 18503 state = ND_UNCHANGED; /* compute in nce_add_common() */
18509 18504 } else {
18510 18505 state = ND_REACHABLE;
18511 18506 flags |= NCE_F_UNSOL_ADV;
18512 18507 }
18513 18508
18514 18509 retry:
18515 18510 err = nce_lookup_then_add_v4(ill,
18516 18511 bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length,
18517 18512 addr, flags, state, &nce);
18518 18513
18519 18514 /*
18520 18515 * note that we may encounter EEXIST if we are moving
18521 18516 * the nce as a result of a rebind operation.
18522 18517 */
18523 18518 switch (err) {
18524 18519 case 0:
18525 18520 ipif->ipif_added_nce = 1;
18526 18521 nce->nce_ipif_cnt++;
18527 18522 break;
18528 18523 case EEXIST:
18529 18524 ip1dbg(("ipif_arp_up: NCE already exists for %s\n",
18530 18525 ill->ill_name));
18531 18526 if (!NCE_MYADDR(nce->nce_common)) {
18532 18527 /*
18533 18528 * A leftover nce from before this address
18534 18529 * existed
18535 18530 */
18536 18531 ncec_delete(nce->nce_common);
18537 18532 nce_refrele(nce);
18538 18533 nce = NULL;
18539 18534 goto retry;
18540 18535 }
18541 18536 if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) {
18542 18537 nce_refrele(nce);
18543 18538 nce = NULL;
18544 18539 ip1dbg(("ipif_arp_up: NCE already exists "
18545 18540 "for %s:%u\n", ill->ill_name,
18546 18541 ipif->ipif_id));
18547 18542 goto arp_up_done;
18548 18543 }
18549 18544 /*
18550 18545 * Duplicate local addresses are permissible for
18551 18546 * IPIF_POINTOPOINT interfaces which will get marked
18552 18547 * IPIF_UNNUMBERED later in
18553 18548 * ip_addr_availability_check().
18554 18549 *
18555 18550 * The nce_ipif_cnt field tracks the number of
18556 18551 * ipifs that have nce_addr as their local address.
18557 18552 */
18558 18553 ipif->ipif_addr_ready = 1;
18559 18554 ipif->ipif_added_nce = 1;
18560 18555 nce->nce_ipif_cnt++;
18561 18556 err = 0;
18562 18557 break;
18563 18558 default:
18564 18559 ASSERT(nce == NULL);
18565 18560 goto arp_up_done;
18566 18561 }
18567 18562 if (arp_no_defense) {
18568 18563 if ((ipif->ipif_flags & IPIF_UP) &&
18569 18564 !ipif->ipif_addr_ready)
18570 18565 ipif_up_notify(ipif);
18571 18566 ipif->ipif_addr_ready = 1;
18572 18567 }
18573 18568 } else {
18574 18569 /* zero address. nothing to publish */
18575 18570 ipif->ipif_addr_ready = 1;
18576 18571 }
18577 18572 if (nce != NULL)
18578 18573 nce_refrele(nce);
18579 18574 arp_up_done:
18580 18575 if (added_ipif && err != 0)
18581 18576 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
18582 18577 return (err);
18583 18578 }
18584 18579
18585 18580 int
18586 18581 ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup)
18587 18582 {
18588 18583 int err = 0;
18589 18584 ill_t *ill = ipif->ipif_ill;
18590 18585 boolean_t first_interface, wait_for_dlpi = B_FALSE;
18591 18586
18592 18587 DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up",
18593 18588 ill_t *, ill, ipif_t *, ipif);
18594 18589
18595 18590 /*
18596 18591 * need to bring up ARP or setup mcast mapping only
18597 18592 * when the first interface is coming UP.
18598 18593 */
18599 18594 first_interface = (ill->ill_ipif_up_count == 0 &&
18600 18595 ill->ill_ipif_dup_count == 0 && !was_dup);
18601 18596
18602 18597 if (res_act == Res_act_initial && first_interface) {
18603 18598 /*
18604 18599 * Send ATTACH + BIND
18605 18600 */
18606 18601 err = arp_ll_up(ill);
18607 18602 if (err != EINPROGRESS && err != 0)
18608 18603 return (err);
18609 18604
18610 18605 /*
18611 18606 * Add NCE for local address. Start DAD.
18612 18607 * we'll wait to hear that DAD has finished
18613 18608 * before using the interface.
18614 18609 */
18615 18610 if (err == EINPROGRESS)
18616 18611 wait_for_dlpi = B_TRUE;
18617 18612 }
18618 18613
18619 18614 if (!wait_for_dlpi)
18620 18615 (void) ipif_arp_up_done_tail(ipif, res_act);
18621 18616
18622 18617 return (!wait_for_dlpi ? 0 : EINPROGRESS);
18623 18618 }
18624 18619
18625 18620 /*
18626 18621 * Finish processing of "arp_up" after all the DLPI message
18627 18622 * exchanges have completed between arp and the driver.
18628 18623 */
18629 18624 void
18630 18625 arp_bringup_done(ill_t *ill, int err)
18631 18626 {
18632 18627 mblk_t *mp1;
18633 18628 ipif_t *ipif;
18634 18629 conn_t *connp = NULL;
18635 18630 ipsq_t *ipsq;
18636 18631 queue_t *q;
18637 18632
18638 18633 ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name));
18639 18634
18640 18635 ASSERT(IAM_WRITER_ILL(ill));
18641 18636
18642 18637 ipsq = ill->ill_phyint->phyint_ipsq;
18643 18638 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18644 18639 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18645 18640 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18646 18641 if (mp1 == NULL) /* bringup was aborted by the user */
18647 18642 return;
18648 18643
18649 18644 /*
18650 18645 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18651 18646 * must have an associated conn_t. Otherwise, we're bringing this
18652 18647 * interface back up as part of handling an asynchronous event (e.g.,
18653 18648 * physical address change).
18654 18649 */
18655 18650 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18656 18651 ASSERT(connp != NULL);
18657 18652 q = CONNP_TO_WQ(connp);
18658 18653 } else {
18659 18654 ASSERT(connp == NULL);
18660 18655 q = ill->ill_rq;
18661 18656 }
18662 18657 if (err == 0) {
18663 18658 if (ipif->ipif_isv6) {
18664 18659 if ((err = ipif_up_done_v6(ipif)) != 0)
18665 18660 ip0dbg(("arp_bringup_done: init failed\n"));
18666 18661 } else {
18667 18662 err = ipif_arp_up_done_tail(ipif, Res_act_initial);
18668 18663 if (err != 0 ||
18669 18664 (err = ipif_up_done(ipif)) != 0) {
18670 18665 ip0dbg(("arp_bringup_done: "
18671 18666 "init failed err %x\n", err));
18672 18667 (void) ipif_arp_down(ipif);
18673 18668 }
18674 18669
18675 18670 }
18676 18671 } else {
18677 18672 ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n"));
18678 18673 }
18679 18674
18680 18675 if ((err == 0) && (ill->ill_up_ipifs)) {
18681 18676 err = ill_up_ipifs(ill, q, mp1);
18682 18677 if (err == EINPROGRESS)
18683 18678 return;
18684 18679 }
18685 18680
18686 18681 /*
18687 18682 * If we have a moved ipif to bring up, and everything has succeeded
18688 18683 * to this point, bring it up on the IPMP ill. Otherwise, leave it
18689 18684 * down -- the admin can try to bring it up by hand if need be.
18690 18685 */
18691 18686 if (ill->ill_move_ipif != NULL) {
18692 18687 ipif = ill->ill_move_ipif;
18693 18688 ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif,
18694 18689 ipif->ipif_ill->ill_name));
18695 18690 ill->ill_move_ipif = NULL;
18696 18691 if (err == 0) {
18697 18692 err = ipif_up(ipif, q, mp1);
18698 18693 if (err == EINPROGRESS)
18699 18694 return;
18700 18695 }
18701 18696 }
18702 18697
18703 18698 /*
18704 18699 * The operation must complete without EINPROGRESS since
18705 18700 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18706 18701 * Otherwise, the operation will be stuck forever in the ipsq.
18707 18702 */
18708 18703 ASSERT(err != EINPROGRESS);
18709 18704 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18710 18705 DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish",
18711 18706 int, ipsq->ipsq_xop->ipx_current_ioctl,
18712 18707 ill_t *, ill, ipif_t *, ipif);
18713 18708 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18714 18709 } else {
18715 18710 ipsq_current_finish(ipsq);
18716 18711 }
18717 18712 }
18718 18713
18719 18714 /*
18720 18715 * Finish processing of arp replumb after all the DLPI message
18721 18716 * exchanges have completed between arp and the driver.
18722 18717 */
18723 18718 void
18724 18719 arp_replumb_done(ill_t *ill, int err)
18725 18720 {
18726 18721 mblk_t *mp1;
18727 18722 ipif_t *ipif;
18728 18723 conn_t *connp = NULL;
18729 18724 ipsq_t *ipsq;
18730 18725 queue_t *q;
18731 18726
18732 18727 ASSERT(IAM_WRITER_ILL(ill));
18733 18728
18734 18729 ipsq = ill->ill_phyint->phyint_ipsq;
18735 18730 ipif = ipsq->ipsq_xop->ipx_pending_ipif;
18736 18731 mp1 = ipsq_pending_mp_get(ipsq, &connp);
18737 18732 ASSERT(!((mp1 != NULL) ^ (ipif != NULL)));
18738 18733 if (mp1 == NULL) {
18739 18734 ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n",
18740 18735 ipsq->ipsq_xop->ipx_current_ioctl));
18741 18736 /* bringup was aborted by the user */
18742 18737 return;
18743 18738 }
18744 18739 /*
18745 18740 * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we
18746 18741 * must have an associated conn_t. Otherwise, we're bringing this
18747 18742 * interface back up as part of handling an asynchronous event (e.g.,
18748 18743 * physical address change).
18749 18744 */
18750 18745 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18751 18746 ASSERT(connp != NULL);
18752 18747 q = CONNP_TO_WQ(connp);
18753 18748 } else {
18754 18749 ASSERT(connp == NULL);
18755 18750 q = ill->ill_rq;
18756 18751 }
18757 18752 if ((err == 0) && (ill->ill_up_ipifs)) {
18758 18753 err = ill_up_ipifs(ill, q, mp1);
18759 18754 if (err == EINPROGRESS)
18760 18755 return;
18761 18756 }
18762 18757 /*
18763 18758 * The operation must complete without EINPROGRESS since
18764 18759 * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp.
18765 18760 * Otherwise, the operation will be stuck forever in the ipsq.
18766 18761 */
18767 18762 ASSERT(err != EINPROGRESS);
18768 18763 if (ipsq->ipsq_xop->ipx_current_ioctl != 0) {
18769 18764 DTRACE_PROBE4(ipif__ioctl, char *,
18770 18765 "arp_replumb_done finish",
18771 18766 int, ipsq->ipsq_xop->ipx_current_ioctl,
18772 18767 ill_t *, ill, ipif_t *, ipif);
18773 18768 ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq);
18774 18769 } else {
18775 18770 ipsq_current_finish(ipsq);
18776 18771 }
18777 18772 }
18778 18773
18779 18774 void
18780 18775 ipif_up_notify(ipif_t *ipif)
18781 18776 {
18782 18777 ip_rts_ifmsg(ipif, RTSQ_DEFAULT);
18783 18778 ip_rts_newaddrmsg(RTM_ADD, 0, ipif, RTSQ_DEFAULT);
18784 18779 sctp_update_ipif(ipif, SCTP_IPIF_UP);
18785 18780 ill_nic_event_dispatch(ipif->ipif_ill, MAP_IPIF_ID(ipif->ipif_id),
18786 18781 NE_LIF_UP, NULL, 0);
18787 18782 }
18788 18783
18789 18784 /*
18790 18785 * ILB ioctl uses cv_wait (such as deleting a rule or adding a server) and
18791 18786 * this assumes the context is cv_wait'able. Hence it shouldnt' be used on
18792 18787 * TPI end points with STREAMS modules pushed above. This is assured by not
18793 18788 * having the IPI_MODOK flag for the ioctl. And IP ensures the ILB ioctl
18794 18789 * never ends up on an ipsq, otherwise we may end up processing the ioctl
18795 18790 * while unwinding from the ispq and that could be a thread from the bottom.
18796 18791 */
18797 18792 /* ARGSUSED */
18798 18793 int
18799 18794 ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
18800 18795 ip_ioctl_cmd_t *ipip, void *arg)
18801 18796 {
18802 18797 mblk_t *cmd_mp = mp->b_cont->b_cont;
18803 18798 ilb_cmd_t command = *((ilb_cmd_t *)cmd_mp->b_rptr);
18804 18799 int ret = 0;
18805 18800 int i;
18806 18801 size_t size;
18807 18802 ip_stack_t *ipst;
18808 18803 zoneid_t zoneid;
18809 18804 ilb_stack_t *ilbs;
18810 18805
18811 18806 ipst = CONNQ_TO_IPST(q);
18812 18807 ilbs = ipst->ips_netstack->netstack_ilb;
18813 18808 zoneid = Q_TO_CONN(q)->conn_zoneid;
18814 18809
18815 18810 switch (command) {
18816 18811 case ILB_CREATE_RULE: {
18817 18812 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18818 18813
18819 18814 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18820 18815 ret = EINVAL;
18821 18816 break;
18822 18817 }
18823 18818
18824 18819 ret = ilb_rule_add(ilbs, zoneid, cmd);
18825 18820 break;
18826 18821 }
18827 18822 case ILB_DESTROY_RULE:
18828 18823 case ILB_ENABLE_RULE:
18829 18824 case ILB_DISABLE_RULE: {
18830 18825 ilb_name_cmd_t *cmd = (ilb_name_cmd_t *)cmd_mp->b_rptr;
18831 18826
18832 18827 if (MBLKL(cmd_mp) != sizeof (ilb_name_cmd_t)) {
18833 18828 ret = EINVAL;
18834 18829 break;
18835 18830 }
18836 18831
18837 18832 if (cmd->flags & ILB_RULE_ALLRULES) {
18838 18833 if (command == ILB_DESTROY_RULE) {
18839 18834 ilb_rule_del_all(ilbs, zoneid);
18840 18835 break;
18841 18836 } else if (command == ILB_ENABLE_RULE) {
18842 18837 ilb_rule_enable_all(ilbs, zoneid);
18843 18838 break;
18844 18839 } else if (command == ILB_DISABLE_RULE) {
18845 18840 ilb_rule_disable_all(ilbs, zoneid);
18846 18841 break;
18847 18842 }
18848 18843 } else {
18849 18844 if (command == ILB_DESTROY_RULE) {
18850 18845 ret = ilb_rule_del(ilbs, zoneid, cmd->name);
18851 18846 } else if (command == ILB_ENABLE_RULE) {
18852 18847 ret = ilb_rule_enable(ilbs, zoneid, cmd->name,
18853 18848 NULL);
18854 18849 } else if (command == ILB_DISABLE_RULE) {
18855 18850 ret = ilb_rule_disable(ilbs, zoneid, cmd->name,
18856 18851 NULL);
18857 18852 }
18858 18853 }
18859 18854 break;
18860 18855 }
18861 18856 case ILB_NUM_RULES: {
18862 18857 ilb_num_rules_cmd_t *cmd;
18863 18858
18864 18859 if (MBLKL(cmd_mp) != sizeof (ilb_num_rules_cmd_t)) {
18865 18860 ret = EINVAL;
18866 18861 break;
18867 18862 }
18868 18863 cmd = (ilb_num_rules_cmd_t *)cmd_mp->b_rptr;
18869 18864 ilb_get_num_rules(ilbs, zoneid, &(cmd->num));
18870 18865 break;
18871 18866 }
18872 18867 case ILB_RULE_NAMES: {
18873 18868 ilb_rule_names_cmd_t *cmd;
18874 18869
18875 18870 cmd = (ilb_rule_names_cmd_t *)cmd_mp->b_rptr;
18876 18871 if (MBLKL(cmd_mp) < sizeof (ilb_rule_names_cmd_t) ||
18877 18872 cmd->num_names == 0) {
18878 18873 ret = EINVAL;
18879 18874 break;
18880 18875 }
18881 18876 size = cmd->num_names * ILB_RULE_NAMESZ;
18882 18877 if (cmd_mp->b_rptr + offsetof(ilb_rule_names_cmd_t, buf) +
18883 18878 size != cmd_mp->b_wptr) {
18884 18879 ret = EINVAL;
18885 18880 break;
18886 18881 }
18887 18882 ilb_get_rulenames(ilbs, zoneid, &cmd->num_names, cmd->buf);
18888 18883 break;
18889 18884 }
18890 18885 case ILB_NUM_SERVERS: {
18891 18886 ilb_num_servers_cmd_t *cmd;
18892 18887
18893 18888 if (MBLKL(cmd_mp) != sizeof (ilb_num_servers_cmd_t)) {
18894 18889 ret = EINVAL;
18895 18890 break;
18896 18891 }
18897 18892 cmd = (ilb_num_servers_cmd_t *)cmd_mp->b_rptr;
18898 18893 ret = ilb_get_num_servers(ilbs, zoneid, cmd->name,
18899 18894 &(cmd->num));
18900 18895 break;
18901 18896 }
18902 18897 case ILB_LIST_RULE: {
18903 18898 ilb_rule_cmd_t *cmd = (ilb_rule_cmd_t *)cmd_mp->b_rptr;
18904 18899
18905 18900 if (MBLKL(cmd_mp) != sizeof (ilb_rule_cmd_t)) {
18906 18901 ret = EINVAL;
18907 18902 break;
18908 18903 }
18909 18904 ret = ilb_rule_list(ilbs, zoneid, cmd);
18910 18905 break;
18911 18906 }
18912 18907 case ILB_LIST_SERVERS: {
18913 18908 ilb_servers_info_cmd_t *cmd;
18914 18909
18915 18910 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18916 18911 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t) ||
18917 18912 cmd->num_servers == 0) {
18918 18913 ret = EINVAL;
18919 18914 break;
18920 18915 }
18921 18916 size = cmd->num_servers * sizeof (ilb_server_info_t);
18922 18917 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18923 18918 size != cmd_mp->b_wptr) {
18924 18919 ret = EINVAL;
18925 18920 break;
18926 18921 }
18927 18922
18928 18923 ret = ilb_get_servers(ilbs, zoneid, cmd->name, cmd->servers,
18929 18924 &cmd->num_servers);
18930 18925 break;
18931 18926 }
18932 18927 case ILB_ADD_SERVERS: {
18933 18928 ilb_servers_info_cmd_t *cmd;
18934 18929 ilb_rule_t *rule;
18935 18930
18936 18931 cmd = (ilb_servers_info_cmd_t *)cmd_mp->b_rptr;
18937 18932 if (MBLKL(cmd_mp) < sizeof (ilb_servers_info_cmd_t)) {
18938 18933 ret = EINVAL;
18939 18934 break;
18940 18935 }
18941 18936 size = cmd->num_servers * sizeof (ilb_server_info_t);
18942 18937 if (cmd_mp->b_rptr + offsetof(ilb_servers_info_cmd_t, servers) +
18943 18938 size != cmd_mp->b_wptr) {
18944 18939 ret = EINVAL;
18945 18940 break;
18946 18941 }
18947 18942 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18948 18943 if (rule == NULL) {
18949 18944 ASSERT(ret != 0);
18950 18945 break;
18951 18946 }
18952 18947 for (i = 0; i < cmd->num_servers; i++) {
18953 18948 ilb_server_info_t *s;
18954 18949
18955 18950 s = &cmd->servers[i];
18956 18951 s->err = ilb_server_add(ilbs, rule, s);
18957 18952 }
18958 18953 ILB_RULE_REFRELE(rule);
18959 18954 break;
18960 18955 }
18961 18956 case ILB_DEL_SERVERS:
18962 18957 case ILB_ENABLE_SERVERS:
18963 18958 case ILB_DISABLE_SERVERS: {
18964 18959 ilb_servers_cmd_t *cmd;
18965 18960 ilb_rule_t *rule;
18966 18961 int (*f)();
18967 18962
18968 18963 cmd = (ilb_servers_cmd_t *)cmd_mp->b_rptr;
18969 18964 if (MBLKL(cmd_mp) < sizeof (ilb_servers_cmd_t)) {
18970 18965 ret = EINVAL;
18971 18966 break;
18972 18967 }
18973 18968 size = cmd->num_servers * sizeof (ilb_server_arg_t);
18974 18969 if (cmd_mp->b_rptr + offsetof(ilb_servers_cmd_t, servers) +
18975 18970 size != cmd_mp->b_wptr) {
18976 18971 ret = EINVAL;
18977 18972 break;
18978 18973 }
18979 18974
18980 18975 if (command == ILB_DEL_SERVERS)
18981 18976 f = ilb_server_del;
18982 18977 else if (command == ILB_ENABLE_SERVERS)
18983 18978 f = ilb_server_enable;
18984 18979 else if (command == ILB_DISABLE_SERVERS)
18985 18980 f = ilb_server_disable;
18986 18981
18987 18982 rule = ilb_find_rule(ilbs, zoneid, cmd->name, &ret);
18988 18983 if (rule == NULL) {
18989 18984 ASSERT(ret != 0);
18990 18985 break;
18991 18986 }
18992 18987
18993 18988 for (i = 0; i < cmd->num_servers; i++) {
18994 18989 ilb_server_arg_t *s;
18995 18990
18996 18991 s = &cmd->servers[i];
18997 18992 s->err = f(ilbs, zoneid, NULL, rule, &s->addr);
18998 18993 }
18999 18994 ILB_RULE_REFRELE(rule);
19000 18995 break;
19001 18996 }
19002 18997 case ILB_LIST_NAT_TABLE: {
19003 18998 ilb_list_nat_cmd_t *cmd;
19004 18999
19005 19000 cmd = (ilb_list_nat_cmd_t *)cmd_mp->b_rptr;
19006 19001 if (MBLKL(cmd_mp) < sizeof (ilb_list_nat_cmd_t)) {
19007 19002 ret = EINVAL;
19008 19003 break;
19009 19004 }
19010 19005 size = cmd->num_nat * sizeof (ilb_nat_entry_t);
19011 19006 if (cmd_mp->b_rptr + offsetof(ilb_list_nat_cmd_t, entries) +
19012 19007 size != cmd_mp->b_wptr) {
19013 19008 ret = EINVAL;
19014 19009 break;
19015 19010 }
19016 19011
19017 19012 ret = ilb_list_nat(ilbs, zoneid, cmd->entries, &cmd->num_nat,
19018 19013 &cmd->flags);
19019 19014 break;
19020 19015 }
19021 19016 case ILB_LIST_STICKY_TABLE: {
19022 19017 ilb_list_sticky_cmd_t *cmd;
19023 19018
19024 19019 cmd = (ilb_list_sticky_cmd_t *)cmd_mp->b_rptr;
19025 19020 if (MBLKL(cmd_mp) < sizeof (ilb_list_sticky_cmd_t)) {
19026 19021 ret = EINVAL;
19027 19022 break;
19028 19023 }
19029 19024 size = cmd->num_sticky * sizeof (ilb_sticky_entry_t);
19030 19025 if (cmd_mp->b_rptr + offsetof(ilb_list_sticky_cmd_t, entries) +
19031 19026 size != cmd_mp->b_wptr) {
19032 19027 ret = EINVAL;
19033 19028 break;
19034 19029 }
19035 19030
19036 19031 ret = ilb_list_sticky(ilbs, zoneid, cmd->entries,
19037 19032 &cmd->num_sticky, &cmd->flags);
19038 19033 break;
19039 19034 }
19040 19035 default:
19041 19036 ret = EINVAL;
19042 19037 break;
19043 19038 }
19044 19039 done:
19045 19040 return (ret);
19046 19041 }
19047 19042
19048 19043 /* Remove all cache entries for this logical interface */
19049 19044 void
19050 19045 ipif_nce_down(ipif_t *ipif)
19051 19046 {
19052 19047 ill_t *ill = ipif->ipif_ill;
19053 19048 nce_t *nce;
19054 19049
19055 19050 DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down",
19056 19051 ill_t *, ill, ipif_t *, ipif);
19057 19052 if (ipif->ipif_added_nce) {
19058 19053 if (ipif->ipif_isv6)
19059 19054 nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr);
19060 19055 else
19061 19056 nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr);
19062 19057 if (nce != NULL) {
19063 19058 if (--nce->nce_ipif_cnt == 0)
19064 19059 ncec_delete(nce->nce_common);
19065 19060 ipif->ipif_added_nce = 0;
19066 19061 nce_refrele(nce);
19067 19062 } else {
19068 19063 /*
19069 19064 * nce may already be NULL because it was already
19070 19065 * flushed, e.g., due to a call to nce_flush
19071 19066 */
19072 19067 ipif->ipif_added_nce = 0;
19073 19068 }
19074 19069 }
19075 19070 /*
19076 19071 * Make IPMP aware of the deleted data address.
19077 19072 */
19078 19073 if (IS_IPMP(ill))
19079 19074 ipmp_illgrp_del_ipif(ill->ill_grp, ipif);
19080 19075
19081 19076 /*
19082 19077 * Remove all other nces dependent on this ill when the last ipif
19083 19078 * is going away.
19084 19079 */
19085 19080 if (ill->ill_ipif_up_count == 0) {
19086 19081 ncec_walk(ill, ncec_delete_per_ill, ill, ill->ill_ipst);
19087 19082 if (IS_UNDER_IPMP(ill))
19088 19083 nce_flush(ill, B_TRUE);
19089 19084 }
19090 19085 }
19091 19086
19092 19087 /*
19093 19088 * find the first interface that uses usill for its source address.
19094 19089 */
19095 19090 ill_t *
19096 19091 ill_lookup_usesrc(ill_t *usill)
19097 19092 {
19098 19093 ip_stack_t *ipst = usill->ill_ipst;
19099 19094 ill_t *ill;
19100 19095
19101 19096 ASSERT(usill != NULL);
19102 19097
19103 19098 /* ill_g_usesrc_lock protects ill_usesrc_grp_next */
19104 19099 rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_WRITER);
19105 19100 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
19106 19101 for (ill = usill->ill_usesrc_grp_next; ill != NULL && ill != usill;
19107 19102 ill = ill->ill_usesrc_grp_next) {
19108 19103 if (!IS_UNDER_IPMP(ill) && (ill->ill_flags & ILLF_MULTICAST) &&
19109 19104 !ILL_IS_CONDEMNED(ill)) {
19110 19105 ill_refhold(ill);
19111 19106 break;
19112 19107 }
19113 19108 }
19114 19109 rw_exit(&ipst->ips_ill_g_lock);
19115 19110 rw_exit(&ipst->ips_ill_g_usesrc_lock);
19116 19111 return (ill);
19117 19112 }
19118 19113
19119 19114 /*
19120 19115 * This comment applies to both ip_sioctl_get_ifhwaddr and
19121 19116 * ip_sioctl_get_lifhwaddr as the basic function of these two functions
19122 19117 * is the same.
19123 19118 *
19124 19119 * The goal here is to find an IP interface that corresponds to the name
19125 19120 * provided by the caller in the ifreq/lifreq structure held in the mblk_t
19126 19121 * chain and to fill out a sockaddr/sockaddr_storage structure with the
19127 19122 * mac address.
19128 19123 *
19129 19124 * The SIOCGIFHWADDR/SIOCGLIFHWADDR ioctl may return an error for a number
19130 19125 * of different reasons:
19131 19126 * ENXIO - the device name is not known to IP.
19132 19127 * EADDRNOTAVAIL - the device has no hardware address. This is indicated
19133 19128 * by ill_phys_addr not pointing to an actual address.
19134 19129 * EPFNOSUPPORT - this will indicate that a request is being made for a
19135 19130 * mac address that will not fit in the data structure supplier (struct
19136 19131 * sockaddr).
19137 19132 *
19138 19133 */
19139 19134 /* ARGSUSED */
19140 19135 int
19141 19136 ip_sioctl_get_ifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19142 19137 ip_ioctl_cmd_t *ipip, void *if_req)
19143 19138 {
19144 19139 struct sockaddr *sock;
19145 19140 struct ifreq *ifr;
19146 19141 mblk_t *mp1;
19147 19142 ill_t *ill;
19148 19143
19149 19144 ASSERT(ipif != NULL);
19150 19145 ill = ipif->ipif_ill;
19151 19146
19152 19147 if (ill->ill_phys_addr == NULL) {
19153 19148 return (EADDRNOTAVAIL);
19154 19149 }
19155 19150 if (ill->ill_phys_addr_length > sizeof (sock->sa_data)) {
19156 19151 return (EPFNOSUPPORT);
19157 19152 }
19158 19153
19159 19154 ip1dbg(("ip_sioctl_get_hwaddr(%s)\n", ill->ill_name));
19160 19155
19161 19156 /* Existence of mp1 has been checked in ip_wput_nondata */
19162 19157 mp1 = mp->b_cont->b_cont;
19163 19158 ifr = (struct ifreq *)mp1->b_rptr;
19164 19159
19165 19160 sock = &ifr->ifr_addr;
19166 19161 /*
19167 19162 * The "family" field in the returned structure is set to a value
19168 19163 * that represents the type of device to which the address belongs.
19169 19164 * The value returned may differ to that on Linux but it will still
19170 19165 * represent the correct symbol on Solaris.
19171 19166 */
19172 19167 sock->sa_family = arp_hw_type(ill->ill_mactype);
19173 19168 bcopy(ill->ill_phys_addr, &sock->sa_data, ill->ill_phys_addr_length);
19174 19169
19175 19170 return (0);
19176 19171 }
19177 19172
19178 19173 /*
19179 19174 * The expection of applications using SIOCGIFHWADDR is that data will
19180 19175 * be returned in the sa_data field of the sockaddr structure. With
19181 19176 * SIOCGLIFHWADDR, we're breaking new ground as there is no Linux
19182 19177 * equivalent. In light of this, struct sockaddr_dl is used as it
19183 19178 * offers more space for address storage in sll_data.
19184 19179 */
19185 19180 /* ARGSUSED */
19186 19181 int
19187 19182 ip_sioctl_get_lifhwaddr(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
19188 19183 ip_ioctl_cmd_t *ipip, void *if_req)
19189 19184 {
19190 19185 struct sockaddr_dl *sock;
19191 19186 struct lifreq *lifr;
19192 19187 mblk_t *mp1;
19193 19188 ill_t *ill;
19194 19189
19195 19190 ASSERT(ipif != NULL);
19196 19191 ill = ipif->ipif_ill;
19197 19192
19198 19193 if (ill->ill_phys_addr == NULL) {
19199 19194 return (EADDRNOTAVAIL);
19200 19195 }
19201 19196 if (ill->ill_phys_addr_length > sizeof (sock->sdl_data)) {
19202 19197 return (EPFNOSUPPORT);
19203 19198 }
19204 19199
19205 19200 ip1dbg(("ip_sioctl_get_lifhwaddr(%s)\n", ill->ill_name));
19206 19201
19207 19202 /* Existence of mp1 has been checked in ip_wput_nondata */
19208 19203 mp1 = mp->b_cont->b_cont;
19209 19204 lifr = (struct lifreq *)mp1->b_rptr;
19210 19205
19211 19206 /*
19212 19207 * sockaddr_ll is used here because it is also the structure used in
19213 19208 * responding to the same ioctl in sockpfp. The only other choice is
19214 19209 * sockaddr_dl which contains fields that are not required here
19215 19210 * because its purpose is different.
19216 19211 */
19217 19212 lifr->lifr_type = ill->ill_type;
19218 19213 sock = (struct sockaddr_dl *)&lifr->lifr_addr;
19219 19214 sock->sdl_family = AF_LINK;
19220 19215 sock->sdl_index = ill->ill_phyint->phyint_ifindex;
19221 19216 sock->sdl_type = ill->ill_mactype;
19222 19217 sock->sdl_nlen = 0;
19223 19218 sock->sdl_slen = 0;
19224 19219 sock->sdl_alen = ill->ill_phys_addr_length;
19225 19220 bcopy(ill->ill_phys_addr, sock->sdl_data, ill->ill_phys_addr_length);
19226 19221
19227 19222 return (0);
19228 19223 }
|
↓ open down ↓ |
6270 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX