Print this page
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ new/usr/src/uts/common/inet/tcp/tcp_bind.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 25 * Copyright 2016 Joyent, Inc.
26 26 */
27 27
28 28 #include <sys/types.h>
29 29 #include <sys/stream.h>
30 30 #include <sys/strsun.h>
31 31 #include <sys/strsubr.h>
32 32 #include <sys/stropts.h>
33 33 #include <sys/strlog.h>
34 34 #define _SUN_TPI_VERSION 2
35 35 #include <sys/tihdr.h>
36 36 #include <sys/suntpi.h>
37 37 #include <sys/xti_inet.h>
38 38 #include <sys/policy.h>
39 39 #include <sys/squeue_impl.h>
40 40 #include <sys/squeue.h>
41 41 #include <sys/tsol/tnet.h>
42 42
43 43 #include <rpc/pmap_prot.h>
44 44
45 45 #include <inet/common.h>
46 46 #include <inet/ip.h>
47 47 #include <inet/tcp.h>
48 48 #include <inet/tcp_impl.h>
49 49 #include <inet/proto_set.h>
50 50 #include <inet/ipsec_impl.h>
51 51
52 52 /* Setable in /etc/system */
53 53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
54 54 static uint32_t tcp_random_anon_port = 1;
55 55
56 56 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
57 57 cred_t *cr);
58 58 static in_port_t tcp_get_next_priv_port(const tcp_t *);
59 59 static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
60 60
61 61 /*
62 62 * Hash list insertion routine for tcp_t structures. Each hash bucket
63 63 * contains a list of tcp_t entries, and each entry is bound to a unique
64 64 * port. If there are multiple tcp_t's that are bound to the same port, then
65 65 * one of them will be linked into the hash bucket list, and the rest will
66 66 * hang off of that one entry. For each port, entries bound to a specific IP
67 67 * address will be inserted before those those bound to INADDR_ANY.
68 68 */
69 69 void
70 70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
71 71 {
72 72 tcp_t **tcpp;
73 73 tcp_t *tcpnext;
74 74 tcp_t *tcphash;
75 75 conn_t *connp = tcp->tcp_connp;
76 76 conn_t *connext;
77 77
78 78 if (tcp->tcp_ptpbhn != NULL) {
79 79 ASSERT(!caller_holds_lock);
80 80 tcp_bind_hash_remove(tcp);
81 81 }
82 82 tcpp = &tbf->tf_tcp;
83 83 if (!caller_holds_lock) {
84 84 mutex_enter(&tbf->tf_lock);
85 85 } else {
86 86 ASSERT(MUTEX_HELD(&tbf->tf_lock));
87 87 }
88 88 tcphash = tcpp[0];
89 89 tcpnext = NULL;
90 90 if (tcphash != NULL) {
91 91 /* Look for an entry using the same port */
92 92 while ((tcphash = tcpp[0]) != NULL &&
93 93 connp->conn_lport != tcphash->tcp_connp->conn_lport)
94 94 tcpp = &(tcphash->tcp_bind_hash);
95 95
96 96 /* The port was not found, just add to the end */
97 97 if (tcphash == NULL)
98 98 goto insert;
99 99
100 100 /*
101 101 * OK, there already exists an entry bound to the
102 102 * same port.
103 103 *
104 104 * If the new tcp bound to the INADDR_ANY address
105 105 * and the first one in the list is not bound to
106 106 * INADDR_ANY we skip all entries until we find the
107 107 * first one bound to INADDR_ANY.
108 108 * This makes sure that applications binding to a
109 109 * specific address get preference over those binding to
110 110 * INADDR_ANY.
111 111 */
112 112 tcpnext = tcphash;
113 113 connext = tcpnext->tcp_connp;
114 114 tcphash = NULL;
115 115 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
116 116 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
117 117 while ((tcpnext = tcpp[0]) != NULL) {
118 118 connext = tcpnext->tcp_connp;
119 119 if (!V6_OR_V4_INADDR_ANY(
120 120 connext->conn_bound_addr_v6))
121 121 tcpp = &(tcpnext->tcp_bind_hash_port);
122 122 else
123 123 break;
124 124 }
125 125 if (tcpnext != NULL) {
126 126 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
127 127 tcphash = tcpnext->tcp_bind_hash;
128 128 if (tcphash != NULL) {
129 129 tcphash->tcp_ptpbhn =
130 130 &(tcp->tcp_bind_hash);
131 131 tcpnext->tcp_bind_hash = NULL;
132 132 }
133 133 }
134 134 } else {
135 135 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
136 136 tcphash = tcpnext->tcp_bind_hash;
137 137 if (tcphash != NULL) {
138 138 tcphash->tcp_ptpbhn =
139 139 &(tcp->tcp_bind_hash);
140 140 tcpnext->tcp_bind_hash = NULL;
141 141 }
142 142 }
143 143 }
144 144 insert:
145 145 tcp->tcp_bind_hash_port = tcpnext;
146 146 tcp->tcp_bind_hash = tcphash;
147 147 tcp->tcp_ptpbhn = tcpp;
148 148 tcpp[0] = tcp;
149 149 if (!caller_holds_lock)
150 150 mutex_exit(&tbf->tf_lock);
151 151 }
152 152
153 153 /*
154 154 * Hash list removal routine for tcp_t structures.
155 155 */
156 156 void
157 157 tcp_bind_hash_remove(tcp_t *tcp)
158 158 {
159 159 tcp_t *tcpnext;
160 160 kmutex_t *lockp;
161 161 tcp_stack_t *tcps = tcp->tcp_tcps;
162 162 conn_t *connp = tcp->tcp_connp;
163 163
164 164 if (tcp->tcp_ptpbhn == NULL)
165 165 return;
166 166
167 167 /*
168 168 * Extract the lock pointer in case there are concurrent
169 169 * hash_remove's for this instance.
170 170 */
171 171 ASSERT(connp->conn_lport != 0);
172 172 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
173 173 connp->conn_lport)].tf_lock;
174 174
175 175 ASSERT(lockp != NULL);
176 176 mutex_enter(lockp);
177 177
178 178 /* destroy any association with SO_REUSEPORT group */
179 179 if (tcp->tcp_rg_bind != NULL) {
180 180 if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
181 181 /* Last one out turns off the lights */
182 182 tcp_rg_destroy(tcp->tcp_rg_bind);
183 183 }
184 184 tcp->tcp_rg_bind = NULL;
185 185 }
186 186
187 187 if (tcp->tcp_ptpbhn) {
188 188 tcpnext = tcp->tcp_bind_hash_port;
189 189 if (tcpnext != NULL) {
190 190 tcp->tcp_bind_hash_port = NULL;
191 191 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
192 192 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
193 193 if (tcpnext->tcp_bind_hash != NULL) {
194 194 tcpnext->tcp_bind_hash->tcp_ptpbhn =
195 195 &(tcpnext->tcp_bind_hash);
196 196 tcp->tcp_bind_hash = NULL;
197 197 }
198 198 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
199 199 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
200 200 tcp->tcp_bind_hash = NULL;
201 201 }
202 202 *tcp->tcp_ptpbhn = tcpnext;
203 203 tcp->tcp_ptpbhn = NULL;
204 204 }
205 205 mutex_exit(lockp);
206 206 }
207 207
208 208 /*
209 209 * Don't let port fall into the privileged range.
210 210 * Since the extra privileged ports can be arbitrary we also
211 211 * ensure that we exclude those from consideration.
212 212 * tcp_g_epriv_ports is not sorted thus we loop over it until
213 213 * there are no changes.
214 214 *
215 215 * Note: No locks are held when inspecting tcp_g_*epriv_ports
216 216 * but instead the code relies on:
217 217 * - the fact that the address of the array and its size never changes
218 218 * - the atomic assignment of the elements of the array
219 219 *
220 220 * Returns 0 if there are no more ports available.
221 221 *
222 222 * TS note: skip multilevel ports.
223 223 */
224 224 in_port_t
225 225 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
226 226 {
227 227 int i, bump;
228 228 boolean_t restart = B_FALSE;
229 229 tcp_stack_t *tcps = tcp->tcp_tcps;
230 230
231 231 if (random && tcp_random_anon_port != 0) {
232 232 (void) random_get_pseudo_bytes((uint8_t *)&port,
233 233 sizeof (in_port_t));
234 234 /*
235 235 * Unless changed by a sys admin, the smallest anon port
236 236 * is 32768 and the largest anon port is 65535. It is
237 237 * very likely (50%) for the random port to be smaller
238 238 * than the smallest anon port. When that happens,
239 239 * add port % (anon port range) to the smallest anon
240 240 * port to get the random port. It should fall into the
241 241 * valid anon port range.
242 242 */
243 243 if ((port < tcps->tcps_smallest_anon_port) ||
244 244 (port > tcps->tcps_largest_anon_port)) {
245 245 if (tcps->tcps_smallest_anon_port ==
246 246 tcps->tcps_largest_anon_port) {
247 247 bump = 0;
248 248 } else {
249 249 bump = port % (tcps->tcps_largest_anon_port -
250 250 tcps->tcps_smallest_anon_port);
251 251 }
252 252 port = tcps->tcps_smallest_anon_port + bump;
253 253 }
254 254 }
255 255
256 256 retry:
257 257 if (port < tcps->tcps_smallest_anon_port)
258 258 port = (in_port_t)tcps->tcps_smallest_anon_port;
259 259
260 260 if (port > tcps->tcps_largest_anon_port) {
261 261 if (restart)
262 262 return (0);
263 263 restart = B_TRUE;
264 264 port = (in_port_t)tcps->tcps_smallest_anon_port;
265 265 }
266 266
267 267 if (port < tcps->tcps_smallest_nonpriv_port)
268 268 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
269 269
270 270 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
271 271 if (port == tcps->tcps_g_epriv_ports[i]) {
272 272 port++;
273 273 /*
274 274 * Make sure whether the port is in the
275 275 * valid range.
276 276 */
277 277 goto retry;
278 278 }
279 279 }
280 280 if (is_system_labeled() &&
281 281 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
282 282 IPPROTO_TCP, B_TRUE)) != 0) {
283 283 port = i;
284 284 goto retry;
285 285 }
286 286 return (port);
287 287 }
288 288
289 289 /*
290 290 * Return the next anonymous port in the privileged port range for
291 291 * bind checking. It starts at IPPORT_RESERVED - 1 and goes
292 292 * downwards. This is the same behavior as documented in the userland
293 293 * library call rresvport(3N).
294 294 *
295 295 * TS note: skip multilevel ports.
296 296 */
297 297 static in_port_t
298 298 tcp_get_next_priv_port(const tcp_t *tcp)
299 299 {
300 300 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
301 301 in_port_t nextport;
302 302 boolean_t restart = B_FALSE;
303 303 tcp_stack_t *tcps = tcp->tcp_tcps;
304 304 retry:
305 305 if (next_priv_port < tcps->tcps_min_anonpriv_port ||
306 306 next_priv_port >= IPPORT_RESERVED) {
307 307 next_priv_port = IPPORT_RESERVED - 1;
308 308 if (restart)
309 309 return (0);
310 310 restart = B_TRUE;
311 311 }
312 312 if (is_system_labeled() &&
313 313 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
314 314 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
315 315 next_priv_port = nextport;
316 316 goto retry;
317 317 }
318 318 return (next_priv_port--);
319 319 }
320 320
321 321 static int
322 322 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
323 323 boolean_t bind_to_req_port_only, cred_t *cr)
324 324 {
325 325 in_port_t mlp_port;
326 326 mlp_type_t addrtype, mlptype;
327 327 boolean_t user_specified;
328 328 in_port_t allocated_port;
329 329 in_port_t requested_port = *requested_port_ptr;
330 330 conn_t *connp = tcp->tcp_connp;
331 331 zone_t *zone;
332 332 tcp_stack_t *tcps = tcp->tcp_tcps;
333 333 in6_addr_t v6addr = connp->conn_laddr_v6;
334 334
335 335 /*
336 336 * XXX It's up to the caller to specify bind_to_req_port_only or not.
337 337 */
338 338 ASSERT(cr != NULL);
339 339
340 340 /*
341 341 * Get a valid port (within the anonymous range and should not
342 342 * be a privileged one) to use if the user has not given a port.
343 343 * If multiple threads are here, they may all start with
344 344 * with the same initial port. But, it should be fine as long as
345 345 * tcp_bindi will ensure that no two threads will be assigned
346 346 * the same port.
347 347 *
348 348 * NOTE: XXX If a privileged process asks for an anonymous port, we
349 349 * still check for ports only in the range > tcp_smallest_non_priv_port,
350 350 * unless TCP_ANONPRIVBIND option is set.
351 351 */
352 352 mlptype = mlptSingle;
353 353 mlp_port = requested_port;
354 354 if (requested_port == 0) {
355 355 requested_port = connp->conn_anon_priv_bind ?
356 356 tcp_get_next_priv_port(tcp) :
357 357 tcp_update_next_port(tcps->tcps_next_port_to_try,
358 358 tcp, B_TRUE);
359 359 if (requested_port == 0) {
360 360 return (-TNOADDR);
361 361 }
362 362 user_specified = B_FALSE;
363 363
364 364 /*
365 365 * If the user went through one of the RPC interfaces to create
366 366 * this socket and RPC is MLP in this zone, then give him an
367 367 * anonymous MLP.
368 368 */
369 369 if (connp->conn_anon_mlp && is_system_labeled()) {
370 370 zone = crgetzone(cr);
371 371 addrtype = tsol_mlp_addr_type(
372 372 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
373 373 IPV6_VERSION, &v6addr,
374 374 tcps->tcps_netstack->netstack_ip);
375 375 if (addrtype == mlptSingle) {
376 376 return (-TNOADDR);
377 377 }
378 378 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
379 379 PMAPPORT, addrtype);
380 380 mlp_port = PMAPPORT;
381 381 }
382 382 } else {
383 383 int i;
384 384 boolean_t priv = B_FALSE;
385 385
386 386 /*
387 387 * If the requested_port is in the well-known privileged range,
388 388 * verify that the stream was opened by a privileged user.
389 389 * Note: No locks are held when inspecting tcp_g_*epriv_ports
390 390 * but instead the code relies on:
391 391 * - the fact that the address of the array and its size never
392 392 * changes
393 393 * - the atomic assignment of the elements of the array
394 394 */
395 395 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
396 396 priv = B_TRUE;
397 397 } else {
398 398 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
399 399 if (requested_port ==
400 400 tcps->tcps_g_epriv_ports[i]) {
401 401 priv = B_TRUE;
402 402 break;
403 403 }
404 404 }
405 405 }
406 406 if (priv) {
407 407 if (secpolicy_net_privaddr(cr, requested_port,
408 408 IPPROTO_TCP) != 0) {
409 409 if (connp->conn_debug) {
410 410 (void) strlog(TCP_MOD_ID, 0, 1,
411 411 SL_ERROR|SL_TRACE,
412 412 "tcp_bind: no priv for port %d",
413 413 requested_port);
414 414 }
415 415 return (-TACCES);
416 416 }
417 417 }
418 418 user_specified = B_TRUE;
419 419
420 420 connp = tcp->tcp_connp;
421 421 if (is_system_labeled()) {
422 422 zone = crgetzone(cr);
423 423 addrtype = tsol_mlp_addr_type(
424 424 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
425 425 IPV6_VERSION, &v6addr,
426 426 tcps->tcps_netstack->netstack_ip);
427 427 if (addrtype == mlptSingle) {
428 428 return (-TNOADDR);
429 429 }
430 430 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
431 431 requested_port, addrtype);
432 432 }
433 433 }
434 434
435 435 if (mlptype != mlptSingle) {
436 436 if (secpolicy_net_bindmlp(cr) != 0) {
437 437 if (connp->conn_debug) {
438 438 (void) strlog(TCP_MOD_ID, 0, 1,
439 439 SL_ERROR|SL_TRACE,
440 440 "tcp_bind: no priv for multilevel port %d",
441 441 requested_port);
442 442 }
443 443 return (-TACCES);
444 444 }
445 445
446 446 /*
447 447 * If we're specifically binding a shared IP address and the
448 448 * port is MLP on shared addresses, then check to see if this
449 449 * zone actually owns the MLP. Reject if not.
450 450 */
451 451 if (mlptype == mlptShared && addrtype == mlptShared) {
452 452 /*
453 453 * No need to handle exclusive-stack zones since
454 454 * ALL_ZONES only applies to the shared stack.
455 455 */
456 456 zoneid_t mlpzone;
457 457
458 458 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
459 459 htons(mlp_port));
460 460 if (connp->conn_zoneid != mlpzone) {
461 461 if (connp->conn_debug) {
462 462 (void) strlog(TCP_MOD_ID, 0, 1,
463 463 SL_ERROR|SL_TRACE,
464 464 "tcp_bind: attempt to bind port "
465 465 "%d on shared addr in zone %d "
466 466 "(should be %d)",
467 467 mlp_port, connp->conn_zoneid,
468 468 mlpzone);
469 469 }
470 470 return (-TACCES);
471 471 }
472 472 }
473 473
474 474 if (!user_specified) {
475 475 int err;
476 476 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
477 477 requested_port, B_TRUE);
478 478 if (err != 0) {
479 479 if (connp->conn_debug) {
480 480 (void) strlog(TCP_MOD_ID, 0, 1,
481 481 SL_ERROR|SL_TRACE,
482 482 "tcp_bind: cannot establish anon "
483 483 "MLP for port %d",
484 484 requested_port);
485 485 }
486 486 return (err);
487 487 }
488 488 connp->conn_anon_port = B_TRUE;
489 489 }
490 490 connp->conn_mlp_type = mlptype;
491 491 }
492 492
493 493 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
494 494 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
495 495 user_specified);
496 496
497 497 if (allocated_port == 0) {
498 498 connp->conn_mlp_type = mlptSingle;
499 499 if (connp->conn_anon_port) {
500 500 connp->conn_anon_port = B_FALSE;
501 501 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
502 502 requested_port, B_FALSE);
503 503 }
504 504 if (bind_to_req_port_only) {
505 505 if (connp->conn_debug) {
506 506 (void) strlog(TCP_MOD_ID, 0, 1,
507 507 SL_ERROR|SL_TRACE,
508 508 "tcp_bind: requested addr busy");
509 509 }
510 510 return (-TADDRBUSY);
511 511 } else {
512 512 /* If we are out of ports, fail the bind. */
513 513 if (connp->conn_debug) {
514 514 (void) strlog(TCP_MOD_ID, 0, 1,
515 515 SL_ERROR|SL_TRACE,
516 516 "tcp_bind: out of ports?");
517 517 }
518 518 return (-TNOADDR);
519 519 }
520 520 }
521 521
522 522 /* Pass the allocated port back */
523 523 *requested_port_ptr = allocated_port;
524 524 return (0);
525 525 }
526 526
527 527 /*
528 528 * Check the address and check/pick a local port number.
529 529 */
530 530 int
531 531 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
532 532 boolean_t bind_to_req_port_only)
533 533 {
534 534 tcp_t *tcp = connp->conn_tcp;
535 535 sin_t *sin;
536 536 sin6_t *sin6;
537 537 in_port_t requested_port;
538 538 ipaddr_t v4addr;
539 539 in6_addr_t v6addr;
540 540 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
541 541 zoneid_t zoneid = IPCL_ZONEID(connp);
542 542 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
543 543 uint_t scopeid = 0;
544 544 int error = 0;
545 545 ip_xmit_attr_t *ixa = connp->conn_ixa;
546 546
547 547 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
548 548
549 549 if (tcp->tcp_state == TCPS_BOUND) {
550 550 return (0);
551 551 } else if (tcp->tcp_state > TCPS_BOUND) {
552 552 if (connp->conn_debug) {
553 553 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
554 554 "tcp_bind: bad state, %d", tcp->tcp_state);
555 555 }
556 556 return (-TOUTSTATE);
557 557 }
558 558
559 559 ASSERT(sa != NULL && len != 0);
560 560
561 561 if (!OK_32PTR((char *)sa)) {
562 562 if (connp->conn_debug) {
563 563 (void) strlog(TCP_MOD_ID, 0, 1,
564 564 SL_ERROR|SL_TRACE,
565 565 "tcp_bind: bad address parameter, "
566 566 "address %p, len %d",
567 567 (void *)sa, len);
568 568 }
569 569 return (-TPROTO);
570 570 }
571 571
572 572 error = proto_verify_ip_addr(connp->conn_family, sa, len);
573 573 if (error != 0) {
574 574 return (error);
575 575 }
576 576
577 577 switch (len) {
578 578 case sizeof (sin_t): /* Complete IPv4 address */
579 579 sin = (sin_t *)sa;
580 580 requested_port = ntohs(sin->sin_port);
581 581 v4addr = sin->sin_addr.s_addr;
582 582 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
583 583 if (v4addr != INADDR_ANY) {
584 584 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
585 585 B_FALSE);
586 586 }
587 587 break;
588 588
589 589 case sizeof (sin6_t): /* Complete IPv6 address */
590 590 sin6 = (sin6_t *)sa;
591 591 v6addr = sin6->sin6_addr;
592 592 requested_port = ntohs(sin6->sin6_port);
593 593 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
594 594 if (connp->conn_ipv6_v6only)
595 595 return (EADDRNOTAVAIL);
596 596
597 597 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
598 598 if (v4addr != INADDR_ANY) {
599 599 laddr_type = ip_laddr_verify_v4(v4addr,
600 600 zoneid, ipst, B_FALSE);
601 601 }
602 602 } else {
603 603 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
604 604 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
605 605 scopeid = sin6->sin6_scope_id;
606 606 laddr_type = ip_laddr_verify_v6(&v6addr,
607 607 zoneid, ipst, B_FALSE, scopeid);
608 608 }
609 609 }
610 610 break;
611 611
612 612 default:
613 613 if (connp->conn_debug) {
614 614 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
615 615 "tcp_bind: bad address length, %d", len);
616 616 }
617 617 return (EAFNOSUPPORT);
618 618 /* return (-TBADADDR); */
619 619 }
620 620
621 621 /* Is the local address a valid unicast address? */
622 622 if (laddr_type == IPVL_BAD)
623 623 return (EADDRNOTAVAIL);
624 624
625 625 connp->conn_bound_addr_v6 = v6addr;
626 626 if (scopeid != 0) {
627 627 ixa->ixa_flags |= IXAF_SCOPEID_SET;
628 628 ixa->ixa_scopeid = scopeid;
629 629 connp->conn_incoming_ifindex = scopeid;
630 630 } else {
631 631 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
632 632 connp->conn_incoming_ifindex = connp->conn_bound_if;
633 633 }
634 634
635 635 connp->conn_laddr_v6 = v6addr;
636 636 connp->conn_saddr_v6 = v6addr;
637 637
638 638 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
639 639
640 640 error = tcp_bind_select_lport(tcp, &requested_port,
641 641 bind_to_req_port_only, cr);
642 642 if (error != 0) {
643 643 connp->conn_laddr_v6 = ipv6_all_zeros;
644 644 connp->conn_saddr_v6 = ipv6_all_zeros;
645 645 connp->conn_bound_addr_v6 = ipv6_all_zeros;
646 646 }
647 647 return (error);
648 648 }
649 649
650 650 /*
651 651 * If the "bind_to_req_port_only" parameter is set and the requested port
652 652 * number is available, return it (else return 0).
653 653 *
654 654 * If "bind_to_req_port_only" parameter is not set and the requested port
655 655 * number is available, return it. If not, return the first anonymous port we
656 656 * happen across. If no anonymous ports are available, return 0.
657 657 *
658 658 * In either case, when succeeding update the tcp_t to record the port number
659 659 * and insert it in the bind hash table.
660 660 *
661 661 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
662 662 * without setting SO_REUSEADDR. This is needed so that they
663 663 * can be viewed as two independent transport protocols.
664 664 */
665 665 in_port_t
666 666 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
667 667 int reuseaddr, boolean_t quick_connect,
668 668 boolean_t bind_to_req_port_only, boolean_t user_specified)
669 669 {
670 670 /* number of times we have run around the loop */
671 671 int count = 0;
672 672 /* maximum number of times to run around the loop */
673 673 int loopmax;
674 674 conn_t *connp = tcp->tcp_connp;
675 675 tcp_stack_t *tcps = tcp->tcp_tcps;
676 676 boolean_t reuseport = connp->conn_reuseport;
677 677
678 678 /*
679 679 * Lookup for free addresses is done in a loop and "loopmax"
680 680 * influences how long we spin in the loop
681 681 */
682 682 if (bind_to_req_port_only) {
683 683 /*
684 684 * If the requested port is busy, don't bother to look
685 685 * for a new one. Setting loop maximum count to 1 has
686 686 * that effect.
687 687 */
688 688 loopmax = 1;
689 689 } else {
690 690 /*
691 691 * If the requested port is busy, look for a free one
692 692 * in the anonymous port range.
693 693 * Set loopmax appropriately so that one does not look
694 694 * forever in the case all of the anonymous ports are in use.
695 695 */
696 696 if (connp->conn_anon_priv_bind) {
697 697 /*
698 698 * loopmax =
699 699 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
700 700 */
701 701 loopmax = IPPORT_RESERVED -
702 702 tcps->tcps_min_anonpriv_port;
703 703 } else {
704 704 loopmax = (tcps->tcps_largest_anon_port -
705 705 tcps->tcps_smallest_anon_port + 1);
706 706 }
707 707 }
708 708 do {
709 709 uint16_t lport;
710 710 tf_t *tbf;
711 711 tcp_t *ltcp;
712 712 conn_t *lconnp;
713 713 boolean_t attempt_reuse = B_FALSE;
714 714
715 715 lport = htons(port);
716 716
717 717 /*
718 718 * Ensure that the tcp_t is not currently in the bind hash.
719 719 * Hold the lock on the hash bucket to ensure that
720 720 * the duplicate check plus the insertion is an atomic
721 721 * operation.
722 722 *
723 723 * This function does an inline lookup on the bind hash list
724 724 * Make sure that we access only members of tcp_t
725 725 * and that we don't look at tcp_tcp, since we are not
726 726 * doing a CONN_INC_REF.
727 727 */
728 728 tcp_bind_hash_remove(tcp);
729 729 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
730 730 mutex_enter(&tbf->tf_lock);
731 731 for (ltcp = tbf->tf_tcp; ltcp != NULL;
732 732 ltcp = ltcp->tcp_bind_hash) {
733 733 if (lport == ltcp->tcp_connp->conn_lport)
734 734 break;
735 735 }
736 736
737 737 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
738 738 boolean_t not_socket;
739 739 boolean_t exclbind;
740 740 boolean_t addrmatch;
741 741
742 742 lconnp = ltcp->tcp_connp;
743 743
744 744 /*
745 745 * On a labeled system, we must treat bindings to ports
746 746 * on shared IP addresses by sockets with MAC exemption
747 747 * privilege as being in all zones, as there's
748 748 * otherwise no way to identify the right receiver.
749 749 */
750 750 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
751 751 continue;
752 752
753 753 /*
754 754 * If TCP_EXCLBIND is set for either the bound or
755 755 * binding endpoint, the semantics of bind
756 756 * is changed according to the following.
757 757 *
758 758 * spec = specified address (v4 or v6)
759 759 * unspec = unspecified address (v4 or v6)
760 760 * A = specified addresses are different for endpoints
761 761 *
762 762 * bound bind to allowed
763 763 * -------------------------------------
764 764 * unspec unspec no
765 765 * unspec spec no
766 766 * spec unspec no
767 767 * spec spec yes if A
768 768 *
769 769 * For labeled systems, SO_MAC_EXEMPT behaves the same
770 770 * as TCP_EXCLBIND, except that zoneid is ignored.
771 771 *
772 772 * Note:
773 773 *
774 774 * 1. Because of TLI semantics, an endpoint can go
775 775 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
776 776 * TCPS_BOUND, depending on whether it is originally
777 777 * a listener or not. That is why we need to check
778 778 * for states greater than or equal to TCPS_BOUND
779 779 * here.
780 780 *
781 781 * 2. Ideally, we should only check for state equals
782 782 * to TCPS_LISTEN. And the following check should be
783 783 * added.
784 784 *
785 785 * if (ltcp->tcp_state == TCPS_LISTEN ||
786 786 * !reuseaddr || !lconnp->conn_reuseaddr) {
787 787 * ...
788 788 * }
789 789 *
790 790 * The semantics will be changed to this. If the
791 791 * endpoint on the list is in state not equal to
792 792 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
793 793 * set, let the bind succeed.
794 794 *
795 795 * Because of (1), we cannot do that for TLI
796 796 * endpoints. But we can do that for socket endpoints.
797 797 * If in future, we can change this going back
798 798 * semantics, we can use the above check for TLI also.
799 799 */
800 800 not_socket = !(TCP_IS_SOCKET(ltcp) &&
801 801 TCP_IS_SOCKET(tcp));
802 802 exclbind = lconnp->conn_exclbind ||
803 803 connp->conn_exclbind;
804 804
805 805 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
806 806 (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
807 807 (exclbind && (not_socket ||
808 808 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
809 809 if (V6_OR_V4_INADDR_ANY(
810 810 lconnp->conn_bound_addr_v6) ||
811 811 V6_OR_V4_INADDR_ANY(*laddr) ||
812 812 IN6_ARE_ADDR_EQUAL(laddr,
813 813 &lconnp->conn_bound_addr_v6)) {
814 814 break;
815 815 }
816 816 continue;
817 817 }
818 818
819 819 /*
820 820 * Check ipversion to allow IPv4 and IPv6 sockets to
821 821 * have disjoint port number spaces, if *_EXCLBIND
822 822 * is not set and only if the application binds to a
823 823 * specific port. We use the same autoassigned port
824 824 * number space for IPv4 and IPv6 sockets.
825 825 */
826 826 if (connp->conn_ipversion != lconnp->conn_ipversion &&
827 827 bind_to_req_port_only)
828 828 continue;
829 829
830 830 /*
831 831 * Ideally, we should make sure that the source
832 832 * address, remote address, and remote port in the
833 833 * four tuple for this tcp-connection is unique.
834 834 * However, trying to find out the local source
835 835 * address would require too much code duplication
836 836 * with IP, since IP needs needs to have that code
837 837 * to support userland TCP implementations.
838 838 */
839 839 if (quick_connect &&
840 840 (ltcp->tcp_state > TCPS_LISTEN) &&
841 841 ((connp->conn_fport != lconnp->conn_fport) ||
842 842 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
843 843 &lconnp->conn_faddr_v6)))
844 844 continue;
845 845
846 846 addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
847 847 &lconnp->conn_bound_addr_v6);
848 848
849 849 if (addrmatch && reuseport && bind_to_req_port_only &&
850 850 (ltcp->tcp_state == TCPS_BOUND ||
851 851 ltcp->tcp_state == TCPS_LISTEN)) {
852 852 /*
853 853 * This entry is bound to the exact same
854 854 * address and port. If SO_REUSEPORT is set on
855 855 * the calling socket, attempt to reuse this
856 856 * binding if it too had SO_REUSEPORT enabled
857 857 * when it was bound.
858 858 */
859 859 attempt_reuse = (ltcp->tcp_rg_bind != NULL);
860 860 break;
861 861 }
862 862
863 863 if (!reuseaddr) {
864 864 /*
865 865 * No socket option SO_REUSEADDR. If an
866 866 * existing port is bound to a non-wildcard IP
867 867 * address and the requesting stream is bound
868 868 * to a distinct different IP address
869 869 * (non-wildcard, also), keep going.
870 870 */
871 871 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
872 872 !V6_OR_V4_INADDR_ANY(
873 873 lconnp->conn_bound_addr_v6) &&
874 874 !addrmatch)
875 875 continue;
876 876 if (ltcp->tcp_state >= TCPS_BOUND) {
877 877 /*
878 878 * This port is being used and
879 879 * its state is >= TCPS_BOUND,
880 880 * so we can't bind to it.
881 881 */
882 882 break;
883 883 }
884 884 } else {
885 885 /*
886 886 * socket option SO_REUSEADDR is set on the
887 887 * binding tcp_t.
888 888 *
889 889 * If two streams are bound to the same IP
890 890 * address or both addr and bound source are
891 891 * wildcards (INADDR_ANY), we want to stop
892 892 * searching. We have found a match of IP
893 893 * source address and source port, which is
894 894 * refused regardless of the SO_REUSEADDR
895 895 * setting, so we break.
896 896 */
897 897 if (addrmatch &&
898 898 (ltcp->tcp_state == TCPS_LISTEN ||
899 899 ltcp->tcp_state == TCPS_BOUND))
900 900 break;
901 901 }
902 902 }
903 903 if (ltcp != NULL && !attempt_reuse) {
904 904 /* The port number is busy */
905 905 mutex_exit(&tbf->tf_lock);
906 906 } else {
907 907 if (attempt_reuse) {
908 908 int err;
909 909 struct tcp_rg_s *rg;
910 910
911 911 ASSERT(ltcp != NULL);
912 912 ASSERT(ltcp->tcp_rg_bind != NULL);
913 913 ASSERT(tcp->tcp_rg_bind != NULL);
914 914 ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
915 915
916 916 err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
917 917 if (err != 0) {
918 918 mutex_exit(&tbf->tf_lock);
919 919 return (0);
920 920 }
921 921 /*
922 922 * Now that the newly-binding socket has joined
923 923 * the existing reuseport group on ltcp, it
924 924 * should clean up its own (empty) group.
925 925 */
926 926 rg = tcp->tcp_rg_bind;
927 927 tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
928 928 VERIFY(tcp_rg_remove(rg, tcp));
929 929 tcp_rg_destroy(rg);
930 930 }
931 931
932 932 /*
933 933 * This port is ours. Insert in fanout and mark as
934 934 * bound to prevent others from getting the port
935 935 * number.
936 936 */
937 937 tcp->tcp_state = TCPS_BOUND;
938 938 DTRACE_TCP6(state__change, void, NULL,
939 939 ip_xmit_attr_t *, connp->conn_ixa,
940 940 void, NULL, tcp_t *, tcp, void, NULL,
941 941 int32_t, TCPS_IDLE);
942 942
943 943 connp->conn_lport = htons(port);
944 944
945 945 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
946 946 connp->conn_lport)] == tbf);
947 947 tcp_bind_hash_insert(tbf, tcp, 1);
948 948
949 949 mutex_exit(&tbf->tf_lock);
950 950
951 951 /*
952 952 * We don't want tcp_next_port_to_try to "inherit"
953 953 * a port number supplied by the user in a bind.
954 954 */
955 955 if (user_specified)
956 956 return (port);
957 957
958 958 /*
959 959 * This is the only place where tcp_next_port_to_try
960 960 * is updated. After the update, it may or may not
961 961 * be in the valid range.
962 962 */
963 963 if (!connp->conn_anon_priv_bind)
964 964 tcps->tcps_next_port_to_try = port + 1;
965 965 return (port);
966 966 }
967 967
968 968 if (connp->conn_anon_priv_bind) {
969 969 port = tcp_get_next_priv_port(tcp);
970 970 } else {
971 971 if (count == 0 && user_specified) {
972 972 /*
973 973 * We may have to return an anonymous port. So
974 974 * get one to start with.
975 975 */
976 976 port =
977 977 tcp_update_next_port(
978 978 tcps->tcps_next_port_to_try,
979 979 tcp, B_TRUE);
980 980 user_specified = B_FALSE;
981 981 } else {
982 982 port = tcp_update_next_port(port + 1, tcp,
983 983 B_FALSE);
984 984 }
985 985 }
986 986 if (port == 0)
987 987 break;
988 988
989 989 /*
990 990 * Don't let this loop run forever in the case where
991 991 * all of the anonymous ports are in use.
992 992 */
993 993 } while (++count < loopmax);
994 994 return (0);
995 995 }
996 996
997 997 /* Max number of members in TCP SO_REUSEPORT group */
998 998 #define TCP_RG_SIZE_MAX 64
999 999 /* Step size when expanding members array */
1000 1000 #define TCP_RG_SIZE_STEP 2
1001 1001
1002 1002
1003 1003 tcp_rg_t *
1004 1004 tcp_rg_init(tcp_t *tcp)
1005 1005 {
1006 1006 tcp_rg_t *rg;
1007 1007 rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
1008 1008 if (rg == NULL)
1009 1009 return (NULL);
1010 1010 rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
1011 1011 KM_NOSLEEP|KM_NORMALPRI);
1012 1012 if (rg->tcprg_members == NULL) {
1013 1013 kmem_free(rg, sizeof (tcp_rg_t));
1014 1014 return (NULL);
1015 1015 }
1016 1016
1017 1017 mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
1018 1018 rg->tcprg_size = 2;
1019 1019 rg->tcprg_count = 1;
1020 1020 rg->tcprg_active = 1;
1021 1021 rg->tcprg_members[0] = tcp;
1022 1022 return (rg);
1023 1023 }
1024 1024
1025 1025 void
1026 1026 tcp_rg_destroy(tcp_rg_t *rg)
1027 1027 {
1028 1028 mutex_enter(&rg->tcprg_lock);
1029 1029 ASSERT(rg->tcprg_count == 0);
1030 1030 ASSERT(rg->tcprg_active == 0);
1031 1031 kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
1032 1032 mutex_destroy(&rg->tcprg_lock);
1033 1033 kmem_free(rg, sizeof (struct tcp_rg_s));
1034 1034 }
1035 1035
1036 1036 static int
1037 1037 tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
1038 1038 {
1039 1039 mutex_enter(&rg->tcprg_lock);
1040 1040
1041 1041 VERIFY(rg->tcprg_size > 0);
1042 1042 VERIFY(rg->tcprg_count <= rg->tcprg_size);
1043 1043 if (rg->tcprg_count != 0) {
1044 1044 cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
1045 1045 cred_t *newcred = tcp->tcp_connp->conn_cred;
1046 1046
1047 1047 if (crgetuid(oldcred) != crgetuid(newcred) ||
1048 1048 crgetzoneid(oldcred) != crgetzoneid(newcred)) {
1049 1049 mutex_exit(&rg->tcprg_lock);
1050 1050 return (EPERM);
1051 1051 }
1052 1052 }
1053 1053
1054 1054 if (rg->tcprg_count == rg->tcprg_size) {
1055 1055 unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
1056 1056 unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
1057 1057 tcp_t **newmembers;
1058 1058
1059 1059 if (newsize > TCP_RG_SIZE_MAX) {
1060 1060 mutex_exit(&rg->tcprg_lock);
1061 1061 return (EINVAL);
1062 1062 }
1063 1063 newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
1064 1064 KM_NOSLEEP|KM_NORMALPRI);
1065 1065 if (newmembers == NULL) {
1066 1066 mutex_exit(&rg->tcprg_lock);
1067 1067 return (ENOMEM);
1068 1068 }
1069 1069 bcopy(rg->tcprg_members, newmembers, oldalloc);
1070 1070 kmem_free(rg->tcprg_members, oldalloc);
1071 1071 rg->tcprg_members = newmembers;
1072 1072 rg->tcprg_size = newsize;
1073 1073 }
1074 1074
1075 1075 rg->tcprg_members[rg->tcprg_count] = tcp;
1076 1076 rg->tcprg_count++;
1077 1077 rg->tcprg_active++;
1078 1078
1079 1079 mutex_exit(&rg->tcprg_lock);
1080 1080 return (0);
1081 1081 }
1082 1082
1083 1083 boolean_t
1084 1084 tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
1085 1085 {
1086 1086 int i;
1087 1087 boolean_t is_empty;
1088 1088
1089 1089 mutex_enter(&rg->tcprg_lock);
1090 1090 for (i = 0; i < rg->tcprg_count; i++) {
1091 1091 if (rg->tcprg_members[i] == tcp)
1092 1092 break;
1093 1093 }
1094 1094 /* The item should be present */
1095 1095 ASSERT(i < rg->tcprg_count);
1096 1096 /* Move the last member into this position */
1097 1097 rg->tcprg_count--;
1098 1098 rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
1099 1099 rg->tcprg_members[rg->tcprg_count] = NULL;
1100 1100 if (tcp->tcp_connp->conn_reuseport != 0)
1101 1101 rg->tcprg_active--;
1102 1102 is_empty = (rg->tcprg_count == 0);
1103 1103 mutex_exit(&rg->tcprg_lock);
1104 1104 return (is_empty);
1105 1105 }
1106 1106
1107 1107 void
1108 1108 tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
1109 1109 {
1110 1110 mutex_enter(&rg->tcprg_lock);
1111 1111 if (is_active) {
1112 1112 rg->tcprg_active++;
1113 1113 } else {
1114 1114 rg->tcprg_active--;
1115 1115 }
1116 1116 mutex_exit(&rg->tcprg_lock);
1117 1117 }
|
↓ open down ↓ |
1117 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX