Print this page
OS-5613 SO_REUSEPORT needs better state-change coverage
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4018 lxbrand support TCP SO_REUSEPORT
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ new/usr/src/uts/common/inet/tcp/tcp_bind.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 + * Copyright 2016 Joyent, Inc.
25 26 */
26 27
27 28 #include <sys/types.h>
28 29 #include <sys/stream.h>
29 30 #include <sys/strsun.h>
30 31 #include <sys/strsubr.h>
31 32 #include <sys/stropts.h>
32 33 #include <sys/strlog.h>
33 34 #define _SUN_TPI_VERSION 2
34 35 #include <sys/tihdr.h>
35 36 #include <sys/suntpi.h>
36 37 #include <sys/xti_inet.h>
37 38 #include <sys/policy.h>
38 39 #include <sys/squeue_impl.h>
39 40 #include <sys/squeue.h>
40 41 #include <sys/tsol/tnet.h>
41 42
42 43 #include <rpc/pmap_prot.h>
43 44
44 45 #include <inet/common.h>
45 46 #include <inet/ip.h>
46 47 #include <inet/tcp.h>
47 48 #include <inet/tcp_impl.h>
|
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
48 49 #include <inet/proto_set.h>
49 50 #include <inet/ipsec_impl.h>
50 51
51 52 /* Setable in /etc/system */
52 53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
53 54 static uint32_t tcp_random_anon_port = 1;
54 55
55 56 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
56 57 cred_t *cr);
57 58 static in_port_t tcp_get_next_priv_port(const tcp_t *);
59 +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
58 60
59 61 /*
60 62 * Hash list insertion routine for tcp_t structures. Each hash bucket
61 63 * contains a list of tcp_t entries, and each entry is bound to a unique
62 64 * port. If there are multiple tcp_t's that are bound to the same port, then
63 65 * one of them will be linked into the hash bucket list, and the rest will
64 66 * hang off of that one entry. For each port, entries bound to a specific IP
65 67 * address will be inserted before those those bound to INADDR_ANY.
66 68 */
67 69 void
68 70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
69 71 {
70 72 tcp_t **tcpp;
71 73 tcp_t *tcpnext;
72 74 tcp_t *tcphash;
73 75 conn_t *connp = tcp->tcp_connp;
74 76 conn_t *connext;
75 77
76 78 if (tcp->tcp_ptpbhn != NULL) {
77 79 ASSERT(!caller_holds_lock);
78 80 tcp_bind_hash_remove(tcp);
79 81 }
80 82 tcpp = &tbf->tf_tcp;
81 83 if (!caller_holds_lock) {
82 84 mutex_enter(&tbf->tf_lock);
83 85 } else {
84 86 ASSERT(MUTEX_HELD(&tbf->tf_lock));
85 87 }
86 88 tcphash = tcpp[0];
87 89 tcpnext = NULL;
88 90 if (tcphash != NULL) {
89 91 /* Look for an entry using the same port */
90 92 while ((tcphash = tcpp[0]) != NULL &&
91 93 connp->conn_lport != tcphash->tcp_connp->conn_lport)
92 94 tcpp = &(tcphash->tcp_bind_hash);
93 95
94 96 /* The port was not found, just add to the end */
95 97 if (tcphash == NULL)
96 98 goto insert;
97 99
98 100 /*
99 101 * OK, there already exists an entry bound to the
100 102 * same port.
101 103 *
102 104 * If the new tcp bound to the INADDR_ANY address
103 105 * and the first one in the list is not bound to
104 106 * INADDR_ANY we skip all entries until we find the
105 107 * first one bound to INADDR_ANY.
106 108 * This makes sure that applications binding to a
107 109 * specific address get preference over those binding to
108 110 * INADDR_ANY.
109 111 */
110 112 tcpnext = tcphash;
111 113 connext = tcpnext->tcp_connp;
112 114 tcphash = NULL;
113 115 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
114 116 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
115 117 while ((tcpnext = tcpp[0]) != NULL) {
116 118 connext = tcpnext->tcp_connp;
117 119 if (!V6_OR_V4_INADDR_ANY(
118 120 connext->conn_bound_addr_v6))
119 121 tcpp = &(tcpnext->tcp_bind_hash_port);
120 122 else
121 123 break;
122 124 }
123 125 if (tcpnext != NULL) {
124 126 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
125 127 tcphash = tcpnext->tcp_bind_hash;
126 128 if (tcphash != NULL) {
127 129 tcphash->tcp_ptpbhn =
128 130 &(tcp->tcp_bind_hash);
129 131 tcpnext->tcp_bind_hash = NULL;
130 132 }
131 133 }
132 134 } else {
133 135 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
134 136 tcphash = tcpnext->tcp_bind_hash;
135 137 if (tcphash != NULL) {
136 138 tcphash->tcp_ptpbhn =
137 139 &(tcp->tcp_bind_hash);
138 140 tcpnext->tcp_bind_hash = NULL;
139 141 }
140 142 }
141 143 }
142 144 insert:
143 145 tcp->tcp_bind_hash_port = tcpnext;
144 146 tcp->tcp_bind_hash = tcphash;
145 147 tcp->tcp_ptpbhn = tcpp;
146 148 tcpp[0] = tcp;
147 149 if (!caller_holds_lock)
148 150 mutex_exit(&tbf->tf_lock);
149 151 }
150 152
151 153 /*
152 154 * Hash list removal routine for tcp_t structures.
153 155 */
154 156 void
155 157 tcp_bind_hash_remove(tcp_t *tcp)
156 158 {
157 159 tcp_t *tcpnext;
158 160 kmutex_t *lockp;
159 161 tcp_stack_t *tcps = tcp->tcp_tcps;
160 162 conn_t *connp = tcp->tcp_connp;
161 163
162 164 if (tcp->tcp_ptpbhn == NULL)
163 165 return;
164 166
|
↓ open down ↓ |
97 lines elided |
↑ open up ↑ |
165 167 /*
166 168 * Extract the lock pointer in case there are concurrent
167 169 * hash_remove's for this instance.
168 170 */
169 171 ASSERT(connp->conn_lport != 0);
170 172 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
171 173 connp->conn_lport)].tf_lock;
172 174
173 175 ASSERT(lockp != NULL);
174 176 mutex_enter(lockp);
177 +
178 + /* destroy any association with SO_REUSEPORT group */
179 + if (tcp->tcp_rg_bind != NULL) {
180 + if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
181 + /* Last one out turns off the lights */
182 + tcp_rg_destroy(tcp->tcp_rg_bind);
183 + }
184 + tcp->tcp_rg_bind = NULL;
185 + }
186 +
175 187 if (tcp->tcp_ptpbhn) {
176 188 tcpnext = tcp->tcp_bind_hash_port;
177 189 if (tcpnext != NULL) {
178 190 tcp->tcp_bind_hash_port = NULL;
179 191 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
180 192 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
181 193 if (tcpnext->tcp_bind_hash != NULL) {
182 194 tcpnext->tcp_bind_hash->tcp_ptpbhn =
183 195 &(tcpnext->tcp_bind_hash);
184 196 tcp->tcp_bind_hash = NULL;
185 197 }
186 198 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
187 199 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
188 200 tcp->tcp_bind_hash = NULL;
189 201 }
190 202 *tcp->tcp_ptpbhn = tcpnext;
191 203 tcp->tcp_ptpbhn = NULL;
192 204 }
193 205 mutex_exit(lockp);
194 206 }
195 207
196 208 /*
197 209 * Don't let port fall into the privileged range.
198 210 * Since the extra privileged ports can be arbitrary we also
199 211 * ensure that we exclude those from consideration.
200 212 * tcp_g_epriv_ports is not sorted thus we loop over it until
201 213 * there are no changes.
202 214 *
203 215 * Note: No locks are held when inspecting tcp_g_*epriv_ports
204 216 * but instead the code relies on:
205 217 * - the fact that the address of the array and its size never changes
206 218 * - the atomic assignment of the elements of the array
207 219 *
208 220 * Returns 0 if there are no more ports available.
209 221 *
210 222 * TS note: skip multilevel ports.
211 223 */
212 224 in_port_t
213 225 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
214 226 {
215 227 int i, bump;
216 228 boolean_t restart = B_FALSE;
217 229 tcp_stack_t *tcps = tcp->tcp_tcps;
218 230
219 231 if (random && tcp_random_anon_port != 0) {
220 232 (void) random_get_pseudo_bytes((uint8_t *)&port,
221 233 sizeof (in_port_t));
222 234 /*
223 235 * Unless changed by a sys admin, the smallest anon port
224 236 * is 32768 and the largest anon port is 65535. It is
225 237 * very likely (50%) for the random port to be smaller
226 238 * than the smallest anon port. When that happens,
227 239 * add port % (anon port range) to the smallest anon
228 240 * port to get the random port. It should fall into the
229 241 * valid anon port range.
230 242 */
231 243 if ((port < tcps->tcps_smallest_anon_port) ||
232 244 (port > tcps->tcps_largest_anon_port)) {
233 245 if (tcps->tcps_smallest_anon_port ==
234 246 tcps->tcps_largest_anon_port) {
235 247 bump = 0;
236 248 } else {
237 249 bump = port % (tcps->tcps_largest_anon_port -
238 250 tcps->tcps_smallest_anon_port);
239 251 }
240 252 port = tcps->tcps_smallest_anon_port + bump;
241 253 }
242 254 }
243 255
244 256 retry:
245 257 if (port < tcps->tcps_smallest_anon_port)
246 258 port = (in_port_t)tcps->tcps_smallest_anon_port;
247 259
248 260 if (port > tcps->tcps_largest_anon_port) {
249 261 if (restart)
250 262 return (0);
251 263 restart = B_TRUE;
252 264 port = (in_port_t)tcps->tcps_smallest_anon_port;
253 265 }
254 266
255 267 if (port < tcps->tcps_smallest_nonpriv_port)
256 268 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
257 269
258 270 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
259 271 if (port == tcps->tcps_g_epriv_ports[i]) {
260 272 port++;
261 273 /*
262 274 * Make sure whether the port is in the
263 275 * valid range.
264 276 */
265 277 goto retry;
266 278 }
267 279 }
268 280 if (is_system_labeled() &&
269 281 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
270 282 IPPROTO_TCP, B_TRUE)) != 0) {
271 283 port = i;
272 284 goto retry;
273 285 }
274 286 return (port);
275 287 }
276 288
277 289 /*
278 290 * Return the next anonymous port in the privileged port range for
279 291 * bind checking. It starts at IPPORT_RESERVED - 1 and goes
280 292 * downwards. This is the same behavior as documented in the userland
281 293 * library call rresvport(3N).
282 294 *
283 295 * TS note: skip multilevel ports.
284 296 */
285 297 static in_port_t
286 298 tcp_get_next_priv_port(const tcp_t *tcp)
287 299 {
288 300 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
289 301 in_port_t nextport;
290 302 boolean_t restart = B_FALSE;
291 303 tcp_stack_t *tcps = tcp->tcp_tcps;
292 304 retry:
293 305 if (next_priv_port < tcps->tcps_min_anonpriv_port ||
294 306 next_priv_port >= IPPORT_RESERVED) {
295 307 next_priv_port = IPPORT_RESERVED - 1;
296 308 if (restart)
297 309 return (0);
298 310 restart = B_TRUE;
299 311 }
300 312 if (is_system_labeled() &&
301 313 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
302 314 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
303 315 next_priv_port = nextport;
304 316 goto retry;
305 317 }
306 318 return (next_priv_port--);
307 319 }
308 320
309 321 static int
310 322 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
311 323 boolean_t bind_to_req_port_only, cred_t *cr)
312 324 {
313 325 in_port_t mlp_port;
314 326 mlp_type_t addrtype, mlptype;
315 327 boolean_t user_specified;
316 328 in_port_t allocated_port;
317 329 in_port_t requested_port = *requested_port_ptr;
318 330 conn_t *connp = tcp->tcp_connp;
319 331 zone_t *zone;
320 332 tcp_stack_t *tcps = tcp->tcp_tcps;
321 333 in6_addr_t v6addr = connp->conn_laddr_v6;
322 334
323 335 /*
324 336 * XXX It's up to the caller to specify bind_to_req_port_only or not.
325 337 */
326 338 ASSERT(cr != NULL);
327 339
328 340 /*
329 341 * Get a valid port (within the anonymous range and should not
330 342 * be a privileged one) to use if the user has not given a port.
331 343 * If multiple threads are here, they may all start with
332 344 * with the same initial port. But, it should be fine as long as
333 345 * tcp_bindi will ensure that no two threads will be assigned
334 346 * the same port.
335 347 *
336 348 * NOTE: XXX If a privileged process asks for an anonymous port, we
337 349 * still check for ports only in the range > tcp_smallest_non_priv_port,
338 350 * unless TCP_ANONPRIVBIND option is set.
339 351 */
340 352 mlptype = mlptSingle;
341 353 mlp_port = requested_port;
342 354 if (requested_port == 0) {
343 355 requested_port = connp->conn_anon_priv_bind ?
344 356 tcp_get_next_priv_port(tcp) :
345 357 tcp_update_next_port(tcps->tcps_next_port_to_try,
346 358 tcp, B_TRUE);
347 359 if (requested_port == 0) {
348 360 return (-TNOADDR);
349 361 }
350 362 user_specified = B_FALSE;
351 363
352 364 /*
353 365 * If the user went through one of the RPC interfaces to create
354 366 * this socket and RPC is MLP in this zone, then give him an
355 367 * anonymous MLP.
356 368 */
357 369 if (connp->conn_anon_mlp && is_system_labeled()) {
358 370 zone = crgetzone(cr);
359 371 addrtype = tsol_mlp_addr_type(
360 372 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
361 373 IPV6_VERSION, &v6addr,
362 374 tcps->tcps_netstack->netstack_ip);
363 375 if (addrtype == mlptSingle) {
364 376 return (-TNOADDR);
365 377 }
366 378 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
367 379 PMAPPORT, addrtype);
368 380 mlp_port = PMAPPORT;
369 381 }
370 382 } else {
371 383 int i;
372 384 boolean_t priv = B_FALSE;
373 385
374 386 /*
375 387 * If the requested_port is in the well-known privileged range,
376 388 * verify that the stream was opened by a privileged user.
377 389 * Note: No locks are held when inspecting tcp_g_*epriv_ports
378 390 * but instead the code relies on:
379 391 * - the fact that the address of the array and its size never
380 392 * changes
381 393 * - the atomic assignment of the elements of the array
382 394 */
383 395 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
384 396 priv = B_TRUE;
385 397 } else {
386 398 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
387 399 if (requested_port ==
388 400 tcps->tcps_g_epriv_ports[i]) {
389 401 priv = B_TRUE;
390 402 break;
391 403 }
392 404 }
393 405 }
394 406 if (priv) {
395 407 if (secpolicy_net_privaddr(cr, requested_port,
396 408 IPPROTO_TCP) != 0) {
397 409 if (connp->conn_debug) {
398 410 (void) strlog(TCP_MOD_ID, 0, 1,
399 411 SL_ERROR|SL_TRACE,
400 412 "tcp_bind: no priv for port %d",
401 413 requested_port);
402 414 }
403 415 return (-TACCES);
404 416 }
405 417 }
406 418 user_specified = B_TRUE;
407 419
408 420 connp = tcp->tcp_connp;
409 421 if (is_system_labeled()) {
410 422 zone = crgetzone(cr);
411 423 addrtype = tsol_mlp_addr_type(
412 424 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
413 425 IPV6_VERSION, &v6addr,
414 426 tcps->tcps_netstack->netstack_ip);
415 427 if (addrtype == mlptSingle) {
416 428 return (-TNOADDR);
417 429 }
418 430 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
419 431 requested_port, addrtype);
420 432 }
421 433 }
422 434
423 435 if (mlptype != mlptSingle) {
424 436 if (secpolicy_net_bindmlp(cr) != 0) {
425 437 if (connp->conn_debug) {
426 438 (void) strlog(TCP_MOD_ID, 0, 1,
427 439 SL_ERROR|SL_TRACE,
428 440 "tcp_bind: no priv for multilevel port %d",
429 441 requested_port);
430 442 }
431 443 return (-TACCES);
432 444 }
433 445
434 446 /*
435 447 * If we're specifically binding a shared IP address and the
436 448 * port is MLP on shared addresses, then check to see if this
437 449 * zone actually owns the MLP. Reject if not.
438 450 */
439 451 if (mlptype == mlptShared && addrtype == mlptShared) {
440 452 /*
441 453 * No need to handle exclusive-stack zones since
442 454 * ALL_ZONES only applies to the shared stack.
443 455 */
444 456 zoneid_t mlpzone;
445 457
446 458 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
447 459 htons(mlp_port));
448 460 if (connp->conn_zoneid != mlpzone) {
449 461 if (connp->conn_debug) {
450 462 (void) strlog(TCP_MOD_ID, 0, 1,
451 463 SL_ERROR|SL_TRACE,
452 464 "tcp_bind: attempt to bind port "
453 465 "%d on shared addr in zone %d "
454 466 "(should be %d)",
455 467 mlp_port, connp->conn_zoneid,
456 468 mlpzone);
457 469 }
458 470 return (-TACCES);
459 471 }
460 472 }
461 473
462 474 if (!user_specified) {
463 475 int err;
464 476 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
465 477 requested_port, B_TRUE);
466 478 if (err != 0) {
467 479 if (connp->conn_debug) {
468 480 (void) strlog(TCP_MOD_ID, 0, 1,
469 481 SL_ERROR|SL_TRACE,
470 482 "tcp_bind: cannot establish anon "
471 483 "MLP for port %d",
472 484 requested_port);
473 485 }
474 486 return (err);
475 487 }
476 488 connp->conn_anon_port = B_TRUE;
477 489 }
478 490 connp->conn_mlp_type = mlptype;
479 491 }
480 492
481 493 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
482 494 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
483 495 user_specified);
484 496
485 497 if (allocated_port == 0) {
486 498 connp->conn_mlp_type = mlptSingle;
487 499 if (connp->conn_anon_port) {
488 500 connp->conn_anon_port = B_FALSE;
489 501 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
490 502 requested_port, B_FALSE);
491 503 }
492 504 if (bind_to_req_port_only) {
493 505 if (connp->conn_debug) {
494 506 (void) strlog(TCP_MOD_ID, 0, 1,
495 507 SL_ERROR|SL_TRACE,
496 508 "tcp_bind: requested addr busy");
497 509 }
498 510 return (-TADDRBUSY);
499 511 } else {
500 512 /* If we are out of ports, fail the bind. */
501 513 if (connp->conn_debug) {
502 514 (void) strlog(TCP_MOD_ID, 0, 1,
503 515 SL_ERROR|SL_TRACE,
504 516 "tcp_bind: out of ports?");
505 517 }
506 518 return (-TNOADDR);
507 519 }
508 520 }
509 521
510 522 /* Pass the allocated port back */
511 523 *requested_port_ptr = allocated_port;
512 524 return (0);
513 525 }
514 526
515 527 /*
516 528 * Check the address and check/pick a local port number.
517 529 */
518 530 int
519 531 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
520 532 boolean_t bind_to_req_port_only)
521 533 {
522 534 tcp_t *tcp = connp->conn_tcp;
523 535 sin_t *sin;
524 536 sin6_t *sin6;
525 537 in_port_t requested_port;
526 538 ipaddr_t v4addr;
527 539 in6_addr_t v6addr;
528 540 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
529 541 zoneid_t zoneid = IPCL_ZONEID(connp);
530 542 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
531 543 uint_t scopeid = 0;
532 544 int error = 0;
533 545 ip_xmit_attr_t *ixa = connp->conn_ixa;
534 546
535 547 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
536 548
537 549 if (tcp->tcp_state == TCPS_BOUND) {
538 550 return (0);
539 551 } else if (tcp->tcp_state > TCPS_BOUND) {
540 552 if (connp->conn_debug) {
541 553 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
542 554 "tcp_bind: bad state, %d", tcp->tcp_state);
543 555 }
544 556 return (-TOUTSTATE);
545 557 }
546 558
547 559 ASSERT(sa != NULL && len != 0);
548 560
549 561 if (!OK_32PTR((char *)sa)) {
550 562 if (connp->conn_debug) {
551 563 (void) strlog(TCP_MOD_ID, 0, 1,
552 564 SL_ERROR|SL_TRACE,
553 565 "tcp_bind: bad address parameter, "
554 566 "address %p, len %d",
555 567 (void *)sa, len);
556 568 }
557 569 return (-TPROTO);
558 570 }
559 571
560 572 error = proto_verify_ip_addr(connp->conn_family, sa, len);
561 573 if (error != 0) {
562 574 return (error);
563 575 }
564 576
565 577 switch (len) {
566 578 case sizeof (sin_t): /* Complete IPv4 address */
567 579 sin = (sin_t *)sa;
568 580 requested_port = ntohs(sin->sin_port);
569 581 v4addr = sin->sin_addr.s_addr;
570 582 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
571 583 if (v4addr != INADDR_ANY) {
572 584 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
573 585 B_FALSE);
574 586 }
575 587 break;
576 588
577 589 case sizeof (sin6_t): /* Complete IPv6 address */
578 590 sin6 = (sin6_t *)sa;
579 591 v6addr = sin6->sin6_addr;
580 592 requested_port = ntohs(sin6->sin6_port);
581 593 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
582 594 if (connp->conn_ipv6_v6only)
583 595 return (EADDRNOTAVAIL);
584 596
585 597 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
586 598 if (v4addr != INADDR_ANY) {
587 599 laddr_type = ip_laddr_verify_v4(v4addr,
588 600 zoneid, ipst, B_FALSE);
589 601 }
590 602 } else {
591 603 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
592 604 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
593 605 scopeid = sin6->sin6_scope_id;
594 606 laddr_type = ip_laddr_verify_v6(&v6addr,
595 607 zoneid, ipst, B_FALSE, scopeid);
596 608 }
597 609 }
598 610 break;
599 611
600 612 default:
601 613 if (connp->conn_debug) {
602 614 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
603 615 "tcp_bind: bad address length, %d", len);
604 616 }
605 617 return (EAFNOSUPPORT);
606 618 /* return (-TBADADDR); */
607 619 }
608 620
609 621 /* Is the local address a valid unicast address? */
610 622 if (laddr_type == IPVL_BAD)
611 623 return (EADDRNOTAVAIL);
612 624
613 625 connp->conn_bound_addr_v6 = v6addr;
614 626 if (scopeid != 0) {
615 627 ixa->ixa_flags |= IXAF_SCOPEID_SET;
616 628 ixa->ixa_scopeid = scopeid;
617 629 connp->conn_incoming_ifindex = scopeid;
618 630 } else {
619 631 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
620 632 connp->conn_incoming_ifindex = connp->conn_bound_if;
621 633 }
622 634
623 635 connp->conn_laddr_v6 = v6addr;
624 636 connp->conn_saddr_v6 = v6addr;
625 637
626 638 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
627 639
628 640 error = tcp_bind_select_lport(tcp, &requested_port,
|
↓ open down ↓ |
444 lines elided |
↑ open up ↑ |
629 641 bind_to_req_port_only, cr);
630 642 if (error != 0) {
631 643 connp->conn_laddr_v6 = ipv6_all_zeros;
632 644 connp->conn_saddr_v6 = ipv6_all_zeros;
633 645 connp->conn_bound_addr_v6 = ipv6_all_zeros;
634 646 }
635 647 return (error);
636 648 }
637 649
638 650 /*
639 - * If the "bind_to_req_port_only" parameter is set, if the requested port
640 - * number is available, return it, If not return 0
651 + * If the "bind_to_req_port_only" parameter is set and the requested port
652 + * number is available, return it (else return 0).
641 653 *
642 - * If "bind_to_req_port_only" parameter is not set and
643 - * If the requested port number is available, return it. If not, return
644 - * the first anonymous port we happen across. If no anonymous ports are
645 - * available, return 0. addr is the requested local address, if any.
654 + * If "bind_to_req_port_only" parameter is not set and the requested port
655 + * number is available, return it. If not, return the first anonymous port we
656 + * happen across. If no anonymous ports are available, return 0.
646 657 *
647 658 * In either case, when succeeding update the tcp_t to record the port number
648 659 * and insert it in the bind hash table.
649 660 *
650 661 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
651 662 * without setting SO_REUSEADDR. This is needed so that they
652 663 * can be viewed as two independent transport protocols.
653 664 */
654 665 in_port_t
655 666 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
656 667 int reuseaddr, boolean_t quick_connect,
657 668 boolean_t bind_to_req_port_only, boolean_t user_specified)
658 669 {
659 670 /* number of times we have run around the loop */
660 671 int count = 0;
661 672 /* maximum number of times to run around the loop */
662 673 int loopmax;
663 674 conn_t *connp = tcp->tcp_connp;
664 675 tcp_stack_t *tcps = tcp->tcp_tcps;
676 + boolean_t reuseport = connp->conn_reuseport;
665 677
666 678 /*
667 679 * Lookup for free addresses is done in a loop and "loopmax"
668 680 * influences how long we spin in the loop
669 681 */
670 682 if (bind_to_req_port_only) {
671 683 /*
672 684 * If the requested port is busy, don't bother to look
673 685 * for a new one. Setting loop maximum count to 1 has
674 686 * that effect.
675 687 */
676 688 loopmax = 1;
677 689 } else {
678 690 /*
679 691 * If the requested port is busy, look for a free one
680 692 * in the anonymous port range.
681 693 * Set loopmax appropriately so that one does not look
682 694 * forever in the case all of the anonymous ports are in use.
683 695 */
684 696 if (connp->conn_anon_priv_bind) {
685 697 /*
686 698 * loopmax =
687 699 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
688 700 */
689 701 loopmax = IPPORT_RESERVED -
690 702 tcps->tcps_min_anonpriv_port;
|
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
691 703 } else {
692 704 loopmax = (tcps->tcps_largest_anon_port -
693 705 tcps->tcps_smallest_anon_port + 1);
694 706 }
695 707 }
696 708 do {
697 709 uint16_t lport;
698 710 tf_t *tbf;
699 711 tcp_t *ltcp;
700 712 conn_t *lconnp;
713 + boolean_t attempt_reuse = B_FALSE;
701 714
702 715 lport = htons(port);
703 716
704 717 /*
705 718 * Ensure that the tcp_t is not currently in the bind hash.
706 719 * Hold the lock on the hash bucket to ensure that
707 720 * the duplicate check plus the insertion is an atomic
708 721 * operation.
709 722 *
710 723 * This function does an inline lookup on the bind hash list
711 724 * Make sure that we access only members of tcp_t
712 725 * and that we don't look at tcp_tcp, since we are not
713 726 * doing a CONN_INC_REF.
714 727 */
715 728 tcp_bind_hash_remove(tcp);
716 729 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
717 730 mutex_enter(&tbf->tf_lock);
718 731 for (ltcp = tbf->tf_tcp; ltcp != NULL;
719 732 ltcp = ltcp->tcp_bind_hash) {
720 733 if (lport == ltcp->tcp_connp->conn_lport)
721 734 break;
722 735 }
723 736
724 737 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
725 738 boolean_t not_socket;
726 739 boolean_t exclbind;
740 + boolean_t addrmatch;
727 741
728 742 lconnp = ltcp->tcp_connp;
729 743
730 744 /*
731 745 * On a labeled system, we must treat bindings to ports
732 746 * on shared IP addresses by sockets with MAC exemption
733 747 * privilege as being in all zones, as there's
734 748 * otherwise no way to identify the right receiver.
735 749 */
736 750 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
737 751 continue;
738 752
739 753 /*
740 754 * If TCP_EXCLBIND is set for either the bound or
741 755 * binding endpoint, the semantics of bind
742 756 * is changed according to the following.
743 757 *
744 758 * spec = specified address (v4 or v6)
745 759 * unspec = unspecified address (v4 or v6)
746 760 * A = specified addresses are different for endpoints
747 761 *
748 762 * bound bind to allowed
749 763 * -------------------------------------
750 764 * unspec unspec no
751 765 * unspec spec no
752 766 * spec unspec no
753 767 * spec spec yes if A
754 768 *
755 769 * For labeled systems, SO_MAC_EXEMPT behaves the same
756 770 * as TCP_EXCLBIND, except that zoneid is ignored.
757 771 *
758 772 * Note:
759 773 *
760 774 * 1. Because of TLI semantics, an endpoint can go
761 775 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
762 776 * TCPS_BOUND, depending on whether it is originally
763 777 * a listener or not. That is why we need to check
764 778 * for states greater than or equal to TCPS_BOUND
765 779 * here.
766 780 *
767 781 * 2. Ideally, we should only check for state equals
768 782 * to TCPS_LISTEN. And the following check should be
769 783 * added.
770 784 *
771 785 * if (ltcp->tcp_state == TCPS_LISTEN ||
772 786 * !reuseaddr || !lconnp->conn_reuseaddr) {
773 787 * ...
774 788 * }
775 789 *
776 790 * The semantics will be changed to this. If the
777 791 * endpoint on the list is in state not equal to
778 792 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
779 793 * set, let the bind succeed.
780 794 *
781 795 * Because of (1), we cannot do that for TLI
782 796 * endpoints. But we can do that for socket endpoints.
783 797 * If in future, we can change this going back
784 798 * semantics, we can use the above check for TLI also.
785 799 */
786 800 not_socket = !(TCP_IS_SOCKET(ltcp) &&
787 801 TCP_IS_SOCKET(tcp));
788 802 exclbind = lconnp->conn_exclbind ||
789 803 connp->conn_exclbind;
790 804
791 805 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
792 806 (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
793 807 (exclbind && (not_socket ||
794 808 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
795 809 if (V6_OR_V4_INADDR_ANY(
796 810 lconnp->conn_bound_addr_v6) ||
797 811 V6_OR_V4_INADDR_ANY(*laddr) ||
798 812 IN6_ARE_ADDR_EQUAL(laddr,
799 813 &lconnp->conn_bound_addr_v6)) {
800 814 break;
801 815 }
802 816 continue;
803 817 }
804 818
805 819 /*
806 820 * Check ipversion to allow IPv4 and IPv6 sockets to
807 821 * have disjoint port number spaces, if *_EXCLBIND
808 822 * is not set and only if the application binds to a
809 823 * specific port. We use the same autoassigned port
810 824 * number space for IPv4 and IPv6 sockets.
811 825 */
812 826 if (connp->conn_ipversion != lconnp->conn_ipversion &&
813 827 bind_to_req_port_only)
814 828 continue;
815 829
816 830 /*
817 831 * Ideally, we should make sure that the source
818 832 * address, remote address, and remote port in the
819 833 * four tuple for this tcp-connection is unique.
820 834 * However, trying to find out the local source
821 835 * address would require too much code duplication
|
↓ open down ↓ |
85 lines elided |
↑ open up ↑ |
822 836 * with IP, since IP needs needs to have that code
823 837 * to support userland TCP implementations.
824 838 */
825 839 if (quick_connect &&
826 840 (ltcp->tcp_state > TCPS_LISTEN) &&
827 841 ((connp->conn_fport != lconnp->conn_fport) ||
828 842 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
829 843 &lconnp->conn_faddr_v6)))
830 844 continue;
831 845
846 + addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
847 + &lconnp->conn_bound_addr_v6);
848 +
849 + if (addrmatch && reuseport && bind_to_req_port_only &&
850 + (ltcp->tcp_state == TCPS_BOUND ||
851 + ltcp->tcp_state == TCPS_LISTEN)) {
852 + /*
853 + * This entry is bound to the exact same
854 + * address and port. If SO_REUSEPORT is set on
855 + * the calling socket, attempt to reuse this
856 + * binding if it too had SO_REUSEPORT enabled
857 + * when it was bound.
858 + */
859 + attempt_reuse = (ltcp->tcp_rg_bind != NULL);
860 + break;
861 + }
862 +
832 863 if (!reuseaddr) {
833 864 /*
834 - * No socket option SO_REUSEADDR.
835 - * If existing port is bound to
836 - * a non-wildcard IP address
837 - * and the requesting stream is
838 - * bound to a distinct
839 - * different IP addresses
840 - * (non-wildcard, also), keep
841 - * going.
865 + * No socket option SO_REUSEADDR. If an
866 + * existing port is bound to a non-wildcard IP
867 + * address and the requesting stream is bound
868 + * to a distinct different IP address
869 + * (non-wildcard, also), keep going.
842 870 */
843 871 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
844 872 !V6_OR_V4_INADDR_ANY(
845 873 lconnp->conn_bound_addr_v6) &&
846 - !IN6_ARE_ADDR_EQUAL(laddr,
847 - &lconnp->conn_bound_addr_v6))
874 + !addrmatch)
848 875 continue;
849 876 if (ltcp->tcp_state >= TCPS_BOUND) {
850 877 /*
851 878 * This port is being used and
852 879 * its state is >= TCPS_BOUND,
853 880 * so we can't bind to it.
854 881 */
855 882 break;
856 883 }
857 884 } else {
858 885 /*
859 886 * socket option SO_REUSEADDR is set on the
860 887 * binding tcp_t.
861 888 *
862 - * If two streams are bound to
863 - * same IP address or both addr
864 - * and bound source are wildcards
865 - * (INADDR_ANY), we want to stop
866 - * searching.
867 - * We have found a match of IP source
868 - * address and source port, which is
869 - * refused regardless of the
870 - * SO_REUSEADDR setting, so we break.
889 + * If two streams are bound to the same IP
890 + * address or both addr and bound source are
891 + * wildcards (INADDR_ANY), we want to stop
892 + * searching. We have found a match of IP
893 + * source address and source port, which is
894 + * refused regardless of the SO_REUSEADDR
895 + * setting, so we break.
871 896 */
872 - if (IN6_ARE_ADDR_EQUAL(laddr,
873 - &lconnp->conn_bound_addr_v6) &&
897 + if (addrmatch &&
874 898 (ltcp->tcp_state == TCPS_LISTEN ||
875 899 ltcp->tcp_state == TCPS_BOUND))
876 900 break;
877 901 }
878 902 }
879 - if (ltcp != NULL) {
903 + if (ltcp != NULL && !attempt_reuse) {
880 904 /* The port number is busy */
881 905 mutex_exit(&tbf->tf_lock);
882 906 } else {
907 + if (attempt_reuse) {
908 + int err;
909 + struct tcp_rg_s *rg;
910 +
911 + ASSERT(ltcp != NULL);
912 + ASSERT(ltcp->tcp_rg_bind != NULL);
913 + ASSERT(tcp->tcp_rg_bind != NULL);
914 + ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
915 +
916 + err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
917 + if (err != 0) {
918 + mutex_exit(&tbf->tf_lock);
919 + return (0);
920 + }
921 + /*
922 + * Now that the newly-binding socket has joined
923 + * the existing reuseport group on ltcp, it
924 + * should clean up its own (empty) group.
925 + */
926 + rg = tcp->tcp_rg_bind;
927 + tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
928 + VERIFY(tcp_rg_remove(rg, tcp));
929 + tcp_rg_destroy(rg);
930 + }
931 +
883 932 /*
884 933 * This port is ours. Insert in fanout and mark as
885 934 * bound to prevent others from getting the port
886 935 * number.
887 936 */
888 937 tcp->tcp_state = TCPS_BOUND;
889 938 DTRACE_TCP6(state__change, void, NULL,
890 939 ip_xmit_attr_t *, connp->conn_ixa,
891 940 void, NULL, tcp_t *, tcp, void, NULL,
892 941 int32_t, TCPS_IDLE);
893 942
894 943 connp->conn_lport = htons(port);
895 944
896 945 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
897 946 connp->conn_lport)] == tbf);
898 947 tcp_bind_hash_insert(tbf, tcp, 1);
899 948
900 949 mutex_exit(&tbf->tf_lock);
901 950
902 951 /*
903 952 * We don't want tcp_next_port_to_try to "inherit"
904 953 * a port number supplied by the user in a bind.
905 954 */
906 955 if (user_specified)
907 956 return (port);
908 957
909 958 /*
910 959 * This is the only place where tcp_next_port_to_try
911 960 * is updated. After the update, it may or may not
912 961 * be in the valid range.
913 962 */
914 963 if (!connp->conn_anon_priv_bind)
915 964 tcps->tcps_next_port_to_try = port + 1;
916 965 return (port);
917 966 }
918 967
919 968 if (connp->conn_anon_priv_bind) {
920 969 port = tcp_get_next_priv_port(tcp);
921 970 } else {
922 971 if (count == 0 && user_specified) {
923 972 /*
924 973 * We may have to return an anonymous port. So
925 974 * get one to start with.
926 975 */
927 976 port =
928 977 tcp_update_next_port(
929 978 tcps->tcps_next_port_to_try,
930 979 tcp, B_TRUE);
931 980 user_specified = B_FALSE;
932 981 } else {
933 982 port = tcp_update_next_port(port + 1, tcp,
934 983 B_FALSE);
935 984 }
|
↓ open down ↓ |
43 lines elided |
↑ open up ↑ |
936 985 }
937 986 if (port == 0)
938 987 break;
939 988
940 989 /*
941 990 * Don't let this loop run forever in the case where
942 991 * all of the anonymous ports are in use.
943 992 */
944 993 } while (++count < loopmax);
945 994 return (0);
995 +}
996 +
997 +/* Max number of members in TCP SO_REUSEPORT group */
998 +#define TCP_RG_SIZE_MAX 64
999 +/* Step size when expanding members array */
1000 +#define TCP_RG_SIZE_STEP 2
1001 +
1002 +
1003 +tcp_rg_t *
1004 +tcp_rg_init(tcp_t *tcp)
1005 +{
1006 + tcp_rg_t *rg;
1007 + rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
1008 + if (rg == NULL)
1009 + return (NULL);
1010 + rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
1011 + KM_NOSLEEP|KM_NORMALPRI);
1012 + if (rg->tcprg_members == NULL) {
1013 + kmem_free(rg, sizeof (tcp_rg_t));
1014 + return (NULL);
1015 + }
1016 +
1017 + mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
1018 + rg->tcprg_size = 2;
1019 + rg->tcprg_count = 1;
1020 + rg->tcprg_active = 1;
1021 + rg->tcprg_members[0] = tcp;
1022 + return (rg);
1023 +}
1024 +
1025 +void
1026 +tcp_rg_destroy(tcp_rg_t *rg)
1027 +{
1028 + mutex_enter(&rg->tcprg_lock);
1029 + ASSERT(rg->tcprg_count == 0);
1030 + ASSERT(rg->tcprg_active == 0);
1031 + kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
1032 + mutex_destroy(&rg->tcprg_lock);
1033 + kmem_free(rg, sizeof (struct tcp_rg_s));
1034 +}
1035 +
1036 +static int
1037 +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
1038 +{
1039 + mutex_enter(&rg->tcprg_lock);
1040 +
1041 + VERIFY(rg->tcprg_size > 0);
1042 + VERIFY(rg->tcprg_count <= rg->tcprg_size);
1043 + if (rg->tcprg_count != 0) {
1044 + cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
1045 + cred_t *newcred = tcp->tcp_connp->conn_cred;
1046 +
1047 + if (crgetuid(oldcred) != crgetuid(newcred) ||
1048 + crgetzoneid(oldcred) != crgetzoneid(newcred)) {
1049 + mutex_exit(&rg->tcprg_lock);
1050 + return (EPERM);
1051 + }
1052 + }
1053 +
1054 + if (rg->tcprg_count == rg->tcprg_size) {
1055 + unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
1056 + unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
1057 + tcp_t **newmembers;
1058 +
1059 + if (newsize > TCP_RG_SIZE_MAX) {
1060 + mutex_exit(&rg->tcprg_lock);
1061 + return (EINVAL);
1062 + }
1063 + newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
1064 + KM_NOSLEEP|KM_NORMALPRI);
1065 + if (newmembers == NULL) {
1066 + mutex_exit(&rg->tcprg_lock);
1067 + return (ENOMEM);
1068 + }
1069 + bcopy(rg->tcprg_members, newmembers, oldalloc);
1070 + kmem_free(rg->tcprg_members, oldalloc);
1071 + rg->tcprg_members = newmembers;
1072 + rg->tcprg_size = newsize;
1073 + }
1074 +
1075 + rg->tcprg_members[rg->tcprg_count] = tcp;
1076 + rg->tcprg_count++;
1077 + rg->tcprg_active++;
1078 +
1079 + mutex_exit(&rg->tcprg_lock);
1080 + return (0);
1081 +}
1082 +
1083 +boolean_t
1084 +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
1085 +{
1086 + int i;
1087 + boolean_t is_empty;
1088 +
1089 + mutex_enter(&rg->tcprg_lock);
1090 + for (i = 0; i < rg->tcprg_count; i++) {
1091 + if (rg->tcprg_members[i] == tcp)
1092 + break;
1093 + }
1094 + /* The item should be present */
1095 + ASSERT(i < rg->tcprg_count);
1096 + /* Move the last member into this position */
1097 + rg->tcprg_count--;
1098 + rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
1099 + rg->tcprg_members[rg->tcprg_count] = NULL;
1100 + if (tcp->tcp_connp->conn_reuseport != 0)
1101 + rg->tcprg_active--;
1102 + is_empty = (rg->tcprg_count == 0);
1103 + mutex_exit(&rg->tcprg_lock);
1104 + return (is_empty);
1105 +}
1106 +
1107 +void
1108 +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
1109 +{
1110 + mutex_enter(&rg->tcprg_lock);
1111 + if (is_active) {
1112 + rg->tcprg_active++;
1113 + } else {
1114 + rg->tcprg_active--;
1115 + }
1116 + mutex_exit(&rg->tcprg_lock);
946 1117 }
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX