1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright 2016 Joyent, Inc.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/strsun.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/stropts.h>
  33 #include <sys/strlog.h>
  34 #define _SUN_TPI_VERSION 2
  35 #include <sys/tihdr.h>
  36 #include <sys/suntpi.h>
  37 #include <sys/xti_inet.h>
  38 #include <sys/policy.h>
  39 #include <sys/squeue_impl.h>
  40 #include <sys/squeue.h>
  41 #include <sys/tsol/tnet.h>
  42 
  43 #include <rpc/pmap_prot.h>
  44 
  45 #include <inet/common.h>
  46 #include <inet/ip.h>
  47 #include <inet/tcp.h>
  48 #include <inet/tcp_impl.h>
  49 #include <inet/proto_set.h>
  50 #include <inet/ipsec_impl.h>
  51 
  52 /* Setable in /etc/system */
  53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
  54 static uint32_t tcp_random_anon_port = 1;
  55 
  56 static int      tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
  57                     cred_t *cr);
  58 static in_port_t        tcp_get_next_priv_port(const tcp_t *);
  59 static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
  60 
  61 /*
  62  * Hash list insertion routine for tcp_t structures. Each hash bucket
  63  * contains a list of tcp_t entries, and each entry is bound to a unique
  64  * port. If there are multiple tcp_t's that are bound to the same port, then
  65  * one of them will be linked into the hash bucket list, and the rest will
  66  * hang off of that one entry. For each port, entries bound to a specific IP
  67  * address will be inserted before those those bound to INADDR_ANY.
  68  */
  69 void
  70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
  71 {
  72         tcp_t   **tcpp;
  73         tcp_t   *tcpnext;
  74         tcp_t   *tcphash;
  75         conn_t  *connp = tcp->tcp_connp;
  76         conn_t  *connext;
  77 
  78         if (tcp->tcp_ptpbhn != NULL) {
  79                 ASSERT(!caller_holds_lock);
  80                 tcp_bind_hash_remove(tcp);
  81         }
  82         tcpp = &tbf->tf_tcp;
  83         if (!caller_holds_lock) {
  84                 mutex_enter(&tbf->tf_lock);
  85         } else {
  86                 ASSERT(MUTEX_HELD(&tbf->tf_lock));
  87         }
  88         tcphash = tcpp[0];
  89         tcpnext = NULL;
  90         if (tcphash != NULL) {
  91                 /* Look for an entry using the same port */
  92                 while ((tcphash = tcpp[0]) != NULL &&
  93                     connp->conn_lport != tcphash->tcp_connp->conn_lport)
  94                         tcpp = &(tcphash->tcp_bind_hash);
  95 
  96                 /* The port was not found, just add to the end */
  97                 if (tcphash == NULL)
  98                         goto insert;
  99 
 100                 /*
 101                  * OK, there already exists an entry bound to the
 102                  * same port.
 103                  *
 104                  * If the new tcp bound to the INADDR_ANY address
 105                  * and the first one in the list is not bound to
 106                  * INADDR_ANY we skip all entries until we find the
 107                  * first one bound to INADDR_ANY.
 108                  * This makes sure that applications binding to a
 109                  * specific address get preference over those binding to
 110                  * INADDR_ANY.
 111                  */
 112                 tcpnext = tcphash;
 113                 connext = tcpnext->tcp_connp;
 114                 tcphash = NULL;
 115                 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
 116                     !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
 117                         while ((tcpnext = tcpp[0]) != NULL) {
 118                                 connext = tcpnext->tcp_connp;
 119                                 if (!V6_OR_V4_INADDR_ANY(
 120                                     connext->conn_bound_addr_v6))
 121                                         tcpp = &(tcpnext->tcp_bind_hash_port);
 122                                 else
 123                                         break;
 124                         }
 125                         if (tcpnext != NULL) {
 126                                 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
 127                                 tcphash = tcpnext->tcp_bind_hash;
 128                                 if (tcphash != NULL) {
 129                                         tcphash->tcp_ptpbhn =
 130                                             &(tcp->tcp_bind_hash);
 131                                         tcpnext->tcp_bind_hash = NULL;
 132                                 }
 133                         }
 134                 } else {
 135                         tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
 136                         tcphash = tcpnext->tcp_bind_hash;
 137                         if (tcphash != NULL) {
 138                                 tcphash->tcp_ptpbhn =
 139                                     &(tcp->tcp_bind_hash);
 140                                 tcpnext->tcp_bind_hash = NULL;
 141                         }
 142                 }
 143         }
 144 insert:
 145         tcp->tcp_bind_hash_port = tcpnext;
 146         tcp->tcp_bind_hash = tcphash;
 147         tcp->tcp_ptpbhn = tcpp;
 148         tcpp[0] = tcp;
 149         if (!caller_holds_lock)
 150                 mutex_exit(&tbf->tf_lock);
 151 }
 152 
 153 /*
 154  * Hash list removal routine for tcp_t structures.
 155  */
 156 void
 157 tcp_bind_hash_remove(tcp_t *tcp)
 158 {
 159         tcp_t   *tcpnext;
 160         kmutex_t *lockp;
 161         tcp_stack_t     *tcps = tcp->tcp_tcps;
 162         conn_t          *connp = tcp->tcp_connp;
 163 
 164         if (tcp->tcp_ptpbhn == NULL)
 165                 return;
 166 
 167         /*
 168          * Extract the lock pointer in case there are concurrent
 169          * hash_remove's for this instance.
 170          */
 171         ASSERT(connp->conn_lport != 0);
 172         lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
 173             connp->conn_lport)].tf_lock;
 174 
 175         ASSERT(lockp != NULL);
 176         mutex_enter(lockp);
 177 
 178         /* destroy any association with SO_REUSEPORT group */
 179         if (tcp->tcp_rg_bind != NULL) {
 180                 if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
 181                         /* Last one out turns off the lights */
 182                         tcp_rg_destroy(tcp->tcp_rg_bind);
 183                 }
 184                 tcp->tcp_rg_bind = NULL;
 185         }
 186 
 187         if (tcp->tcp_ptpbhn) {
 188                 tcpnext = tcp->tcp_bind_hash_port;
 189                 if (tcpnext != NULL) {
 190                         tcp->tcp_bind_hash_port = NULL;
 191                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 192                         tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
 193                         if (tcpnext->tcp_bind_hash != NULL) {
 194                                 tcpnext->tcp_bind_hash->tcp_ptpbhn =
 195                                     &(tcpnext->tcp_bind_hash);
 196                                 tcp->tcp_bind_hash = NULL;
 197                         }
 198                 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
 199                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 200                         tcp->tcp_bind_hash = NULL;
 201                 }
 202                 *tcp->tcp_ptpbhn = tcpnext;
 203                 tcp->tcp_ptpbhn = NULL;
 204         }
 205         mutex_exit(lockp);
 206 }
 207 
 208 /*
 209  * Don't let port fall into the privileged range.
 210  * Since the extra privileged ports can be arbitrary we also
 211  * ensure that we exclude those from consideration.
 212  * tcp_g_epriv_ports is not sorted thus we loop over it until
 213  * there are no changes.
 214  *
 215  * Note: No locks are held when inspecting tcp_g_*epriv_ports
 216  * but instead the code relies on:
 217  * - the fact that the address of the array and its size never changes
 218  * - the atomic assignment of the elements of the array
 219  *
 220  * Returns 0 if there are no more ports available.
 221  *
 222  * TS note: skip multilevel ports.
 223  */
 224 in_port_t
 225 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
 226 {
 227         int i, bump;
 228         boolean_t restart = B_FALSE;
 229         tcp_stack_t *tcps = tcp->tcp_tcps;
 230 
 231         if (random && tcp_random_anon_port != 0) {
 232                 (void) random_get_pseudo_bytes((uint8_t *)&port,
 233                     sizeof (in_port_t));
 234                 /*
 235                  * Unless changed by a sys admin, the smallest anon port
 236                  * is 32768 and the largest anon port is 65535.  It is
 237                  * very likely (50%) for the random port to be smaller
 238                  * than the smallest anon port.  When that happens,
 239                  * add port % (anon port range) to the smallest anon
 240                  * port to get the random port.  It should fall into the
 241                  * valid anon port range.
 242                  */
 243                 if ((port < tcps->tcps_smallest_anon_port) ||
 244                     (port > tcps->tcps_largest_anon_port)) {
 245                         if (tcps->tcps_smallest_anon_port ==
 246                             tcps->tcps_largest_anon_port) {
 247                                 bump = 0;
 248                         } else {
 249                                 bump = port % (tcps->tcps_largest_anon_port -
 250                                     tcps->tcps_smallest_anon_port);
 251                         }
 252                         port = tcps->tcps_smallest_anon_port + bump;
 253                 }
 254         }
 255 
 256 retry:
 257         if (port < tcps->tcps_smallest_anon_port)
 258                 port = (in_port_t)tcps->tcps_smallest_anon_port;
 259 
 260         if (port > tcps->tcps_largest_anon_port) {
 261                 if (restart)
 262                         return (0);
 263                 restart = B_TRUE;
 264                 port = (in_port_t)tcps->tcps_smallest_anon_port;
 265         }
 266 
 267         if (port < tcps->tcps_smallest_nonpriv_port)
 268                 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
 269 
 270         for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
 271                 if (port == tcps->tcps_g_epriv_ports[i]) {
 272                         port++;
 273                         /*
 274                          * Make sure whether the port is in the
 275                          * valid range.
 276                          */
 277                         goto retry;
 278                 }
 279         }
 280         if (is_system_labeled() &&
 281             (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
 282             IPPROTO_TCP, B_TRUE)) != 0) {
 283                 port = i;
 284                 goto retry;
 285         }
 286         return (port);
 287 }
 288 
 289 /*
 290  * Return the next anonymous port in the privileged port range for
 291  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
 292  * downwards.  This is the same behavior as documented in the userland
 293  * library call rresvport(3N).
 294  *
 295  * TS note: skip multilevel ports.
 296  */
 297 static in_port_t
 298 tcp_get_next_priv_port(const tcp_t *tcp)
 299 {
 300         static in_port_t next_priv_port = IPPORT_RESERVED - 1;
 301         in_port_t nextport;
 302         boolean_t restart = B_FALSE;
 303         tcp_stack_t *tcps = tcp->tcp_tcps;
 304 retry:
 305         if (next_priv_port < tcps->tcps_min_anonpriv_port ||
 306             next_priv_port >= IPPORT_RESERVED) {
 307                 next_priv_port = IPPORT_RESERVED - 1;
 308                 if (restart)
 309                         return (0);
 310                 restart = B_TRUE;
 311         }
 312         if (is_system_labeled() &&
 313             (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
 314             next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
 315                 next_priv_port = nextport;
 316                 goto retry;
 317         }
 318         return (next_priv_port--);
 319 }
 320 
 321 static int
 322 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 323     boolean_t bind_to_req_port_only, cred_t *cr)
 324 {
 325         in_port_t       mlp_port;
 326         mlp_type_t      addrtype, mlptype;
 327         boolean_t       user_specified;
 328         in_port_t       allocated_port;
 329         in_port_t       requested_port = *requested_port_ptr;
 330         conn_t          *connp = tcp->tcp_connp;
 331         zone_t          *zone;
 332         tcp_stack_t     *tcps = tcp->tcp_tcps;
 333         in6_addr_t      v6addr = connp->conn_laddr_v6;
 334 
 335         /*
 336          * XXX It's up to the caller to specify bind_to_req_port_only or not.
 337          */
 338         ASSERT(cr != NULL);
 339 
 340         /*
 341          * Get a valid port (within the anonymous range and should not
 342          * be a privileged one) to use if the user has not given a port.
 343          * If multiple threads are here, they may all start with
 344          * with the same initial port. But, it should be fine as long as
 345          * tcp_bindi will ensure that no two threads will be assigned
 346          * the same port.
 347          *
 348          * NOTE: XXX If a privileged process asks for an anonymous port, we
 349          * still check for ports only in the range > tcp_smallest_non_priv_port,
 350          * unless TCP_ANONPRIVBIND option is set.
 351          */
 352         mlptype = mlptSingle;
 353         mlp_port = requested_port;
 354         if (requested_port == 0) {
 355                 requested_port = connp->conn_anon_priv_bind ?
 356                     tcp_get_next_priv_port(tcp) :
 357                     tcp_update_next_port(tcps->tcps_next_port_to_try,
 358                     tcp, B_TRUE);
 359                 if (requested_port == 0) {
 360                         return (-TNOADDR);
 361                 }
 362                 user_specified = B_FALSE;
 363 
 364                 /*
 365                  * If the user went through one of the RPC interfaces to create
 366                  * this socket and RPC is MLP in this zone, then give him an
 367                  * anonymous MLP.
 368                  */
 369                 if (connp->conn_anon_mlp && is_system_labeled()) {
 370                         zone = crgetzone(cr);
 371                         addrtype = tsol_mlp_addr_type(
 372                             connp->conn_allzones ? ALL_ZONES : zone->zone_id,
 373                             IPV6_VERSION, &v6addr,
 374                             tcps->tcps_netstack->netstack_ip);
 375                         if (addrtype == mlptSingle) {
 376                                 return (-TNOADDR);
 377                         }
 378                         mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
 379                             PMAPPORT, addrtype);
 380                         mlp_port = PMAPPORT;
 381                 }
 382         } else {
 383                 int i;
 384                 boolean_t priv = B_FALSE;
 385 
 386                 /*
 387                  * If the requested_port is in the well-known privileged range,
 388                  * verify that the stream was opened by a privileged user.
 389                  * Note: No locks are held when inspecting tcp_g_*epriv_ports
 390                  * but instead the code relies on:
 391                  * - the fact that the address of the array and its size never
 392                  *   changes
 393                  * - the atomic assignment of the elements of the array
 394                  */
 395                 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
 396                         priv = B_TRUE;
 397                 } else {
 398                         for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
 399                                 if (requested_port ==
 400                                     tcps->tcps_g_epriv_ports[i]) {
 401                                         priv = B_TRUE;
 402                                         break;
 403                                 }
 404                         }
 405                 }
 406                 if (priv) {
 407                         if (secpolicy_net_privaddr(cr, requested_port,
 408                             IPPROTO_TCP) != 0) {
 409                                 if (connp->conn_debug) {
 410                                         (void) strlog(TCP_MOD_ID, 0, 1,
 411                                             SL_ERROR|SL_TRACE,
 412                                             "tcp_bind: no priv for port %d",
 413                                             requested_port);
 414                                 }
 415                                 return (-TACCES);
 416                         }
 417                 }
 418                 user_specified = B_TRUE;
 419 
 420                 connp = tcp->tcp_connp;
 421                 if (is_system_labeled()) {
 422                         zone = crgetzone(cr);
 423                         addrtype = tsol_mlp_addr_type(
 424                             connp->conn_allzones ? ALL_ZONES : zone->zone_id,
 425                             IPV6_VERSION, &v6addr,
 426                             tcps->tcps_netstack->netstack_ip);
 427                         if (addrtype == mlptSingle) {
 428                                 return (-TNOADDR);
 429                         }
 430                         mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
 431                             requested_port, addrtype);
 432                 }
 433         }
 434 
 435         if (mlptype != mlptSingle) {
 436                 if (secpolicy_net_bindmlp(cr) != 0) {
 437                         if (connp->conn_debug) {
 438                                 (void) strlog(TCP_MOD_ID, 0, 1,
 439                                     SL_ERROR|SL_TRACE,
 440                                     "tcp_bind: no priv for multilevel port %d",
 441                                     requested_port);
 442                         }
 443                         return (-TACCES);
 444                 }
 445 
 446                 /*
 447                  * If we're specifically binding a shared IP address and the
 448                  * port is MLP on shared addresses, then check to see if this
 449                  * zone actually owns the MLP.  Reject if not.
 450                  */
 451                 if (mlptype == mlptShared && addrtype == mlptShared) {
 452                         /*
 453                          * No need to handle exclusive-stack zones since
 454                          * ALL_ZONES only applies to the shared stack.
 455                          */
 456                         zoneid_t mlpzone;
 457 
 458                         mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
 459                             htons(mlp_port));
 460                         if (connp->conn_zoneid != mlpzone) {
 461                                 if (connp->conn_debug) {
 462                                         (void) strlog(TCP_MOD_ID, 0, 1,
 463                                             SL_ERROR|SL_TRACE,
 464                                             "tcp_bind: attempt to bind port "
 465                                             "%d on shared addr in zone %d "
 466                                             "(should be %d)",
 467                                             mlp_port, connp->conn_zoneid,
 468                                             mlpzone);
 469                                 }
 470                                 return (-TACCES);
 471                         }
 472                 }
 473 
 474                 if (!user_specified) {
 475                         int err;
 476                         err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 477                             requested_port, B_TRUE);
 478                         if (err != 0) {
 479                                 if (connp->conn_debug) {
 480                                         (void) strlog(TCP_MOD_ID, 0, 1,
 481                                             SL_ERROR|SL_TRACE,
 482                                             "tcp_bind: cannot establish anon "
 483                                             "MLP for port %d",
 484                                             requested_port);
 485                                 }
 486                                 return (err);
 487                         }
 488                         connp->conn_anon_port = B_TRUE;
 489                 }
 490                 connp->conn_mlp_type = mlptype;
 491         }
 492 
 493         allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
 494             connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
 495             user_specified);
 496 
 497         if (allocated_port == 0) {
 498                 connp->conn_mlp_type = mlptSingle;
 499                 if (connp->conn_anon_port) {
 500                         connp->conn_anon_port = B_FALSE;
 501                         (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 502                             requested_port, B_FALSE);
 503                 }
 504                 if (bind_to_req_port_only) {
 505                         if (connp->conn_debug) {
 506                                 (void) strlog(TCP_MOD_ID, 0, 1,
 507                                     SL_ERROR|SL_TRACE,
 508                                     "tcp_bind: requested addr busy");
 509                         }
 510                         return (-TADDRBUSY);
 511                 } else {
 512                         /* If we are out of ports, fail the bind. */
 513                         if (connp->conn_debug) {
 514                                 (void) strlog(TCP_MOD_ID, 0, 1,
 515                                     SL_ERROR|SL_TRACE,
 516                                     "tcp_bind: out of ports?");
 517                         }
 518                         return (-TNOADDR);
 519                 }
 520         }
 521 
 522         /* Pass the allocated port back */
 523         *requested_port_ptr = allocated_port;
 524         return (0);
 525 }
 526 
 527 /*
 528  * Check the address and check/pick a local port number.
 529  */
 530 int
 531 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 532     boolean_t bind_to_req_port_only)
 533 {
 534         tcp_t   *tcp = connp->conn_tcp;
 535         sin_t   *sin;
 536         sin6_t  *sin6;
 537         in_port_t       requested_port;
 538         ipaddr_t        v4addr;
 539         in6_addr_t      v6addr;
 540         ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 541         zoneid_t        zoneid = IPCL_ZONEID(connp);
 542         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 543         uint_t          scopeid = 0;
 544         int             error = 0;
 545         ip_xmit_attr_t  *ixa = connp->conn_ixa;
 546 
 547         ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
 548 
 549         if (tcp->tcp_state == TCPS_BOUND) {
 550                 return (0);
 551         } else if (tcp->tcp_state > TCPS_BOUND) {
 552                 if (connp->conn_debug) {
 553                         (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 554                             "tcp_bind: bad state, %d", tcp->tcp_state);
 555                 }
 556                 return (-TOUTSTATE);
 557         }
 558 
 559         ASSERT(sa != NULL && len != 0);
 560 
 561         if (!OK_32PTR((char *)sa)) {
 562                 if (connp->conn_debug) {
 563                         (void) strlog(TCP_MOD_ID, 0, 1,
 564                             SL_ERROR|SL_TRACE,
 565                             "tcp_bind: bad address parameter, "
 566                             "address %p, len %d",
 567                             (void *)sa, len);
 568                 }
 569                 return (-TPROTO);
 570         }
 571 
 572         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 573         if (error != 0) {
 574                 return (error);
 575         }
 576 
 577         switch (len) {
 578         case sizeof (sin_t):    /* Complete IPv4 address */
 579                 sin = (sin_t *)sa;
 580                 requested_port = ntohs(sin->sin_port);
 581                 v4addr = sin->sin_addr.s_addr;
 582                 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
 583                 if (v4addr != INADDR_ANY) {
 584                         laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
 585                             B_FALSE);
 586                 }
 587                 break;
 588 
 589         case sizeof (sin6_t): /* Complete IPv6 address */
 590                 sin6 = (sin6_t *)sa;
 591                 v6addr = sin6->sin6_addr;
 592                 requested_port = ntohs(sin6->sin6_port);
 593                 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
 594                         if (connp->conn_ipv6_v6only)
 595                                 return (EADDRNOTAVAIL);
 596 
 597                         IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
 598                         if (v4addr != INADDR_ANY) {
 599                                 laddr_type = ip_laddr_verify_v4(v4addr,
 600                                     zoneid, ipst, B_FALSE);
 601                         }
 602                 } else {
 603                         if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
 604                                 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
 605                                         scopeid = sin6->sin6_scope_id;
 606                                 laddr_type = ip_laddr_verify_v6(&v6addr,
 607                                     zoneid, ipst, B_FALSE, scopeid);
 608                         }
 609                 }
 610                 break;
 611 
 612         default:
 613                 if (connp->conn_debug) {
 614                         (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 615                             "tcp_bind: bad address length, %d", len);
 616                 }
 617                 return (EAFNOSUPPORT);
 618                 /* return (-TBADADDR); */
 619         }
 620 
 621         /* Is the local address a valid unicast address? */
 622         if (laddr_type == IPVL_BAD)
 623                 return (EADDRNOTAVAIL);
 624 
 625         connp->conn_bound_addr_v6 = v6addr;
 626         if (scopeid != 0) {
 627                 ixa->ixa_flags |= IXAF_SCOPEID_SET;
 628                 ixa->ixa_scopeid = scopeid;
 629                 connp->conn_incoming_ifindex = scopeid;
 630         } else {
 631                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 632                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 633         }
 634 
 635         connp->conn_laddr_v6 = v6addr;
 636         connp->conn_saddr_v6 = v6addr;
 637 
 638         bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
 639 
 640         error = tcp_bind_select_lport(tcp, &requested_port,
 641             bind_to_req_port_only, cr);
 642         if (error != 0) {
 643                 connp->conn_laddr_v6 = ipv6_all_zeros;
 644                 connp->conn_saddr_v6 = ipv6_all_zeros;
 645                 connp->conn_bound_addr_v6 = ipv6_all_zeros;
 646         }
 647         return (error);
 648 }
 649 
 650 /*
 651  * If the "bind_to_req_port_only" parameter is set and the requested port
 652  * number is available, return it (else return 0).
 653  *
 654  * If "bind_to_req_port_only" parameter is not set and the requested port
 655  * number is available, return it.  If not, return the first anonymous port we
 656  * happen across.  If no anonymous ports are available, return 0.
 657  *
 658  * In either case, when succeeding update the tcp_t to record the port number
 659  * and insert it in the bind hash table.
 660  *
 661  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
 662  * without setting SO_REUSEADDR. This is needed so that they
 663  * can be viewed as two independent transport protocols.
 664  */
 665 in_port_t
 666 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 667     int reuseaddr, boolean_t quick_connect,
 668     boolean_t bind_to_req_port_only, boolean_t user_specified)
 669 {
 670         /* number of times we have run around the loop */
 671         int count = 0;
 672         /* maximum number of times to run around the loop */
 673         int loopmax;
 674         conn_t *connp = tcp->tcp_connp;
 675         tcp_stack_t     *tcps = tcp->tcp_tcps;
 676         boolean_t reuseport = connp->conn_reuseport;
 677 
 678         /*
 679          * Lookup for free addresses is done in a loop and "loopmax"
 680          * influences how long we spin in the loop
 681          */
 682         if (bind_to_req_port_only) {
 683                 /*
 684                  * If the requested port is busy, don't bother to look
 685                  * for a new one. Setting loop maximum count to 1 has
 686                  * that effect.
 687                  */
 688                 loopmax = 1;
 689         } else {
 690                 /*
 691                  * If the requested port is busy, look for a free one
 692                  * in the anonymous port range.
 693                  * Set loopmax appropriately so that one does not look
 694                  * forever in the case all of the anonymous ports are in use.
 695                  */
 696                 if (connp->conn_anon_priv_bind) {
 697                         /*
 698                          * loopmax =
 699                          *      (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
 700                          */
 701                         loopmax = IPPORT_RESERVED -
 702                             tcps->tcps_min_anonpriv_port;
 703                 } else {
 704                         loopmax = (tcps->tcps_largest_anon_port -
 705                             tcps->tcps_smallest_anon_port + 1);
 706                 }
 707         }
 708         do {
 709                 uint16_t        lport;
 710                 tf_t            *tbf;
 711                 tcp_t           *ltcp;
 712                 conn_t          *lconnp;
 713                 boolean_t       attempt_reuse = B_FALSE;
 714 
 715                 lport = htons(port);
 716 
 717                 /*
 718                  * Ensure that the tcp_t is not currently in the bind hash.
 719                  * Hold the lock on the hash bucket to ensure that
 720                  * the duplicate check plus the insertion is an atomic
 721                  * operation.
 722                  *
 723                  * This function does an inline lookup on the bind hash list
 724                  * Make sure that we access only members of tcp_t
 725                  * and that we don't look at tcp_tcp, since we are not
 726                  * doing a CONN_INC_REF.
 727                  */
 728                 tcp_bind_hash_remove(tcp);
 729                 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
 730                 mutex_enter(&tbf->tf_lock);
 731                 for (ltcp = tbf->tf_tcp; ltcp != NULL;
 732                     ltcp = ltcp->tcp_bind_hash) {
 733                         if (lport == ltcp->tcp_connp->conn_lport)
 734                                 break;
 735                 }
 736 
 737                 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 738                         boolean_t not_socket;
 739                         boolean_t exclbind;
 740                         boolean_t addrmatch;
 741 
 742                         lconnp = ltcp->tcp_connp;
 743 
 744                         /*
 745                          * On a labeled system, we must treat bindings to ports
 746                          * on shared IP addresses by sockets with MAC exemption
 747                          * privilege as being in all zones, as there's
 748                          * otherwise no way to identify the right receiver.
 749                          */
 750                         if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
 751                                 continue;
 752 
 753                         /*
 754                          * If TCP_EXCLBIND is set for either the bound or
 755                          * binding endpoint, the semantics of bind
 756                          * is changed according to the following.
 757                          *
 758                          * spec = specified address (v4 or v6)
 759                          * unspec = unspecified address (v4 or v6)
 760                          * A = specified addresses are different for endpoints
 761                          *
 762                          * bound        bind to         allowed
 763                          * -------------------------------------
 764                          * unspec       unspec          no
 765                          * unspec       spec            no
 766                          * spec         unspec          no
 767                          * spec         spec            yes if A
 768                          *
 769                          * For labeled systems, SO_MAC_EXEMPT behaves the same
 770                          * as TCP_EXCLBIND, except that zoneid is ignored.
 771                          *
 772                          * Note:
 773                          *
 774                          * 1. Because of TLI semantics, an endpoint can go
 775                          * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
 776                          * TCPS_BOUND, depending on whether it is originally
 777                          * a listener or not.  That is why we need to check
 778                          * for states greater than or equal to TCPS_BOUND
 779                          * here.
 780                          *
 781                          * 2. Ideally, we should only check for state equals
 782                          * to TCPS_LISTEN. And the following check should be
 783                          * added.
 784                          *
 785                          * if (ltcp->tcp_state == TCPS_LISTEN ||
 786                          *      !reuseaddr || !lconnp->conn_reuseaddr) {
 787                          *              ...
 788                          * }
 789                          *
 790                          * The semantics will be changed to this.  If the
 791                          * endpoint on the list is in state not equal to
 792                          * TCPS_LISTEN and both endpoints have SO_REUSEADDR
 793                          * set, let the bind succeed.
 794                          *
 795                          * Because of (1), we cannot do that for TLI
 796                          * endpoints.  But we can do that for socket endpoints.
 797                          * If in future, we can change this going back
 798                          * semantics, we can use the above check for TLI also.
 799                          */
 800                         not_socket = !(TCP_IS_SOCKET(ltcp) &&
 801                             TCP_IS_SOCKET(tcp));
 802                         exclbind = lconnp->conn_exclbind ||
 803                             connp->conn_exclbind;
 804 
 805                         if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 806                             (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 807                             (exclbind && (not_socket ||
 808                             ltcp->tcp_state <= TCPS_ESTABLISHED))) {
 809                                 if (V6_OR_V4_INADDR_ANY(
 810                                     lconnp->conn_bound_addr_v6) ||
 811                                     V6_OR_V4_INADDR_ANY(*laddr) ||
 812                                     IN6_ARE_ADDR_EQUAL(laddr,
 813                                     &lconnp->conn_bound_addr_v6)) {
 814                                         break;
 815                                 }
 816                                 continue;
 817                         }
 818 
 819                         /*
 820                          * Check ipversion to allow IPv4 and IPv6 sockets to
 821                          * have disjoint port number spaces, if *_EXCLBIND
 822                          * is not set and only if the application binds to a
 823                          * specific port. We use the same autoassigned port
 824                          * number space for IPv4 and IPv6 sockets.
 825                          */
 826                         if (connp->conn_ipversion != lconnp->conn_ipversion &&
 827                             bind_to_req_port_only)
 828                                 continue;
 829 
 830                         /*
 831                          * Ideally, we should make sure that the source
 832                          * address, remote address, and remote port in the
 833                          * four tuple for this tcp-connection is unique.
 834                          * However, trying to find out the local source
 835                          * address would require too much code duplication
 836                          * with IP, since IP needs needs to have that code
 837                          * to support userland TCP implementations.
 838                          */
 839                         if (quick_connect &&
 840                             (ltcp->tcp_state > TCPS_LISTEN) &&
 841                             ((connp->conn_fport != lconnp->conn_fport) ||
 842                             !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
 843                             &lconnp->conn_faddr_v6)))
 844                                 continue;
 845 
 846                         addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
 847                             &lconnp->conn_bound_addr_v6);
 848 
 849                         if (addrmatch && reuseport && bind_to_req_port_only &&
 850                             (ltcp->tcp_state == TCPS_BOUND ||
 851                             ltcp->tcp_state == TCPS_LISTEN)) {
 852                                 /*
 853                                  * This entry is bound to the exact same
 854                                  * address and port.  If SO_REUSEPORT is set on
 855                                  * the calling socket, attempt to reuse this
 856                                  * binding if it too had SO_REUSEPORT enabled
 857                                  * when it was bound.
 858                                  */
 859                                 attempt_reuse = (ltcp->tcp_rg_bind != NULL);
 860                                 break;
 861                         }
 862 
 863                         if (!reuseaddr) {
 864                                 /*
 865                                  * No socket option SO_REUSEADDR.  If an
 866                                  * existing port is bound to a non-wildcard IP
 867                                  * address and the requesting stream is bound
 868                                  * to a distinct different IP address
 869                                  * (non-wildcard, also), keep going.
 870                                  */
 871                                 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 872                                     !V6_OR_V4_INADDR_ANY(
 873                                     lconnp->conn_bound_addr_v6) &&
 874                                     !addrmatch)
 875                                         continue;
 876                                 if (ltcp->tcp_state >= TCPS_BOUND) {
 877                                         /*
 878                                          * This port is being used and
 879                                          * its state is >= TCPS_BOUND,
 880                                          * so we can't bind to it.
 881                                          */
 882                                         break;
 883                                 }
 884                         } else {
 885                                 /*
 886                                  * socket option SO_REUSEADDR is set on the
 887                                  * binding tcp_t.
 888                                  *
 889                                  * If two streams are bound to the same IP
 890                                  * address or both addr and bound source are
 891                                  * wildcards (INADDR_ANY), we want to stop
 892                                  * searching.  We have found a match of IP
 893                                  * source address and source port, which is
 894                                  * refused regardless of the SO_REUSEADDR
 895                                  * setting, so we break.
 896                                  */
 897                                 if (addrmatch &&
 898                                     (ltcp->tcp_state == TCPS_LISTEN ||
 899                                     ltcp->tcp_state == TCPS_BOUND))
 900                                         break;
 901                         }
 902                 }
 903                 if (ltcp != NULL && !attempt_reuse) {
 904                         /* The port number is busy */
 905                         mutex_exit(&tbf->tf_lock);
 906                 } else {
 907                         if (attempt_reuse) {
 908                                 int err;
 909                                 struct tcp_rg_s *rg;
 910 
 911                                 ASSERT(ltcp != NULL);
 912                                 ASSERT(ltcp->tcp_rg_bind != NULL);
 913                                 ASSERT(tcp->tcp_rg_bind != NULL);
 914                                 ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
 915 
 916                                 err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
 917                                 if (err != 0) {
 918                                         mutex_exit(&tbf->tf_lock);
 919                                         return (0);
 920                                 }
 921                                 /*
 922                                  * Now that the newly-binding socket has joined
 923                                  * the existing reuseport group on ltcp, it
 924                                  * should clean up its own (empty) group.
 925                                  */
 926                                 rg = tcp->tcp_rg_bind;
 927                                 tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
 928                                 VERIFY(tcp_rg_remove(rg, tcp));
 929                                 tcp_rg_destroy(rg);
 930                         }
 931 
 932                         /*
 933                          * This port is ours. Insert in fanout and mark as
 934                          * bound to prevent others from getting the port
 935                          * number.
 936                          */
 937                         tcp->tcp_state = TCPS_BOUND;
 938                         DTRACE_TCP6(state__change, void, NULL,
 939                             ip_xmit_attr_t *, connp->conn_ixa,
 940                             void, NULL, tcp_t *, tcp, void, NULL,
 941                             int32_t, TCPS_IDLE);
 942 
 943                         connp->conn_lport = htons(port);
 944 
 945                         ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
 946                             connp->conn_lport)] == tbf);
 947                         tcp_bind_hash_insert(tbf, tcp, 1);
 948 
 949                         mutex_exit(&tbf->tf_lock);
 950 
 951                         /*
 952                          * We don't want tcp_next_port_to_try to "inherit"
 953                          * a port number supplied by the user in a bind.
 954                          */
 955                         if (user_specified)
 956                                 return (port);
 957 
 958                         /*
 959                          * This is the only place where tcp_next_port_to_try
 960                          * is updated. After the update, it may or may not
 961                          * be in the valid range.
 962                          */
 963                         if (!connp->conn_anon_priv_bind)
 964                                 tcps->tcps_next_port_to_try = port + 1;
 965                         return (port);
 966                 }
 967 
 968                 if (connp->conn_anon_priv_bind) {
 969                         port = tcp_get_next_priv_port(tcp);
 970                 } else {
 971                         if (count == 0 && user_specified) {
 972                                 /*
 973                                  * We may have to return an anonymous port. So
 974                                  * get one to start with.
 975                                  */
 976                                 port =
 977                                     tcp_update_next_port(
 978                                     tcps->tcps_next_port_to_try,
 979                                     tcp, B_TRUE);
 980                                 user_specified = B_FALSE;
 981                         } else {
 982                                 port = tcp_update_next_port(port + 1, tcp,
 983                                     B_FALSE);
 984                         }
 985                 }
 986                 if (port == 0)
 987                         break;
 988 
 989                 /*
 990                  * Don't let this loop run forever in the case where
 991                  * all of the anonymous ports are in use.
 992                  */
 993         } while (++count < loopmax);
 994         return (0);
 995 }
 996 
 997 /* Max number of members in TCP SO_REUSEPORT group */
 998 #define TCP_RG_SIZE_MAX         64
 999 /* Step size when expanding members array */
1000 #define TCP_RG_SIZE_STEP        2
1001 
1002 
1003 tcp_rg_t *
1004 tcp_rg_init(tcp_t *tcp)
1005 {
1006         tcp_rg_t *rg;
1007         rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
1008         if (rg == NULL)
1009                 return (NULL);
1010         rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
1011             KM_NOSLEEP|KM_NORMALPRI);
1012         if (rg->tcprg_members == NULL) {
1013                 kmem_free(rg, sizeof (tcp_rg_t));
1014                 return (NULL);
1015         }
1016 
1017         mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
1018         rg->tcprg_size = 2;
1019         rg->tcprg_count = 1;
1020         rg->tcprg_active = 1;
1021         rg->tcprg_members[0] = tcp;
1022         return (rg);
1023 }
1024 
1025 void
1026 tcp_rg_destroy(tcp_rg_t *rg)
1027 {
1028         mutex_enter(&rg->tcprg_lock);
1029         ASSERT(rg->tcprg_count == 0);
1030         ASSERT(rg->tcprg_active == 0);
1031         kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
1032         mutex_destroy(&rg->tcprg_lock);
1033         kmem_free(rg, sizeof (struct tcp_rg_s));
1034 }
1035 
1036 static int
1037 tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
1038 {
1039         mutex_enter(&rg->tcprg_lock);
1040 
1041         VERIFY(rg->tcprg_size > 0);
1042         VERIFY(rg->tcprg_count <= rg->tcprg_size);
1043         if (rg->tcprg_count != 0) {
1044                 cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
1045                 cred_t *newcred = tcp->tcp_connp->conn_cred;
1046 
1047                 if (crgetuid(oldcred) != crgetuid(newcred) ||
1048                     crgetzoneid(oldcred) != crgetzoneid(newcred)) {
1049                         mutex_exit(&rg->tcprg_lock);
1050                         return (EPERM);
1051                 }
1052         }
1053 
1054         if (rg->tcprg_count == rg->tcprg_size) {
1055                 unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
1056                 unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
1057                 tcp_t **newmembers;
1058 
1059                 if (newsize > TCP_RG_SIZE_MAX) {
1060                         mutex_exit(&rg->tcprg_lock);
1061                         return (EINVAL);
1062                 }
1063                 newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
1064                     KM_NOSLEEP|KM_NORMALPRI);
1065                 if (newmembers == NULL) {
1066                         mutex_exit(&rg->tcprg_lock);
1067                         return (ENOMEM);
1068                 }
1069                 bcopy(rg->tcprg_members, newmembers, oldalloc);
1070                 kmem_free(rg->tcprg_members, oldalloc);
1071                 rg->tcprg_members = newmembers;
1072                 rg->tcprg_size = newsize;
1073         }
1074 
1075         rg->tcprg_members[rg->tcprg_count] = tcp;
1076         rg->tcprg_count++;
1077         rg->tcprg_active++;
1078 
1079         mutex_exit(&rg->tcprg_lock);
1080         return (0);
1081 }
1082 
1083 boolean_t
1084 tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
1085 {
1086         int i;
1087         boolean_t is_empty;
1088 
1089         mutex_enter(&rg->tcprg_lock);
1090         for (i = 0; i < rg->tcprg_count; i++) {
1091                 if (rg->tcprg_members[i] == tcp)
1092                         break;
1093         }
1094         /* The item should be present */
1095         ASSERT(i < rg->tcprg_count);
1096         /* Move the last member into this position */
1097         rg->tcprg_count--;
1098         rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
1099         rg->tcprg_members[rg->tcprg_count] = NULL;
1100         if (tcp->tcp_connp->conn_reuseport != 0)
1101                 rg->tcprg_active--;
1102         is_empty = (rg->tcprg_count == 0);
1103         mutex_exit(&rg->tcprg_lock);
1104         return (is_empty);
1105 }
1106 
1107 void
1108 tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
1109 {
1110         mutex_enter(&rg->tcprg_lock);
1111         if (is_active) {
1112                 rg->tcprg_active++;
1113         } else {
1114                 rg->tcprg_active--;
1115         }
1116         mutex_exit(&rg->tcprg_lock);
1117 }