1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/stream.h>
  29 #include <sys/strsun.h>
  30 #include <sys/strsubr.h>
  31 #include <sys/stropts.h>
  32 #include <sys/strlog.h>
  33 #define _SUN_TPI_VERSION 2
  34 #include <sys/tihdr.h>
  35 #include <sys/suntpi.h>
  36 #include <sys/xti_inet.h>
  37 #include <sys/policy.h>
  38 #include <sys/squeue_impl.h>
  39 #include <sys/squeue.h>
  40 #include <sys/tsol/tnet.h>
  41 
  42 #include <rpc/pmap_prot.h>
  43 
  44 #include <inet/common.h>
  45 #include <inet/ip.h>
  46 #include <inet/tcp.h>
  47 #include <inet/tcp_impl.h>
  48 #include <inet/proto_set.h>
  49 #include <inet/ipsec_impl.h>
  50 
  51 /* Setable in /etc/system */
  52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
  53 static uint32_t tcp_random_anon_port = 1;
  54 
  55 static int      tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
  56                     cred_t *cr);
  57 static in_port_t        tcp_get_next_priv_port(const tcp_t *);
  58 
  59 /*
  60  * Hash list insertion routine for tcp_t structures. Each hash bucket
  61  * contains a list of tcp_t entries, and each entry is bound to a unique
  62  * port. If there are multiple tcp_t's that are bound to the same port, then
  63  * one of them will be linked into the hash bucket list, and the rest will
  64  * hang off of that one entry. For each port, entries bound to a specific IP
  65  * address will be inserted before those those bound to INADDR_ANY.
  66  */
  67 void
  68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
  69 {
  70         tcp_t   **tcpp;
  71         tcp_t   *tcpnext;
  72         tcp_t   *tcphash;
  73         conn_t  *connp = tcp->tcp_connp;
  74         conn_t  *connext;
  75 
  76         if (tcp->tcp_ptpbhn != NULL) {
  77                 ASSERT(!caller_holds_lock);
  78                 tcp_bind_hash_remove(tcp);
  79         }
  80         tcpp = &tbf->tf_tcp;
  81         if (!caller_holds_lock) {
  82                 mutex_enter(&tbf->tf_lock);
  83         } else {
  84                 ASSERT(MUTEX_HELD(&tbf->tf_lock));
  85         }
  86         tcphash = tcpp[0];
  87         tcpnext = NULL;
  88         if (tcphash != NULL) {
  89                 /* Look for an entry using the same port */
  90                 while ((tcphash = tcpp[0]) != NULL &&
  91                     connp->conn_lport != tcphash->tcp_connp->conn_lport)
  92                         tcpp = &(tcphash->tcp_bind_hash);
  93 
  94                 /* The port was not found, just add to the end */
  95                 if (tcphash == NULL)
  96                         goto insert;
  97 
  98                 /*
  99                  * OK, there already exists an entry bound to the
 100                  * same port.
 101                  *
 102                  * If the new tcp bound to the INADDR_ANY address
 103                  * and the first one in the list is not bound to
 104                  * INADDR_ANY we skip all entries until we find the
 105                  * first one bound to INADDR_ANY.
 106                  * This makes sure that applications binding to a
 107                  * specific address get preference over those binding to
 108                  * INADDR_ANY.
 109                  */
 110                 tcpnext = tcphash;
 111                 connext = tcpnext->tcp_connp;
 112                 tcphash = NULL;
 113                 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
 114                     !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
 115                         while ((tcpnext = tcpp[0]) != NULL) {
 116                                 connext = tcpnext->tcp_connp;
 117                                 if (!V6_OR_V4_INADDR_ANY(
 118                                     connext->conn_bound_addr_v6))
 119                                         tcpp = &(tcpnext->tcp_bind_hash_port);
 120                                 else
 121                                         break;
 122                         }
 123                         if (tcpnext != NULL) {
 124                                 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
 125                                 tcphash = tcpnext->tcp_bind_hash;
 126                                 if (tcphash != NULL) {
 127                                         tcphash->tcp_ptpbhn =
 128                                             &(tcp->tcp_bind_hash);
 129                                         tcpnext->tcp_bind_hash = NULL;
 130                                 }
 131                         }
 132                 } else {
 133                         tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
 134                         tcphash = tcpnext->tcp_bind_hash;
 135                         if (tcphash != NULL) {
 136                                 tcphash->tcp_ptpbhn =
 137                                     &(tcp->tcp_bind_hash);
 138                                 tcpnext->tcp_bind_hash = NULL;
 139                         }
 140                 }
 141         }
 142 insert:
 143         tcp->tcp_bind_hash_port = tcpnext;
 144         tcp->tcp_bind_hash = tcphash;
 145         tcp->tcp_ptpbhn = tcpp;
 146         tcpp[0] = tcp;
 147         if (!caller_holds_lock)
 148                 mutex_exit(&tbf->tf_lock);
 149 }
 150 
 151 /*
 152  * Hash list removal routine for tcp_t structures.
 153  */
 154 void
 155 tcp_bind_hash_remove(tcp_t *tcp)
 156 {
 157         tcp_t   *tcpnext;
 158         kmutex_t *lockp;
 159         tcp_stack_t     *tcps = tcp->tcp_tcps;
 160         conn_t          *connp = tcp->tcp_connp;
 161 
 162         if (tcp->tcp_ptpbhn == NULL)
 163                 return;
 164 
 165         /*
 166          * Extract the lock pointer in case there are concurrent
 167          * hash_remove's for this instance.
 168          */
 169         ASSERT(connp->conn_lport != 0);
 170         lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
 171             connp->conn_lport)].tf_lock;
 172 
 173         ASSERT(lockp != NULL);
 174         mutex_enter(lockp);
 175         if (tcp->tcp_ptpbhn) {
 176                 tcpnext = tcp->tcp_bind_hash_port;
 177                 if (tcpnext != NULL) {
 178                         tcp->tcp_bind_hash_port = NULL;
 179                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 180                         tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
 181                         if (tcpnext->tcp_bind_hash != NULL) {
 182                                 tcpnext->tcp_bind_hash->tcp_ptpbhn =
 183                                     &(tcpnext->tcp_bind_hash);
 184                                 tcp->tcp_bind_hash = NULL;
 185                         }
 186                 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
 187                         tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
 188                         tcp->tcp_bind_hash = NULL;
 189                 }
 190                 *tcp->tcp_ptpbhn = tcpnext;
 191                 tcp->tcp_ptpbhn = NULL;
 192         }
 193         mutex_exit(lockp);
 194 }
 195 
 196 /*
 197  * Don't let port fall into the privileged range.
 198  * Since the extra privileged ports can be arbitrary we also
 199  * ensure that we exclude those from consideration.
 200  * tcp_g_epriv_ports is not sorted thus we loop over it until
 201  * there are no changes.
 202  *
 203  * Note: No locks are held when inspecting tcp_g_*epriv_ports
 204  * but instead the code relies on:
 205  * - the fact that the address of the array and its size never changes
 206  * - the atomic assignment of the elements of the array
 207  *
 208  * Returns 0 if there are no more ports available.
 209  *
 210  * TS note: skip multilevel ports.
 211  */
 212 in_port_t
 213 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
 214 {
 215         int i, bump;
 216         boolean_t restart = B_FALSE;
 217         tcp_stack_t *tcps = tcp->tcp_tcps;
 218 
 219         if (random && tcp_random_anon_port != 0) {
 220                 (void) random_get_pseudo_bytes((uint8_t *)&port,
 221                     sizeof (in_port_t));
 222                 /*
 223                  * Unless changed by a sys admin, the smallest anon port
 224                  * is 32768 and the largest anon port is 65535.  It is
 225                  * very likely (50%) for the random port to be smaller
 226                  * than the smallest anon port.  When that happens,
 227                  * add port % (anon port range) to the smallest anon
 228                  * port to get the random port.  It should fall into the
 229                  * valid anon port range.
 230                  */
 231                 if ((port < tcps->tcps_smallest_anon_port) ||
 232                     (port > tcps->tcps_largest_anon_port)) {
 233                         if (tcps->tcps_smallest_anon_port ==
 234                             tcps->tcps_largest_anon_port) {
 235                                 bump = 0;
 236                         } else {
 237                                 bump = port % (tcps->tcps_largest_anon_port -
 238                                     tcps->tcps_smallest_anon_port);
 239                         }
 240                         port = tcps->tcps_smallest_anon_port + bump;
 241                 }
 242         }
 243 
 244 retry:
 245         if (port < tcps->tcps_smallest_anon_port)
 246                 port = (in_port_t)tcps->tcps_smallest_anon_port;
 247 
 248         if (port > tcps->tcps_largest_anon_port) {
 249                 if (restart)
 250                         return (0);
 251                 restart = B_TRUE;
 252                 port = (in_port_t)tcps->tcps_smallest_anon_port;
 253         }
 254 
 255         if (port < tcps->tcps_smallest_nonpriv_port)
 256                 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
 257 
 258         for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
 259                 if (port == tcps->tcps_g_epriv_ports[i]) {
 260                         port++;
 261                         /*
 262                          * Make sure whether the port is in the
 263                          * valid range.
 264                          */
 265                         goto retry;
 266                 }
 267         }
 268         if (is_system_labeled() &&
 269             (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
 270             IPPROTO_TCP, B_TRUE)) != 0) {
 271                 port = i;
 272                 goto retry;
 273         }
 274         return (port);
 275 }
 276 
 277 /*
 278  * Return the next anonymous port in the privileged port range for
 279  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
 280  * downwards.  This is the same behavior as documented in the userland
 281  * library call rresvport(3N).
 282  *
 283  * TS note: skip multilevel ports.
 284  */
 285 static in_port_t
 286 tcp_get_next_priv_port(const tcp_t *tcp)
 287 {
 288         static in_port_t next_priv_port = IPPORT_RESERVED - 1;
 289         in_port_t nextport;
 290         boolean_t restart = B_FALSE;
 291         tcp_stack_t *tcps = tcp->tcp_tcps;
 292 retry:
 293         if (next_priv_port < tcps->tcps_min_anonpriv_port ||
 294             next_priv_port >= IPPORT_RESERVED) {
 295                 next_priv_port = IPPORT_RESERVED - 1;
 296                 if (restart)
 297                         return (0);
 298                 restart = B_TRUE;
 299         }
 300         if (is_system_labeled() &&
 301             (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
 302             next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
 303                 next_priv_port = nextport;
 304                 goto retry;
 305         }
 306         return (next_priv_port--);
 307 }
 308 
 309 static int
 310 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
 311     boolean_t bind_to_req_port_only, cred_t *cr)
 312 {
 313         in_port_t       mlp_port;
 314         mlp_type_t      addrtype, mlptype;
 315         boolean_t       user_specified;
 316         in_port_t       allocated_port;
 317         in_port_t       requested_port = *requested_port_ptr;
 318         conn_t          *connp = tcp->tcp_connp;
 319         zone_t          *zone;
 320         tcp_stack_t     *tcps = tcp->tcp_tcps;
 321         in6_addr_t      v6addr = connp->conn_laddr_v6;
 322 
 323         /*
 324          * XXX It's up to the caller to specify bind_to_req_port_only or not.
 325          */
 326         ASSERT(cr != NULL);
 327 
 328         /*
 329          * Get a valid port (within the anonymous range and should not
 330          * be a privileged one) to use if the user has not given a port.
 331          * If multiple threads are here, they may all start with
 332          * with the same initial port. But, it should be fine as long as
 333          * tcp_bindi will ensure that no two threads will be assigned
 334          * the same port.
 335          *
 336          * NOTE: XXX If a privileged process asks for an anonymous port, we
 337          * still check for ports only in the range > tcp_smallest_non_priv_port,
 338          * unless TCP_ANONPRIVBIND option is set.
 339          */
 340         mlptype = mlptSingle;
 341         mlp_port = requested_port;
 342         if (requested_port == 0) {
 343                 requested_port = connp->conn_anon_priv_bind ?
 344                     tcp_get_next_priv_port(tcp) :
 345                     tcp_update_next_port(tcps->tcps_next_port_to_try,
 346                     tcp, B_TRUE);
 347                 if (requested_port == 0) {
 348                         return (-TNOADDR);
 349                 }
 350                 user_specified = B_FALSE;
 351 
 352                 /*
 353                  * If the user went through one of the RPC interfaces to create
 354                  * this socket and RPC is MLP in this zone, then give him an
 355                  * anonymous MLP.
 356                  */
 357                 if (connp->conn_anon_mlp && is_system_labeled()) {
 358                         zone = crgetzone(cr);
 359                         addrtype = tsol_mlp_addr_type(
 360                             connp->conn_allzones ? ALL_ZONES : zone->zone_id,
 361                             IPV6_VERSION, &v6addr,
 362                             tcps->tcps_netstack->netstack_ip);
 363                         if (addrtype == mlptSingle) {
 364                                 return (-TNOADDR);
 365                         }
 366                         mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
 367                             PMAPPORT, addrtype);
 368                         mlp_port = PMAPPORT;
 369                 }
 370         } else {
 371                 int i;
 372                 boolean_t priv = B_FALSE;
 373 
 374                 /*
 375                  * If the requested_port is in the well-known privileged range,
 376                  * verify that the stream was opened by a privileged user.
 377                  * Note: No locks are held when inspecting tcp_g_*epriv_ports
 378                  * but instead the code relies on:
 379                  * - the fact that the address of the array and its size never
 380                  *   changes
 381                  * - the atomic assignment of the elements of the array
 382                  */
 383                 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
 384                         priv = B_TRUE;
 385                 } else {
 386                         for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
 387                                 if (requested_port ==
 388                                     tcps->tcps_g_epriv_ports[i]) {
 389                                         priv = B_TRUE;
 390                                         break;
 391                                 }
 392                         }
 393                 }
 394                 if (priv) {
 395                         if (secpolicy_net_privaddr(cr, requested_port,
 396                             IPPROTO_TCP) != 0) {
 397                                 if (connp->conn_debug) {
 398                                         (void) strlog(TCP_MOD_ID, 0, 1,
 399                                             SL_ERROR|SL_TRACE,
 400                                             "tcp_bind: no priv for port %d",
 401                                             requested_port);
 402                                 }
 403                                 return (-TACCES);
 404                         }
 405                 }
 406                 user_specified = B_TRUE;
 407 
 408                 connp = tcp->tcp_connp;
 409                 if (is_system_labeled()) {
 410                         zone = crgetzone(cr);
 411                         addrtype = tsol_mlp_addr_type(
 412                             connp->conn_allzones ? ALL_ZONES : zone->zone_id,
 413                             IPV6_VERSION, &v6addr,
 414                             tcps->tcps_netstack->netstack_ip);
 415                         if (addrtype == mlptSingle) {
 416                                 return (-TNOADDR);
 417                         }
 418                         mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
 419                             requested_port, addrtype);
 420                 }
 421         }
 422 
 423         if (mlptype != mlptSingle) {
 424                 if (secpolicy_net_bindmlp(cr) != 0) {
 425                         if (connp->conn_debug) {
 426                                 (void) strlog(TCP_MOD_ID, 0, 1,
 427                                     SL_ERROR|SL_TRACE,
 428                                     "tcp_bind: no priv for multilevel port %d",
 429                                     requested_port);
 430                         }
 431                         return (-TACCES);
 432                 }
 433 
 434                 /*
 435                  * If we're specifically binding a shared IP address and the
 436                  * port is MLP on shared addresses, then check to see if this
 437                  * zone actually owns the MLP.  Reject if not.
 438                  */
 439                 if (mlptype == mlptShared && addrtype == mlptShared) {
 440                         /*
 441                          * No need to handle exclusive-stack zones since
 442                          * ALL_ZONES only applies to the shared stack.
 443                          */
 444                         zoneid_t mlpzone;
 445 
 446                         mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
 447                             htons(mlp_port));
 448                         if (connp->conn_zoneid != mlpzone) {
 449                                 if (connp->conn_debug) {
 450                                         (void) strlog(TCP_MOD_ID, 0, 1,
 451                                             SL_ERROR|SL_TRACE,
 452                                             "tcp_bind: attempt to bind port "
 453                                             "%d on shared addr in zone %d "
 454                                             "(should be %d)",
 455                                             mlp_port, connp->conn_zoneid,
 456                                             mlpzone);
 457                                 }
 458                                 return (-TACCES);
 459                         }
 460                 }
 461 
 462                 if (!user_specified) {
 463                         int err;
 464                         err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 465                             requested_port, B_TRUE);
 466                         if (err != 0) {
 467                                 if (connp->conn_debug) {
 468                                         (void) strlog(TCP_MOD_ID, 0, 1,
 469                                             SL_ERROR|SL_TRACE,
 470                                             "tcp_bind: cannot establish anon "
 471                                             "MLP for port %d",
 472                                             requested_port);
 473                                 }
 474                                 return (err);
 475                         }
 476                         connp->conn_anon_port = B_TRUE;
 477                 }
 478                 connp->conn_mlp_type = mlptype;
 479         }
 480 
 481         allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
 482             connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
 483             user_specified);
 484 
 485         if (allocated_port == 0) {
 486                 connp->conn_mlp_type = mlptSingle;
 487                 if (connp->conn_anon_port) {
 488                         connp->conn_anon_port = B_FALSE;
 489                         (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
 490                             requested_port, B_FALSE);
 491                 }
 492                 if (bind_to_req_port_only) {
 493                         if (connp->conn_debug) {
 494                                 (void) strlog(TCP_MOD_ID, 0, 1,
 495                                     SL_ERROR|SL_TRACE,
 496                                     "tcp_bind: requested addr busy");
 497                         }
 498                         return (-TADDRBUSY);
 499                 } else {
 500                         /* If we are out of ports, fail the bind. */
 501                         if (connp->conn_debug) {
 502                                 (void) strlog(TCP_MOD_ID, 0, 1,
 503                                     SL_ERROR|SL_TRACE,
 504                                     "tcp_bind: out of ports?");
 505                         }
 506                         return (-TNOADDR);
 507                 }
 508         }
 509 
 510         /* Pass the allocated port back */
 511         *requested_port_ptr = allocated_port;
 512         return (0);
 513 }
 514 
 515 /*
 516  * Check the address and check/pick a local port number.
 517  */
 518 int
 519 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 520     boolean_t bind_to_req_port_only)
 521 {
 522         tcp_t   *tcp = connp->conn_tcp;
 523         sin_t   *sin;
 524         sin6_t  *sin6;
 525         in_port_t       requested_port;
 526         ipaddr_t        v4addr;
 527         in6_addr_t      v6addr;
 528         ip_laddr_t      laddr_type = IPVL_UNICAST_UP;   /* INADDR_ANY */
 529         zoneid_t        zoneid = IPCL_ZONEID(connp);
 530         ip_stack_t      *ipst = connp->conn_netstack->netstack_ip;
 531         uint_t          scopeid = 0;
 532         int             error = 0;
 533         ip_xmit_attr_t  *ixa = connp->conn_ixa;
 534 
 535         ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
 536 
 537         if (tcp->tcp_state == TCPS_BOUND) {
 538                 return (0);
 539         } else if (tcp->tcp_state > TCPS_BOUND) {
 540                 if (connp->conn_debug) {
 541                         (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 542                             "tcp_bind: bad state, %d", tcp->tcp_state);
 543                 }
 544                 return (-TOUTSTATE);
 545         }
 546 
 547         ASSERT(sa != NULL && len != 0);
 548 
 549         if (!OK_32PTR((char *)sa)) {
 550                 if (connp->conn_debug) {
 551                         (void) strlog(TCP_MOD_ID, 0, 1,
 552                             SL_ERROR|SL_TRACE,
 553                             "tcp_bind: bad address parameter, "
 554                             "address %p, len %d",
 555                             (void *)sa, len);
 556                 }
 557                 return (-TPROTO);
 558         }
 559 
 560         error = proto_verify_ip_addr(connp->conn_family, sa, len);
 561         if (error != 0) {
 562                 return (error);
 563         }
 564 
 565         switch (len) {
 566         case sizeof (sin_t):    /* Complete IPv4 address */
 567                 sin = (sin_t *)sa;
 568                 requested_port = ntohs(sin->sin_port);
 569                 v4addr = sin->sin_addr.s_addr;
 570                 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
 571                 if (v4addr != INADDR_ANY) {
 572                         laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
 573                             B_FALSE);
 574                 }
 575                 break;
 576 
 577         case sizeof (sin6_t): /* Complete IPv6 address */
 578                 sin6 = (sin6_t *)sa;
 579                 v6addr = sin6->sin6_addr;
 580                 requested_port = ntohs(sin6->sin6_port);
 581                 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
 582                         if (connp->conn_ipv6_v6only)
 583                                 return (EADDRNOTAVAIL);
 584 
 585                         IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
 586                         if (v4addr != INADDR_ANY) {
 587                                 laddr_type = ip_laddr_verify_v4(v4addr,
 588                                     zoneid, ipst, B_FALSE);
 589                         }
 590                 } else {
 591                         if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
 592                                 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
 593                                         scopeid = sin6->sin6_scope_id;
 594                                 laddr_type = ip_laddr_verify_v6(&v6addr,
 595                                     zoneid, ipst, B_FALSE, scopeid);
 596                         }
 597                 }
 598                 break;
 599 
 600         default:
 601                 if (connp->conn_debug) {
 602                         (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
 603                             "tcp_bind: bad address length, %d", len);
 604                 }
 605                 return (EAFNOSUPPORT);
 606                 /* return (-TBADADDR); */
 607         }
 608 
 609         /* Is the local address a valid unicast address? */
 610         if (laddr_type == IPVL_BAD)
 611                 return (EADDRNOTAVAIL);
 612 
 613         connp->conn_bound_addr_v6 = v6addr;
 614         if (scopeid != 0) {
 615                 ixa->ixa_flags |= IXAF_SCOPEID_SET;
 616                 ixa->ixa_scopeid = scopeid;
 617                 connp->conn_incoming_ifindex = scopeid;
 618         } else {
 619                 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
 620                 connp->conn_incoming_ifindex = connp->conn_bound_if;
 621         }
 622 
 623         connp->conn_laddr_v6 = v6addr;
 624         connp->conn_saddr_v6 = v6addr;
 625 
 626         bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
 627 
 628         error = tcp_bind_select_lport(tcp, &requested_port,
 629             bind_to_req_port_only, cr);
 630         if (error != 0) {
 631                 connp->conn_laddr_v6 = ipv6_all_zeros;
 632                 connp->conn_saddr_v6 = ipv6_all_zeros;
 633                 connp->conn_bound_addr_v6 = ipv6_all_zeros;
 634         }
 635         return (error);
 636 }
 637 
 638 /*
 639  * If the "bind_to_req_port_only" parameter is set, if the requested port
 640  * number is available, return it, If not return 0
 641  *
 642  * If "bind_to_req_port_only" parameter is not set and
 643  * If the requested port number is available, return it.  If not, return
 644  * the first anonymous port we happen across.  If no anonymous ports are
 645  * available, return 0. addr is the requested local address, if any.
 646  *
 647  * In either case, when succeeding update the tcp_t to record the port number
 648  * and insert it in the bind hash table.
 649  *
 650  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
 651  * without setting SO_REUSEADDR. This is needed so that they
 652  * can be viewed as two independent transport protocols.
 653  */
 654 in_port_t
 655 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 656     int reuseaddr, boolean_t quick_connect,
 657     boolean_t bind_to_req_port_only, boolean_t user_specified)
 658 {
 659         /* number of times we have run around the loop */
 660         int count = 0;
 661         /* maximum number of times to run around the loop */
 662         int loopmax;
 663         conn_t *connp = tcp->tcp_connp;
 664         tcp_stack_t     *tcps = tcp->tcp_tcps;
 665 
 666         /*
 667          * Lookup for free addresses is done in a loop and "loopmax"
 668          * influences how long we spin in the loop
 669          */
 670         if (bind_to_req_port_only) {
 671                 /*
 672                  * If the requested port is busy, don't bother to look
 673                  * for a new one. Setting loop maximum count to 1 has
 674                  * that effect.
 675                  */
 676                 loopmax = 1;
 677         } else {
 678                 /*
 679                  * If the requested port is busy, look for a free one
 680                  * in the anonymous port range.
 681                  * Set loopmax appropriately so that one does not look
 682                  * forever in the case all of the anonymous ports are in use.
 683                  */
 684                 if (connp->conn_anon_priv_bind) {
 685                         /*
 686                          * loopmax =
 687                          *      (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
 688                          */
 689                         loopmax = IPPORT_RESERVED -
 690                             tcps->tcps_min_anonpriv_port;
 691                 } else {
 692                         loopmax = (tcps->tcps_largest_anon_port -
 693                             tcps->tcps_smallest_anon_port + 1);
 694                 }
 695         }
 696         do {
 697                 uint16_t        lport;
 698                 tf_t            *tbf;
 699                 tcp_t           *ltcp;
 700                 conn_t          *lconnp;
 701 
 702                 lport = htons(port);
 703 
 704                 /*
 705                  * Ensure that the tcp_t is not currently in the bind hash.
 706                  * Hold the lock on the hash bucket to ensure that
 707                  * the duplicate check plus the insertion is an atomic
 708                  * operation.
 709                  *
 710                  * This function does an inline lookup on the bind hash list
 711                  * Make sure that we access only members of tcp_t
 712                  * and that we don't look at tcp_tcp, since we are not
 713                  * doing a CONN_INC_REF.
 714                  */
 715                 tcp_bind_hash_remove(tcp);
 716                 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
 717                 mutex_enter(&tbf->tf_lock);
 718                 for (ltcp = tbf->tf_tcp; ltcp != NULL;
 719                     ltcp = ltcp->tcp_bind_hash) {
 720                         if (lport == ltcp->tcp_connp->conn_lport)
 721                                 break;
 722                 }
 723 
 724                 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 725                         boolean_t not_socket;
 726                         boolean_t exclbind;
 727 
 728                         lconnp = ltcp->tcp_connp;
 729 
 730                         /*
 731                          * On a labeled system, we must treat bindings to ports
 732                          * on shared IP addresses by sockets with MAC exemption
 733                          * privilege as being in all zones, as there's
 734                          * otherwise no way to identify the right receiver.
 735                          */
 736                         if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
 737                                 continue;
 738 
 739                         /*
 740                          * If TCP_EXCLBIND is set for either the bound or
 741                          * binding endpoint, the semantics of bind
 742                          * is changed according to the following.
 743                          *
 744                          * spec = specified address (v4 or v6)
 745                          * unspec = unspecified address (v4 or v6)
 746                          * A = specified addresses are different for endpoints
 747                          *
 748                          * bound        bind to         allowed
 749                          * -------------------------------------
 750                          * unspec       unspec          no
 751                          * unspec       spec            no
 752                          * spec         unspec          no
 753                          * spec         spec            yes if A
 754                          *
 755                          * For labeled systems, SO_MAC_EXEMPT behaves the same
 756                          * as TCP_EXCLBIND, except that zoneid is ignored.
 757                          *
 758                          * Note:
 759                          *
 760                          * 1. Because of TLI semantics, an endpoint can go
 761                          * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
 762                          * TCPS_BOUND, depending on whether it is originally
 763                          * a listener or not.  That is why we need to check
 764                          * for states greater than or equal to TCPS_BOUND
 765                          * here.
 766                          *
 767                          * 2. Ideally, we should only check for state equals
 768                          * to TCPS_LISTEN. And the following check should be
 769                          * added.
 770                          *
 771                          * if (ltcp->tcp_state == TCPS_LISTEN ||
 772                          *      !reuseaddr || !lconnp->conn_reuseaddr) {
 773                          *              ...
 774                          * }
 775                          *
 776                          * The semantics will be changed to this.  If the
 777                          * endpoint on the list is in state not equal to
 778                          * TCPS_LISTEN and both endpoints have SO_REUSEADDR
 779                          * set, let the bind succeed.
 780                          *
 781                          * Because of (1), we cannot do that for TLI
 782                          * endpoints.  But we can do that for socket endpoints.
 783                          * If in future, we can change this going back
 784                          * semantics, we can use the above check for TLI also.
 785                          */
 786                         not_socket = !(TCP_IS_SOCKET(ltcp) &&
 787                             TCP_IS_SOCKET(tcp));
 788                         exclbind = lconnp->conn_exclbind ||
 789                             connp->conn_exclbind;
 790 
 791                         if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 792                             (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
 793                             (exclbind && (not_socket ||
 794                             ltcp->tcp_state <= TCPS_ESTABLISHED))) {
 795                                 if (V6_OR_V4_INADDR_ANY(
 796                                     lconnp->conn_bound_addr_v6) ||
 797                                     V6_OR_V4_INADDR_ANY(*laddr) ||
 798                                     IN6_ARE_ADDR_EQUAL(laddr,
 799                                     &lconnp->conn_bound_addr_v6)) {
 800                                         break;
 801                                 }
 802                                 continue;
 803                         }
 804 
 805                         /*
 806                          * Check ipversion to allow IPv4 and IPv6 sockets to
 807                          * have disjoint port number spaces, if *_EXCLBIND
 808                          * is not set and only if the application binds to a
 809                          * specific port. We use the same autoassigned port
 810                          * number space for IPv4 and IPv6 sockets.
 811                          */
 812                         if (connp->conn_ipversion != lconnp->conn_ipversion &&
 813                             bind_to_req_port_only)
 814                                 continue;
 815 
 816                         /*
 817                          * Ideally, we should make sure that the source
 818                          * address, remote address, and remote port in the
 819                          * four tuple for this tcp-connection is unique.
 820                          * However, trying to find out the local source
 821                          * address would require too much code duplication
 822                          * with IP, since IP needs needs to have that code
 823                          * to support userland TCP implementations.
 824                          */
 825                         if (quick_connect &&
 826                             (ltcp->tcp_state > TCPS_LISTEN) &&
 827                             ((connp->conn_fport != lconnp->conn_fport) ||
 828                             !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
 829                             &lconnp->conn_faddr_v6)))
 830                                 continue;
 831 
 832                         if (!reuseaddr) {
 833                                 /*
 834                                  * No socket option SO_REUSEADDR.
 835                                  * If existing port is bound to
 836                                  * a non-wildcard IP address
 837                                  * and the requesting stream is
 838                                  * bound to a distinct
 839                                  * different IP addresses
 840                                  * (non-wildcard, also), keep
 841                                  * going.
 842                                  */
 843                                 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 844                                     !V6_OR_V4_INADDR_ANY(
 845                                     lconnp->conn_bound_addr_v6) &&
 846                                     !IN6_ARE_ADDR_EQUAL(laddr,
 847                                     &lconnp->conn_bound_addr_v6))
 848                                         continue;
 849                                 if (ltcp->tcp_state >= TCPS_BOUND) {
 850                                         /*
 851                                          * This port is being used and
 852                                          * its state is >= TCPS_BOUND,
 853                                          * so we can't bind to it.
 854                                          */
 855                                         break;
 856                                 }
 857                         } else {
 858                                 /*
 859                                  * socket option SO_REUSEADDR is set on the
 860                                  * binding tcp_t.
 861                                  *
 862                                  * If two streams are bound to
 863                                  * same IP address or both addr
 864                                  * and bound source are wildcards
 865                                  * (INADDR_ANY), we want to stop
 866                                  * searching.
 867                                  * We have found a match of IP source
 868                                  * address and source port, which is
 869                                  * refused regardless of the
 870                                  * SO_REUSEADDR setting, so we break.
 871                                  */
 872                                 if (IN6_ARE_ADDR_EQUAL(laddr,
 873                                     &lconnp->conn_bound_addr_v6) &&
 874                                     (ltcp->tcp_state == TCPS_LISTEN ||
 875                                     ltcp->tcp_state == TCPS_BOUND))
 876                                         break;
 877                         }
 878                 }
 879                 if (ltcp != NULL) {
 880                         /* The port number is busy */
 881                         mutex_exit(&tbf->tf_lock);
 882                 } else {
 883                         /*
 884                          * This port is ours. Insert in fanout and mark as
 885                          * bound to prevent others from getting the port
 886                          * number.
 887                          */
 888                         tcp->tcp_state = TCPS_BOUND;
 889                         DTRACE_TCP6(state__change, void, NULL,
 890                             ip_xmit_attr_t *, connp->conn_ixa,
 891                             void, NULL, tcp_t *, tcp, void, NULL,
 892                             int32_t, TCPS_IDLE);
 893 
 894                         connp->conn_lport = htons(port);
 895 
 896                         ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
 897                             connp->conn_lport)] == tbf);
 898                         tcp_bind_hash_insert(tbf, tcp, 1);
 899 
 900                         mutex_exit(&tbf->tf_lock);
 901 
 902                         /*
 903                          * We don't want tcp_next_port_to_try to "inherit"
 904                          * a port number supplied by the user in a bind.
 905                          */
 906                         if (user_specified)
 907                                 return (port);
 908 
 909                         /*
 910                          * This is the only place where tcp_next_port_to_try
 911                          * is updated. After the update, it may or may not
 912                          * be in the valid range.
 913                          */
 914                         if (!connp->conn_anon_priv_bind)
 915                                 tcps->tcps_next_port_to_try = port + 1;
 916                         return (port);
 917                 }
 918 
 919                 if (connp->conn_anon_priv_bind) {
 920                         port = tcp_get_next_priv_port(tcp);
 921                 } else {
 922                         if (count == 0 && user_specified) {
 923                                 /*
 924                                  * We may have to return an anonymous port. So
 925                                  * get one to start with.
 926                                  */
 927                                 port =
 928                                     tcp_update_next_port(
 929                                     tcps->tcps_next_port_to_try,
 930                                     tcp, B_TRUE);
 931                                 user_specified = B_FALSE;
 932                         } else {
 933                                 port = tcp_update_next_port(port + 1, tcp,
 934                                     B_FALSE);
 935                         }
 936                 }
 937                 if (port == 0)
 938                         break;
 939 
 940                 /*
 941                  * Don't let this loop run forever in the case where
 942                  * all of the anonymous ports are in use.
 943                  */
 944         } while (++count < loopmax);
 945         return (0);
 946 }