1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 /*
  30  * IP interface to squeues.
  31  *
  32  * IP uses squeues to force serialization of packets, both incoming and
  33  * outgoing. Each squeue is associated with a connection instance (conn_t)
  34  * above, and a soft ring (if enabled) below. Each CPU will have a default
  35  * squeue for outbound connections, and each soft ring of an interface will
  36  * have an squeue to which it sends incoming packets. squeues are never
  37  * destroyed, and if they become unused they are kept around against future
  38  * needs.
  39  *
  40  * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
  41  * in the system there will be one squeue set, all of whose squeues will be
  42  * bound to that CPU, plus one additional set known as the unbound set. Sets
  43  * associated with CPUs will have one default squeue, for outbound
  44  * connections, and a linked list of squeues used by various NICs for inbound
  45  * packets. The unbound set also has a linked list of squeues, but no default
  46  * squeue.
  47  *
  48  * When a CPU goes offline its squeue set is destroyed, and all its squeues
  49  * are moved to the unbound set. When a CPU comes online, a new squeue set is
  50  * created and the default set is searched for a default squeue formerly bound
  51  * to this CPU. If no default squeue is found, a new one is created.
  52  *
  53  * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
  54  * and not the squeue code. squeue.c will not touch them, and we can modify
  55  * them without holding the squeue lock because of the guarantee that squeues
  56  * are never destroyed. ip_squeue locks must be held, however.
  57  *
  58  * All the squeue sets are protected by a single lock, the sqset_lock. This
  59  * is also used to protect the sq_next and sq_set fields of an squeue_t.
  60  *
  61  * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
  62  *
  63  * There are two modes of associating connection with squeues. The first mode
  64  * associates each connection with the CPU that creates the connection (either
  65  * during open time or during accept time). The second mode associates each
  66  * connection with a random CPU, effectively distributing load over all CPUs
  67  * and all squeues in the system. The mode is controlled by the
  68  * ip_squeue_fanout variable.
  69  *
  70  * NOTE: The fact that there is an association between each connection and
  71  * squeue and squeue and CPU does not mean that each connection is always
  72  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
  73  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
  74  * binding is only relevant for the worker thread.
  75  *
  76  * INTERFACE:
  77  *
  78  * squeue_t *ip_squeue_get(ill_rx_ring_t)
  79  *
  80  * Returns the squeue associated with an ill receive ring. If the ring is
  81  * not bound to a CPU, and we're currently servicing the interrupt which
  82  * generated the packet, then bind the squeue to CPU.
  83  *
  84  *
  85  * DR Notes
  86  * ========
  87  *
  88  * The ip_squeue_init() registers a call-back function with the CPU DR
  89  * subsystem using register_cpu_setup_func(). The call-back function does two
  90  * things:
  91  *
  92  * o When the CPU is going off-line or unconfigured, the worker thread is
  93  *      unbound from the CPU. This allows the CPU unconfig code to move it to
  94  *      another CPU.
  95  *
  96  * o When the CPU is going online, it creates a new squeue for this CPU if
  97  *      necessary and binds the squeue worker thread to this CPU.
  98  *
  99  * TUNABLES:
 100  *
 101  * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
 102  * pick the default squeue from a random CPU, otherwise use our CPU's default
 103  * squeue.
 104  *
 105  * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
 106  * /dev/ip.
 107  *
 108  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
 109  * created. This is the time squeue code waits before waking up the worker
 110  * thread after queuing a request.
 111  */
 112 
 113 #include <sys/types.h>
 114 #include <sys/debug.h>
 115 #include <sys/kmem.h>
 116 #include <sys/cpuvar.h>
 117 #include <sys/cmn_err.h>
 118 
 119 #include <inet/common.h>
 120 #include <inet/ip.h>
 121 #include <netinet/ip6.h>
 122 #include <inet/ip_if.h>
 123 #include <inet/ip_ire.h>
 124 #include <inet/nd.h>
 125 #include <inet/ipclassifier.h>
 126 #include <sys/types.h>
 127 #include <sys/conf.h>
 128 #include <sys/sunddi.h>
 129 #include <sys/dlpi.h>
 130 #include <sys/squeue_impl.h>
 131 #include <sys/tihdr.h>
 132 #include <inet/udp_impl.h>
 133 #include <sys/strsubr.h>
 134 #include <sys/zone.h>
 135 #include <sys/dld.h>
 136 #include <sys/atomic.h>
 137 
 138 /*
 139  * List of all created squeue sets. The list and its size are protected by
 140  * sqset_lock.
 141  */
 142 static squeue_set_t     **sqset_global_list; /* list 0 is the unbound list */
 143 static uint_t           sqset_global_size;
 144 kmutex_t                sqset_lock;
 145 
 146 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
 147 
 148 /*
 149  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
 150  *      created. This is the time squeue code waits before waking up the worker
 151  *      thread after queuing a request.
 152  */
 153 volatile uint_t ip_squeue_worker_wait = 10;
 154 
 155 static squeue_t *ip_squeue_create(pri_t);
 156 static squeue_set_t *ip_squeue_set_create(processorid_t);
 157 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
 158 static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
 159 static void ip_squeue_set_destroy(cpu_t *);
 160 static void ip_squeue_clean(void *, mblk_t *, void *);
 161 
 162 #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
 163 
 164 static squeue_t *
 165 ip_squeue_create(pri_t pri)
 166 {
 167         squeue_t *sqp;
 168 
 169         sqp = squeue_create(ip_squeue_worker_wait, pri);
 170         ASSERT(sqp != NULL);
 171         if (ip_squeue_create_callback != NULL)
 172                 ip_squeue_create_callback(sqp);
 173         return (sqp);
 174 }
 175 
 176 /*
 177  * Create a new squeue_set. If id == -1, then we're creating the unbound set,
 178  * which should only happen once when we are first initialized. Otherwise id
 179  * is the id of the CPU that needs a set, either because we are initializing
 180  * or because the CPU has come online.
 181  *
 182  * If id != -1, then we need at a minimum to provide a default squeue for the
 183  * new set. We search the unbound set for candidates, and if none are found we
 184  * create a new one.
 185  */
 186 static squeue_set_t *
 187 ip_squeue_set_create(processorid_t id)
 188 {
 189         squeue_set_t    *sqs;
 190         squeue_set_t    *src = sqset_global_list[0];
 191         squeue_t        **lastsqp, *sq;
 192         squeue_t        **defaultq_lastp = NULL;
 193 
 194         sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
 195         sqs->sqs_cpuid = id;
 196 
 197         if (id == -1) {
 198                 ASSERT(sqset_global_size == 0);
 199                 sqset_global_list[0] = sqs;
 200                 sqset_global_size = 1;
 201                 return (sqs);
 202         }
 203 
 204         /*
 205          * When we create an squeue set id != -1, we need to give it a
 206          * default squeue, in order to support fanout of conns across
 207          * CPUs. Try to find a former default squeue that matches this
 208          * cpu id on the unbound squeue set. If no such squeue is found,
 209          * find some non-default TCP squeue that is free. If still no such
 210          * candidate is found, create a new squeue.
 211          */
 212 
 213         ASSERT(MUTEX_HELD(&cpu_lock));
 214         mutex_enter(&sqset_lock);
 215         lastsqp = &src->sqs_head;
 216 
 217         while (*lastsqp) {
 218                 if ((*lastsqp)->sq_bind == id &&
 219                     (*lastsqp)->sq_state & SQS_DEFAULT) {
 220                         /*
 221                          * Exact match. Former default squeue of cpu 'id'
 222                          */
 223                         ASSERT(!((*lastsqp)->sq_state & SQS_ILL_BOUND));
 224                         defaultq_lastp = lastsqp;
 225                         break;
 226                 }
 227                 if (defaultq_lastp == NULL &&
 228                     !((*lastsqp)->sq_state & (SQS_ILL_BOUND | SQS_DEFAULT))) {
 229                         /*
 230                          * A free non-default TCP squeue
 231                          */
 232                         defaultq_lastp = lastsqp;
 233                 }
 234                 lastsqp = &(*lastsqp)->sq_next;
 235         }
 236 
 237         if (defaultq_lastp != NULL) {
 238                 /* Remove from src set and set SQS_DEFAULT */
 239                 sq = *defaultq_lastp;
 240                 *defaultq_lastp = sq->sq_next;
 241                 sq->sq_next = NULL;
 242                 if (!(sq->sq_state & SQS_DEFAULT)) {
 243                         mutex_enter(&sq->sq_lock);
 244                         sq->sq_state |= SQS_DEFAULT;
 245                         mutex_exit(&sq->sq_lock);
 246                 }
 247         } else {
 248                 sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
 249                 sq->sq_state |= SQS_DEFAULT;
 250         }
 251 
 252         sq->sq_set = sqs;
 253         sqs->sqs_default = sq;
 254         squeue_bind(sq, id); /* this locks squeue mutex */
 255 
 256         ASSERT(sqset_global_size <= NCPU);
 257         sqset_global_list[sqset_global_size++] = sqs;
 258         mutex_exit(&sqset_lock);
 259         return (sqs);
 260 }
 261 
 262 /*
 263  * Called by ill_ring_add() to find an squeue to associate with a new ring.
 264  */
 265 
 266 squeue_t *
 267 ip_squeue_getfree(pri_t pri)
 268 {
 269         squeue_set_t    *sqs = sqset_global_list[0];
 270         squeue_t        *sq;
 271 
 272         mutex_enter(&sqset_lock);
 273         for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
 274                 /*
 275                  * Select a non-default TCP squeue that is free i.e. not
 276                  * bound to any ill.
 277                  */
 278                 if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
 279                         break;
 280         }
 281 
 282         if (sq == NULL) {
 283                 sq = ip_squeue_create(pri);
 284                 sq->sq_set = sqs;
 285                 sq->sq_next = sqs->sqs_head;
 286                 sqs->sqs_head = sq;
 287         }
 288 
 289         ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
 290             SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
 291             SQS_POLL_THR_QUIESCED)));
 292 
 293         mutex_enter(&sq->sq_lock);
 294         sq->sq_state |= SQS_ILL_BOUND;
 295         mutex_exit(&sq->sq_lock);
 296         mutex_exit(&sqset_lock);
 297 
 298         if (sq->sq_priority != pri) {
 299                 thread_lock(sq->sq_worker);
 300                 (void) thread_change_pri(sq->sq_worker, pri, 0);
 301                 thread_unlock(sq->sq_worker);
 302 
 303                 thread_lock(sq->sq_poll_thr);
 304                 (void) thread_change_pri(sq->sq_poll_thr, pri, 0);
 305                 thread_unlock(sq->sq_poll_thr);
 306 
 307                 sq->sq_priority = pri;
 308         }
 309         return (sq);
 310 }
 311 
 312 /*
 313  * Initialize IP squeues.
 314  */
 315 void
 316 ip_squeue_init(void (*callback)(squeue_t *))
 317 {
 318         int i;
 319         squeue_set_t    *sqs;
 320 
 321         ASSERT(sqset_global_list == NULL);
 322 
 323         ip_squeue_create_callback = callback;
 324         squeue_init();
 325         mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
 326         sqset_global_list =
 327             kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
 328         sqset_global_size = 0;
 329         /*
 330          * We are called at system boot time and we don't
 331          * expect memory allocation failure.
 332          */
 333         sqs = ip_squeue_set_create(-1);
 334         ASSERT(sqs != NULL);
 335 
 336         mutex_enter(&cpu_lock);
 337         /* Create squeue for each active CPU available */
 338         for (i = 0; i < NCPU; i++) {
 339                 cpu_t *cp = cpu_get(i);
 340                 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
 341                         /*
 342                          * We are called at system boot time and we don't
 343                          * expect memory allocation failure then
 344                          */
 345                         cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
 346                         ASSERT(cp->cpu_squeue_set != NULL);
 347                 }
 348         }
 349 
 350         register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
 351         mutex_exit(&cpu_lock);
 352 }
 353 
 354 /*
 355  * Get a default squeue, either from the current CPU or a CPU derived by hash
 356  * from the index argument, depending upon the setting of ip_squeue_fanout.
 357  */
 358 squeue_t *
 359 ip_squeue_random(uint_t index)
 360 {
 361         squeue_set_t *sqs = NULL;
 362         squeue_t *sq;
 363 
 364         /*
 365          * The minimum value of sqset_global_size is 2, one for the unbound
 366          * squeue set and another for the squeue set of the zeroth CPU.
 367          * Even though the value could be changing, it can never go below 2,
 368          * so the assert does not need the lock protection.
 369          */
 370         ASSERT(sqset_global_size > 1);
 371 
 372         /* Protect against changes to sqset_global_list */
 373         mutex_enter(&sqset_lock);
 374 
 375         if (!ip_squeue_fanout)
 376                 sqs = CPU->cpu_squeue_set;
 377 
 378         /*
 379          * sqset_global_list[0] corresponds to the unbound squeue set.
 380          * The computation below picks a set other than the unbound set.
 381          */
 382         if (sqs == NULL)
 383                 sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
 384         sq = sqs->sqs_default;
 385 
 386         mutex_exit(&sqset_lock);
 387         ASSERT(sq);
 388         return (sq);
 389 }
 390 
 391 /*
 392  * Move squeue from its current set to newset. Not used for default squeues.
 393  * Bind or unbind the worker thread as appropriate.
 394  */
 395 
 396 static void
 397 ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
 398 {
 399         squeue_set_t    *set;
 400         squeue_t        **lastsqp;
 401         processorid_t   cpuid = newset->sqs_cpuid;
 402 
 403         ASSERT(!(sq->sq_state & SQS_DEFAULT));
 404         ASSERT(!MUTEX_HELD(&sq->sq_lock));
 405         ASSERT(MUTEX_HELD(&sqset_lock));
 406 
 407         set = sq->sq_set;
 408         if (set == newset)
 409                 return;
 410 
 411         lastsqp = &set->sqs_head;
 412         while (*lastsqp != sq)
 413                 lastsqp = &(*lastsqp)->sq_next;
 414 
 415         *lastsqp = sq->sq_next;
 416         sq->sq_next = newset->sqs_head;
 417         newset->sqs_head = sq;
 418         sq->sq_set = newset;
 419         if (cpuid == -1)
 420                 squeue_unbind(sq);
 421         else
 422                 squeue_bind(sq, cpuid);
 423 }
 424 
 425 /*
 426  * Move squeue from its current set to cpuid's set and bind to cpuid.
 427  */
 428 
 429 int
 430 ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
 431 {
 432         cpu_t *cpu;
 433         squeue_set_t *set;
 434 
 435         if (sq->sq_state & SQS_DEFAULT)
 436                 return (-1);
 437 
 438         ASSERT(MUTEX_HELD(&cpu_lock));
 439 
 440         cpu = cpu_get(cpuid);
 441         if (!CPU_ISON(cpu))
 442                 return (-1);
 443 
 444         mutex_enter(&sqset_lock);
 445         set = cpu->cpu_squeue_set;
 446         if (set != NULL)
 447                 ip_squeue_set_move(sq, set);
 448         mutex_exit(&sqset_lock);
 449         return ((set == NULL) ? -1 : 0);
 450 }
 451 
 452 /*
 453  * The mac layer is calling, asking us to move an squeue to a
 454  * new CPU. This routine is called with cpu_lock held.
 455  */
 456 void
 457 ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
 458 {
 459         ASSERT(ILL_MAC_PERIM_HELD(ill));
 460         ASSERT(rx_ring->rr_ill == ill);
 461 
 462         mutex_enter(&ill->ill_lock);
 463         if (rx_ring->rr_ring_state == RR_FREE ||
 464             rx_ring->rr_ring_state == RR_FREE_INPROG) {
 465                 mutex_exit(&ill->ill_lock);
 466                 return;
 467         }
 468 
 469         if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
 470                 rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
 471 
 472         mutex_exit(&ill->ill_lock);
 473 }
 474 
 475 void *
 476 ip_squeue_add_ring(ill_t *ill, void *mrp)
 477 {
 478         mac_rx_fifo_t           *mrfp = (mac_rx_fifo_t *)mrp;
 479         ill_rx_ring_t           *rx_ring, *ring_tbl;
 480         int                     ip_rx_index;
 481         squeue_t                *sq = NULL;
 482         pri_t                   pri;
 483 
 484         ASSERT(ILL_MAC_PERIM_HELD(ill));
 485         ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
 486         ASSERT(ill->ill_dld_capab != NULL);
 487 
 488         ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
 489 
 490         mutex_enter(&ill->ill_lock);
 491         for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
 492                 rx_ring = &ring_tbl[ip_rx_index];
 493                 if (rx_ring->rr_ring_state == RR_FREE)
 494                         break;
 495         }
 496 
 497         if (ip_rx_index == ILL_MAX_RINGS) {
 498                 /*
 499                  * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
 500                  * we have devices which can overwhelm this limit,
 501                  * ILL_MAX_RING should be made configurable. Meanwhile it
 502                  * cause no panic because driver will pass ip_input a NULL
 503                  * handle which will make IP allocate the default squeue and
 504                  * Polling mode will not be used for this ring.
 505                  */
 506                 cmn_err(CE_NOTE,
 507                     "Reached maximum number of receiving rings (%d) for %s\n",
 508                     ILL_MAX_RINGS, ill->ill_name);
 509                 mutex_exit(&ill->ill_lock);
 510                 return (NULL);
 511         }
 512 
 513         bzero(rx_ring, sizeof (ill_rx_ring_t));
 514         rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
 515         /* XXX: Hard code it to tcp accept for now */
 516         rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
 517 
 518         rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
 519         rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
 520         rx_ring->rr_intr_disable =
 521             (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
 522         rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
 523         rx_ring->rr_ill = ill;
 524 
 525         pri = mrfp->mrf_flow_priority;
 526 
 527         sq = ip_squeue_getfree(pri);
 528 
 529         mutex_enter(&sq->sq_lock);
 530         sq->sq_rx_ring = rx_ring;
 531         rx_ring->rr_sqp = sq;
 532 
 533         sq->sq_state |= SQS_POLL_CAPAB;
 534 
 535         rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
 536         sq->sq_ill = ill;
 537         mutex_exit(&sq->sq_lock);
 538         mutex_exit(&ill->ill_lock);
 539 
 540         DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
 541             ip_rx_index, void *, mrfp->mrf_rx_arg);
 542 
 543         /* Assign the squeue to the specified CPU as well */
 544         mutex_enter(&cpu_lock);
 545         (void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
 546         mutex_exit(&cpu_lock);
 547 
 548         return (rx_ring);
 549 }
 550 
 551 /*
 552  * sanitize the squeue etc. Some of the processing
 553  * needs to be done from inside the perimeter.
 554  */
 555 void
 556 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
 557 {
 558         squeue_t *sqp;
 559 
 560         ASSERT(ILL_MAC_PERIM_HELD(ill));
 561         ASSERT(rx_ring != NULL);
 562 
 563         /* Just clean one squeue */
 564         mutex_enter(&ill->ill_lock);
 565         if (rx_ring->rr_ring_state == RR_FREE) {
 566                 mutex_exit(&ill->ill_lock);
 567                 return;
 568         }
 569         rx_ring->rr_ring_state = RR_FREE_INPROG;
 570         sqp = rx_ring->rr_sqp;
 571 
 572         mutex_enter(&sqp->sq_lock);
 573         sqp->sq_state |= SQS_POLL_CLEANUP;
 574         cv_signal(&sqp->sq_worker_cv);
 575         mutex_exit(&ill->ill_lock);
 576         while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
 577                 cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
 578         sqp->sq_state &= ~SQS_POLL_CLEANUP_DONE;
 579 
 580         ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
 581             SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
 582             SQS_POLL_THR_QUIESCED)));
 583 
 584         cv_signal(&sqp->sq_worker_cv);
 585         mutex_exit(&sqp->sq_lock);
 586 
 587         /*
 588          * Move the squeue to sqset_global_list[0] which holds the set of
 589          * squeues not bound to any cpu. Note that the squeue is still
 590          * considered bound to an ill as long as SQS_ILL_BOUND is set.
 591          */
 592         mutex_enter(&sqset_lock);
 593         ip_squeue_set_move(sqp, sqset_global_list[0]);
 594         mutex_exit(&sqset_lock);
 595 
 596         /*
 597          * CPU going offline can also trigger a move of the squeue to the
 598          * unbound set sqset_global_list[0]. However the squeue won't be
 599          * recycled for the next use as long as the SQS_ILL_BOUND flag
 600          * is set. Hence we clear the SQS_ILL_BOUND flag only towards the
 601          * end after the move.
 602          */
 603         mutex_enter(&sqp->sq_lock);
 604         sqp->sq_state &= ~SQS_ILL_BOUND;
 605         mutex_exit(&sqp->sq_lock);
 606 
 607         mutex_enter(&ill->ill_lock);
 608         rx_ring->rr_ring_state = RR_FREE;
 609         mutex_exit(&ill->ill_lock);
 610 }
 611 
 612 /*
 613  * Stop the squeue from polling. This needs to be done
 614  * from inside the perimeter.
 615  */
 616 void
 617 ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
 618 {
 619         squeue_t *sqp;
 620 
 621         ASSERT(ILL_MAC_PERIM_HELD(ill));
 622         ASSERT(rx_ring != NULL);
 623 
 624         sqp = rx_ring->rr_sqp;
 625         mutex_enter(&sqp->sq_lock);
 626         sqp->sq_state |= SQS_POLL_QUIESCE;
 627         cv_signal(&sqp->sq_worker_cv);
 628         while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
 629                 cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
 630 
 631         mutex_exit(&sqp->sq_lock);
 632 }
 633 
 634 /*
 635  * Restart polling etc. Needs to be inside the perimeter to
 636  * prevent races.
 637  */
 638 void
 639 ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
 640 {
 641         squeue_t *sqp;
 642 
 643         ASSERT(ILL_MAC_PERIM_HELD(ill));
 644         ASSERT(rx_ring != NULL);
 645 
 646         sqp = rx_ring->rr_sqp;
 647         mutex_enter(&sqp->sq_lock);
 648         /*
 649          * Handle change in number of rings between the quiesce and
 650          * restart operations by checking for a previous quiesce before
 651          * attempting a restart.
 652          */
 653         if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
 654                 mutex_exit(&sqp->sq_lock);
 655                 return;
 656         }
 657         sqp->sq_state |= SQS_POLL_RESTART;
 658         cv_signal(&sqp->sq_worker_cv);
 659         while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
 660                 cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
 661         sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
 662         mutex_exit(&sqp->sq_lock);
 663 }
 664 
 665 /*
 666  * sanitize all squeues associated with the ill.
 667  */
 668 void
 669 ip_squeue_clean_all(ill_t *ill)
 670 {
 671         int idx;
 672         ill_rx_ring_t   *rx_ring;
 673 
 674         for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
 675                 rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
 676                 ip_squeue_clean_ring(ill, rx_ring);
 677         }
 678 }
 679 
 680 /*
 681  * Used by IP to get the squeue associated with a ring. If the squeue isn't
 682  * yet bound to a CPU, and we're being called directly from the NIC's
 683  * interrupt, then we know what CPU we want to assign the squeue to, so
 684  * dispatch that task to a taskq.
 685  */
 686 squeue_t *
 687 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
 688 {
 689         squeue_t        *sqp;
 690 
 691         if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
 692                 return (IP_SQUEUE_GET(CPU_PSEUDO_RANDOM()));
 693 
 694         return (sqp);
 695 }
 696 
 697 /*
 698  * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
 699  * squeues are unboudn and moved to the unbound set.
 700  */
 701 static void
 702 ip_squeue_set_destroy(cpu_t *cpu)
 703 {
 704         int i;
 705         squeue_t *sqp, *lastsqp = NULL;
 706         squeue_set_t *sqs, *unbound = sqset_global_list[0];
 707 
 708         mutex_enter(&sqset_lock);
 709         if ((sqs = cpu->cpu_squeue_set) == NULL) {
 710                 mutex_exit(&sqset_lock);
 711                 return;
 712         }
 713 
 714         /* Move all squeues to unbound set */
 715 
 716         for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
 717                 squeue_unbind(sqp);
 718                 sqp->sq_set = unbound;
 719         }
 720         if (sqs->sqs_head) {
 721                 lastsqp->sq_next = unbound->sqs_head;
 722                 unbound->sqs_head = sqs->sqs_head;
 723         }
 724 
 725         /* Also move default squeue to unbound set */
 726 
 727         sqp = sqs->sqs_default;
 728         ASSERT(sqp != NULL);
 729         ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
 730 
 731         sqp->sq_next = unbound->sqs_head;
 732         unbound->sqs_head = sqp;
 733         squeue_unbind(sqp);
 734         sqp->sq_set = unbound;
 735 
 736         for (i = 1; i < sqset_global_size; i++)
 737                 if (sqset_global_list[i] == sqs)
 738                         break;
 739 
 740         ASSERT(i < sqset_global_size);
 741         sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
 742         sqset_global_list[sqset_global_size - 1] = NULL;
 743         sqset_global_size--;
 744 
 745         mutex_exit(&sqset_lock);
 746         kmem_free(sqs, sizeof (*sqs));
 747 }
 748 
 749 /*
 750  * Reconfiguration callback
 751  */
 752 /* ARGSUSED */
 753 static int
 754 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
 755 {
 756         cpu_t *cp = cpu_get(id);
 757 
 758         ASSERT(MUTEX_HELD(&cpu_lock));
 759         switch (what) {
 760         case CPU_CONFIG:
 761         case CPU_ON:
 762         case CPU_INIT:
 763         case CPU_CPUPART_IN:
 764                 if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL)
 765                         cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
 766                 break;
 767         case CPU_UNCONFIG:
 768         case CPU_OFF:
 769         case CPU_CPUPART_OUT:
 770                 if (cp->cpu_squeue_set != NULL) {
 771                         ip_squeue_set_destroy(cp);
 772                         cp->cpu_squeue_set = NULL;
 773                 }
 774                 break;
 775         default:
 776                 break;
 777         }
 778         return (0);
 779 }