1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 /*
  26  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  27  */
  28 
  29 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T     */
  30 /*        All Rights Reserved   */
  31 
  32 
  33 #include <sys/types.h>
  34 #include <sys/param.h>
  35 #include <sys/sysmacros.h>
  36 #include <sys/signal.h>
  37 #include <sys/user.h>
  38 #include <sys/systm.h>
  39 #include <sys/sysinfo.h>
  40 #include <sys/var.h>
  41 #include <sys/errno.h>
  42 #include <sys/cmn_err.h>
  43 #include <sys/debug.h>
  44 #include <sys/inline.h>
  45 #include <sys/disp.h>
  46 #include <sys/class.h>
  47 #include <sys/bitmap.h>
  48 #include <sys/kmem.h>
  49 #include <sys/cpuvar.h>
  50 #include <sys/vtrace.h>
  51 #include <sys/tnf.h>
  52 #include <sys/cpupart.h>
  53 #include <sys/lgrp.h>
  54 #include <sys/pg.h>
  55 #include <sys/cmt.h>
  56 #include <sys/bitset.h>
  57 #include <sys/schedctl.h>
  58 #include <sys/atomic.h>
  59 #include <sys/dtrace.h>
  60 #include <sys/sdt.h>
  61 #include <sys/archsystm.h>
  62 
  63 #include <vm/as.h>
  64 
  65 #define BOUND_CPU       0x1
  66 #define BOUND_PARTITION 0x2
  67 #define BOUND_INTR      0x4
  68 
  69 /* Dispatch queue allocation structure and functions */
  70 struct disp_queue_info {
  71         disp_t  *dp;
  72         dispq_t *olddispq;
  73         dispq_t *newdispq;
  74         ulong_t *olddqactmap;
  75         ulong_t *newdqactmap;
  76         int     oldnglobpris;
  77 };
  78 static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  79     disp_t *dp);
  80 static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  81 static void     disp_dq_free(struct disp_queue_info *dptr);
  82 
  83 /* platform-specific routine to call when processor is idle */
  84 static void     generic_idle_cpu();
  85 void            (*idle_cpu)() = generic_idle_cpu;
  86 
  87 /* routines invoked when a CPU enters/exits the idle loop */
  88 static void     idle_enter();
  89 static void     idle_exit();
  90 
  91 /* platform-specific routine to call when thread is enqueued */
  92 static void     generic_enq_thread(cpu_t *, int);
  93 void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  94 
  95 pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  96 pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  97 pri_t   intr_pri;               /* interrupt thread priority base level */
  98 
  99 #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
 100 pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
 101 disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
 102 disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 103 int     nswapped;               /* total number of swapped threads */
 104 void    disp_swapped_enq(kthread_t *tp);
 105 static void     disp_swapped_setrun(kthread_t *tp);
 106 static void     cpu_resched(cpu_t *cp, pri_t tpri);
 107 
 108 /*
 109  * If this is set, only interrupt threads will cause kernel preemptions.
 110  * This is done by changing the value of kpreemptpri.  kpreemptpri
 111  * will either be the max sysclass pri + 1 or the min interrupt pri.
 112  */
 113 int     only_intr_kpreempt;
 114 
 115 extern void set_idle_cpu(int cpun);
 116 extern void unset_idle_cpu(int cpun);
 117 static void setkpdq(kthread_t *tp, int borf);
 118 #define SETKP_BACK      0
 119 #define SETKP_FRONT     1
 120 /*
 121  * Parameter that determines how recently a thread must have run
 122  * on the CPU to be considered loosely-bound to that CPU to reduce
 123  * cold cache effects.  The interval is in hertz.
 124  */
 125 #define RECHOOSE_INTERVAL 3
 126 volatile int    rechoose_interval = RECHOOSE_INTERVAL;
 127 
 128 /*
 129  * Parameter that determines how long (in nanoseconds) a thread must
 130  * be sitting on a run queue before it can be stolen by another CPU
 131  * to reduce migrations.  The interval is in nanoseconds.
 132  *
 133  * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 134  * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 135  * here indicating it is uninitiallized.
 136  * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 137  *
 138  */
 139 #define NOSTEAL_UNINITIALIZED   (-1)
 140 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 141 extern void cmp_set_nosteal_interval(void);
 142 
 143 id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 144 
 145 disp_lock_t     transition_lock;        /* lock on transitioning threads */
 146 disp_lock_t     stop_lock;              /* lock on stopped threads */
 147 
 148 static void     cpu_dispqalloc(int numpris);
 149 
 150 /*
 151  * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 152  * a thread because it was sitting on its run queue for a very short
 153  * period of time.
 154  */
 155 #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 156 
 157 static kthread_t        *disp_getwork(cpu_t *to);
 158 static kthread_t        *disp_getbest(disp_t *from);
 159 static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 160 
 161 void    swtch_to(kthread_t *);
 162 
 163 /*
 164  * dispatcher and scheduler initialization
 165  */
 166 
 167 /*
 168  * disp_setup - Common code to calculate and allocate dispatcher
 169  *              variables and structures based on the maximum priority.
 170  */
 171 static void
 172 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 173 {
 174         pri_t   newnglobpris;
 175 
 176         ASSERT(MUTEX_HELD(&cpu_lock));
 177 
 178         newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 179 
 180         if (newnglobpris > oldnglobpris) {
 181                 /*
 182                  * Allocate new kp queues for each CPU partition.
 183                  */
 184                 cpupart_kpqalloc(newnglobpris);
 185 
 186                 /*
 187                  * Allocate new dispatch queues for each CPU.
 188                  */
 189                 cpu_dispqalloc(newnglobpris);
 190 
 191                 /*
 192                  * compute new interrupt thread base priority
 193                  */
 194                 intr_pri = maxglobpri;
 195                 if (only_intr_kpreempt) {
 196                         kpreemptpri = intr_pri + 1;
 197                         if (kpqpri == KPQPRI)
 198                                 kpqpri = kpreemptpri;
 199                 }
 200                 v.v_nglobpris = newnglobpris;
 201         }
 202 }
 203 
 204 /*
 205  * dispinit - Called to initialize all loaded classes and the
 206  *            dispatcher framework.
 207  */
 208 void
 209 dispinit(void)
 210 {
 211         id_t    cid;
 212         pri_t   maxglobpri;
 213         pri_t   cl_maxglobpri;
 214 
 215         maxglobpri = -1;
 216 
 217         /*
 218          * Initialize transition lock, which will always be set.
 219          */
 220         DISP_LOCK_INIT(&transition_lock);
 221         disp_lock_enter_high(&transition_lock);
 222         DISP_LOCK_INIT(&stop_lock);
 223 
 224         mutex_enter(&cpu_lock);
 225         CPU->cpu_disp->disp_maxrunpri = -1;
 226         CPU->cpu_disp->disp_max_unbound_pri = -1;
 227 
 228         /*
 229          * Initialize the default CPU partition.
 230          */
 231         cpupart_initialize_default();
 232         /*
 233          * Call the class specific initialization functions for
 234          * all pre-installed schedulers.
 235          *
 236          * We pass the size of a class specific parameter
 237          * buffer to each of the initialization functions
 238          * to try to catch problems with backward compatibility
 239          * of class modules.
 240          *
 241          * For example a new class module running on an old system
 242          * which didn't provide sufficiently large parameter buffers
 243          * would be bad news. Class initialization modules can check for
 244          * this and take action if they detect a problem.
 245          */
 246 
 247         for (cid = 0; cid < nclass; cid++) {
 248                 sclass_t        *sc;
 249 
 250                 sc = &sclass[cid];
 251                 if (SCHED_INSTALLED(sc)) {
 252                         cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 253                             &sc->cl_funcs);
 254                         if (cl_maxglobpri > maxglobpri)
 255                                 maxglobpri = cl_maxglobpri;
 256                 }
 257         }
 258         kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 259         if (kpqpri == KPQPRI)
 260                 kpqpri = kpreemptpri;
 261 
 262         ASSERT(maxglobpri >= 0);
 263         disp_setup(maxglobpri, 0);
 264 
 265         mutex_exit(&cpu_lock);
 266 
 267         /*
 268          * Platform specific sticky scheduler setup.
 269          */
 270         if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 271                 cmp_set_nosteal_interval();
 272 
 273         /*
 274          * Get the default class ID; this may be later modified via
 275          * dispadmin(1M).  This will load the class (normally TS) and that will
 276          * call disp_add(), which is why we had to drop cpu_lock first.
 277          */
 278         if (getcid(defaultclass, &defaultcid) != 0) {
 279                 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 280                     defaultclass);
 281         }
 282 }
 283 
 284 /*
 285  * disp_add - Called with class pointer to initialize the dispatcher
 286  *            for a newly loaded class.
 287  */
 288 void
 289 disp_add(sclass_t *clp)
 290 {
 291         pri_t   maxglobpri;
 292         pri_t   cl_maxglobpri;
 293 
 294         mutex_enter(&cpu_lock);
 295         /*
 296          * Initialize the scheduler class.
 297          */
 298         maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 299         cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 300         if (cl_maxglobpri > maxglobpri)
 301                 maxglobpri = cl_maxglobpri;
 302 
 303         /*
 304          * Save old queue information.  Since we're initializing a
 305          * new scheduling class which has just been loaded, then
 306          * the size of the dispq may have changed.  We need to handle
 307          * that here.
 308          */
 309         disp_setup(maxglobpri, v.v_nglobpris);
 310 
 311         mutex_exit(&cpu_lock);
 312 }
 313 
 314 
 315 /*
 316  * For each CPU, allocate new dispatch queues
 317  * with the stated number of priorities.
 318  */
 319 static void
 320 cpu_dispqalloc(int numpris)
 321 {
 322         cpu_t   *cpup;
 323         struct disp_queue_info  *disp_mem;
 324         int i, num;
 325 
 326         ASSERT(MUTEX_HELD(&cpu_lock));
 327 
 328         disp_mem = kmem_zalloc(NCPU *
 329             sizeof (struct disp_queue_info), KM_SLEEP);
 330 
 331         /*
 332          * This routine must allocate all of the memory before stopping
 333          * the cpus because it must not sleep in kmem_alloc while the
 334          * CPUs are stopped.  Locks they hold will not be freed until they
 335          * are restarted.
 336          */
 337         i = 0;
 338         cpup = cpu_list;
 339         do {
 340                 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 341                 i++;
 342                 cpup = cpup->cpu_next;
 343         } while (cpup != cpu_list);
 344         num = i;
 345 
 346         pause_cpus(NULL, NULL);
 347         for (i = 0; i < num; i++)
 348                 disp_dq_assign(&disp_mem[i], numpris);
 349         start_cpus();
 350 
 351         /*
 352          * I must free all of the memory after starting the cpus because
 353          * I can not risk sleeping in kmem_free while the cpus are stopped.
 354          */
 355         for (i = 0; i < num; i++)
 356                 disp_dq_free(&disp_mem[i]);
 357 
 358         kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 359 }
 360 
 361 static void
 362 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 363 {
 364         dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 365         dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 366             sizeof (long), KM_SLEEP);
 367         dptr->dp = dp;
 368 }
 369 
 370 static void
 371 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 372 {
 373         disp_t  *dp;
 374 
 375         dp = dptr->dp;
 376         dptr->olddispq = dp->disp_q;
 377         dptr->olddqactmap = dp->disp_qactmap;
 378         dptr->oldnglobpris = dp->disp_npri;
 379 
 380         ASSERT(dptr->oldnglobpris < numpris);
 381 
 382         if (dptr->olddispq != NULL) {
 383                 /*
 384                  * Use kcopy because bcopy is platform-specific
 385                  * and could block while we might have paused the cpus.
 386                  */
 387                 (void) kcopy(dptr->olddispq, dptr->newdispq,
 388                     dptr->oldnglobpris * sizeof (dispq_t));
 389                 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 390                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 391                     sizeof (long));
 392         }
 393         dp->disp_q = dptr->newdispq;
 394         dp->disp_qactmap = dptr->newdqactmap;
 395         dp->disp_q_limit = &dptr->newdispq[numpris];
 396         dp->disp_npri = numpris;
 397 }
 398 
 399 static void
 400 disp_dq_free(struct disp_queue_info *dptr)
 401 {
 402         if (dptr->olddispq != NULL)
 403                 kmem_free(dptr->olddispq,
 404                     dptr->oldnglobpris * sizeof (dispq_t));
 405         if (dptr->olddqactmap != NULL)
 406                 kmem_free(dptr->olddqactmap,
 407                     ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 408 }
 409 
 410 /*
 411  * For a newly created CPU, initialize the dispatch queue.
 412  * This is called before the CPU is known through cpu[] or on any lists.
 413  */
 414 void
 415 disp_cpu_init(cpu_t *cp)
 416 {
 417         disp_t  *dp;
 418         dispq_t *newdispq;
 419         ulong_t *newdqactmap;
 420 
 421         ASSERT(MUTEX_HELD(&cpu_lock));      /* protect dispatcher queue sizes */
 422 
 423         if (cp == cpu0_disp.disp_cpu)
 424                 dp = &cpu0_disp;
 425         else
 426                 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 427         bzero(dp, sizeof (disp_t));
 428         cp->cpu_disp = dp;
 429         dp->disp_cpu = cp;
 430         dp->disp_maxrunpri = -1;
 431         dp->disp_max_unbound_pri = -1;
 432         DISP_LOCK_INIT(&cp->cpu_thread_lock);
 433         /*
 434          * Allocate memory for the dispatcher queue headers
 435          * and the active queue bitmap.
 436          */
 437         newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 438         newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 439             sizeof (long), KM_SLEEP);
 440         dp->disp_q = newdispq;
 441         dp->disp_qactmap = newdqactmap;
 442         dp->disp_q_limit = &newdispq[v.v_nglobpris];
 443         dp->disp_npri = v.v_nglobpris;
 444 }
 445 
 446 void
 447 disp_cpu_fini(cpu_t *cp)
 448 {
 449         ASSERT(MUTEX_HELD(&cpu_lock));
 450 
 451         disp_kp_free(cp->cpu_disp);
 452         if (cp->cpu_disp != &cpu0_disp)
 453                 kmem_free(cp->cpu_disp, sizeof (disp_t));
 454 }
 455 
 456 /*
 457  * Allocate new, larger kpreempt dispatch queue to replace the old one.
 458  */
 459 void
 460 disp_kp_alloc(disp_t *dq, pri_t npri)
 461 {
 462         struct disp_queue_info  mem_info;
 463 
 464         if (npri > dq->disp_npri) {
 465                 /*
 466                  * Allocate memory for the new array.
 467                  */
 468                 disp_dq_alloc(&mem_info, npri, dq);
 469 
 470                 /*
 471                  * We need to copy the old structures to the new
 472                  * and free the old.
 473                  */
 474                 disp_dq_assign(&mem_info, npri);
 475                 disp_dq_free(&mem_info);
 476         }
 477 }
 478 
 479 /*
 480  * Free dispatch queue.
 481  * Used for the kpreempt queues for a removed CPU partition and
 482  * for the per-CPU queues of deleted CPUs.
 483  */
 484 void
 485 disp_kp_free(disp_t *dq)
 486 {
 487         struct disp_queue_info  mem_info;
 488 
 489         mem_info.olddispq = dq->disp_q;
 490         mem_info.olddqactmap = dq->disp_qactmap;
 491         mem_info.oldnglobpris = dq->disp_npri;
 492         disp_dq_free(&mem_info);
 493 }
 494 
 495 /*
 496  * End dispatcher and scheduler initialization.
 497  */
 498 
 499 /*
 500  * See if there's anything to do other than remain idle.
 501  * Return non-zero if there is.
 502  *
 503  * This function must be called with high spl, or with
 504  * kernel preemption disabled to prevent the partition's
 505  * active cpu list from changing while being traversed.
 506  *
 507  * This is essentially a simpler version of disp_getwork()
 508  * to be called by CPUs preparing to "halt".
 509  */
 510 int
 511 disp_anywork(void)
 512 {
 513         cpu_t           *cp = CPU;
 514         cpu_t           *ocp;
 515         volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 516 
 517         if (!(cp->cpu_flags & CPU_OFFLINE)) {
 518                 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 519                         return (1);
 520 
 521                 for (ocp = cp->cpu_next_part; ocp != cp;
 522                     ocp = ocp->cpu_next_part) {
 523                         ASSERT(CPU_ACTIVE(ocp));
 524 
 525                         /*
 526                          * Something has appeared on the local run queue.
 527                          */
 528                         if (*local_nrunnable > 0)
 529                                 return (1);
 530                         /*
 531                          * If we encounter another idle CPU that will
 532                          * soon be trolling around through disp_anywork()
 533                          * terminate our walk here and let this other CPU
 534                          * patrol the next part of the list.
 535                          */
 536                         if (ocp->cpu_dispatch_pri == -1 &&
 537                             (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 538                                 return (0);
 539                         /*
 540                          * Work can be taken from another CPU if:
 541                          *      - There is unbound work on the run queue
 542                          *      - That work isn't a thread undergoing a
 543                          *      - context switch on an otherwise empty queue.
 544                          *      - The CPU isn't running the idle loop.
 545                          */
 546                         if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 547                             !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 548                             ocp->cpu_disp->disp_nrunnable == 1) &&
 549                             ocp->cpu_dispatch_pri != -1)
 550                                 return (1);
 551                 }
 552         }
 553         return (0);
 554 }
 555 
 556 /*
 557  * Called when CPU enters the idle loop
 558  */
 559 static void
 560 idle_enter()
 561 {
 562         cpu_t           *cp = CPU;
 563 
 564         new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 565         CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 566         set_idle_cpu(cp->cpu_id);    /* arch-dependent hook */
 567 }
 568 
 569 /*
 570  * Called when CPU exits the idle loop
 571  */
 572 static void
 573 idle_exit()
 574 {
 575         cpu_t           *cp = CPU;
 576 
 577         new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 578         unset_idle_cpu(cp->cpu_id);  /* arch-dependent hook */
 579 }
 580 
 581 /*
 582  * Idle loop.
 583  */
 584 void
 585 idle()
 586 {
 587         struct cpu      *cp = CPU;              /* pointer to this CPU */
 588         kthread_t       *t;                     /* taken thread */
 589 
 590         idle_enter();
 591 
 592         /*
 593          * Uniprocessor version of idle loop.
 594          * Do this until notified that we're on an actual multiprocessor.
 595          */
 596         while (ncpus == 1) {
 597                 if (cp->cpu_disp->disp_nrunnable == 0) {
 598                         (*idle_cpu)();
 599                         continue;
 600                 }
 601                 idle_exit();
 602                 swtch();
 603 
 604                 idle_enter(); /* returned from swtch */
 605         }
 606 
 607         /*
 608          * Multiprocessor idle loop.
 609          */
 610         for (;;) {
 611                 /*
 612                  * If CPU is completely quiesced by p_online(2), just wait
 613                  * here with minimal bus traffic until put online.
 614                  */
 615                 while (cp->cpu_flags & CPU_QUIESCED)
 616                         (*idle_cpu)();
 617 
 618                 if (cp->cpu_disp->disp_nrunnable != 0) {
 619                         idle_exit();
 620                         swtch();
 621                 } else {
 622                         if (cp->cpu_flags & CPU_OFFLINE)
 623                                 continue;
 624                         if ((t = disp_getwork(cp)) == NULL) {
 625                                 if (cp->cpu_chosen_level != -1) {
 626                                         disp_t *dp = cp->cpu_disp;
 627                                         disp_t *kpq;
 628 
 629                                         disp_lock_enter(&dp->disp_lock);
 630                                         /*
 631                                          * Set kpq under lock to prevent
 632                                          * migration between partitions.
 633                                          */
 634                                         kpq = &cp->cpu_part->cp_kp_queue;
 635                                         if (kpq->disp_maxrunpri == -1)
 636                                                 cp->cpu_chosen_level = -1;
 637                                         disp_lock_exit(&dp->disp_lock);
 638                                 }
 639                                 (*idle_cpu)();
 640                                 continue;
 641                         }
 642                         /*
 643                          * If there was a thread but we couldn't steal
 644                          * it, then keep trying.
 645                          */
 646                         if (t == T_DONTSTEAL)
 647                                 continue;
 648                         idle_exit();
 649                         swtch_to(t);
 650                 }
 651                 idle_enter(); /* returned from swtch/swtch_to */
 652         }
 653 }
 654 
 655 
 656 /*
 657  * Preempt the currently running thread in favor of the highest
 658  * priority thread.  The class of the current thread controls
 659  * where it goes on the dispatcher queues. If panicking, turn
 660  * preemption off.
 661  */
 662 void
 663 preempt()
 664 {
 665         kthread_t       *t = curthread;
 666         klwp_t          *lwp = ttolwp(curthread);
 667 
 668         if (panicstr)
 669                 return;
 670 
 671         TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 672 
 673         thread_lock(t);
 674 
 675         if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 676                 /*
 677                  * this thread has already been chosen to be run on
 678                  * another CPU. Clear kprunrun on this CPU since we're
 679                  * already headed for swtch().
 680                  */
 681                 CPU->cpu_kprunrun = 0;
 682                 thread_unlock_nopreempt(t);
 683                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 684         } else {
 685                 if (lwp != NULL)
 686                         lwp->lwp_ru.nivcsw++;
 687                 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 688                 THREAD_TRANSITION(t);
 689                 CL_PREEMPT(t);
 690                 DTRACE_SCHED(preempt);
 691                 thread_unlock_nopreempt(t);
 692 
 693                 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 694 
 695                 swtch();                /* clears CPU->cpu_runrun via disp() */
 696         }
 697 }
 698 
 699 extern kthread_t *thread_unpin();
 700 
 701 /*
 702  * disp() - find the highest priority thread for this processor to run, and
 703  * set it in TS_ONPROC state so that resume() can be called to run it.
 704  */
 705 static kthread_t *
 706 disp()
 707 {
 708         cpu_t           *cpup;
 709         disp_t          *dp;
 710         kthread_t       *tp;
 711         dispq_t         *dq;
 712         int             maxrunword;
 713         pri_t           pri;
 714         disp_t          *kpq;
 715 
 716         TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 717 
 718         cpup = CPU;
 719         /*
 720          * Find the highest priority loaded, runnable thread.
 721          */
 722         dp = cpup->cpu_disp;
 723 
 724 reschedule:
 725         /*
 726          * If there is more important work on the global queue with a better
 727          * priority than the maximum on this CPU, take it now.
 728          */
 729         kpq = &cpup->cpu_part->cp_kp_queue;
 730         while ((pri = kpq->disp_maxrunpri) >= 0 &&
 731             pri >= dp->disp_maxrunpri &&
 732             (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 733             (tp = disp_getbest(kpq)) != NULL) {
 734                 if (disp_ratify(tp, kpq) != NULL) {
 735                         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 736                             "disp_end:tid %p", tp);
 737                         return (tp);
 738                 }
 739         }
 740 
 741         disp_lock_enter(&dp->disp_lock);
 742         pri = dp->disp_maxrunpri;
 743 
 744         /*
 745          * If there is nothing to run, look at what's runnable on other queues.
 746          * Choose the idle thread if the CPU is quiesced.
 747          * Note that CPUs that have the CPU_OFFLINE flag set can still run
 748          * interrupt threads, which will be the only threads on the CPU's own
 749          * queue, but cannot run threads from other queues.
 750          */
 751         if (pri == -1) {
 752                 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 753                         disp_lock_exit(&dp->disp_lock);
 754                         if ((tp = disp_getwork(cpup)) == NULL ||
 755                             tp == T_DONTSTEAL) {
 756                                 tp = cpup->cpu_idle_thread;
 757                                 (void) splhigh();
 758                                 THREAD_ONPROC(tp, cpup);
 759                                 cpup->cpu_dispthread = tp;
 760                                 cpup->cpu_dispatch_pri = -1;
 761                                 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 762                                 cpup->cpu_chosen_level = -1;
 763                         }
 764                 } else {
 765                         disp_lock_exit_high(&dp->disp_lock);
 766                         tp = cpup->cpu_idle_thread;
 767                         THREAD_ONPROC(tp, cpup);
 768                         cpup->cpu_dispthread = tp;
 769                         cpup->cpu_dispatch_pri = -1;
 770                         cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 771                         cpup->cpu_chosen_level = -1;
 772                 }
 773                 TRACE_1(TR_FAC_DISP, TR_DISP_END,
 774                     "disp_end:tid %p", tp);
 775                 return (tp);
 776         }
 777 
 778         dq = &dp->disp_q[pri];
 779         tp = dq->dq_first;
 780 
 781         ASSERT(tp != NULL);
 782         ASSERT(tp->t_schedflag & TS_LOAD);       /* thread must be swapped in */
 783 
 784         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 785 
 786         /*
 787          * Found it so remove it from queue.
 788          */
 789         dp->disp_nrunnable--;
 790         dq->dq_sruncnt--;
 791         if ((dq->dq_first = tp->t_link) == NULL) {
 792                 ulong_t *dqactmap = dp->disp_qactmap;
 793 
 794                 ASSERT(dq->dq_sruncnt == 0);
 795                 dq->dq_last = NULL;
 796 
 797                 /*
 798                  * The queue is empty, so the corresponding bit needs to be
 799                  * turned off in dqactmap.   If nrunnable != 0 just took the
 800                  * last runnable thread off the
 801                  * highest queue, so recompute disp_maxrunpri.
 802                  */
 803                 maxrunword = pri >> BT_ULSHIFT;
 804                 dqactmap[maxrunword] &= ~BT_BIW(pri);
 805 
 806                 if (dp->disp_nrunnable == 0) {
 807                         dp->disp_max_unbound_pri = -1;
 808                         dp->disp_maxrunpri = -1;
 809                 } else {
 810                         int ipri;
 811 
 812                         ipri = bt_gethighbit(dqactmap, maxrunword);
 813                         dp->disp_maxrunpri = ipri;
 814                         if (ipri < dp->disp_max_unbound_pri)
 815                                 dp->disp_max_unbound_pri = ipri;
 816                 }
 817         } else {
 818                 tp->t_link = NULL;
 819         }
 820 
 821         /*
 822          * Set TS_DONT_SWAP flag to prevent another processor from swapping
 823          * out this thread before we have a chance to run it.
 824          * While running, it is protected against swapping by t_lock.
 825          */
 826         tp->t_schedflag |= TS_DONT_SWAP;
 827         cpup->cpu_dispthread = tp;           /* protected by spl only */
 828         cpup->cpu_dispatch_pri = pri;
 829         ASSERT(pri == DISP_PRIO(tp));
 830         thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 831         disp_lock_exit_high(&dp->disp_lock);     /* drop run queue lock */
 832 
 833         ASSERT(tp != NULL);
 834         TRACE_1(TR_FAC_DISP, TR_DISP_END,
 835             "disp_end:tid %p", tp);
 836 
 837         if (disp_ratify(tp, kpq) == NULL)
 838                 goto reschedule;
 839 
 840         return (tp);
 841 }
 842 
 843 /*
 844  * swtch()
 845  *      Find best runnable thread and run it.
 846  *      Called with the current thread already switched to a new state,
 847  *      on a sleep queue, run queue, stopped, and not zombied.
 848  *      May be called at any spl level less than or equal to LOCK_LEVEL.
 849  *      Always drops spl to the base level (spl0()).
 850  */
 851 void
 852 swtch()
 853 {
 854         kthread_t       *t = curthread;
 855         kthread_t       *next;
 856         cpu_t           *cp;
 857 
 858         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 859 
 860         if (t->t_flag & T_INTR_THREAD)
 861                 cpu_intr_swtch_enter(t);
 862 
 863         if (t->t_intr != NULL) {
 864                 /*
 865                  * We are an interrupt thread.  Setup and return
 866                  * the interrupted thread to be resumed.
 867                  */
 868                 (void) splhigh();       /* block other scheduler action */
 869                 cp = CPU;               /* now protected against migration */
 870                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 871                 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 872                 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 873                 next = thread_unpin();
 874                 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 875                 resume_from_intr(next);
 876         } else {
 877 #ifdef  DEBUG
 878                 if (t->t_state == TS_ONPROC &&
 879                     t->t_disp_queue->disp_cpu == CPU &&
 880                     t->t_preempt == 0) {
 881                         thread_lock(t);
 882                         ASSERT(t->t_state != TS_ONPROC ||
 883                             t->t_disp_queue->disp_cpu != CPU ||
 884                             t->t_preempt != 0);      /* cannot migrate */
 885                         thread_unlock_nopreempt(t);
 886                 }
 887 #endif  /* DEBUG */
 888                 cp = CPU;
 889                 next = disp();          /* returns with spl high */
 890                 ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 891 
 892                 /* OK to steal anything left on run queue */
 893                 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 894 
 895                 if (next != t) {
 896                         hrtime_t now;
 897 
 898                         now = gethrtime_unscaled();
 899                         pg_ev_thread_swtch(cp, now, t, next);
 900 
 901                         /*
 902                          * If t was previously in the TS_ONPROC state,
 903                          * setfrontdq and setbackdq won't have set its t_waitrq.
 904                          * Since we now finally know that we're switching away
 905                          * from this thread, set its t_waitrq if it is on a run
 906                          * queue.
 907                          */
 908                         if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 909                                 t->t_waitrq = now;
 910                         }
 911 
 912                         /*
 913                          * restore mstate of thread that we are switching to
 914                          */
 915                         restore_mstate(next);
 916 
 917                         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 918                         cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 919                         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 920 
 921                         if (dtrace_vtime_active)
 922                                 dtrace_vtime_switch(next);
 923 
 924                         resume(next);
 925                         /*
 926                          * The TR_RESUME_END and TR_SWTCH_END trace points
 927                          * appear at the end of resume(), because we may not
 928                          * return here
 929                          */
 930                 } else {
 931                         if (t->t_flag & T_INTR_THREAD)
 932                                 cpu_intr_swtch_exit(t);
 933                         /*
 934                          * Threads that enqueue themselves on a run queue defer
 935                          * setting t_waitrq. It is then either set in swtch()
 936                          * when the CPU is actually yielded, or not at all if it
 937                          * is remaining on the CPU.
 938                          * There is however a window between where the thread
 939                          * placed itself on a run queue, and where it selects
 940                          * itself in disp(), where a third party (eg. clock()
 941                          * doing tick processing) may have re-enqueued this
 942                          * thread, setting t_waitrq in the process. We detect
 943                          * this race by noticing that despite switching to
 944                          * ourself, our t_waitrq has been set, and should be
 945                          * cleared.
 946                          */
 947                         if (t->t_waitrq != 0)
 948                                 t->t_waitrq = 0;
 949 
 950                         pg_ev_thread_remain(cp, t);
 951 
 952                         DTRACE_SCHED(remain__cpu);
 953                         TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 954                         (void) spl0();
 955                 }
 956         }
 957 }
 958 
 959 /*
 960  * swtch_from_zombie()
 961  *      Special case of swtch(), which allows checks for TS_ZOMB to be
 962  *      eliminated from normal resume.
 963  *      Find best runnable thread and run it.
 964  *      Called with the current thread zombied.
 965  *      Zombies cannot migrate, so CPU references are safe.
 966  */
 967 void
 968 swtch_from_zombie()
 969 {
 970         kthread_t       *next;
 971         cpu_t           *cpu = CPU;
 972 
 973         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 974 
 975         ASSERT(curthread->t_state == TS_ZOMB);
 976 
 977         next = disp();                  /* returns with spl high */
 978         ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 979         CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 980         ASSERT(next != curthread);
 981         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 982 
 983         pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 984 
 985         restore_mstate(next);
 986 
 987         if (dtrace_vtime_active)
 988                 dtrace_vtime_switch(next);
 989 
 990         resume_from_zombie(next);
 991         /*
 992          * The TR_RESUME_END and TR_SWTCH_END trace points
 993          * appear at the end of resume(), because we certainly will not
 994          * return here
 995          */
 996 }
 997 
 998 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 999 
1000 /*
1001  * search_disp_queues()
1002  *      Search the given dispatch queues for thread tp.
1003  *      Return 1 if tp is found, otherwise return 0.
1004  */
1005 static int
1006 search_disp_queues(disp_t *dp, kthread_t *tp)
1007 {
1008         dispq_t         *dq;
1009         dispq_t         *eq;
1010 
1011         disp_lock_enter_high(&dp->disp_lock);
1012 
1013         for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1014                 kthread_t       *rp;
1015 
1016                 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1017 
1018                 for (rp = dq->dq_first; rp; rp = rp->t_link)
1019                         if (tp == rp) {
1020                                 disp_lock_exit_high(&dp->disp_lock);
1021                                 return (1);
1022                         }
1023         }
1024         disp_lock_exit_high(&dp->disp_lock);
1025 
1026         return (0);
1027 }
1028 
1029 /*
1030  * thread_on_queue()
1031  *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1032  *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1033  */
1034 static int
1035 thread_on_queue(kthread_t *tp)
1036 {
1037         cpu_t           *cp;
1038         struct cpupart  *part;
1039 
1040         ASSERT(getpil() >= DISP_LEVEL);
1041 
1042         /*
1043          * Search the per-CPU dispatch queues for tp.
1044          */
1045         cp = CPU;
1046         do {
1047                 if (search_disp_queues(cp->cpu_disp, tp))
1048                         return (1);
1049         } while ((cp = cp->cpu_next_onln) != CPU);
1050 
1051         /*
1052          * Search the partition-wide kpreempt queues for tp.
1053          */
1054         part = CPU->cpu_part;
1055         do {
1056                 if (search_disp_queues(&part->cp_kp_queue, tp))
1057                         return (1);
1058         } while ((part = part->cp_next) != CPU->cpu_part);
1059 
1060         return (0);
1061 }
1062 
1063 #else
1064 
1065 #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1066 
1067 #endif  /* DEBUG */
1068 
1069 /*
1070  * like swtch(), but switch to a specified thread taken from another CPU.
1071  *      called with spl high..
1072  */
1073 void
1074 swtch_to(kthread_t *next)
1075 {
1076         cpu_t                   *cp = CPU;
1077         hrtime_t                now;
1078 
1079         TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1080 
1081         /*
1082          * Update context switch statistics.
1083          */
1084         CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1085 
1086         TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1087 
1088         now = gethrtime_unscaled();
1089         pg_ev_thread_swtch(cp, now, curthread, next);
1090 
1091         /* OK to steal anything left on run queue */
1092         cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1093 
1094         /* record last execution time */
1095         cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1096 
1097         /*
1098          * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1099          * won't have set its t_waitrq.  Since we now finally know that we're
1100          * switching away from this thread, set its t_waitrq if it is on a run
1101          * queue.
1102          */
1103         if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1104                 curthread->t_waitrq = now;
1105         }
1106 
1107         /* restore next thread to previously running microstate */
1108         restore_mstate(next);
1109 
1110         if (dtrace_vtime_active)
1111                 dtrace_vtime_switch(next);
1112 
1113         resume(next);
1114         /*
1115          * The TR_RESUME_END and TR_SWTCH_END trace points
1116          * appear at the end of resume(), because we may not
1117          * return here
1118          */
1119 }
1120 
1121 #define CPU_IDLING(pri) ((pri) == -1)
1122 
1123 static void
1124 cpu_resched(cpu_t *cp, pri_t tpri)
1125 {
1126         int     call_poke_cpu = 0;
1127         pri_t   cpupri = cp->cpu_dispatch_pri;
1128 
1129         if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1130                 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1131                     "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1132                 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1133                         cp->cpu_runrun = 1;
1134                         aston(cp->cpu_dispthread);
1135                         if (tpri < kpreemptpri && cp != CPU)
1136                                 call_poke_cpu = 1;
1137                 }
1138                 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1139                         cp->cpu_kprunrun = 1;
1140                         if (cp != CPU)
1141                                 call_poke_cpu = 1;
1142                 }
1143         }
1144 
1145         /*
1146          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1147          */
1148         membar_enter();
1149 
1150         if (call_poke_cpu)
1151                 poke_cpu(cp->cpu_id);
1152 }
1153 
1154 /*
1155  * setbackdq() keeps runqs balanced such that the difference in length
1156  * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1157  * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1158  * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1159  * try to keep runqs perfectly balanced regardless of the thread priority.
1160  */
1161 #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1162 #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1163 #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1164 
1165 /*
1166  * Macro that evaluates to true if it is likely that the thread has cache
1167  * warmth. This is based on the amount of time that has elapsed since the
1168  * thread last ran. If that amount of time is less than "rechoose_interval"
1169  * ticks, then we decide that the thread has enough cache warmth to warrant
1170  * some affinity for t->t_cpu.
1171  */
1172 #define THREAD_HAS_CACHE_WARMTH(thread) \
1173         ((thread == curthread) ||       \
1174         ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1175 /*
1176  * Put the specified thread on the back of the dispatcher
1177  * queue corresponding to its current priority.
1178  *
1179  * Called with the thread in transition, onproc or stopped state
1180  * and locked (transition implies locked) and at high spl.
1181  * Returns with the thread in TS_RUN state and still locked.
1182  */
1183 void
1184 setbackdq(kthread_t *tp)
1185 {
1186         dispq_t *dq;
1187         disp_t          *dp;
1188         cpu_t           *cp;
1189         pri_t           tpri;
1190         int             bound;
1191         boolean_t       self;
1192 
1193         ASSERT(THREAD_LOCK_HELD(tp));
1194         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1195         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1196 
1197         /*
1198          * If thread is "swapped" or on the swap queue don't
1199          * queue it, but wake sched.
1200          */
1201         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1202                 disp_swapped_setrun(tp);
1203                 return;
1204         }
1205 
1206         self = (tp == curthread);
1207 
1208         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209                 bound = 1;
1210         else
1211                 bound = 0;
1212 
1213         tpri = DISP_PRIO(tp);
1214         if (ncpus == 1)
1215                 cp = tp->t_cpu;
1216         else if (!bound) {
1217                 if (tpri >= kpqpri) {
1218                         setkpdq(tp, SETKP_BACK);
1219                         return;
1220                 }
1221 
1222                 /*
1223                  * We'll generally let this thread continue to run where
1224                  * it last ran...but will consider migration if:
1225                  * - We thread probably doesn't have much cache warmth.
1226                  * - The CPU where it last ran is the target of an offline
1227                  *   request.
1228                  * - The thread last ran outside it's home lgroup.
1229                  */
1230                 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1231                     (tp->t_cpu == cpu_inmotion)) {
1232                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1233                 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1234                         cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1235                             self ? tp->t_cpu : NULL);
1236                 } else {
1237                         cp = tp->t_cpu;
1238                 }
1239 
1240                 if (tp->t_cpupart == cp->cpu_part) {
1241                         int     qlen;
1242 
1243                         /*
1244                          * Perform any CMT load balancing
1245                          */
1246                         cp = cmt_balance(tp, cp);
1247 
1248                         /*
1249                          * Balance across the run queues
1250                          */
1251                         qlen = RUNQ_LEN(cp, tpri);
1252                         if (tpri >= RUNQ_MATCH_PRI &&
1253                             !(tp->t_schedflag & TS_RUNQMATCH))
1254                                 qlen -= RUNQ_MAX_DIFF;
1255                         if (qlen > 0) {
1256                                 cpu_t *newcp;
1257 
1258                                 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259                                         newcp = cp->cpu_next_part;
1260                                 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1261                                         newcp = cp->cpu_next_part;
1262                                 }
1263 
1264                                 if (RUNQ_LEN(newcp, tpri) < qlen) {
1265                                         DTRACE_PROBE3(runq__balance,
1266                                             kthread_t *, tp,
1267                                             cpu_t *, cp, cpu_t *, newcp);
1268                                         cp = newcp;
1269                                 }
1270                         }
1271                 } else {
1272                         /*
1273                          * Migrate to a cpu in the new partition.
1274                          */
1275                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1276                             tp->t_lpl, tp->t_pri, NULL);
1277                 }
1278                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1279         } else {
1280                 /*
1281                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1282                  * a short time until weak binding that existed when the
1283                  * strong binding was established has dropped) so we must
1284                  * favour weak binding over strong.
1285                  */
1286                 cp = tp->t_weakbound_cpu ?
1287                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1288         }
1289         /*
1290          * A thread that is ONPROC may be temporarily placed on the run queue
1291          * but then chosen to run again by disp.  If the thread we're placing on
1292          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1293          * replacement process is actually scheduled in swtch().  In this
1294          * situation, curthread is the only thread that could be in the ONPROC
1295          * state.
1296          */
1297         if ((!self) && (tp->t_waitrq == 0)) {
1298                 hrtime_t curtime;
1299 
1300                 curtime = gethrtime_unscaled();
1301                 (void) cpu_update_pct(tp, curtime);
1302                 tp->t_waitrq = curtime;
1303         } else {
1304                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1305         }
1306 
1307         dp = cp->cpu_disp;
1308         disp_lock_enter_high(&dp->disp_lock);
1309 
1310         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1311         TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1312             tpri, cp, tp);
1313 
1314 #ifndef NPROBE
1315         /* Kernel probe */
1316         if (tnf_tracing_active)
1317                 tnf_thread_queue(tp, cp, tpri);
1318 #endif /* NPROBE */
1319 
1320         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1321 
1322         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1323         tp->t_disp_queue = dp;
1324         tp->t_link = NULL;
1325 
1326         dq = &dp->disp_q[tpri];
1327         dp->disp_nrunnable++;
1328         if (!bound)
1329                 dp->disp_steal = 0;
1330         membar_enter();
1331 
1332         if (dq->dq_sruncnt++ != 0) {
1333                 ASSERT(dq->dq_first != NULL);
1334                 dq->dq_last->t_link = tp;
1335                 dq->dq_last = tp;
1336         } else {
1337                 ASSERT(dq->dq_first == NULL);
1338                 ASSERT(dq->dq_last == NULL);
1339                 dq->dq_first = dq->dq_last = tp;
1340                 BT_SET(dp->disp_qactmap, tpri);
1341                 if (tpri > dp->disp_maxrunpri) {
1342                         dp->disp_maxrunpri = tpri;
1343                         membar_enter();
1344                         cpu_resched(cp, tpri);
1345                 }
1346         }
1347 
1348         if (!bound && tpri > dp->disp_max_unbound_pri) {
1349                 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1350                         /*
1351                          * If there are no other unbound threads on the
1352                          * run queue, don't allow other CPUs to steal
1353                          * this thread while we are in the middle of a
1354                          * context switch. We may just switch to it
1355                          * again right away. CPU_DISP_DONTSTEAL is cleared
1356                          * in swtch and swtch_to.
1357                          */
1358                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1359                 }
1360                 dp->disp_max_unbound_pri = tpri;
1361         }
1362         (*disp_enq_thread)(cp, bound);
1363 }
1364 
1365 /*
1366  * Put the specified thread on the front of the dispatcher
1367  * queue corresponding to its current priority.
1368  *
1369  * Called with the thread in transition, onproc or stopped state
1370  * and locked (transition implies locked) and at high spl.
1371  * Returns with the thread in TS_RUN state and still locked.
1372  */
1373 void
1374 setfrontdq(kthread_t *tp)
1375 {
1376         disp_t          *dp;
1377         dispq_t         *dq;
1378         cpu_t           *cp;
1379         pri_t           tpri;
1380         int             bound;
1381 
1382         ASSERT(THREAD_LOCK_HELD(tp));
1383         ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1384         ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1385 
1386         /*
1387          * If thread is "swapped" or on the swap queue don't
1388          * queue it, but wake sched.
1389          */
1390         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1391                 disp_swapped_setrun(tp);
1392                 return;
1393         }
1394 
1395         if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1396                 bound = 1;
1397         else
1398                 bound = 0;
1399 
1400         tpri = DISP_PRIO(tp);
1401         if (ncpus == 1)
1402                 cp = tp->t_cpu;
1403         else if (!bound) {
1404                 if (tpri >= kpqpri) {
1405                         setkpdq(tp, SETKP_FRONT);
1406                         return;
1407                 }
1408                 cp = tp->t_cpu;
1409                 if (tp->t_cpupart == cp->cpu_part) {
1410                         /*
1411                          * We'll generally let this thread continue to run
1412                          * where it last ran, but will consider migration if:
1413                          * - The thread last ran outside it's home lgroup.
1414                          * - The CPU where it last ran is the target of an
1415                          *   offline request (a thread_nomigrate() on the in
1416                          *   motion CPU relies on this when forcing a preempt).
1417                          * - The thread isn't the highest priority thread where
1418                          *   it last ran, and it is considered not likely to
1419                          *   have significant cache warmth.
1420                          */
1421                         if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1422                             (cp == cpu_inmotion)) {
1423                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1424                                     (tp == curthread) ? cp : NULL);
1425                         } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1426                             (!THREAD_HAS_CACHE_WARMTH(tp))) {
1427                                 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1428                                     NULL);
1429                         }
1430                 } else {
1431                         /*
1432                          * Migrate to a cpu in the new partition.
1433                          */
1434                         cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1435                             tp->t_lpl, tp->t_pri, NULL);
1436                 }
1437                 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1438         } else {
1439                 /*
1440                  * It is possible that t_weakbound_cpu != t_bound_cpu (for
1441                  * a short time until weak binding that existed when the
1442                  * strong binding was established has dropped) so we must
1443                  * favour weak binding over strong.
1444                  */
1445                 cp = tp->t_weakbound_cpu ?
1446                     tp->t_weakbound_cpu : tp->t_bound_cpu;
1447         }
1448 
1449         /*
1450          * A thread that is ONPROC may be temporarily placed on the run queue
1451          * but then chosen to run again by disp.  If the thread we're placing on
1452          * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1453          * replacement process is actually scheduled in swtch().  In this
1454          * situation, curthread is the only thread that could be in the ONPROC
1455          * state.
1456          */
1457         if ((tp != curthread) && (tp->t_waitrq == 0)) {
1458                 hrtime_t curtime;
1459 
1460                 curtime = gethrtime_unscaled();
1461                 (void) cpu_update_pct(tp, curtime);
1462                 tp->t_waitrq = curtime;
1463         } else {
1464                 (void) cpu_update_pct(tp, gethrtime_unscaled());
1465         }
1466 
1467         dp = cp->cpu_disp;
1468         disp_lock_enter_high(&dp->disp_lock);
1469 
1470         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1471         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1472 
1473 #ifndef NPROBE
1474         /* Kernel probe */
1475         if (tnf_tracing_active)
1476                 tnf_thread_queue(tp, cp, tpri);
1477 #endif /* NPROBE */
1478 
1479         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1480 
1481         THREAD_RUN(tp, &dp->disp_lock);          /* set TS_RUN state and lock */
1482         tp->t_disp_queue = dp;
1483 
1484         dq = &dp->disp_q[tpri];
1485         dp->disp_nrunnable++;
1486         if (!bound)
1487                 dp->disp_steal = 0;
1488         membar_enter();
1489 
1490         if (dq->dq_sruncnt++ != 0) {
1491                 ASSERT(dq->dq_last != NULL);
1492                 tp->t_link = dq->dq_first;
1493                 dq->dq_first = tp;
1494         } else {
1495                 ASSERT(dq->dq_last == NULL);
1496                 ASSERT(dq->dq_first == NULL);
1497                 tp->t_link = NULL;
1498                 dq->dq_first = dq->dq_last = tp;
1499                 BT_SET(dp->disp_qactmap, tpri);
1500                 if (tpri > dp->disp_maxrunpri) {
1501                         dp->disp_maxrunpri = tpri;
1502                         membar_enter();
1503                         cpu_resched(cp, tpri);
1504                 }
1505         }
1506 
1507         if (!bound && tpri > dp->disp_max_unbound_pri) {
1508                 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1509                     cp == CPU) {
1510                         /*
1511                          * If there are no other unbound threads on the
1512                          * run queue, don't allow other CPUs to steal
1513                          * this thread while we are in the middle of a
1514                          * context switch. We may just switch to it
1515                          * again right away. CPU_DISP_DONTSTEAL is cleared
1516                          * in swtch and swtch_to.
1517                          */
1518                         cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1519                 }
1520                 dp->disp_max_unbound_pri = tpri;
1521         }
1522         (*disp_enq_thread)(cp, bound);
1523 }
1524 
1525 /*
1526  * Put a high-priority unbound thread on the kp queue
1527  */
1528 static void
1529 setkpdq(kthread_t *tp, int borf)
1530 {
1531         dispq_t *dq;
1532         disp_t  *dp;
1533         cpu_t   *cp;
1534         pri_t   tpri;
1535 
1536         tpri = DISP_PRIO(tp);
1537 
1538         dp = &tp->t_cpupart->cp_kp_queue;
1539         disp_lock_enter_high(&dp->disp_lock);
1540 
1541         TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1542 
1543         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1544         DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1545         THREAD_RUN(tp, &dp->disp_lock);          /* set t_state to TS_RUN */
1546         tp->t_disp_queue = dp;
1547         dp->disp_nrunnable++;
1548         dq = &dp->disp_q[tpri];
1549 
1550         if (dq->dq_sruncnt++ != 0) {
1551                 if (borf == SETKP_BACK) {
1552                         ASSERT(dq->dq_first != NULL);
1553                         tp->t_link = NULL;
1554                         dq->dq_last->t_link = tp;
1555                         dq->dq_last = tp;
1556                 } else {
1557                         ASSERT(dq->dq_last != NULL);
1558                         tp->t_link = dq->dq_first;
1559                         dq->dq_first = tp;
1560                 }
1561         } else {
1562                 if (borf == SETKP_BACK) {
1563                         ASSERT(dq->dq_first == NULL);
1564                         ASSERT(dq->dq_last == NULL);
1565                         dq->dq_first = dq->dq_last = tp;
1566                 } else {
1567                         ASSERT(dq->dq_last == NULL);
1568                         ASSERT(dq->dq_first == NULL);
1569                         tp->t_link = NULL;
1570                         dq->dq_first = dq->dq_last = tp;
1571                 }
1572                 BT_SET(dp->disp_qactmap, tpri);
1573                 if (tpri > dp->disp_max_unbound_pri)
1574                         dp->disp_max_unbound_pri = tpri;
1575                 if (tpri > dp->disp_maxrunpri) {
1576                         dp->disp_maxrunpri = tpri;
1577                         membar_enter();
1578                 }
1579         }
1580 
1581         cp = tp->t_cpu;
1582         if (tp->t_cpupart != cp->cpu_part) {
1583                 /* migrate to a cpu in the new partition */
1584                 cp = tp->t_cpupart->cp_cpulist;
1585         }
1586         cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1587         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1588         ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1589 
1590 #ifndef NPROBE
1591         /* Kernel probe */
1592         if (tnf_tracing_active)
1593                 tnf_thread_queue(tp, cp, tpri);
1594 #endif /* NPROBE */
1595 
1596         if (cp->cpu_chosen_level < tpri)
1597                 cp->cpu_chosen_level = tpri;
1598         cpu_resched(cp, tpri);
1599         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1600         (*disp_enq_thread)(cp, 0);
1601 }
1602 
1603 /*
1604  * Remove a thread from the dispatcher queue if it is on it.
1605  * It is not an error if it is not found but we return whether
1606  * or not it was found in case the caller wants to check.
1607  */
1608 int
1609 dispdeq(kthread_t *tp)
1610 {
1611         disp_t          *dp;
1612         dispq_t         *dq;
1613         kthread_t       *rp;
1614         kthread_t       *trp;
1615         kthread_t       **ptp;
1616         int             tpri;
1617 
1618         ASSERT(THREAD_LOCK_HELD(tp));
1619 
1620         if (tp->t_state != TS_RUN)
1621                 return (0);
1622 
1623         /*
1624          * The thread is "swapped" or is on the swap queue and
1625          * hence no longer on the run queue, so return true.
1626          */
1627         if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1628                 return (1);
1629 
1630         tpri = DISP_PRIO(tp);
1631         dp = tp->t_disp_queue;
1632         ASSERT(tpri < dp->disp_npri);
1633         dq = &dp->disp_q[tpri];
1634         ptp = &dq->dq_first;
1635         rp = *ptp;
1636         trp = NULL;
1637 
1638         ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1639 
1640         /*
1641          * Search for thread in queue.
1642          * Double links would simplify this at the expense of disp/setrun.
1643          */
1644         while (rp != tp && rp != NULL) {
1645                 trp = rp;
1646                 ptp = &trp->t_link;
1647                 rp = trp->t_link;
1648         }
1649 
1650         if (rp == NULL) {
1651                 panic("dispdeq: thread not on queue");
1652         }
1653 
1654         DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1655 
1656         /*
1657          * Found it so remove it from queue.
1658          */
1659         if ((*ptp = rp->t_link) == NULL)
1660                 dq->dq_last = trp;
1661 
1662         dp->disp_nrunnable--;
1663         if (--dq->dq_sruncnt == 0) {
1664                 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1665                 if (dp->disp_nrunnable == 0) {
1666                         dp->disp_max_unbound_pri = -1;
1667                         dp->disp_maxrunpri = -1;
1668                 } else if (tpri == dp->disp_maxrunpri) {
1669                         int ipri;
1670 
1671                         ipri = bt_gethighbit(dp->disp_qactmap,
1672                             dp->disp_maxrunpri >> BT_ULSHIFT);
1673                         if (ipri < dp->disp_max_unbound_pri)
1674                                 dp->disp_max_unbound_pri = ipri;
1675                         dp->disp_maxrunpri = ipri;
1676                 }
1677         }
1678         tp->t_link = NULL;
1679         THREAD_TRANSITION(tp);          /* put in intermediate state */
1680         return (1);
1681 }
1682 
1683 
1684 /*
1685  * dq_sruninc and dq_srundec are public functions for
1686  * incrementing/decrementing the sruncnts when a thread on
1687  * a dispatcher queue is made schedulable/unschedulable by
1688  * resetting the TS_LOAD flag.
1689  *
1690  * The caller MUST have the thread lock and therefore the dispatcher
1691  * queue lock so that the operation which changes
1692  * the flag, the operation that checks the status of the thread to
1693  * determine if it's on a disp queue AND the call to this function
1694  * are one atomic operation with respect to interrupts.
1695  */
1696 
1697 /*
1698  * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1699  */
1700 void
1701 dq_sruninc(kthread_t *t)
1702 {
1703         ASSERT(t->t_state == TS_RUN);
1704         ASSERT(t->t_schedflag & TS_LOAD);
1705 
1706         THREAD_TRANSITION(t);
1707         setfrontdq(t);
1708 }
1709 
1710 /*
1711  * See comment on calling conventions above.
1712  * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1713  */
1714 void
1715 dq_srundec(kthread_t *t)
1716 {
1717         ASSERT(t->t_schedflag & TS_LOAD);
1718 
1719         (void) dispdeq(t);
1720         disp_swapped_enq(t);
1721 }
1722 
1723 /*
1724  * Change the dispatcher lock of thread to the "swapped_lock"
1725  * and return with thread lock still held.
1726  *
1727  * Called with thread_lock held, in transition state, and at high spl.
1728  */
1729 void
1730 disp_swapped_enq(kthread_t *tp)
1731 {
1732         ASSERT(THREAD_LOCK_HELD(tp));
1733         ASSERT(tp->t_schedflag & TS_LOAD);
1734 
1735         switch (tp->t_state) {
1736         case TS_RUN:
1737                 disp_lock_enter_high(&swapped_lock);
1738                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1739                 break;
1740         case TS_ONPROC:
1741                 disp_lock_enter_high(&swapped_lock);
1742                 THREAD_TRANSITION(tp);
1743                 wake_sched_sec = 1;             /* tell clock to wake sched */
1744                 THREAD_SWAP(tp, &swapped_lock);     /* set TS_RUN state and lock */
1745                 break;
1746         default:
1747                 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1748         }
1749 }
1750 
1751 /*
1752  * This routine is called by setbackdq/setfrontdq if the thread is
1753  * not loaded or loaded and on the swap queue.
1754  *
1755  * Thread state TS_SLEEP implies that a swapped thread
1756  * has been woken up and needs to be swapped in by the swapper.
1757  *
1758  * Thread state TS_RUN, it implies that the priority of a swapped
1759  * thread is being increased by scheduling class (e.g. ts_update).
1760  */
1761 static void
1762 disp_swapped_setrun(kthread_t *tp)
1763 {
1764         ASSERT(THREAD_LOCK_HELD(tp));
1765         ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1766 
1767         switch (tp->t_state) {
1768         case TS_SLEEP:
1769                 disp_lock_enter_high(&swapped_lock);
1770                 /*
1771                  * Wakeup sched immediately (i.e., next tick) if the
1772                  * thread priority is above maxclsyspri.
1773                  */
1774                 if (DISP_PRIO(tp) > maxclsyspri)
1775                         wake_sched = 1;
1776                 else
1777                         wake_sched_sec = 1;
1778                 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1779                 break;
1780         case TS_RUN:                            /* called from ts_update */
1781                 break;
1782         default:
1783                 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1784         }
1785 }
1786 
1787 /*
1788  *      Make a thread give up its processor.  Find the processor on
1789  *      which this thread is executing, and have that processor
1790  *      preempt.
1791  *
1792  *      We allow System Duty Cycle (SDC) threads to be preempted even if
1793  *      they are running at kernel priorities.  To implement this, we always
1794  *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1795  *      calls cpu_surrender() very often, we only preempt if there is anyone
1796  *      competing with us.
1797  */
1798 void
1799 cpu_surrender(kthread_t *tp)
1800 {
1801         cpu_t   *cpup;
1802         int     max_pri;
1803         int     max_run_pri;
1804         klwp_t  *lwp;
1805 
1806         ASSERT(THREAD_LOCK_HELD(tp));
1807 
1808         if (tp->t_state != TS_ONPROC)
1809                 return;
1810         cpup = tp->t_disp_queue->disp_cpu;        /* CPU thread dispatched to */
1811         max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1812         max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1813         if (max_pri < max_run_pri)
1814                 max_pri = max_run_pri;
1815 
1816         if (tp->t_cid == sysdccid) {
1817                 uint_t t_pri = DISP_PRIO(tp);
1818                 if (t_pri > max_pri)
1819                         return;         /* we are not competing w/ anyone */
1820                 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1821         } else {
1822                 cpup->cpu_runrun = 1;
1823                 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1824                         cpup->cpu_kprunrun = 1;
1825                 }
1826         }
1827 
1828         /*
1829          * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1830          */
1831         membar_enter();
1832 
1833         DTRACE_SCHED1(surrender, kthread_t *, tp);
1834 
1835         /*
1836          * Make the target thread take an excursion through trap()
1837          * to do preempt() (unless we're already in trap or post_syscall,
1838          * calling cpu_surrender via CL_TRAPRET).
1839          */
1840         if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1841             lwp->lwp_state != LWP_USER) {
1842                 aston(tp);
1843                 if (cpup != CPU)
1844                         poke_cpu(cpup->cpu_id);
1845         }
1846         TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1847             "cpu_surrender:tid %p cpu %p", tp, cpup);
1848 }
1849 
1850 /*
1851  * Commit to and ratify a scheduling decision
1852  */
1853 /*ARGSUSED*/
1854 static kthread_t *
1855 disp_ratify(kthread_t *tp, disp_t *kpq)
1856 {
1857         pri_t   tpri, maxpri;
1858         pri_t   maxkpri;
1859         cpu_t   *cpup;
1860 
1861         ASSERT(tp != NULL);
1862         /*
1863          * Commit to, then ratify scheduling decision
1864          */
1865         cpup = CPU;
1866         if (cpup->cpu_runrun != 0)
1867                 cpup->cpu_runrun = 0;
1868         if (cpup->cpu_kprunrun != 0)
1869                 cpup->cpu_kprunrun = 0;
1870         if (cpup->cpu_chosen_level != -1)
1871                 cpup->cpu_chosen_level = -1;
1872         membar_enter();
1873         tpri = DISP_PRIO(tp);
1874         maxpri = cpup->cpu_disp->disp_maxrunpri;
1875         maxkpri = kpq->disp_maxrunpri;
1876         if (maxpri < maxkpri)
1877                 maxpri = maxkpri;
1878         if (tpri < maxpri) {
1879                 /*
1880                  * should have done better
1881                  * put this one back and indicate to try again
1882                  */
1883                 cpup->cpu_dispthread = curthread;    /* fixup dispthread */
1884                 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1885                 thread_lock_high(tp);
1886                 THREAD_TRANSITION(tp);
1887                 setfrontdq(tp);
1888                 thread_unlock_nopreempt(tp);
1889 
1890                 tp = NULL;
1891         }
1892         return (tp);
1893 }
1894 
1895 /*
1896  * See if there is any work on the dispatcher queue for other CPUs.
1897  * If there is, dequeue the best thread and return.
1898  */
1899 static kthread_t *
1900 disp_getwork(cpu_t *cp)
1901 {
1902         cpu_t           *ocp;           /* other CPU */
1903         cpu_t           *ocp_start;
1904         cpu_t           *tcp;           /* target local CPU */
1905         kthread_t       *tp;
1906         kthread_t       *retval = NULL;
1907         pri_t           maxpri;
1908         disp_t          *kpq;           /* kp queue for this partition */
1909         lpl_t           *lpl, *lpl_leaf;
1910         int             leafidx, startidx;
1911         hrtime_t        stealtime;
1912         lgrp_id_t       local_id;
1913 
1914         maxpri = -1;
1915         tcp = NULL;
1916 
1917         kpq = &cp->cpu_part->cp_kp_queue;
1918         while (kpq->disp_maxrunpri >= 0) {
1919                 /*
1920                  * Try to take a thread from the kp_queue.
1921                  */
1922                 tp = (disp_getbest(kpq));
1923                 if (tp)
1924                         return (disp_ratify(tp, kpq));
1925         }
1926 
1927         kpreempt_disable();             /* protect the cpu_active list */
1928 
1929         /*
1930          * Try to find something to do on another CPU's run queue.
1931          * Loop through all other CPUs looking for the one with the highest
1932          * priority unbound thread.
1933          *
1934          * On NUMA machines, the partition's CPUs are consulted in order of
1935          * distance from the current CPU. This way, the first available
1936          * work found is also the closest, and will suffer the least
1937          * from being migrated.
1938          */
1939         lpl = lpl_leaf = cp->cpu_lpl;
1940         local_id = lpl_leaf->lpl_lgrpid;
1941         leafidx = startidx = 0;
1942 
1943         /*
1944          * This loop traverses the lpl hierarchy. Higher level lpls represent
1945          * broader levels of locality
1946          */
1947         do {
1948                 /* This loop iterates over the lpl's leaves */
1949                 do {
1950                         if (lpl_leaf != cp->cpu_lpl)
1951                                 ocp = lpl_leaf->lpl_cpus;
1952                         else
1953                                 ocp = cp->cpu_next_lpl;
1954 
1955                         /* This loop iterates over the CPUs in the leaf */
1956                         ocp_start = ocp;
1957                         do {
1958                                 pri_t pri;
1959 
1960                                 ASSERT(CPU_ACTIVE(ocp));
1961 
1962                                 /*
1963                                  * End our stroll around this lpl if:
1964                                  *
1965                                  * - Something became runnable on the local
1966                                  *   queue...which also ends our stroll around
1967                                  *   the partition.
1968                                  *
1969                                  * - We happen across another idle CPU.
1970                                  *   Since it is patrolling the next portion
1971                                  *   of the lpl's list (assuming it's not
1972                                  *   halted, or busy servicing an interrupt),
1973                                  *   move to the next higher level of locality.
1974                                  */
1975                                 if (cp->cpu_disp->disp_nrunnable != 0) {
1976                                         kpreempt_enable();
1977                                         return (NULL);
1978                                 }
1979                                 if (ocp->cpu_dispatch_pri == -1) {
1980                                         if (ocp->cpu_disp_flags &
1981                                             CPU_DISP_HALTED ||
1982                                             ocp->cpu_intr_actv != 0)
1983                                                 continue;
1984                                         else
1985                                                 goto next_level;
1986                                 }
1987 
1988                                 /*
1989                                  * If there's only one thread and the CPU
1990                                  * is in the middle of a context switch,
1991                                  * or it's currently running the idle thread,
1992                                  * don't steal it.
1993                                  */
1994                                 if ((ocp->cpu_disp_flags &
1995                                     CPU_DISP_DONTSTEAL) &&
1996                                     ocp->cpu_disp->disp_nrunnable == 1)
1997                                         continue;
1998 
1999                                 pri = ocp->cpu_disp->disp_max_unbound_pri;
2000                                 if (pri > maxpri) {
2001                                         /*
2002                                          * Don't steal threads that we attempted
2003                                          * to steal recently until they're ready
2004                                          * to be stolen again.
2005                                          */
2006                                         stealtime = ocp->cpu_disp->disp_steal;
2007                                         if (stealtime == 0 ||
2008                                             stealtime - gethrtime() <= 0) {
2009                                                 maxpri = pri;
2010                                                 tcp = ocp;
2011                                         } else {
2012                                                 /*
2013                                                  * Don't update tcp, just set
2014                                                  * the retval to T_DONTSTEAL, so
2015                                                  * that if no acceptable CPUs
2016                                                  * are found the return value
2017                                                  * will be T_DONTSTEAL rather
2018                                                  * then NULL.
2019                                                  */
2020                                                 retval = T_DONTSTEAL;
2021                                         }
2022                                 }
2023                         } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2024 
2025                         /*
2026                          * Iterate to the next leaf lpl in the resource set
2027                          * at this level of locality. If we hit the end of
2028                          * the set, wrap back around to the beginning.
2029                          *
2030                          * Note: This iteration is NULL terminated for a reason
2031                          * see lpl_topo_bootstrap() in lgrp.c for details.
2032                          */
2033                         if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2034                                 leafidx = 0;
2035                                 lpl_leaf = lpl->lpl_rset[leafidx];
2036                         }
2037                 } while (leafidx != startidx);
2038 
2039 next_level:
2040                 /*
2041                  * Expand the search to include farther away CPUs (next
2042                  * locality level). The closer CPUs that have already been
2043                  * checked will be checked again. In doing so, idle CPUs
2044                  * will tend to be more aggresive about stealing from CPUs
2045                  * that are closer (since the closer CPUs will be considered
2046                  * more often).
2047                  * Begin at this level with the CPUs local leaf lpl.
2048                  */
2049                 if ((lpl = lpl->lpl_parent) != NULL) {
2050                         leafidx = startidx = lpl->lpl_id2rset[local_id];
2051                         lpl_leaf = lpl->lpl_rset[leafidx];
2052                 }
2053         } while (!tcp && lpl);
2054 
2055         kpreempt_enable();
2056 
2057         /*
2058          * If another queue looks good, and there is still nothing on
2059          * the local queue, try to transfer one or more threads
2060          * from it to our queue.
2061          */
2062         if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2063                 tp = disp_getbest(tcp->cpu_disp);
2064                 if (tp == NULL || tp == T_DONTSTEAL)
2065                         return (tp);
2066                 return (disp_ratify(tp, kpq));
2067         }
2068         return (retval);
2069 }
2070 
2071 
2072 /*
2073  * disp_fix_unbound_pri()
2074  *      Determines the maximum priority of unbound threads on the queue.
2075  *      The priority is kept for the queue, but is only increased, never
2076  *      reduced unless some CPU is looking for something on that queue.
2077  *
2078  *      The priority argument is the known upper limit.
2079  *
2080  *      Perhaps this should be kept accurately, but that probably means
2081  *      separate bitmaps for bound and unbound threads.  Since only idled
2082  *      CPUs will have to do this recalculation, it seems better this way.
2083  */
2084 static void
2085 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2086 {
2087         kthread_t       *tp;
2088         dispq_t         *dq;
2089         ulong_t         *dqactmap = dp->disp_qactmap;
2090         ulong_t         mapword;
2091         int             wx;
2092 
2093         ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2094 
2095         ASSERT(pri >= 0);                    /* checked by caller */
2096 
2097         /*
2098          * Start the search at the next lowest priority below the supplied
2099          * priority.  This depends on the bitmap implementation.
2100          */
2101         do {
2102                 wx = pri >> BT_ULSHIFT;           /* index of word in map */
2103 
2104                 /*
2105                  * Form mask for all lower priorities in the word.
2106                  */
2107                 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2108 
2109                 /*
2110                  * Get next lower active priority.
2111                  */
2112                 if (mapword != 0) {
2113                         pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2114                 } else if (wx > 0) {
2115                         pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2116                         if (pri < 0)
2117                                 break;
2118                 } else {
2119                         pri = -1;
2120                         break;
2121                 }
2122 
2123                 /*
2124                  * Search the queue for unbound, runnable threads.
2125                  */
2126                 dq = &dp->disp_q[pri];
2127                 tp = dq->dq_first;
2128 
2129                 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2130                         tp = tp->t_link;
2131                 }
2132 
2133                 /*
2134                  * If a thread was found, set the priority and return.
2135                  */
2136         } while (tp == NULL);
2137 
2138         /*
2139          * pri holds the maximum unbound thread priority or -1.
2140          */
2141         if (dp->disp_max_unbound_pri != pri)
2142                 dp->disp_max_unbound_pri = pri;
2143 }
2144 
2145 /*
2146  * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2147  *      check if the CPU to which is was previously bound should have
2148  *      its disp_max_unbound_pri increased.
2149  */
2150 void
2151 disp_adjust_unbound_pri(kthread_t *tp)
2152 {
2153         disp_t *dp;
2154         pri_t tpri;
2155 
2156         ASSERT(THREAD_LOCK_HELD(tp));
2157 
2158         /*
2159          * Don't do anything if the thread is not bound, or
2160          * currently not runnable or swapped out.
2161          */
2162         if (tp->t_bound_cpu == NULL ||
2163             tp->t_state != TS_RUN ||
2164             tp->t_schedflag & TS_ON_SWAPQ)
2165                 return;
2166 
2167         tpri = DISP_PRIO(tp);
2168         dp = tp->t_bound_cpu->cpu_disp;
2169         ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2170         if (tpri > dp->disp_max_unbound_pri)
2171                 dp->disp_max_unbound_pri = tpri;
2172 }
2173 
2174 /*
2175  * disp_getbest()
2176  *   De-queue the highest priority unbound runnable thread.
2177  *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2178  *   Returns NULL if nothing found.
2179  *   Returns T_DONTSTEAL if the thread was not stealable.
2180  *   so that the caller will try again later.
2181  *
2182  *   Passed a pointer to a dispatch queue not associated with this CPU, and
2183  *   its type.
2184  */
2185 static kthread_t *
2186 disp_getbest(disp_t *dp)
2187 {
2188         kthread_t       *tp;
2189         dispq_t         *dq;
2190         pri_t           pri;
2191         cpu_t           *cp, *tcp;
2192         boolean_t       allbound;
2193 
2194         disp_lock_enter(&dp->disp_lock);
2195 
2196         /*
2197          * If there is nothing to run, or the CPU is in the middle of a
2198          * context switch of the only thread, return NULL.
2199          */
2200         tcp = dp->disp_cpu;
2201         cp = CPU;
2202         pri = dp->disp_max_unbound_pri;
2203         if (pri == -1 ||
2204             (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2205             tcp->cpu_disp->disp_nrunnable == 1)) {
2206                 disp_lock_exit_nopreempt(&dp->disp_lock);
2207                 return (NULL);
2208         }
2209 
2210         dq = &dp->disp_q[pri];
2211 
2212 
2213         /*
2214          * Assume that all threads are bound on this queue, and change it
2215          * later when we find out that it is not the case.
2216          */
2217         allbound = B_TRUE;
2218         for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2219                 hrtime_t now, nosteal, rqtime;
2220 
2221                 /*
2222                  * Skip over bound threads which could be here even
2223                  * though disp_max_unbound_pri indicated this level.
2224                  */
2225                 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2226                         continue;
2227 
2228                 /*
2229                  * We've got some unbound threads on this queue, so turn
2230                  * the allbound flag off now.
2231                  */
2232                 allbound = B_FALSE;
2233 
2234                 /*
2235                  * The thread is a candidate for stealing from its run queue. We
2236                  * don't want to steal threads that became runnable just a
2237                  * moment ago. This improves CPU affinity for threads that get
2238                  * preempted for short periods of time and go back on the run
2239                  * queue.
2240                  *
2241                  * We want to let it stay on its run queue if it was only placed
2242                  * there recently and it was running on the same CPU before that
2243                  * to preserve its cache investment. For the thread to remain on
2244                  * its run queue, ALL of the following conditions must be
2245                  * satisfied:
2246                  *
2247                  * - the disp queue should not be the kernel preemption queue
2248                  * - delayed idle stealing should not be disabled
2249                  * - nosteal_nsec should be non-zero
2250                  * - it should run with user priority
2251                  * - it should be on the run queue of the CPU where it was
2252                  *   running before being placed on the run queue
2253                  * - it should be the only thread on the run queue (to prevent
2254                  *   extra scheduling latency for other threads)
2255                  * - it should sit on the run queue for less than per-chip
2256                  *   nosteal interval or global nosteal interval
2257                  * - in case of CPUs with shared cache it should sit in a run
2258                  *   queue of a CPU from a different chip
2259                  *
2260                  * The checks are arranged so that the ones that are faster are
2261                  * placed earlier.
2262                  */
2263                 if (tcp == NULL ||
2264                     pri >= minclsyspri ||
2265                     tp->t_cpu != tcp)
2266                         break;
2267 
2268                 /*
2269                  * Steal immediately if, due to CMT processor architecture
2270                  * migraiton between cp and tcp would incur no performance
2271                  * penalty.
2272                  */
2273                 if (pg_cmt_can_migrate(cp, tcp))
2274                         break;
2275 
2276                 nosteal = nosteal_nsec;
2277                 if (nosteal == 0)
2278                         break;
2279 
2280                 /*
2281                  * Calculate time spent sitting on run queue
2282                  */
2283                 now = gethrtime_unscaled();
2284                 rqtime = now - tp->t_waitrq;
2285                 scalehrtime(&rqtime);
2286 
2287                 /*
2288                  * Steal immediately if the time spent on this run queue is more
2289                  * than allowed nosteal delay.
2290                  *
2291                  * Negative rqtime check is needed here to avoid infinite
2292                  * stealing delays caused by unlikely but not impossible
2293                  * drifts between CPU times on different CPUs.
2294                  */
2295                 if (rqtime > nosteal || rqtime < 0)
2296                         break;
2297 
2298                 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2299                     cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2300                 scalehrtime(&now);
2301                 /*
2302                  * Calculate when this thread becomes stealable
2303                  */
2304                 now += (nosteal - rqtime);
2305 
2306                 /*
2307                  * Calculate time when some thread becomes stealable
2308                  */
2309                 if (now < dp->disp_steal)
2310                         dp->disp_steal = now;
2311         }
2312 
2313         /*
2314          * If there were no unbound threads on this queue, find the queue
2315          * where they are and then return later. The value of
2316          * disp_max_unbound_pri is not always accurate because it isn't
2317          * reduced until another idle CPU looks for work.
2318          */
2319         if (allbound)
2320                 disp_fix_unbound_pri(dp, pri);
2321 
2322         /*
2323          * If we reached the end of the queue and found no unbound threads
2324          * then return NULL so that other CPUs will be considered.  If there
2325          * are unbound threads but they cannot yet be stolen, then
2326          * return T_DONTSTEAL and try again later.
2327          */
2328         if (tp == NULL) {
2329                 disp_lock_exit_nopreempt(&dp->disp_lock);
2330                 return (allbound ? NULL : T_DONTSTEAL);
2331         }
2332 
2333         /*
2334          * Found a runnable, unbound thread, so remove it from queue.
2335          * dispdeq() requires that we have the thread locked, and we do,
2336          * by virtue of holding the dispatch queue lock.  dispdeq() will
2337          * put the thread in transition state, thereby dropping the dispq
2338          * lock.
2339          */
2340 
2341 #ifdef DEBUG
2342         {
2343                 int     thread_was_on_queue;
2344 
2345                 thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2346                 ASSERT(thread_was_on_queue);
2347         }
2348 
2349 #else /* DEBUG */
2350         (void) dispdeq(tp);                     /* drops disp_lock */
2351 #endif /* DEBUG */
2352 
2353         /*
2354          * Reset the disp_queue steal time - we do not know what is the smallest
2355          * value across the queue is.
2356          */
2357         dp->disp_steal = 0;
2358 
2359         tp->t_schedflag |= TS_DONT_SWAP;
2360 
2361         /*
2362          * Setup thread to run on the current CPU.
2363          */
2364         tp->t_disp_queue = cp->cpu_disp;
2365 
2366         cp->cpu_dispthread = tp;             /* protected by spl only */
2367         cp->cpu_dispatch_pri = pri;
2368 
2369         /*
2370          * There can be a memory synchronization race between disp_getbest()
2371          * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2372          * to preempt the current thread to run the enqueued thread while
2373          * disp_getbest() and disp_ratify() are changing the current thread
2374          * to the stolen thread. This may lead to a situation where
2375          * cpu_resched() tries to preempt the wrong thread and the
2376          * stolen thread continues to run on the CPU which has been tagged
2377          * for preemption.
2378          * Later the clock thread gets enqueued but doesn't get to run on the
2379          * CPU causing the system to hang.
2380          *
2381          * To avoid this, grabbing and dropping the disp_lock (which does
2382          * a memory barrier) is needed to synchronize the execution of
2383          * cpu_resched() with disp_getbest() and disp_ratify() and
2384          * synchronize the memory read and written by cpu_resched(),
2385          * disp_getbest(), and disp_ratify() with each other.
2386          *  (see CR#6482861 for more details).
2387          */
2388         disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2389         disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2390 
2391         ASSERT(pri == DISP_PRIO(tp));
2392 
2393         DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2394 
2395         thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2396 
2397         /*
2398          * Return with spl high so that swtch() won't need to raise it.
2399          * The disp_lock was dropped by dispdeq().
2400          */
2401 
2402         return (tp);
2403 }
2404 
2405 /*
2406  * disp_bound_common() - common routine for higher level functions
2407  *      that check for bound threads under certain conditions.
2408  *      If 'threadlistsafe' is set then there is no need to acquire
2409  *      pidlock to stop the thread list from changing (eg, if
2410  *      disp_bound_* is called with cpus paused).
2411  */
2412 static int
2413 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2414 {
2415         int             found = 0;
2416         kthread_t       *tp;
2417 
2418         ASSERT(flag);
2419 
2420         if (!threadlistsafe)
2421                 mutex_enter(&pidlock);
2422         tp = curthread;         /* faster than allthreads */
2423         do {
2424                 if (tp->t_state != TS_FREE) {
2425                         /*
2426                          * If an interrupt thread is busy, but the
2427                          * caller doesn't care (i.e. BOUND_INTR is off),
2428                          * then just ignore it and continue through.
2429                          */
2430                         if ((tp->t_flag & T_INTR_THREAD) &&
2431                             !(flag & BOUND_INTR))
2432                                 continue;
2433 
2434                         /*
2435                          * Skip the idle thread for the CPU
2436                          * we're about to set offline.
2437                          */
2438                         if (tp == cp->cpu_idle_thread)
2439                                 continue;
2440 
2441                         /*
2442                          * Skip the pause thread for the CPU
2443                          * we're about to set offline.
2444                          */
2445                         if (tp == cp->cpu_pause_thread)
2446                                 continue;
2447 
2448                         if ((flag & BOUND_CPU) &&
2449                             (tp->t_bound_cpu == cp ||
2450                             tp->t_bind_cpu == cp->cpu_id ||
2451                             tp->t_weakbound_cpu == cp)) {
2452                                 found = 1;
2453                                 break;
2454                         }
2455 
2456                         if ((flag & BOUND_PARTITION) &&
2457                             (tp->t_cpupart == cp->cpu_part)) {
2458                                 found = 1;
2459                                 break;
2460                         }
2461                 }
2462         } while ((tp = tp->t_next) != curthread && found == 0);
2463         if (!threadlistsafe)
2464                 mutex_exit(&pidlock);
2465         return (found);
2466 }
2467 
2468 /*
2469  * disp_bound_threads - return nonzero if threads are bound to the processor.
2470  *      Called infrequently.  Keep this simple.
2471  *      Includes threads that are asleep or stopped but not onproc.
2472  */
2473 int
2474 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2475 {
2476         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2477 }
2478 
2479 /*
2480  * disp_bound_anythreads - return nonzero if _any_ threads are bound
2481  * to the given processor, including interrupt threads.
2482  */
2483 int
2484 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2485 {
2486         return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2487 }
2488 
2489 /*
2490  * disp_bound_partition - return nonzero if threads are bound to the same
2491  * partition as the processor.
2492  *      Called infrequently.  Keep this simple.
2493  *      Includes threads that are asleep or stopped but not onproc.
2494  */
2495 int
2496 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2497 {
2498         return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2499 }
2500 
2501 /*
2502  * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2503  * threads to other CPUs.
2504  */
2505 void
2506 disp_cpu_inactive(cpu_t *cp)
2507 {
2508         kthread_t       *tp;
2509         disp_t          *dp = cp->cpu_disp;
2510         dispq_t         *dq;
2511         pri_t           pri;
2512         int             wasonq;
2513 
2514         disp_lock_enter(&dp->disp_lock);
2515         while ((pri = dp->disp_max_unbound_pri) != -1) {
2516                 dq = &dp->disp_q[pri];
2517                 tp = dq->dq_first;
2518 
2519                 /*
2520                  * Skip over bound threads.
2521                  */
2522                 while (tp != NULL && tp->t_bound_cpu != NULL) {
2523                         tp = tp->t_link;
2524                 }
2525 
2526                 if (tp == NULL) {
2527                         /* disp_max_unbound_pri must be inaccurate, so fix it */
2528                         disp_fix_unbound_pri(dp, pri);
2529                         continue;
2530                 }
2531 
2532                 wasonq = dispdeq(tp);           /* drops disp_lock */
2533                 ASSERT(wasonq);
2534                 ASSERT(tp->t_weakbound_cpu == NULL);
2535 
2536                 setbackdq(tp);
2537                 /*
2538                  * Called from cpu_offline:
2539                  *
2540                  * cp has already been removed from the list of active cpus
2541                  * and tp->t_cpu has been changed so there is no risk of
2542                  * tp ending up back on cp.
2543                  *
2544                  * Called from cpupart_move_cpu:
2545                  *
2546                  * The cpu has moved to a new cpupart.  Any threads that
2547                  * were on it's dispatch queues before the move remain
2548                  * in the old partition and can't run in the new partition.
2549                  */
2550                 ASSERT(tp->t_cpu != cp);
2551                 thread_unlock(tp);
2552 
2553                 disp_lock_enter(&dp->disp_lock);
2554         }
2555         disp_lock_exit(&dp->disp_lock);
2556 }
2557 
2558 /*
2559  * disp_lowpri_cpu - find CPU running the lowest priority thread.
2560  *      The hint passed in is used as a starting point so we don't favor
2561  *      CPU 0 or any other CPU.  The caller should pass in the most recently
2562  *      used CPU for the thread.
2563  *
2564  *      The lgroup and priority are used to determine the best CPU to run on
2565  *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2566  *      the thread priority will indicate whether the thread will actually run
2567  *      there.  To pick the best CPU, the CPUs inside and outside of the given
2568  *      lgroup which are running the lowest priority threads are found.  The
2569  *      remote CPU is chosen only if the thread will not run locally on a CPU
2570  *      within the lgroup, but will run on the remote CPU. If the thread
2571  *      cannot immediately run on any CPU, the best local CPU will be chosen.
2572  *
2573  *      The lpl specified also identifies the cpu partition from which
2574  *      disp_lowpri_cpu should select a CPU.
2575  *
2576  *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2577  *      behalf of the current thread. (curthread is looking for a new cpu)
2578  *      In this case, cpu_dispatch_pri for this thread's cpu should be
2579  *      ignored.
2580  *
2581  *      If a cpu is the target of an offline request then try to avoid it.
2582  *
2583  *      This function must be called at either high SPL, or with preemption
2584  *      disabled, so that the "hint" CPU cannot be removed from the online
2585  *      CPU list while we are traversing it.
2586  */
2587 cpu_t *
2588 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2589 {
2590         cpu_t   *bestcpu;
2591         cpu_t   *besthomecpu;
2592         cpu_t   *cp, *cpstart;
2593 
2594         pri_t   bestpri;
2595         pri_t   cpupri;
2596 
2597         klgrpset_t      done;
2598         klgrpset_t      cur_set;
2599 
2600         lpl_t           *lpl_iter, *lpl_leaf;
2601         int             i;
2602 
2603         /*
2604          * Scan for a CPU currently running the lowest priority thread.
2605          * Cannot get cpu_lock here because it is adaptive.
2606          * We do not require lock on CPU list.
2607          */
2608         ASSERT(hint != NULL);
2609         ASSERT(lpl != NULL);
2610         ASSERT(lpl->lpl_ncpu > 0);
2611 
2612         /*
2613          * First examine local CPUs. Note that it's possible the hint CPU
2614          * passed in in remote to the specified home lgroup. If our priority
2615          * isn't sufficient enough such that we can run immediately at home,
2616          * then examine CPUs remote to our home lgroup.
2617          * We would like to give preference to CPUs closest to "home".
2618          * If we can't find a CPU where we'll run at a given level
2619          * of locality, we expand our search to include the next level.
2620          */
2621         bestcpu = besthomecpu = NULL;
2622         klgrpset_clear(done);
2623         /* start with lpl we were passed */
2624 
2625         lpl_iter = lpl;
2626 
2627         do {
2628 
2629                 bestpri = SHRT_MAX;
2630                 klgrpset_clear(cur_set);
2631 
2632                 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2633                         lpl_leaf = lpl_iter->lpl_rset[i];
2634                         if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2635                                 continue;
2636 
2637                         klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2638 
2639                         if (hint->cpu_lpl == lpl_leaf)
2640                                 cp = cpstart = hint;
2641                         else
2642                                 cp = cpstart = lpl_leaf->lpl_cpus;
2643 
2644                         do {
2645                                 if (cp == curcpu)
2646                                         cpupri = -1;
2647                                 else if (cp == cpu_inmotion)
2648                                         cpupri = SHRT_MAX;
2649                                 else
2650                                         cpupri = cp->cpu_dispatch_pri;
2651                                 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2652                                         cpupri = cp->cpu_disp->disp_maxrunpri;
2653                                 if (cp->cpu_chosen_level > cpupri)
2654                                         cpupri = cp->cpu_chosen_level;
2655                                 if (cpupri < bestpri) {
2656                                         if (CPU_IDLING(cpupri)) {
2657                                                 ASSERT((cp->cpu_flags &
2658                                                     CPU_QUIESCED) == 0);
2659                                                 return (cp);
2660                                         }
2661                                         bestcpu = cp;
2662                                         bestpri = cpupri;
2663                                 }
2664                         } while ((cp = cp->cpu_next_lpl) != cpstart);
2665                 }
2666 
2667                 if (bestcpu && (tpri > bestpri)) {
2668                         ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2669                         return (bestcpu);
2670                 }
2671                 if (besthomecpu == NULL)
2672                         besthomecpu = bestcpu;
2673                 /*
2674                  * Add the lgrps we just considered to the "done" set
2675                  */
2676                 klgrpset_or(done, cur_set);
2677 
2678         } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2679 
2680         /*
2681          * The specified priority isn't high enough to run immediately
2682          * anywhere, so just return the best CPU from the home lgroup.
2683          */
2684         ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2685         return (besthomecpu);
2686 }
2687 
2688 /*
2689  * This routine provides the generic idle cpu function for all processors.
2690  * If a processor has some specific code to execute when idle (say, to stop
2691  * the pipeline and save power) then that routine should be defined in the
2692  * processors specific code (module_xx.c) and the global variable idle_cpu
2693  * set to that function.
2694  */
2695 static void
2696 generic_idle_cpu(void)
2697 {
2698 }
2699 
2700 /*ARGSUSED*/
2701 static void
2702 generic_enq_thread(cpu_t *cpu, int bound)
2703 {
2704 }