1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25 /*
26 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
31
32
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/signal.h>
37 #include <sys/user.h>
38 #include <sys/systm.h>
39 #include <sys/sysinfo.h>
40 #include <sys/var.h>
41 #include <sys/errno.h>
42 #include <sys/cmn_err.h>
43 #include <sys/debug.h>
44 #include <sys/inline.h>
45 #include <sys/disp.h>
46 #include <sys/class.h>
47 #include <sys/bitmap.h>
48 #include <sys/kmem.h>
49 #include <sys/cpuvar.h>
50 #include <sys/vtrace.h>
51 #include <sys/tnf.h>
52 #include <sys/cpupart.h>
53 #include <sys/lgrp.h>
54 #include <sys/pg.h>
55 #include <sys/cmt.h>
56 #include <sys/bitset.h>
57 #include <sys/schedctl.h>
58 #include <sys/atomic.h>
59 #include <sys/dtrace.h>
60 #include <sys/sdt.h>
61 #include <sys/archsystm.h>
62
63 #include <vm/as.h>
64
65 #define BOUND_CPU 0x1
66 #define BOUND_PARTITION 0x2
67 #define BOUND_INTR 0x4
68
69 /* Dispatch queue allocation structure and functions */
70 struct disp_queue_info {
71 disp_t *dp;
72 dispq_t *olddispq;
73 dispq_t *newdispq;
74 ulong_t *olddqactmap;
75 ulong_t *newdqactmap;
76 int oldnglobpris;
77 };
78 static void disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
79 disp_t *dp);
80 static void disp_dq_assign(struct disp_queue_info *dptr, int numpris);
81 static void disp_dq_free(struct disp_queue_info *dptr);
82
83 /* platform-specific routine to call when processor is idle */
84 static void generic_idle_cpu();
85 void (*idle_cpu)() = generic_idle_cpu;
86
87 /* routines invoked when a CPU enters/exits the idle loop */
88 static void idle_enter();
89 static void idle_exit();
90
91 /* platform-specific routine to call when thread is enqueued */
92 static void generic_enq_thread(cpu_t *, int);
93 void (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
94
95 pri_t kpreemptpri; /* priority where kernel preemption applies */
96 pri_t upreemptpri = 0; /* priority where normal preemption applies */
97 pri_t intr_pri; /* interrupt thread priority base level */
98
99 #define KPQPRI -1 /* pri where cpu affinity is dropped for kpq */
100 pri_t kpqpri = KPQPRI; /* can be set in /etc/system */
101 disp_t cpu0_disp; /* boot CPU's dispatch queue */
102 disp_lock_t swapped_lock; /* lock swapped threads and swap queue */
103 int nswapped; /* total number of swapped threads */
104 void disp_swapped_enq(kthread_t *tp);
105 static void disp_swapped_setrun(kthread_t *tp);
106 static void cpu_resched(cpu_t *cp, pri_t tpri);
107
108 /*
109 * If this is set, only interrupt threads will cause kernel preemptions.
110 * This is done by changing the value of kpreemptpri. kpreemptpri
111 * will either be the max sysclass pri + 1 or the min interrupt pri.
112 */
113 int only_intr_kpreempt;
114
115 extern void set_idle_cpu(int cpun);
116 extern void unset_idle_cpu(int cpun);
117 static void setkpdq(kthread_t *tp, int borf);
118 #define SETKP_BACK 0
119 #define SETKP_FRONT 1
120 /*
121 * Parameter that determines how recently a thread must have run
122 * on the CPU to be considered loosely-bound to that CPU to reduce
123 * cold cache effects. The interval is in hertz.
124 */
125 #define RECHOOSE_INTERVAL 3
126 volatile int rechoose_interval = RECHOOSE_INTERVAL;
127
128 /*
129 * Parameter that determines how long (in nanoseconds) a thread must
130 * be sitting on a run queue before it can be stolen by another CPU
131 * to reduce migrations. The interval is in nanoseconds.
132 *
133 * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
134 * to an appropriate value. nosteal_nsec is set to NOSTEAL_UNINITIALIZED
135 * here indicating it is uninitiallized.
136 * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
137 *
138 */
139 #define NOSTEAL_UNINITIALIZED (-1)
140 hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
141 extern void cmp_set_nosteal_interval(void);
142
143 id_t defaultcid; /* system "default" class; see dispadmin(1M) */
144
145 disp_lock_t transition_lock; /* lock on transitioning threads */
146 disp_lock_t stop_lock; /* lock on stopped threads */
147
148 static void cpu_dispqalloc(int numpris);
149
150 /*
151 * This gets returned by disp_getwork/disp_getbest if we couldn't steal
152 * a thread because it was sitting on its run queue for a very short
153 * period of time.
154 */
155 #define T_DONTSTEAL (kthread_t *)(-1) /* returned by disp_getwork/getbest */
156
157 static kthread_t *disp_getwork(cpu_t *to);
158 static kthread_t *disp_getbest(disp_t *from);
159 static kthread_t *disp_ratify(kthread_t *tp, disp_t *kpq);
160
161 void swtch_to(kthread_t *);
162
163 /*
164 * dispatcher and scheduler initialization
165 */
166
167 /*
168 * disp_setup - Common code to calculate and allocate dispatcher
169 * variables and structures based on the maximum priority.
170 */
171 static void
172 disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
173 {
174 pri_t newnglobpris;
175
176 ASSERT(MUTEX_HELD(&cpu_lock));
177
178 newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
179
180 if (newnglobpris > oldnglobpris) {
181 /*
182 * Allocate new kp queues for each CPU partition.
183 */
184 cpupart_kpqalloc(newnglobpris);
185
186 /*
187 * Allocate new dispatch queues for each CPU.
188 */
189 cpu_dispqalloc(newnglobpris);
190
191 /*
192 * compute new interrupt thread base priority
193 */
194 intr_pri = maxglobpri;
195 if (only_intr_kpreempt) {
196 kpreemptpri = intr_pri + 1;
197 if (kpqpri == KPQPRI)
198 kpqpri = kpreemptpri;
199 }
200 v.v_nglobpris = newnglobpris;
201 }
202 }
203
204 /*
205 * dispinit - Called to initialize all loaded classes and the
206 * dispatcher framework.
207 */
208 void
209 dispinit(void)
210 {
211 id_t cid;
212 pri_t maxglobpri;
213 pri_t cl_maxglobpri;
214
215 maxglobpri = -1;
216
217 /*
218 * Initialize transition lock, which will always be set.
219 */
220 DISP_LOCK_INIT(&transition_lock);
221 disp_lock_enter_high(&transition_lock);
222 DISP_LOCK_INIT(&stop_lock);
223
224 mutex_enter(&cpu_lock);
225 CPU->cpu_disp->disp_maxrunpri = -1;
226 CPU->cpu_disp->disp_max_unbound_pri = -1;
227
228 /*
229 * Initialize the default CPU partition.
230 */
231 cpupart_initialize_default();
232 /*
233 * Call the class specific initialization functions for
234 * all pre-installed schedulers.
235 *
236 * We pass the size of a class specific parameter
237 * buffer to each of the initialization functions
238 * to try to catch problems with backward compatibility
239 * of class modules.
240 *
241 * For example a new class module running on an old system
242 * which didn't provide sufficiently large parameter buffers
243 * would be bad news. Class initialization modules can check for
244 * this and take action if they detect a problem.
245 */
246
247 for (cid = 0; cid < nclass; cid++) {
248 sclass_t *sc;
249
250 sc = &sclass[cid];
251 if (SCHED_INSTALLED(sc)) {
252 cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
253 &sc->cl_funcs);
254 if (cl_maxglobpri > maxglobpri)
255 maxglobpri = cl_maxglobpri;
256 }
257 }
258 kpreemptpri = (pri_t)v.v_maxsyspri + 1;
259 if (kpqpri == KPQPRI)
260 kpqpri = kpreemptpri;
261
262 ASSERT(maxglobpri >= 0);
263 disp_setup(maxglobpri, 0);
264
265 mutex_exit(&cpu_lock);
266
267 /*
268 * Platform specific sticky scheduler setup.
269 */
270 if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
271 cmp_set_nosteal_interval();
272
273 /*
274 * Get the default class ID; this may be later modified via
275 * dispadmin(1M). This will load the class (normally TS) and that will
276 * call disp_add(), which is why we had to drop cpu_lock first.
277 */
278 if (getcid(defaultclass, &defaultcid) != 0) {
279 cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
280 defaultclass);
281 }
282 }
283
284 /*
285 * disp_add - Called with class pointer to initialize the dispatcher
286 * for a newly loaded class.
287 */
288 void
289 disp_add(sclass_t *clp)
290 {
291 pri_t maxglobpri;
292 pri_t cl_maxglobpri;
293
294 mutex_enter(&cpu_lock);
295 /*
296 * Initialize the scheduler class.
297 */
298 maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
299 cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
300 if (cl_maxglobpri > maxglobpri)
301 maxglobpri = cl_maxglobpri;
302
303 /*
304 * Save old queue information. Since we're initializing a
305 * new scheduling class which has just been loaded, then
306 * the size of the dispq may have changed. We need to handle
307 * that here.
308 */
309 disp_setup(maxglobpri, v.v_nglobpris);
310
311 mutex_exit(&cpu_lock);
312 }
313
314
315 /*
316 * For each CPU, allocate new dispatch queues
317 * with the stated number of priorities.
318 */
319 static void
320 cpu_dispqalloc(int numpris)
321 {
322 cpu_t *cpup;
323 struct disp_queue_info *disp_mem;
324 int i, num;
325
326 ASSERT(MUTEX_HELD(&cpu_lock));
327
328 disp_mem = kmem_zalloc(NCPU *
329 sizeof (struct disp_queue_info), KM_SLEEP);
330
331 /*
332 * This routine must allocate all of the memory before stopping
333 * the cpus because it must not sleep in kmem_alloc while the
334 * CPUs are stopped. Locks they hold will not be freed until they
335 * are restarted.
336 */
337 i = 0;
338 cpup = cpu_list;
339 do {
340 disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
341 i++;
342 cpup = cpup->cpu_next;
343 } while (cpup != cpu_list);
344 num = i;
345
346 pause_cpus(NULL, NULL);
347 for (i = 0; i < num; i++)
348 disp_dq_assign(&disp_mem[i], numpris);
349 start_cpus();
350
351 /*
352 * I must free all of the memory after starting the cpus because
353 * I can not risk sleeping in kmem_free while the cpus are stopped.
354 */
355 for (i = 0; i < num; i++)
356 disp_dq_free(&disp_mem[i]);
357
358 kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
359 }
360
361 static void
362 disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
363 {
364 dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
365 dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
366 sizeof (long), KM_SLEEP);
367 dptr->dp = dp;
368 }
369
370 static void
371 disp_dq_assign(struct disp_queue_info *dptr, int numpris)
372 {
373 disp_t *dp;
374
375 dp = dptr->dp;
376 dptr->olddispq = dp->disp_q;
377 dptr->olddqactmap = dp->disp_qactmap;
378 dptr->oldnglobpris = dp->disp_npri;
379
380 ASSERT(dptr->oldnglobpris < numpris);
381
382 if (dptr->olddispq != NULL) {
383 /*
384 * Use kcopy because bcopy is platform-specific
385 * and could block while we might have paused the cpus.
386 */
387 (void) kcopy(dptr->olddispq, dptr->newdispq,
388 dptr->oldnglobpris * sizeof (dispq_t));
389 (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
390 ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
391 sizeof (long));
392 }
393 dp->disp_q = dptr->newdispq;
394 dp->disp_qactmap = dptr->newdqactmap;
395 dp->disp_q_limit = &dptr->newdispq[numpris];
396 dp->disp_npri = numpris;
397 }
398
399 static void
400 disp_dq_free(struct disp_queue_info *dptr)
401 {
402 if (dptr->olddispq != NULL)
403 kmem_free(dptr->olddispq,
404 dptr->oldnglobpris * sizeof (dispq_t));
405 if (dptr->olddqactmap != NULL)
406 kmem_free(dptr->olddqactmap,
407 ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
408 }
409
410 /*
411 * For a newly created CPU, initialize the dispatch queue.
412 * This is called before the CPU is known through cpu[] or on any lists.
413 */
414 void
415 disp_cpu_init(cpu_t *cp)
416 {
417 disp_t *dp;
418 dispq_t *newdispq;
419 ulong_t *newdqactmap;
420
421 ASSERT(MUTEX_HELD(&cpu_lock)); /* protect dispatcher queue sizes */
422
423 if (cp == cpu0_disp.disp_cpu)
424 dp = &cpu0_disp;
425 else
426 dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
427 bzero(dp, sizeof (disp_t));
428 cp->cpu_disp = dp;
429 dp->disp_cpu = cp;
430 dp->disp_maxrunpri = -1;
431 dp->disp_max_unbound_pri = -1;
432 DISP_LOCK_INIT(&cp->cpu_thread_lock);
433 /*
434 * Allocate memory for the dispatcher queue headers
435 * and the active queue bitmap.
436 */
437 newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
438 newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
439 sizeof (long), KM_SLEEP);
440 dp->disp_q = newdispq;
441 dp->disp_qactmap = newdqactmap;
442 dp->disp_q_limit = &newdispq[v.v_nglobpris];
443 dp->disp_npri = v.v_nglobpris;
444 }
445
446 void
447 disp_cpu_fini(cpu_t *cp)
448 {
449 ASSERT(MUTEX_HELD(&cpu_lock));
450
451 disp_kp_free(cp->cpu_disp);
452 if (cp->cpu_disp != &cpu0_disp)
453 kmem_free(cp->cpu_disp, sizeof (disp_t));
454 }
455
456 /*
457 * Allocate new, larger kpreempt dispatch queue to replace the old one.
458 */
459 void
460 disp_kp_alloc(disp_t *dq, pri_t npri)
461 {
462 struct disp_queue_info mem_info;
463
464 if (npri > dq->disp_npri) {
465 /*
466 * Allocate memory for the new array.
467 */
468 disp_dq_alloc(&mem_info, npri, dq);
469
470 /*
471 * We need to copy the old structures to the new
472 * and free the old.
473 */
474 disp_dq_assign(&mem_info, npri);
475 disp_dq_free(&mem_info);
476 }
477 }
478
479 /*
480 * Free dispatch queue.
481 * Used for the kpreempt queues for a removed CPU partition and
482 * for the per-CPU queues of deleted CPUs.
483 */
484 void
485 disp_kp_free(disp_t *dq)
486 {
487 struct disp_queue_info mem_info;
488
489 mem_info.olddispq = dq->disp_q;
490 mem_info.olddqactmap = dq->disp_qactmap;
491 mem_info.oldnglobpris = dq->disp_npri;
492 disp_dq_free(&mem_info);
493 }
494
495 /*
496 * End dispatcher and scheduler initialization.
497 */
498
499 /*
500 * See if there's anything to do other than remain idle.
501 * Return non-zero if there is.
502 *
503 * This function must be called with high spl, or with
504 * kernel preemption disabled to prevent the partition's
505 * active cpu list from changing while being traversed.
506 *
507 * This is essentially a simpler version of disp_getwork()
508 * to be called by CPUs preparing to "halt".
509 */
510 int
511 disp_anywork(void)
512 {
513 cpu_t *cp = CPU;
514 cpu_t *ocp;
515 volatile int *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
516
517 if (!(cp->cpu_flags & CPU_OFFLINE)) {
518 if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
519 return (1);
520
521 for (ocp = cp->cpu_next_part; ocp != cp;
522 ocp = ocp->cpu_next_part) {
523 ASSERT(CPU_ACTIVE(ocp));
524
525 /*
526 * Something has appeared on the local run queue.
527 */
528 if (*local_nrunnable > 0)
529 return (1);
530 /*
531 * If we encounter another idle CPU that will
532 * soon be trolling around through disp_anywork()
533 * terminate our walk here and let this other CPU
534 * patrol the next part of the list.
535 */
536 if (ocp->cpu_dispatch_pri == -1 &&
537 (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
538 return (0);
539 /*
540 * Work can be taken from another CPU if:
541 * - There is unbound work on the run queue
542 * - That work isn't a thread undergoing a
543 * - context switch on an otherwise empty queue.
544 * - The CPU isn't running the idle loop.
545 */
546 if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
547 !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
548 ocp->cpu_disp->disp_nrunnable == 1) &&
549 ocp->cpu_dispatch_pri != -1)
550 return (1);
551 }
552 }
553 return (0);
554 }
555
556 /*
557 * Called when CPU enters the idle loop
558 */
559 static void
560 idle_enter()
561 {
562 cpu_t *cp = CPU;
563
564 new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
565 CPU_STATS_ADDQ(cp, sys, idlethread, 1);
566 set_idle_cpu(cp->cpu_id); /* arch-dependent hook */
567 }
568
569 /*
570 * Called when CPU exits the idle loop
571 */
572 static void
573 idle_exit()
574 {
575 cpu_t *cp = CPU;
576
577 new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
578 unset_idle_cpu(cp->cpu_id); /* arch-dependent hook */
579 }
580
581 /*
582 * Idle loop.
583 */
584 void
585 idle()
586 {
587 struct cpu *cp = CPU; /* pointer to this CPU */
588 kthread_t *t; /* taken thread */
589
590 idle_enter();
591
592 /*
593 * Uniprocessor version of idle loop.
594 * Do this until notified that we're on an actual multiprocessor.
595 */
596 while (ncpus == 1) {
597 if (cp->cpu_disp->disp_nrunnable == 0) {
598 (*idle_cpu)();
599 continue;
600 }
601 idle_exit();
602 swtch();
603
604 idle_enter(); /* returned from swtch */
605 }
606
607 /*
608 * Multiprocessor idle loop.
609 */
610 for (;;) {
611 /*
612 * If CPU is completely quiesced by p_online(2), just wait
613 * here with minimal bus traffic until put online.
614 */
615 while (cp->cpu_flags & CPU_QUIESCED)
616 (*idle_cpu)();
617
618 if (cp->cpu_disp->disp_nrunnable != 0) {
619 idle_exit();
620 swtch();
621 } else {
622 if (cp->cpu_flags & CPU_OFFLINE)
623 continue;
624 if ((t = disp_getwork(cp)) == NULL) {
625 if (cp->cpu_chosen_level != -1) {
626 disp_t *dp = cp->cpu_disp;
627 disp_t *kpq;
628
629 disp_lock_enter(&dp->disp_lock);
630 /*
631 * Set kpq under lock to prevent
632 * migration between partitions.
633 */
634 kpq = &cp->cpu_part->cp_kp_queue;
635 if (kpq->disp_maxrunpri == -1)
636 cp->cpu_chosen_level = -1;
637 disp_lock_exit(&dp->disp_lock);
638 }
639 (*idle_cpu)();
640 continue;
641 }
642 /*
643 * If there was a thread but we couldn't steal
644 * it, then keep trying.
645 */
646 if (t == T_DONTSTEAL)
647 continue;
648 idle_exit();
649 swtch_to(t);
650 }
651 idle_enter(); /* returned from swtch/swtch_to */
652 }
653 }
654
655
656 /*
657 * Preempt the currently running thread in favor of the highest
658 * priority thread. The class of the current thread controls
659 * where it goes on the dispatcher queues. If panicking, turn
660 * preemption off.
661 */
662 void
663 preempt()
664 {
665 kthread_t *t = curthread;
666 klwp_t *lwp = ttolwp(curthread);
667
668 if (panicstr)
669 return;
670
671 TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
672
673 thread_lock(t);
674
675 if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
676 /*
677 * this thread has already been chosen to be run on
678 * another CPU. Clear kprunrun on this CPU since we're
679 * already headed for swtch().
680 */
681 CPU->cpu_kprunrun = 0;
682 thread_unlock_nopreempt(t);
683 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
684 } else {
685 if (lwp != NULL)
686 lwp->lwp_ru.nivcsw++;
687 CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
688 THREAD_TRANSITION(t);
689 CL_PREEMPT(t);
690 DTRACE_SCHED(preempt);
691 thread_unlock_nopreempt(t);
692
693 TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
694
695 swtch(); /* clears CPU->cpu_runrun via disp() */
696 }
697 }
698
699 extern kthread_t *thread_unpin();
700
701 /*
702 * disp() - find the highest priority thread for this processor to run, and
703 * set it in TS_ONPROC state so that resume() can be called to run it.
704 */
705 static kthread_t *
706 disp()
707 {
708 cpu_t *cpup;
709 disp_t *dp;
710 kthread_t *tp;
711 dispq_t *dq;
712 int maxrunword;
713 pri_t pri;
714 disp_t *kpq;
715
716 TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
717
718 cpup = CPU;
719 /*
720 * Find the highest priority loaded, runnable thread.
721 */
722 dp = cpup->cpu_disp;
723
724 reschedule:
725 /*
726 * If there is more important work on the global queue with a better
727 * priority than the maximum on this CPU, take it now.
728 */
729 kpq = &cpup->cpu_part->cp_kp_queue;
730 while ((pri = kpq->disp_maxrunpri) >= 0 &&
731 pri >= dp->disp_maxrunpri &&
732 (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
733 (tp = disp_getbest(kpq)) != NULL) {
734 if (disp_ratify(tp, kpq) != NULL) {
735 TRACE_1(TR_FAC_DISP, TR_DISP_END,
736 "disp_end:tid %p", tp);
737 return (tp);
738 }
739 }
740
741 disp_lock_enter(&dp->disp_lock);
742 pri = dp->disp_maxrunpri;
743
744 /*
745 * If there is nothing to run, look at what's runnable on other queues.
746 * Choose the idle thread if the CPU is quiesced.
747 * Note that CPUs that have the CPU_OFFLINE flag set can still run
748 * interrupt threads, which will be the only threads on the CPU's own
749 * queue, but cannot run threads from other queues.
750 */
751 if (pri == -1) {
752 if (!(cpup->cpu_flags & CPU_OFFLINE)) {
753 disp_lock_exit(&dp->disp_lock);
754 if ((tp = disp_getwork(cpup)) == NULL ||
755 tp == T_DONTSTEAL) {
756 tp = cpup->cpu_idle_thread;
757 (void) splhigh();
758 THREAD_ONPROC(tp, cpup);
759 cpup->cpu_dispthread = tp;
760 cpup->cpu_dispatch_pri = -1;
761 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
762 cpup->cpu_chosen_level = -1;
763 }
764 } else {
765 disp_lock_exit_high(&dp->disp_lock);
766 tp = cpup->cpu_idle_thread;
767 THREAD_ONPROC(tp, cpup);
768 cpup->cpu_dispthread = tp;
769 cpup->cpu_dispatch_pri = -1;
770 cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
771 cpup->cpu_chosen_level = -1;
772 }
773 TRACE_1(TR_FAC_DISP, TR_DISP_END,
774 "disp_end:tid %p", tp);
775 return (tp);
776 }
777
778 dq = &dp->disp_q[pri];
779 tp = dq->dq_first;
780
781 ASSERT(tp != NULL);
782 ASSERT(tp->t_schedflag & TS_LOAD); /* thread must be swapped in */
783
784 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
785
786 /*
787 * Found it so remove it from queue.
788 */
789 dp->disp_nrunnable--;
790 dq->dq_sruncnt--;
791 if ((dq->dq_first = tp->t_link) == NULL) {
792 ulong_t *dqactmap = dp->disp_qactmap;
793
794 ASSERT(dq->dq_sruncnt == 0);
795 dq->dq_last = NULL;
796
797 /*
798 * The queue is empty, so the corresponding bit needs to be
799 * turned off in dqactmap. If nrunnable != 0 just took the
800 * last runnable thread off the
801 * highest queue, so recompute disp_maxrunpri.
802 */
803 maxrunword = pri >> BT_ULSHIFT;
804 dqactmap[maxrunword] &= ~BT_BIW(pri);
805
806 if (dp->disp_nrunnable == 0) {
807 dp->disp_max_unbound_pri = -1;
808 dp->disp_maxrunpri = -1;
809 } else {
810 int ipri;
811
812 ipri = bt_gethighbit(dqactmap, maxrunword);
813 dp->disp_maxrunpri = ipri;
814 if (ipri < dp->disp_max_unbound_pri)
815 dp->disp_max_unbound_pri = ipri;
816 }
817 } else {
818 tp->t_link = NULL;
819 }
820
821 /*
822 * Set TS_DONT_SWAP flag to prevent another processor from swapping
823 * out this thread before we have a chance to run it.
824 * While running, it is protected against swapping by t_lock.
825 */
826 tp->t_schedflag |= TS_DONT_SWAP;
827 cpup->cpu_dispthread = tp; /* protected by spl only */
828 cpup->cpu_dispatch_pri = pri;
829 ASSERT(pri == DISP_PRIO(tp));
830 thread_onproc(tp, cpup); /* set t_state to TS_ONPROC */
831 disp_lock_exit_high(&dp->disp_lock); /* drop run queue lock */
832
833 ASSERT(tp != NULL);
834 TRACE_1(TR_FAC_DISP, TR_DISP_END,
835 "disp_end:tid %p", tp);
836
837 if (disp_ratify(tp, kpq) == NULL)
838 goto reschedule;
839
840 return (tp);
841 }
842
843 /*
844 * swtch()
845 * Find best runnable thread and run it.
846 * Called with the current thread already switched to a new state,
847 * on a sleep queue, run queue, stopped, and not zombied.
848 * May be called at any spl level less than or equal to LOCK_LEVEL.
849 * Always drops spl to the base level (spl0()).
850 */
851 void
852 swtch()
853 {
854 kthread_t *t = curthread;
855 kthread_t *next;
856 cpu_t *cp;
857
858 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
859
860 if (t->t_flag & T_INTR_THREAD)
861 cpu_intr_swtch_enter(t);
862
863 if (t->t_intr != NULL) {
864 /*
865 * We are an interrupt thread. Setup and return
866 * the interrupted thread to be resumed.
867 */
868 (void) splhigh(); /* block other scheduler action */
869 cp = CPU; /* now protected against migration */
870 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
871 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
872 CPU_STATS_ADDQ(cp, sys, intrblk, 1);
873 next = thread_unpin();
874 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
875 resume_from_intr(next);
876 } else {
877 #ifdef DEBUG
878 if (t->t_state == TS_ONPROC &&
879 t->t_disp_queue->disp_cpu == CPU &&
880 t->t_preempt == 0) {
881 thread_lock(t);
882 ASSERT(t->t_state != TS_ONPROC ||
883 t->t_disp_queue->disp_cpu != CPU ||
884 t->t_preempt != 0); /* cannot migrate */
885 thread_unlock_nopreempt(t);
886 }
887 #endif /* DEBUG */
888 cp = CPU;
889 next = disp(); /* returns with spl high */
890 ASSERT(CPU_ON_INTR(cp) == 0); /* not called with PIL > 10 */
891
892 /* OK to steal anything left on run queue */
893 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
894
895 if (next != t) {
896 hrtime_t now;
897
898 now = gethrtime_unscaled();
899 pg_ev_thread_swtch(cp, now, t, next);
900
901 /*
902 * If t was previously in the TS_ONPROC state,
903 * setfrontdq and setbackdq won't have set its t_waitrq.
904 * Since we now finally know that we're switching away
905 * from this thread, set its t_waitrq if it is on a run
906 * queue.
907 */
908 if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
909 t->t_waitrq = now;
910 }
911
912 /*
913 * restore mstate of thread that we are switching to
914 */
915 restore_mstate(next);
916
917 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
918 cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
919 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
920
921 if (dtrace_vtime_active)
922 dtrace_vtime_switch(next);
923
924 resume(next);
925 /*
926 * The TR_RESUME_END and TR_SWTCH_END trace points
927 * appear at the end of resume(), because we may not
928 * return here
929 */
930 } else {
931 if (t->t_flag & T_INTR_THREAD)
932 cpu_intr_swtch_exit(t);
933 /*
934 * Threads that enqueue themselves on a run queue defer
935 * setting t_waitrq. It is then either set in swtch()
936 * when the CPU is actually yielded, or not at all if it
937 * is remaining on the CPU.
938 * There is however a window between where the thread
939 * placed itself on a run queue, and where it selects
940 * itself in disp(), where a third party (eg. clock()
941 * doing tick processing) may have re-enqueued this
942 * thread, setting t_waitrq in the process. We detect
943 * this race by noticing that despite switching to
944 * ourself, our t_waitrq has been set, and should be
945 * cleared.
946 */
947 if (t->t_waitrq != 0)
948 t->t_waitrq = 0;
949
950 pg_ev_thread_remain(cp, t);
951
952 DTRACE_SCHED(remain__cpu);
953 TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
954 (void) spl0();
955 }
956 }
957 }
958
959 /*
960 * swtch_from_zombie()
961 * Special case of swtch(), which allows checks for TS_ZOMB to be
962 * eliminated from normal resume.
963 * Find best runnable thread and run it.
964 * Called with the current thread zombied.
965 * Zombies cannot migrate, so CPU references are safe.
966 */
967 void
968 swtch_from_zombie()
969 {
970 kthread_t *next;
971 cpu_t *cpu = CPU;
972
973 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
974
975 ASSERT(curthread->t_state == TS_ZOMB);
976
977 next = disp(); /* returns with spl high */
978 ASSERT(CPU_ON_INTR(CPU) == 0); /* not called with PIL > 10 */
979 CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
980 ASSERT(next != curthread);
981 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
982
983 pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
984
985 restore_mstate(next);
986
987 if (dtrace_vtime_active)
988 dtrace_vtime_switch(next);
989
990 resume_from_zombie(next);
991 /*
992 * The TR_RESUME_END and TR_SWTCH_END trace points
993 * appear at the end of resume(), because we certainly will not
994 * return here
995 */
996 }
997
998 #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
999
1000 /*
1001 * search_disp_queues()
1002 * Search the given dispatch queues for thread tp.
1003 * Return 1 if tp is found, otherwise return 0.
1004 */
1005 static int
1006 search_disp_queues(disp_t *dp, kthread_t *tp)
1007 {
1008 dispq_t *dq;
1009 dispq_t *eq;
1010
1011 disp_lock_enter_high(&dp->disp_lock);
1012
1013 for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1014 kthread_t *rp;
1015
1016 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1017
1018 for (rp = dq->dq_first; rp; rp = rp->t_link)
1019 if (tp == rp) {
1020 disp_lock_exit_high(&dp->disp_lock);
1021 return (1);
1022 }
1023 }
1024 disp_lock_exit_high(&dp->disp_lock);
1025
1026 return (0);
1027 }
1028
1029 /*
1030 * thread_on_queue()
1031 * Search all per-CPU dispatch queues and all partition-wide kpreempt
1032 * queues for thread tp. Return 1 if tp is found, otherwise return 0.
1033 */
1034 static int
1035 thread_on_queue(kthread_t *tp)
1036 {
1037 cpu_t *cp;
1038 struct cpupart *part;
1039
1040 ASSERT(getpil() >= DISP_LEVEL);
1041
1042 /*
1043 * Search the per-CPU dispatch queues for tp.
1044 */
1045 cp = CPU;
1046 do {
1047 if (search_disp_queues(cp->cpu_disp, tp))
1048 return (1);
1049 } while ((cp = cp->cpu_next_onln) != CPU);
1050
1051 /*
1052 * Search the partition-wide kpreempt queues for tp.
1053 */
1054 part = CPU->cpu_part;
1055 do {
1056 if (search_disp_queues(&part->cp_kp_queue, tp))
1057 return (1);
1058 } while ((part = part->cp_next) != CPU->cpu_part);
1059
1060 return (0);
1061 }
1062
1063 #else
1064
1065 #define thread_on_queue(tp) 0 /* ASSERT must be !thread_on_queue */
1066
1067 #endif /* DEBUG */
1068
1069 /*
1070 * like swtch(), but switch to a specified thread taken from another CPU.
1071 * called with spl high..
1072 */
1073 void
1074 swtch_to(kthread_t *next)
1075 {
1076 cpu_t *cp = CPU;
1077 hrtime_t now;
1078
1079 TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1080
1081 /*
1082 * Update context switch statistics.
1083 */
1084 CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1085
1086 TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1087
1088 now = gethrtime_unscaled();
1089 pg_ev_thread_swtch(cp, now, curthread, next);
1090
1091 /* OK to steal anything left on run queue */
1092 cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1093
1094 /* record last execution time */
1095 cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1096
1097 /*
1098 * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1099 * won't have set its t_waitrq. Since we now finally know that we're
1100 * switching away from this thread, set its t_waitrq if it is on a run
1101 * queue.
1102 */
1103 if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1104 curthread->t_waitrq = now;
1105 }
1106
1107 /* restore next thread to previously running microstate */
1108 restore_mstate(next);
1109
1110 if (dtrace_vtime_active)
1111 dtrace_vtime_switch(next);
1112
1113 resume(next);
1114 /*
1115 * The TR_RESUME_END and TR_SWTCH_END trace points
1116 * appear at the end of resume(), because we may not
1117 * return here
1118 */
1119 }
1120
1121 #define CPU_IDLING(pri) ((pri) == -1)
1122
1123 static void
1124 cpu_resched(cpu_t *cp, pri_t tpri)
1125 {
1126 int call_poke_cpu = 0;
1127 pri_t cpupri = cp->cpu_dispatch_pri;
1128
1129 if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1130 TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1131 "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1132 if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1133 cp->cpu_runrun = 1;
1134 aston(cp->cpu_dispthread);
1135 if (tpri < kpreemptpri && cp != CPU)
1136 call_poke_cpu = 1;
1137 }
1138 if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1139 cp->cpu_kprunrun = 1;
1140 if (cp != CPU)
1141 call_poke_cpu = 1;
1142 }
1143 }
1144
1145 /*
1146 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1147 */
1148 membar_enter();
1149
1150 if (call_poke_cpu)
1151 poke_cpu(cp->cpu_id);
1152 }
1153
1154 /*
1155 * setbackdq() keeps runqs balanced such that the difference in length
1156 * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1157 * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1158 * must match. When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1159 * try to keep runqs perfectly balanced regardless of the thread priority.
1160 */
1161 #define RUNQ_MATCH_PRI 16 /* pri below which queue lengths must match */
1162 #define RUNQ_MAX_DIFF 2 /* maximum runq length difference */
1163 #define RUNQ_LEN(cp, pri) ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1164
1165 /*
1166 * Macro that evaluates to true if it is likely that the thread has cache
1167 * warmth. This is based on the amount of time that has elapsed since the
1168 * thread last ran. If that amount of time is less than "rechoose_interval"
1169 * ticks, then we decide that the thread has enough cache warmth to warrant
1170 * some affinity for t->t_cpu.
1171 */
1172 #define THREAD_HAS_CACHE_WARMTH(thread) \
1173 ((thread == curthread) || \
1174 ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1175 /*
1176 * Put the specified thread on the back of the dispatcher
1177 * queue corresponding to its current priority.
1178 *
1179 * Called with the thread in transition, onproc or stopped state
1180 * and locked (transition implies locked) and at high spl.
1181 * Returns with the thread in TS_RUN state and still locked.
1182 */
1183 void
1184 setbackdq(kthread_t *tp)
1185 {
1186 dispq_t *dq;
1187 disp_t *dp;
1188 cpu_t *cp;
1189 pri_t tpri;
1190 int bound;
1191 boolean_t self;
1192
1193 ASSERT(THREAD_LOCK_HELD(tp));
1194 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1195 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1196
1197 /*
1198 * If thread is "swapped" or on the swap queue don't
1199 * queue it, but wake sched.
1200 */
1201 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1202 disp_swapped_setrun(tp);
1203 return;
1204 }
1205
1206 self = (tp == curthread);
1207
1208 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1209 bound = 1;
1210 else
1211 bound = 0;
1212
1213 tpri = DISP_PRIO(tp);
1214 if (ncpus == 1)
1215 cp = tp->t_cpu;
1216 else if (!bound) {
1217 if (tpri >= kpqpri) {
1218 setkpdq(tp, SETKP_BACK);
1219 return;
1220 }
1221
1222 /*
1223 * We'll generally let this thread continue to run where
1224 * it last ran...but will consider migration if:
1225 * - We thread probably doesn't have much cache warmth.
1226 * - The CPU where it last ran is the target of an offline
1227 * request.
1228 * - The thread last ran outside it's home lgroup.
1229 */
1230 if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1231 (tp->t_cpu == cpu_inmotion)) {
1232 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1233 } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1234 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1235 self ? tp->t_cpu : NULL);
1236 } else {
1237 cp = tp->t_cpu;
1238 }
1239
1240 if (tp->t_cpupart == cp->cpu_part) {
1241 int qlen;
1242
1243 /*
1244 * Perform any CMT load balancing
1245 */
1246 cp = cmt_balance(tp, cp);
1247
1248 /*
1249 * Balance across the run queues
1250 */
1251 qlen = RUNQ_LEN(cp, tpri);
1252 if (tpri >= RUNQ_MATCH_PRI &&
1253 !(tp->t_schedflag & TS_RUNQMATCH))
1254 qlen -= RUNQ_MAX_DIFF;
1255 if (qlen > 0) {
1256 cpu_t *newcp;
1257
1258 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1259 newcp = cp->cpu_next_part;
1260 } else if ((newcp = cp->cpu_next_lpl) == cp) {
1261 newcp = cp->cpu_next_part;
1262 }
1263
1264 if (RUNQ_LEN(newcp, tpri) < qlen) {
1265 DTRACE_PROBE3(runq__balance,
1266 kthread_t *, tp,
1267 cpu_t *, cp, cpu_t *, newcp);
1268 cp = newcp;
1269 }
1270 }
1271 } else {
1272 /*
1273 * Migrate to a cpu in the new partition.
1274 */
1275 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1276 tp->t_lpl, tp->t_pri, NULL);
1277 }
1278 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1279 } else {
1280 /*
1281 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1282 * a short time until weak binding that existed when the
1283 * strong binding was established has dropped) so we must
1284 * favour weak binding over strong.
1285 */
1286 cp = tp->t_weakbound_cpu ?
1287 tp->t_weakbound_cpu : tp->t_bound_cpu;
1288 }
1289 /*
1290 * A thread that is ONPROC may be temporarily placed on the run queue
1291 * but then chosen to run again by disp. If the thread we're placing on
1292 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1293 * replacement process is actually scheduled in swtch(). In this
1294 * situation, curthread is the only thread that could be in the ONPROC
1295 * state.
1296 */
1297 if ((!self) && (tp->t_waitrq == 0)) {
1298 hrtime_t curtime;
1299
1300 curtime = gethrtime_unscaled();
1301 (void) cpu_update_pct(tp, curtime);
1302 tp->t_waitrq = curtime;
1303 } else {
1304 (void) cpu_update_pct(tp, gethrtime_unscaled());
1305 }
1306
1307 dp = cp->cpu_disp;
1308 disp_lock_enter_high(&dp->disp_lock);
1309
1310 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1311 TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1312 tpri, cp, tp);
1313
1314 #ifndef NPROBE
1315 /* Kernel probe */
1316 if (tnf_tracing_active)
1317 tnf_thread_queue(tp, cp, tpri);
1318 #endif /* NPROBE */
1319
1320 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1321
1322 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1323 tp->t_disp_queue = dp;
1324 tp->t_link = NULL;
1325
1326 dq = &dp->disp_q[tpri];
1327 dp->disp_nrunnable++;
1328 if (!bound)
1329 dp->disp_steal = 0;
1330 membar_enter();
1331
1332 if (dq->dq_sruncnt++ != 0) {
1333 ASSERT(dq->dq_first != NULL);
1334 dq->dq_last->t_link = tp;
1335 dq->dq_last = tp;
1336 } else {
1337 ASSERT(dq->dq_first == NULL);
1338 ASSERT(dq->dq_last == NULL);
1339 dq->dq_first = dq->dq_last = tp;
1340 BT_SET(dp->disp_qactmap, tpri);
1341 if (tpri > dp->disp_maxrunpri) {
1342 dp->disp_maxrunpri = tpri;
1343 membar_enter();
1344 cpu_resched(cp, tpri);
1345 }
1346 }
1347
1348 if (!bound && tpri > dp->disp_max_unbound_pri) {
1349 if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1350 /*
1351 * If there are no other unbound threads on the
1352 * run queue, don't allow other CPUs to steal
1353 * this thread while we are in the middle of a
1354 * context switch. We may just switch to it
1355 * again right away. CPU_DISP_DONTSTEAL is cleared
1356 * in swtch and swtch_to.
1357 */
1358 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1359 }
1360 dp->disp_max_unbound_pri = tpri;
1361 }
1362 (*disp_enq_thread)(cp, bound);
1363 }
1364
1365 /*
1366 * Put the specified thread on the front of the dispatcher
1367 * queue corresponding to its current priority.
1368 *
1369 * Called with the thread in transition, onproc or stopped state
1370 * and locked (transition implies locked) and at high spl.
1371 * Returns with the thread in TS_RUN state and still locked.
1372 */
1373 void
1374 setfrontdq(kthread_t *tp)
1375 {
1376 disp_t *dp;
1377 dispq_t *dq;
1378 cpu_t *cp;
1379 pri_t tpri;
1380 int bound;
1381
1382 ASSERT(THREAD_LOCK_HELD(tp));
1383 ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1384 ASSERT(!thread_on_queue(tp)); /* make sure tp isn't on a runq */
1385
1386 /*
1387 * If thread is "swapped" or on the swap queue don't
1388 * queue it, but wake sched.
1389 */
1390 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1391 disp_swapped_setrun(tp);
1392 return;
1393 }
1394
1395 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1396 bound = 1;
1397 else
1398 bound = 0;
1399
1400 tpri = DISP_PRIO(tp);
1401 if (ncpus == 1)
1402 cp = tp->t_cpu;
1403 else if (!bound) {
1404 if (tpri >= kpqpri) {
1405 setkpdq(tp, SETKP_FRONT);
1406 return;
1407 }
1408 cp = tp->t_cpu;
1409 if (tp->t_cpupart == cp->cpu_part) {
1410 /*
1411 * We'll generally let this thread continue to run
1412 * where it last ran, but will consider migration if:
1413 * - The thread last ran outside it's home lgroup.
1414 * - The CPU where it last ran is the target of an
1415 * offline request (a thread_nomigrate() on the in
1416 * motion CPU relies on this when forcing a preempt).
1417 * - The thread isn't the highest priority thread where
1418 * it last ran, and it is considered not likely to
1419 * have significant cache warmth.
1420 */
1421 if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1422 (cp == cpu_inmotion)) {
1423 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1424 (tp == curthread) ? cp : NULL);
1425 } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1426 (!THREAD_HAS_CACHE_WARMTH(tp))) {
1427 cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1428 NULL);
1429 }
1430 } else {
1431 /*
1432 * Migrate to a cpu in the new partition.
1433 */
1434 cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1435 tp->t_lpl, tp->t_pri, NULL);
1436 }
1437 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1438 } else {
1439 /*
1440 * It is possible that t_weakbound_cpu != t_bound_cpu (for
1441 * a short time until weak binding that existed when the
1442 * strong binding was established has dropped) so we must
1443 * favour weak binding over strong.
1444 */
1445 cp = tp->t_weakbound_cpu ?
1446 tp->t_weakbound_cpu : tp->t_bound_cpu;
1447 }
1448
1449 /*
1450 * A thread that is ONPROC may be temporarily placed on the run queue
1451 * but then chosen to run again by disp. If the thread we're placing on
1452 * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1453 * replacement process is actually scheduled in swtch(). In this
1454 * situation, curthread is the only thread that could be in the ONPROC
1455 * state.
1456 */
1457 if ((tp != curthread) && (tp->t_waitrq == 0)) {
1458 hrtime_t curtime;
1459
1460 curtime = gethrtime_unscaled();
1461 (void) cpu_update_pct(tp, curtime);
1462 tp->t_waitrq = curtime;
1463 } else {
1464 (void) cpu_update_pct(tp, gethrtime_unscaled());
1465 }
1466
1467 dp = cp->cpu_disp;
1468 disp_lock_enter_high(&dp->disp_lock);
1469
1470 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1471 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1472
1473 #ifndef NPROBE
1474 /* Kernel probe */
1475 if (tnf_tracing_active)
1476 tnf_thread_queue(tp, cp, tpri);
1477 #endif /* NPROBE */
1478
1479 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1480
1481 THREAD_RUN(tp, &dp->disp_lock); /* set TS_RUN state and lock */
1482 tp->t_disp_queue = dp;
1483
1484 dq = &dp->disp_q[tpri];
1485 dp->disp_nrunnable++;
1486 if (!bound)
1487 dp->disp_steal = 0;
1488 membar_enter();
1489
1490 if (dq->dq_sruncnt++ != 0) {
1491 ASSERT(dq->dq_last != NULL);
1492 tp->t_link = dq->dq_first;
1493 dq->dq_first = tp;
1494 } else {
1495 ASSERT(dq->dq_last == NULL);
1496 ASSERT(dq->dq_first == NULL);
1497 tp->t_link = NULL;
1498 dq->dq_first = dq->dq_last = tp;
1499 BT_SET(dp->disp_qactmap, tpri);
1500 if (tpri > dp->disp_maxrunpri) {
1501 dp->disp_maxrunpri = tpri;
1502 membar_enter();
1503 cpu_resched(cp, tpri);
1504 }
1505 }
1506
1507 if (!bound && tpri > dp->disp_max_unbound_pri) {
1508 if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1509 cp == CPU) {
1510 /*
1511 * If there are no other unbound threads on the
1512 * run queue, don't allow other CPUs to steal
1513 * this thread while we are in the middle of a
1514 * context switch. We may just switch to it
1515 * again right away. CPU_DISP_DONTSTEAL is cleared
1516 * in swtch and swtch_to.
1517 */
1518 cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1519 }
1520 dp->disp_max_unbound_pri = tpri;
1521 }
1522 (*disp_enq_thread)(cp, bound);
1523 }
1524
1525 /*
1526 * Put a high-priority unbound thread on the kp queue
1527 */
1528 static void
1529 setkpdq(kthread_t *tp, int borf)
1530 {
1531 dispq_t *dq;
1532 disp_t *dp;
1533 cpu_t *cp;
1534 pri_t tpri;
1535
1536 tpri = DISP_PRIO(tp);
1537
1538 dp = &tp->t_cpupart->cp_kp_queue;
1539 disp_lock_enter_high(&dp->disp_lock);
1540
1541 TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1542
1543 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1544 DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1545 THREAD_RUN(tp, &dp->disp_lock); /* set t_state to TS_RUN */
1546 tp->t_disp_queue = dp;
1547 dp->disp_nrunnable++;
1548 dq = &dp->disp_q[tpri];
1549
1550 if (dq->dq_sruncnt++ != 0) {
1551 if (borf == SETKP_BACK) {
1552 ASSERT(dq->dq_first != NULL);
1553 tp->t_link = NULL;
1554 dq->dq_last->t_link = tp;
1555 dq->dq_last = tp;
1556 } else {
1557 ASSERT(dq->dq_last != NULL);
1558 tp->t_link = dq->dq_first;
1559 dq->dq_first = tp;
1560 }
1561 } else {
1562 if (borf == SETKP_BACK) {
1563 ASSERT(dq->dq_first == NULL);
1564 ASSERT(dq->dq_last == NULL);
1565 dq->dq_first = dq->dq_last = tp;
1566 } else {
1567 ASSERT(dq->dq_last == NULL);
1568 ASSERT(dq->dq_first == NULL);
1569 tp->t_link = NULL;
1570 dq->dq_first = dq->dq_last = tp;
1571 }
1572 BT_SET(dp->disp_qactmap, tpri);
1573 if (tpri > dp->disp_max_unbound_pri)
1574 dp->disp_max_unbound_pri = tpri;
1575 if (tpri > dp->disp_maxrunpri) {
1576 dp->disp_maxrunpri = tpri;
1577 membar_enter();
1578 }
1579 }
1580
1581 cp = tp->t_cpu;
1582 if (tp->t_cpupart != cp->cpu_part) {
1583 /* migrate to a cpu in the new partition */
1584 cp = tp->t_cpupart->cp_cpulist;
1585 }
1586 cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1587 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1588 ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1589
1590 #ifndef NPROBE
1591 /* Kernel probe */
1592 if (tnf_tracing_active)
1593 tnf_thread_queue(tp, cp, tpri);
1594 #endif /* NPROBE */
1595
1596 if (cp->cpu_chosen_level < tpri)
1597 cp->cpu_chosen_level = tpri;
1598 cpu_resched(cp, tpri);
1599 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1600 (*disp_enq_thread)(cp, 0);
1601 }
1602
1603 /*
1604 * Remove a thread from the dispatcher queue if it is on it.
1605 * It is not an error if it is not found but we return whether
1606 * or not it was found in case the caller wants to check.
1607 */
1608 int
1609 dispdeq(kthread_t *tp)
1610 {
1611 disp_t *dp;
1612 dispq_t *dq;
1613 kthread_t *rp;
1614 kthread_t *trp;
1615 kthread_t **ptp;
1616 int tpri;
1617
1618 ASSERT(THREAD_LOCK_HELD(tp));
1619
1620 if (tp->t_state != TS_RUN)
1621 return (0);
1622
1623 /*
1624 * The thread is "swapped" or is on the swap queue and
1625 * hence no longer on the run queue, so return true.
1626 */
1627 if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1628 return (1);
1629
1630 tpri = DISP_PRIO(tp);
1631 dp = tp->t_disp_queue;
1632 ASSERT(tpri < dp->disp_npri);
1633 dq = &dp->disp_q[tpri];
1634 ptp = &dq->dq_first;
1635 rp = *ptp;
1636 trp = NULL;
1637
1638 ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1639
1640 /*
1641 * Search for thread in queue.
1642 * Double links would simplify this at the expense of disp/setrun.
1643 */
1644 while (rp != tp && rp != NULL) {
1645 trp = rp;
1646 ptp = &trp->t_link;
1647 rp = trp->t_link;
1648 }
1649
1650 if (rp == NULL) {
1651 panic("dispdeq: thread not on queue");
1652 }
1653
1654 DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1655
1656 /*
1657 * Found it so remove it from queue.
1658 */
1659 if ((*ptp = rp->t_link) == NULL)
1660 dq->dq_last = trp;
1661
1662 dp->disp_nrunnable--;
1663 if (--dq->dq_sruncnt == 0) {
1664 dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1665 if (dp->disp_nrunnable == 0) {
1666 dp->disp_max_unbound_pri = -1;
1667 dp->disp_maxrunpri = -1;
1668 } else if (tpri == dp->disp_maxrunpri) {
1669 int ipri;
1670
1671 ipri = bt_gethighbit(dp->disp_qactmap,
1672 dp->disp_maxrunpri >> BT_ULSHIFT);
1673 if (ipri < dp->disp_max_unbound_pri)
1674 dp->disp_max_unbound_pri = ipri;
1675 dp->disp_maxrunpri = ipri;
1676 }
1677 }
1678 tp->t_link = NULL;
1679 THREAD_TRANSITION(tp); /* put in intermediate state */
1680 return (1);
1681 }
1682
1683
1684 /*
1685 * dq_sruninc and dq_srundec are public functions for
1686 * incrementing/decrementing the sruncnts when a thread on
1687 * a dispatcher queue is made schedulable/unschedulable by
1688 * resetting the TS_LOAD flag.
1689 *
1690 * The caller MUST have the thread lock and therefore the dispatcher
1691 * queue lock so that the operation which changes
1692 * the flag, the operation that checks the status of the thread to
1693 * determine if it's on a disp queue AND the call to this function
1694 * are one atomic operation with respect to interrupts.
1695 */
1696
1697 /*
1698 * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1699 */
1700 void
1701 dq_sruninc(kthread_t *t)
1702 {
1703 ASSERT(t->t_state == TS_RUN);
1704 ASSERT(t->t_schedflag & TS_LOAD);
1705
1706 THREAD_TRANSITION(t);
1707 setfrontdq(t);
1708 }
1709
1710 /*
1711 * See comment on calling conventions above.
1712 * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1713 */
1714 void
1715 dq_srundec(kthread_t *t)
1716 {
1717 ASSERT(t->t_schedflag & TS_LOAD);
1718
1719 (void) dispdeq(t);
1720 disp_swapped_enq(t);
1721 }
1722
1723 /*
1724 * Change the dispatcher lock of thread to the "swapped_lock"
1725 * and return with thread lock still held.
1726 *
1727 * Called with thread_lock held, in transition state, and at high spl.
1728 */
1729 void
1730 disp_swapped_enq(kthread_t *tp)
1731 {
1732 ASSERT(THREAD_LOCK_HELD(tp));
1733 ASSERT(tp->t_schedflag & TS_LOAD);
1734
1735 switch (tp->t_state) {
1736 case TS_RUN:
1737 disp_lock_enter_high(&swapped_lock);
1738 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1739 break;
1740 case TS_ONPROC:
1741 disp_lock_enter_high(&swapped_lock);
1742 THREAD_TRANSITION(tp);
1743 wake_sched_sec = 1; /* tell clock to wake sched */
1744 THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1745 break;
1746 default:
1747 panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1748 }
1749 }
1750
1751 /*
1752 * This routine is called by setbackdq/setfrontdq if the thread is
1753 * not loaded or loaded and on the swap queue.
1754 *
1755 * Thread state TS_SLEEP implies that a swapped thread
1756 * has been woken up and needs to be swapped in by the swapper.
1757 *
1758 * Thread state TS_RUN, it implies that the priority of a swapped
1759 * thread is being increased by scheduling class (e.g. ts_update).
1760 */
1761 static void
1762 disp_swapped_setrun(kthread_t *tp)
1763 {
1764 ASSERT(THREAD_LOCK_HELD(tp));
1765 ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1766
1767 switch (tp->t_state) {
1768 case TS_SLEEP:
1769 disp_lock_enter_high(&swapped_lock);
1770 /*
1771 * Wakeup sched immediately (i.e., next tick) if the
1772 * thread priority is above maxclsyspri.
1773 */
1774 if (DISP_PRIO(tp) > maxclsyspri)
1775 wake_sched = 1;
1776 else
1777 wake_sched_sec = 1;
1778 THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1779 break;
1780 case TS_RUN: /* called from ts_update */
1781 break;
1782 default:
1783 panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1784 }
1785 }
1786
1787 /*
1788 * Make a thread give up its processor. Find the processor on
1789 * which this thread is executing, and have that processor
1790 * preempt.
1791 *
1792 * We allow System Duty Cycle (SDC) threads to be preempted even if
1793 * they are running at kernel priorities. To implement this, we always
1794 * set cpu_kprunrun; this ensures preempt() will be called. Since SDC
1795 * calls cpu_surrender() very often, we only preempt if there is anyone
1796 * competing with us.
1797 */
1798 void
1799 cpu_surrender(kthread_t *tp)
1800 {
1801 cpu_t *cpup;
1802 int max_pri;
1803 int max_run_pri;
1804 klwp_t *lwp;
1805
1806 ASSERT(THREAD_LOCK_HELD(tp));
1807
1808 if (tp->t_state != TS_ONPROC)
1809 return;
1810 cpup = tp->t_disp_queue->disp_cpu; /* CPU thread dispatched to */
1811 max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1812 max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1813 if (max_pri < max_run_pri)
1814 max_pri = max_run_pri;
1815
1816 if (tp->t_cid == sysdccid) {
1817 uint_t t_pri = DISP_PRIO(tp);
1818 if (t_pri > max_pri)
1819 return; /* we are not competing w/ anyone */
1820 cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1821 } else {
1822 cpup->cpu_runrun = 1;
1823 if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1824 cpup->cpu_kprunrun = 1;
1825 }
1826 }
1827
1828 /*
1829 * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1830 */
1831 membar_enter();
1832
1833 DTRACE_SCHED1(surrender, kthread_t *, tp);
1834
1835 /*
1836 * Make the target thread take an excursion through trap()
1837 * to do preempt() (unless we're already in trap or post_syscall,
1838 * calling cpu_surrender via CL_TRAPRET).
1839 */
1840 if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1841 lwp->lwp_state != LWP_USER) {
1842 aston(tp);
1843 if (cpup != CPU)
1844 poke_cpu(cpup->cpu_id);
1845 }
1846 TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1847 "cpu_surrender:tid %p cpu %p", tp, cpup);
1848 }
1849
1850 /*
1851 * Commit to and ratify a scheduling decision
1852 */
1853 /*ARGSUSED*/
1854 static kthread_t *
1855 disp_ratify(kthread_t *tp, disp_t *kpq)
1856 {
1857 pri_t tpri, maxpri;
1858 pri_t maxkpri;
1859 cpu_t *cpup;
1860
1861 ASSERT(tp != NULL);
1862 /*
1863 * Commit to, then ratify scheduling decision
1864 */
1865 cpup = CPU;
1866 if (cpup->cpu_runrun != 0)
1867 cpup->cpu_runrun = 0;
1868 if (cpup->cpu_kprunrun != 0)
1869 cpup->cpu_kprunrun = 0;
1870 if (cpup->cpu_chosen_level != -1)
1871 cpup->cpu_chosen_level = -1;
1872 membar_enter();
1873 tpri = DISP_PRIO(tp);
1874 maxpri = cpup->cpu_disp->disp_maxrunpri;
1875 maxkpri = kpq->disp_maxrunpri;
1876 if (maxpri < maxkpri)
1877 maxpri = maxkpri;
1878 if (tpri < maxpri) {
1879 /*
1880 * should have done better
1881 * put this one back and indicate to try again
1882 */
1883 cpup->cpu_dispthread = curthread; /* fixup dispthread */
1884 cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1885 thread_lock_high(tp);
1886 THREAD_TRANSITION(tp);
1887 setfrontdq(tp);
1888 thread_unlock_nopreempt(tp);
1889
1890 tp = NULL;
1891 }
1892 return (tp);
1893 }
1894
1895 /*
1896 * See if there is any work on the dispatcher queue for other CPUs.
1897 * If there is, dequeue the best thread and return.
1898 */
1899 static kthread_t *
1900 disp_getwork(cpu_t *cp)
1901 {
1902 cpu_t *ocp; /* other CPU */
1903 cpu_t *ocp_start;
1904 cpu_t *tcp; /* target local CPU */
1905 kthread_t *tp;
1906 kthread_t *retval = NULL;
1907 pri_t maxpri;
1908 disp_t *kpq; /* kp queue for this partition */
1909 lpl_t *lpl, *lpl_leaf;
1910 int leafidx, startidx;
1911 hrtime_t stealtime;
1912 lgrp_id_t local_id;
1913
1914 maxpri = -1;
1915 tcp = NULL;
1916
1917 kpq = &cp->cpu_part->cp_kp_queue;
1918 while (kpq->disp_maxrunpri >= 0) {
1919 /*
1920 * Try to take a thread from the kp_queue.
1921 */
1922 tp = (disp_getbest(kpq));
1923 if (tp)
1924 return (disp_ratify(tp, kpq));
1925 }
1926
1927 kpreempt_disable(); /* protect the cpu_active list */
1928
1929 /*
1930 * Try to find something to do on another CPU's run queue.
1931 * Loop through all other CPUs looking for the one with the highest
1932 * priority unbound thread.
1933 *
1934 * On NUMA machines, the partition's CPUs are consulted in order of
1935 * distance from the current CPU. This way, the first available
1936 * work found is also the closest, and will suffer the least
1937 * from being migrated.
1938 */
1939 lpl = lpl_leaf = cp->cpu_lpl;
1940 local_id = lpl_leaf->lpl_lgrpid;
1941 leafidx = startidx = 0;
1942
1943 /*
1944 * This loop traverses the lpl hierarchy. Higher level lpls represent
1945 * broader levels of locality
1946 */
1947 do {
1948 /* This loop iterates over the lpl's leaves */
1949 do {
1950 if (lpl_leaf != cp->cpu_lpl)
1951 ocp = lpl_leaf->lpl_cpus;
1952 else
1953 ocp = cp->cpu_next_lpl;
1954
1955 /* This loop iterates over the CPUs in the leaf */
1956 ocp_start = ocp;
1957 do {
1958 pri_t pri;
1959
1960 ASSERT(CPU_ACTIVE(ocp));
1961
1962 /*
1963 * End our stroll around this lpl if:
1964 *
1965 * - Something became runnable on the local
1966 * queue...which also ends our stroll around
1967 * the partition.
1968 *
1969 * - We happen across another idle CPU.
1970 * Since it is patrolling the next portion
1971 * of the lpl's list (assuming it's not
1972 * halted, or busy servicing an interrupt),
1973 * move to the next higher level of locality.
1974 */
1975 if (cp->cpu_disp->disp_nrunnable != 0) {
1976 kpreempt_enable();
1977 return (NULL);
1978 }
1979 if (ocp->cpu_dispatch_pri == -1) {
1980 if (ocp->cpu_disp_flags &
1981 CPU_DISP_HALTED ||
1982 ocp->cpu_intr_actv != 0)
1983 continue;
1984 else
1985 goto next_level;
1986 }
1987
1988 /*
1989 * If there's only one thread and the CPU
1990 * is in the middle of a context switch,
1991 * or it's currently running the idle thread,
1992 * don't steal it.
1993 */
1994 if ((ocp->cpu_disp_flags &
1995 CPU_DISP_DONTSTEAL) &&
1996 ocp->cpu_disp->disp_nrunnable == 1)
1997 continue;
1998
1999 pri = ocp->cpu_disp->disp_max_unbound_pri;
2000 if (pri > maxpri) {
2001 /*
2002 * Don't steal threads that we attempted
2003 * to steal recently until they're ready
2004 * to be stolen again.
2005 */
2006 stealtime = ocp->cpu_disp->disp_steal;
2007 if (stealtime == 0 ||
2008 stealtime - gethrtime() <= 0) {
2009 maxpri = pri;
2010 tcp = ocp;
2011 } else {
2012 /*
2013 * Don't update tcp, just set
2014 * the retval to T_DONTSTEAL, so
2015 * that if no acceptable CPUs
2016 * are found the return value
2017 * will be T_DONTSTEAL rather
2018 * then NULL.
2019 */
2020 retval = T_DONTSTEAL;
2021 }
2022 }
2023 } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2024
2025 /*
2026 * Iterate to the next leaf lpl in the resource set
2027 * at this level of locality. If we hit the end of
2028 * the set, wrap back around to the beginning.
2029 *
2030 * Note: This iteration is NULL terminated for a reason
2031 * see lpl_topo_bootstrap() in lgrp.c for details.
2032 */
2033 if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2034 leafidx = 0;
2035 lpl_leaf = lpl->lpl_rset[leafidx];
2036 }
2037 } while (leafidx != startidx);
2038
2039 next_level:
2040 /*
2041 * Expand the search to include farther away CPUs (next
2042 * locality level). The closer CPUs that have already been
2043 * checked will be checked again. In doing so, idle CPUs
2044 * will tend to be more aggresive about stealing from CPUs
2045 * that are closer (since the closer CPUs will be considered
2046 * more often).
2047 * Begin at this level with the CPUs local leaf lpl.
2048 */
2049 if ((lpl = lpl->lpl_parent) != NULL) {
2050 leafidx = startidx = lpl->lpl_id2rset[local_id];
2051 lpl_leaf = lpl->lpl_rset[leafidx];
2052 }
2053 } while (!tcp && lpl);
2054
2055 kpreempt_enable();
2056
2057 /*
2058 * If another queue looks good, and there is still nothing on
2059 * the local queue, try to transfer one or more threads
2060 * from it to our queue.
2061 */
2062 if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2063 tp = disp_getbest(tcp->cpu_disp);
2064 if (tp == NULL || tp == T_DONTSTEAL)
2065 return (tp);
2066 return (disp_ratify(tp, kpq));
2067 }
2068 return (retval);
2069 }
2070
2071
2072 /*
2073 * disp_fix_unbound_pri()
2074 * Determines the maximum priority of unbound threads on the queue.
2075 * The priority is kept for the queue, but is only increased, never
2076 * reduced unless some CPU is looking for something on that queue.
2077 *
2078 * The priority argument is the known upper limit.
2079 *
2080 * Perhaps this should be kept accurately, but that probably means
2081 * separate bitmaps for bound and unbound threads. Since only idled
2082 * CPUs will have to do this recalculation, it seems better this way.
2083 */
2084 static void
2085 disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2086 {
2087 kthread_t *tp;
2088 dispq_t *dq;
2089 ulong_t *dqactmap = dp->disp_qactmap;
2090 ulong_t mapword;
2091 int wx;
2092
2093 ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2094
2095 ASSERT(pri >= 0); /* checked by caller */
2096
2097 /*
2098 * Start the search at the next lowest priority below the supplied
2099 * priority. This depends on the bitmap implementation.
2100 */
2101 do {
2102 wx = pri >> BT_ULSHIFT; /* index of word in map */
2103
2104 /*
2105 * Form mask for all lower priorities in the word.
2106 */
2107 mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2108
2109 /*
2110 * Get next lower active priority.
2111 */
2112 if (mapword != 0) {
2113 pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2114 } else if (wx > 0) {
2115 pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2116 if (pri < 0)
2117 break;
2118 } else {
2119 pri = -1;
2120 break;
2121 }
2122
2123 /*
2124 * Search the queue for unbound, runnable threads.
2125 */
2126 dq = &dp->disp_q[pri];
2127 tp = dq->dq_first;
2128
2129 while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2130 tp = tp->t_link;
2131 }
2132
2133 /*
2134 * If a thread was found, set the priority and return.
2135 */
2136 } while (tp == NULL);
2137
2138 /*
2139 * pri holds the maximum unbound thread priority or -1.
2140 */
2141 if (dp->disp_max_unbound_pri != pri)
2142 dp->disp_max_unbound_pri = pri;
2143 }
2144
2145 /*
2146 * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2147 * check if the CPU to which is was previously bound should have
2148 * its disp_max_unbound_pri increased.
2149 */
2150 void
2151 disp_adjust_unbound_pri(kthread_t *tp)
2152 {
2153 disp_t *dp;
2154 pri_t tpri;
2155
2156 ASSERT(THREAD_LOCK_HELD(tp));
2157
2158 /*
2159 * Don't do anything if the thread is not bound, or
2160 * currently not runnable or swapped out.
2161 */
2162 if (tp->t_bound_cpu == NULL ||
2163 tp->t_state != TS_RUN ||
2164 tp->t_schedflag & TS_ON_SWAPQ)
2165 return;
2166
2167 tpri = DISP_PRIO(tp);
2168 dp = tp->t_bound_cpu->cpu_disp;
2169 ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2170 if (tpri > dp->disp_max_unbound_pri)
2171 dp->disp_max_unbound_pri = tpri;
2172 }
2173
2174 /*
2175 * disp_getbest()
2176 * De-queue the highest priority unbound runnable thread.
2177 * Returns with the thread unlocked and onproc but at splhigh (like disp()).
2178 * Returns NULL if nothing found.
2179 * Returns T_DONTSTEAL if the thread was not stealable.
2180 * so that the caller will try again later.
2181 *
2182 * Passed a pointer to a dispatch queue not associated with this CPU, and
2183 * its type.
2184 */
2185 static kthread_t *
2186 disp_getbest(disp_t *dp)
2187 {
2188 kthread_t *tp;
2189 dispq_t *dq;
2190 pri_t pri;
2191 cpu_t *cp, *tcp;
2192 boolean_t allbound;
2193
2194 disp_lock_enter(&dp->disp_lock);
2195
2196 /*
2197 * If there is nothing to run, or the CPU is in the middle of a
2198 * context switch of the only thread, return NULL.
2199 */
2200 tcp = dp->disp_cpu;
2201 cp = CPU;
2202 pri = dp->disp_max_unbound_pri;
2203 if (pri == -1 ||
2204 (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2205 tcp->cpu_disp->disp_nrunnable == 1)) {
2206 disp_lock_exit_nopreempt(&dp->disp_lock);
2207 return (NULL);
2208 }
2209
2210 dq = &dp->disp_q[pri];
2211
2212
2213 /*
2214 * Assume that all threads are bound on this queue, and change it
2215 * later when we find out that it is not the case.
2216 */
2217 allbound = B_TRUE;
2218 for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2219 hrtime_t now, nosteal, rqtime;
2220
2221 /*
2222 * Skip over bound threads which could be here even
2223 * though disp_max_unbound_pri indicated this level.
2224 */
2225 if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2226 continue;
2227
2228 /*
2229 * We've got some unbound threads on this queue, so turn
2230 * the allbound flag off now.
2231 */
2232 allbound = B_FALSE;
2233
2234 /*
2235 * The thread is a candidate for stealing from its run queue. We
2236 * don't want to steal threads that became runnable just a
2237 * moment ago. This improves CPU affinity for threads that get
2238 * preempted for short periods of time and go back on the run
2239 * queue.
2240 *
2241 * We want to let it stay on its run queue if it was only placed
2242 * there recently and it was running on the same CPU before that
2243 * to preserve its cache investment. For the thread to remain on
2244 * its run queue, ALL of the following conditions must be
2245 * satisfied:
2246 *
2247 * - the disp queue should not be the kernel preemption queue
2248 * - delayed idle stealing should not be disabled
2249 * - nosteal_nsec should be non-zero
2250 * - it should run with user priority
2251 * - it should be on the run queue of the CPU where it was
2252 * running before being placed on the run queue
2253 * - it should be the only thread on the run queue (to prevent
2254 * extra scheduling latency for other threads)
2255 * - it should sit on the run queue for less than per-chip
2256 * nosteal interval or global nosteal interval
2257 * - in case of CPUs with shared cache it should sit in a run
2258 * queue of a CPU from a different chip
2259 *
2260 * The checks are arranged so that the ones that are faster are
2261 * placed earlier.
2262 */
2263 if (tcp == NULL ||
2264 pri >= minclsyspri ||
2265 tp->t_cpu != tcp)
2266 break;
2267
2268 /*
2269 * Steal immediately if, due to CMT processor architecture
2270 * migraiton between cp and tcp would incur no performance
2271 * penalty.
2272 */
2273 if (pg_cmt_can_migrate(cp, tcp))
2274 break;
2275
2276 nosteal = nosteal_nsec;
2277 if (nosteal == 0)
2278 break;
2279
2280 /*
2281 * Calculate time spent sitting on run queue
2282 */
2283 now = gethrtime_unscaled();
2284 rqtime = now - tp->t_waitrq;
2285 scalehrtime(&rqtime);
2286
2287 /*
2288 * Steal immediately if the time spent on this run queue is more
2289 * than allowed nosteal delay.
2290 *
2291 * Negative rqtime check is needed here to avoid infinite
2292 * stealing delays caused by unlikely but not impossible
2293 * drifts between CPU times on different CPUs.
2294 */
2295 if (rqtime > nosteal || rqtime < 0)
2296 break;
2297
2298 DTRACE_PROBE4(nosteal, kthread_t *, tp,
2299 cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2300 scalehrtime(&now);
2301 /*
2302 * Calculate when this thread becomes stealable
2303 */
2304 now += (nosteal - rqtime);
2305
2306 /*
2307 * Calculate time when some thread becomes stealable
2308 */
2309 if (now < dp->disp_steal)
2310 dp->disp_steal = now;
2311 }
2312
2313 /*
2314 * If there were no unbound threads on this queue, find the queue
2315 * where they are and then return later. The value of
2316 * disp_max_unbound_pri is not always accurate because it isn't
2317 * reduced until another idle CPU looks for work.
2318 */
2319 if (allbound)
2320 disp_fix_unbound_pri(dp, pri);
2321
2322 /*
2323 * If we reached the end of the queue and found no unbound threads
2324 * then return NULL so that other CPUs will be considered. If there
2325 * are unbound threads but they cannot yet be stolen, then
2326 * return T_DONTSTEAL and try again later.
2327 */
2328 if (tp == NULL) {
2329 disp_lock_exit_nopreempt(&dp->disp_lock);
2330 return (allbound ? NULL : T_DONTSTEAL);
2331 }
2332
2333 /*
2334 * Found a runnable, unbound thread, so remove it from queue.
2335 * dispdeq() requires that we have the thread locked, and we do,
2336 * by virtue of holding the dispatch queue lock. dispdeq() will
2337 * put the thread in transition state, thereby dropping the dispq
2338 * lock.
2339 */
2340
2341 #ifdef DEBUG
2342 {
2343 int thread_was_on_queue;
2344
2345 thread_was_on_queue = dispdeq(tp); /* drops disp_lock */
2346 ASSERT(thread_was_on_queue);
2347 }
2348
2349 #else /* DEBUG */
2350 (void) dispdeq(tp); /* drops disp_lock */
2351 #endif /* DEBUG */
2352
2353 /*
2354 * Reset the disp_queue steal time - we do not know what is the smallest
2355 * value across the queue is.
2356 */
2357 dp->disp_steal = 0;
2358
2359 tp->t_schedflag |= TS_DONT_SWAP;
2360
2361 /*
2362 * Setup thread to run on the current CPU.
2363 */
2364 tp->t_disp_queue = cp->cpu_disp;
2365
2366 cp->cpu_dispthread = tp; /* protected by spl only */
2367 cp->cpu_dispatch_pri = pri;
2368
2369 /*
2370 * There can be a memory synchronization race between disp_getbest()
2371 * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2372 * to preempt the current thread to run the enqueued thread while
2373 * disp_getbest() and disp_ratify() are changing the current thread
2374 * to the stolen thread. This may lead to a situation where
2375 * cpu_resched() tries to preempt the wrong thread and the
2376 * stolen thread continues to run on the CPU which has been tagged
2377 * for preemption.
2378 * Later the clock thread gets enqueued but doesn't get to run on the
2379 * CPU causing the system to hang.
2380 *
2381 * To avoid this, grabbing and dropping the disp_lock (which does
2382 * a memory barrier) is needed to synchronize the execution of
2383 * cpu_resched() with disp_getbest() and disp_ratify() and
2384 * synchronize the memory read and written by cpu_resched(),
2385 * disp_getbest(), and disp_ratify() with each other.
2386 * (see CR#6482861 for more details).
2387 */
2388 disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2389 disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2390
2391 ASSERT(pri == DISP_PRIO(tp));
2392
2393 DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2394
2395 thread_onproc(tp, cp); /* set t_state to TS_ONPROC */
2396
2397 /*
2398 * Return with spl high so that swtch() won't need to raise it.
2399 * The disp_lock was dropped by dispdeq().
2400 */
2401
2402 return (tp);
2403 }
2404
2405 /*
2406 * disp_bound_common() - common routine for higher level functions
2407 * that check for bound threads under certain conditions.
2408 * If 'threadlistsafe' is set then there is no need to acquire
2409 * pidlock to stop the thread list from changing (eg, if
2410 * disp_bound_* is called with cpus paused).
2411 */
2412 static int
2413 disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2414 {
2415 int found = 0;
2416 kthread_t *tp;
2417
2418 ASSERT(flag);
2419
2420 if (!threadlistsafe)
2421 mutex_enter(&pidlock);
2422 tp = curthread; /* faster than allthreads */
2423 do {
2424 if (tp->t_state != TS_FREE) {
2425 /*
2426 * If an interrupt thread is busy, but the
2427 * caller doesn't care (i.e. BOUND_INTR is off),
2428 * then just ignore it and continue through.
2429 */
2430 if ((tp->t_flag & T_INTR_THREAD) &&
2431 !(flag & BOUND_INTR))
2432 continue;
2433
2434 /*
2435 * Skip the idle thread for the CPU
2436 * we're about to set offline.
2437 */
2438 if (tp == cp->cpu_idle_thread)
2439 continue;
2440
2441 /*
2442 * Skip the pause thread for the CPU
2443 * we're about to set offline.
2444 */
2445 if (tp == cp->cpu_pause_thread)
2446 continue;
2447
2448 if ((flag & BOUND_CPU) &&
2449 (tp->t_bound_cpu == cp ||
2450 tp->t_bind_cpu == cp->cpu_id ||
2451 tp->t_weakbound_cpu == cp)) {
2452 found = 1;
2453 break;
2454 }
2455
2456 if ((flag & BOUND_PARTITION) &&
2457 (tp->t_cpupart == cp->cpu_part)) {
2458 found = 1;
2459 break;
2460 }
2461 }
2462 } while ((tp = tp->t_next) != curthread && found == 0);
2463 if (!threadlistsafe)
2464 mutex_exit(&pidlock);
2465 return (found);
2466 }
2467
2468 /*
2469 * disp_bound_threads - return nonzero if threads are bound to the processor.
2470 * Called infrequently. Keep this simple.
2471 * Includes threads that are asleep or stopped but not onproc.
2472 */
2473 int
2474 disp_bound_threads(cpu_t *cp, int threadlistsafe)
2475 {
2476 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2477 }
2478
2479 /*
2480 * disp_bound_anythreads - return nonzero if _any_ threads are bound
2481 * to the given processor, including interrupt threads.
2482 */
2483 int
2484 disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2485 {
2486 return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2487 }
2488
2489 /*
2490 * disp_bound_partition - return nonzero if threads are bound to the same
2491 * partition as the processor.
2492 * Called infrequently. Keep this simple.
2493 * Includes threads that are asleep or stopped but not onproc.
2494 */
2495 int
2496 disp_bound_partition(cpu_t *cp, int threadlistsafe)
2497 {
2498 return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2499 }
2500
2501 /*
2502 * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2503 * threads to other CPUs.
2504 */
2505 void
2506 disp_cpu_inactive(cpu_t *cp)
2507 {
2508 kthread_t *tp;
2509 disp_t *dp = cp->cpu_disp;
2510 dispq_t *dq;
2511 pri_t pri;
2512 int wasonq;
2513
2514 disp_lock_enter(&dp->disp_lock);
2515 while ((pri = dp->disp_max_unbound_pri) != -1) {
2516 dq = &dp->disp_q[pri];
2517 tp = dq->dq_first;
2518
2519 /*
2520 * Skip over bound threads.
2521 */
2522 while (tp != NULL && tp->t_bound_cpu != NULL) {
2523 tp = tp->t_link;
2524 }
2525
2526 if (tp == NULL) {
2527 /* disp_max_unbound_pri must be inaccurate, so fix it */
2528 disp_fix_unbound_pri(dp, pri);
2529 continue;
2530 }
2531
2532 wasonq = dispdeq(tp); /* drops disp_lock */
2533 ASSERT(wasonq);
2534 ASSERT(tp->t_weakbound_cpu == NULL);
2535
2536 setbackdq(tp);
2537 /*
2538 * Called from cpu_offline:
2539 *
2540 * cp has already been removed from the list of active cpus
2541 * and tp->t_cpu has been changed so there is no risk of
2542 * tp ending up back on cp.
2543 *
2544 * Called from cpupart_move_cpu:
2545 *
2546 * The cpu has moved to a new cpupart. Any threads that
2547 * were on it's dispatch queues before the move remain
2548 * in the old partition and can't run in the new partition.
2549 */
2550 ASSERT(tp->t_cpu != cp);
2551 thread_unlock(tp);
2552
2553 disp_lock_enter(&dp->disp_lock);
2554 }
2555 disp_lock_exit(&dp->disp_lock);
2556 }
2557
2558 /*
2559 * disp_lowpri_cpu - find CPU running the lowest priority thread.
2560 * The hint passed in is used as a starting point so we don't favor
2561 * CPU 0 or any other CPU. The caller should pass in the most recently
2562 * used CPU for the thread.
2563 *
2564 * The lgroup and priority are used to determine the best CPU to run on
2565 * in a NUMA machine. The lgroup specifies which CPUs are closest while
2566 * the thread priority will indicate whether the thread will actually run
2567 * there. To pick the best CPU, the CPUs inside and outside of the given
2568 * lgroup which are running the lowest priority threads are found. The
2569 * remote CPU is chosen only if the thread will not run locally on a CPU
2570 * within the lgroup, but will run on the remote CPU. If the thread
2571 * cannot immediately run on any CPU, the best local CPU will be chosen.
2572 *
2573 * The lpl specified also identifies the cpu partition from which
2574 * disp_lowpri_cpu should select a CPU.
2575 *
2576 * curcpu is used to indicate that disp_lowpri_cpu is being called on
2577 * behalf of the current thread. (curthread is looking for a new cpu)
2578 * In this case, cpu_dispatch_pri for this thread's cpu should be
2579 * ignored.
2580 *
2581 * If a cpu is the target of an offline request then try to avoid it.
2582 *
2583 * This function must be called at either high SPL, or with preemption
2584 * disabled, so that the "hint" CPU cannot be removed from the online
2585 * CPU list while we are traversing it.
2586 */
2587 cpu_t *
2588 disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2589 {
2590 cpu_t *bestcpu;
2591 cpu_t *besthomecpu;
2592 cpu_t *cp, *cpstart;
2593
2594 pri_t bestpri;
2595 pri_t cpupri;
2596
2597 klgrpset_t done;
2598 klgrpset_t cur_set;
2599
2600 lpl_t *lpl_iter, *lpl_leaf;
2601 int i;
2602
2603 /*
2604 * Scan for a CPU currently running the lowest priority thread.
2605 * Cannot get cpu_lock here because it is adaptive.
2606 * We do not require lock on CPU list.
2607 */
2608 ASSERT(hint != NULL);
2609 ASSERT(lpl != NULL);
2610 ASSERT(lpl->lpl_ncpu > 0);
2611
2612 /*
2613 * First examine local CPUs. Note that it's possible the hint CPU
2614 * passed in in remote to the specified home lgroup. If our priority
2615 * isn't sufficient enough such that we can run immediately at home,
2616 * then examine CPUs remote to our home lgroup.
2617 * We would like to give preference to CPUs closest to "home".
2618 * If we can't find a CPU where we'll run at a given level
2619 * of locality, we expand our search to include the next level.
2620 */
2621 bestcpu = besthomecpu = NULL;
2622 klgrpset_clear(done);
2623 /* start with lpl we were passed */
2624
2625 lpl_iter = lpl;
2626
2627 do {
2628
2629 bestpri = SHRT_MAX;
2630 klgrpset_clear(cur_set);
2631
2632 for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2633 lpl_leaf = lpl_iter->lpl_rset[i];
2634 if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2635 continue;
2636
2637 klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2638
2639 if (hint->cpu_lpl == lpl_leaf)
2640 cp = cpstart = hint;
2641 else
2642 cp = cpstart = lpl_leaf->lpl_cpus;
2643
2644 do {
2645 if (cp == curcpu)
2646 cpupri = -1;
2647 else if (cp == cpu_inmotion)
2648 cpupri = SHRT_MAX;
2649 else
2650 cpupri = cp->cpu_dispatch_pri;
2651 if (cp->cpu_disp->disp_maxrunpri > cpupri)
2652 cpupri = cp->cpu_disp->disp_maxrunpri;
2653 if (cp->cpu_chosen_level > cpupri)
2654 cpupri = cp->cpu_chosen_level;
2655 if (cpupri < bestpri) {
2656 if (CPU_IDLING(cpupri)) {
2657 ASSERT((cp->cpu_flags &
2658 CPU_QUIESCED) == 0);
2659 return (cp);
2660 }
2661 bestcpu = cp;
2662 bestpri = cpupri;
2663 }
2664 } while ((cp = cp->cpu_next_lpl) != cpstart);
2665 }
2666
2667 if (bestcpu && (tpri > bestpri)) {
2668 ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2669 return (bestcpu);
2670 }
2671 if (besthomecpu == NULL)
2672 besthomecpu = bestcpu;
2673 /*
2674 * Add the lgrps we just considered to the "done" set
2675 */
2676 klgrpset_or(done, cur_set);
2677
2678 } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2679
2680 /*
2681 * The specified priority isn't high enough to run immediately
2682 * anywhere, so just return the best CPU from the home lgroup.
2683 */
2684 ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2685 return (besthomecpu);
2686 }
2687
2688 /*
2689 * This routine provides the generic idle cpu function for all processors.
2690 * If a processor has some specific code to execute when idle (say, to stop
2691 * the pipeline and save power) then that routine should be defined in the
2692 * processors specific code (module_xx.c) and the global variable idle_cpu
2693 * set to that function.
2694 */
2695 static void
2696 generic_idle_cpu(void)
2697 {
2698 }
2699
2700 /*ARGSUSED*/
2701 static void
2702 generic_enq_thread(cpu_t *cpu, int bound)
2703 {
2704 }