1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/signal.h>
32 #include <sys/stack.h>
33 #include <sys/pcb.h>
34 #include <sys/user.h>
35 #include <sys/systm.h>
36 #include <sys/sysinfo.h>
37 #include <sys/errno.h>
38 #include <sys/cmn_err.h>
39 #include <sys/cred.h>
40 #include <sys/resource.h>
41 #include <sys/task.h>
42 #include <sys/project.h>
43 #include <sys/proc.h>
44 #include <sys/debug.h>
45 #include <sys/disp.h>
46 #include <sys/class.h>
47 #include <vm/seg_kmem.h>
48 #include <vm/seg_kp.h>
49 #include <sys/machlock.h>
50 #include <sys/kmem.h>
51 #include <sys/varargs.h>
52 #include <sys/turnstile.h>
53 #include <sys/poll.h>
54 #include <sys/vtrace.h>
55 #include <sys/callb.h>
56 #include <c2/audit.h>
57 #include <sys/tnf.h>
58 #include <sys/sobject.h>
59 #include <sys/cpupart.h>
60 #include <sys/pset.h>
61 #include <sys/door.h>
62 #include <sys/spl.h>
63 #include <sys/copyops.h>
64 #include <sys/rctl.h>
65 #include <sys/brand.h>
66 #include <sys/pool.h>
67 #include <sys/zone.h>
68 #include <sys/tsol/label.h>
69 #include <sys/tsol/tndb.h>
70 #include <sys/cpc_impl.h>
71 #include <sys/sdt.h>
72 #include <sys/reboot.h>
73 #include <sys/kdi.h>
74 #include <sys/schedctl.h>
75 #include <sys/waitq.h>
76 #include <sys/cpucaps.h>
77 #include <sys/kiconv.h>
78
79 struct kmem_cache *thread_cache; /* cache of free threads */
80 struct kmem_cache *lwp_cache; /* cache of free lwps */
81 struct kmem_cache *turnstile_cache; /* cache of free turnstiles */
82
83 /*
84 * allthreads is only for use by kmem_readers. All kernel loops can use
85 * the current thread as a start/end point.
86 */
87 kthread_t *allthreads = &t0; /* circular list of all threads */
88
89 static kcondvar_t reaper_cv; /* synchronization var */
90 kthread_t *thread_deathrow; /* circular list of reapable threads */
91 kthread_t *lwp_deathrow; /* circular list of reapable threads */
92 kmutex_t reaplock; /* protects lwp and thread deathrows */
93 int thread_reapcnt = 0; /* number of threads on deathrow */
94 int lwp_reapcnt = 0; /* number of lwps on deathrow */
95 int reaplimit = 16; /* delay reaping until reaplimit */
96
97 thread_free_lock_t *thread_free_lock;
98 /* protects tick thread from reaper */
99
100 extern int nthread;
101
102 /* System Scheduling classes. */
103 id_t syscid; /* system scheduling class ID */
104 id_t sysdccid = CLASS_UNUSED; /* reset when SDC loads */
105
106 void *segkp_thread; /* cookie for segkp pool */
107
108 int lwp_cache_sz = 32;
109 int t_cache_sz = 8;
110 static kt_did_t next_t_id = 1;
111
112 /* Default mode for thread binding to CPUs and processor sets */
113 int default_binding_mode = TB_ALLHARD;
114
115 /*
116 * Min/Max stack sizes for stack size parameters
117 */
118 #define MAX_STKSIZE (32 * DEFAULTSTKSZ)
119 #define MIN_STKSIZE DEFAULTSTKSZ
120
121 /*
122 * default_stksize overrides lwp_default_stksize if it is set.
123 */
124 volatile int default_stksize;
125 volatile int lwp_default_stksize;
126
127 static zone_key_t zone_thread_key;
128
129 unsigned int kmem_stackinfo; /* stackinfo feature on-off */
130 kmem_stkinfo_t *kmem_stkinfo_log; /* stackinfo circular log */
131 static kmutex_t kmem_stkinfo_lock; /* protects kmem_stkinfo_log */
132
133 /*
134 * forward declarations for internal thread specific data (tsd)
135 */
136 static void *tsd_realloc(void *, size_t, size_t);
137
138 void thread_reaper(void);
139
140 /* forward declarations for stackinfo feature */
141 static void stkinfo_begin(kthread_t *);
142 static void stkinfo_end(kthread_t *);
143 static size_t stkinfo_percent(caddr_t, caddr_t, caddr_t);
144
145 /*ARGSUSED*/
146 static int
147 turnstile_constructor(void *buf, void *cdrarg, int kmflags)
148 {
149 bzero(buf, sizeof (turnstile_t));
150 return (0);
151 }
152
153 /*ARGSUSED*/
154 static void
155 turnstile_destructor(void *buf, void *cdrarg)
156 {
157 turnstile_t *ts = buf;
158
159 ASSERT(ts->ts_free == NULL);
160 ASSERT(ts->ts_waiters == 0);
161 ASSERT(ts->ts_inheritor == NULL);
162 ASSERT(ts->ts_sleepq[0].sq_first == NULL);
163 ASSERT(ts->ts_sleepq[1].sq_first == NULL);
164 }
165
166 void
167 thread_init(void)
168 {
169 kthread_t *tp;
170 extern char sys_name[];
171 extern void idle();
172 struct cpu *cpu = CPU;
173 int i;
174 kmutex_t *lp;
175
176 mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
177 thread_free_lock =
178 kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
179 for (i = 0; i < THREAD_FREE_NUM; i++) {
180 lp = &thread_free_lock[i].tf_lock;
181 mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
182 }
183
184 #if defined(__i386) || defined(__amd64)
185 thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
186 PTR24_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
187
188 /*
189 * "struct _klwp" includes a "struct pcb", which includes a
190 * "struct fpu", which needs to be 64-byte aligned on amd64
191 * (and even on i386) for xsave/xrstor.
192 */
193 lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
194 64, NULL, NULL, NULL, NULL, NULL, 0);
195 #else
196 /*
197 * Allocate thread structures from static_arena. This prevents
198 * issues where a thread tries to relocate its own thread
199 * structure and touches it after the mapping has been suspended.
200 */
201 thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
202 PTR24_ALIGN, NULL, NULL, NULL, NULL, static_arena, 0);
203
204 lwp_stk_cache_init();
205
206 lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
207 0, NULL, NULL, NULL, NULL, NULL, 0);
208 #endif
209
210 turnstile_cache = kmem_cache_create("turnstile_cache",
211 sizeof (turnstile_t), 0,
212 turnstile_constructor, turnstile_destructor, NULL, NULL, NULL, 0);
213
214 label_init();
215 cred_init();
216
217 /*
218 * Initialize various resource management facilities.
219 */
220 rctl_init();
221 cpucaps_init();
222 /*
223 * Zone_init() should be called before project_init() so that project ID
224 * for the first project is initialized correctly.
225 */
226 zone_init();
227 project_init();
228 brand_init();
229 kiconv_init();
230 task_init();
231 tcache_init();
232 pool_init();
233
234 curthread->t_ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
235
236 /*
237 * Originally, we had two parameters to set default stack
238 * size: one for lwp's (lwp_default_stksize), and one for
239 * kernel-only threads (DEFAULTSTKSZ, a.k.a. _defaultstksz).
240 * Now we have a third parameter that overrides both if it is
241 * set to a legal stack size, called default_stksize.
242 */
243
244 if (default_stksize == 0) {
245 default_stksize = DEFAULTSTKSZ;
246 } else if (default_stksize % PAGESIZE != 0 ||
247 default_stksize > MAX_STKSIZE ||
248 default_stksize < MIN_STKSIZE) {
249 cmn_err(CE_WARN, "Illegal stack size. Using %d",
250 (int)DEFAULTSTKSZ);
251 default_stksize = DEFAULTSTKSZ;
252 } else {
253 lwp_default_stksize = default_stksize;
254 }
255
256 if (lwp_default_stksize == 0) {
257 lwp_default_stksize = default_stksize;
258 } else if (lwp_default_stksize % PAGESIZE != 0 ||
259 lwp_default_stksize > MAX_STKSIZE ||
260 lwp_default_stksize < MIN_STKSIZE) {
261 cmn_err(CE_WARN, "Illegal stack size. Using %d",
262 default_stksize);
263 lwp_default_stksize = default_stksize;
264 }
265
266 segkp_lwp = segkp_cache_init(segkp, lwp_cache_sz,
267 lwp_default_stksize,
268 (KPD_NOWAIT | KPD_HASREDZONE | KPD_LOCKED));
269
270 segkp_thread = segkp_cache_init(segkp, t_cache_sz,
271 default_stksize, KPD_HASREDZONE | KPD_LOCKED | KPD_NO_ANON);
272
273 (void) getcid(sys_name, &syscid);
274 curthread->t_cid = syscid; /* current thread is t0 */
275
276 /*
277 * Set up the first CPU's idle thread.
278 * It runs whenever the CPU has nothing worthwhile to do.
279 */
280 tp = thread_create(NULL, 0, idle, NULL, 0, &p0, TS_STOPPED, -1);
281 cpu->cpu_idle_thread = tp;
282 tp->t_preempt = 1;
283 tp->t_disp_queue = cpu->cpu_disp;
284 ASSERT(tp->t_disp_queue != NULL);
285 tp->t_bound_cpu = cpu;
286 tp->t_affinitycnt = 1;
287
288 /*
289 * Registering a thread in the callback table is usually
290 * done in the initialization code of the thread. In this
291 * case, we do it right after thread creation to avoid
292 * blocking idle thread while registering itself. It also
293 * avoids the possibility of reregistration in case a CPU
294 * restarts its idle thread.
295 */
296 CALLB_CPR_INIT_SAFE(tp, "idle");
297
298 /*
299 * Create the thread_reaper daemon. From this point on, exited
300 * threads will get reaped.
301 */
302 (void) thread_create(NULL, 0, (void (*)())thread_reaper,
303 NULL, 0, &p0, TS_RUN, minclsyspri);
304
305 /*
306 * Finish initializing the kernel memory allocator now that
307 * thread_create() is available.
308 */
309 kmem_thread_init();
310
311 if (boothowto & RB_DEBUG)
312 kdi_dvec_thravail();
313 }
314
315 /*
316 * Create a thread.
317 *
318 * thread_create() blocks for memory if necessary. It never fails.
319 *
320 * If stk is NULL, the thread is created at the base of the stack
321 * and cannot be swapped.
322 */
323 kthread_t *
324 thread_create(
325 caddr_t stk,
326 size_t stksize,
327 void (*proc)(),
328 void *arg,
329 size_t len,
330 proc_t *pp,
331 int state,
332 pri_t pri)
333 {
334 kthread_t *t;
335 extern struct classfuncs sys_classfuncs;
336 turnstile_t *ts;
337
338 /*
339 * Every thread keeps a turnstile around in case it needs to block.
340 * The only reason the turnstile is not simply part of the thread
341 * structure is that we may have to break the association whenever
342 * more than one thread blocks on a given synchronization object.
343 * From a memory-management standpoint, turnstiles are like the
344 * "attached mblks" that hang off dblks in the streams allocator.
345 */
346 ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
347
348 if (stk == NULL) {
349 /*
350 * alloc both thread and stack in segkp chunk
351 */
352
353 if (stksize < default_stksize)
354 stksize = default_stksize;
355
356 if (stksize == default_stksize) {
357 stk = (caddr_t)segkp_cache_get(segkp_thread);
358 } else {
359 stksize = roundup(stksize, PAGESIZE);
360 stk = (caddr_t)segkp_get(segkp, stksize,
361 (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
362 }
363
364 ASSERT(stk != NULL);
365
366 /*
367 * The machine-dependent mutex code may require that
368 * thread pointers (since they may be used for mutex owner
369 * fields) have certain alignment requirements.
370 * PTR24_ALIGN is the size of the alignment quanta.
371 * XXX - assumes stack grows toward low addresses.
372 */
373 if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
374 cmn_err(CE_PANIC, "thread_create: proposed stack size"
375 " too small to hold thread.");
376 #ifdef STACK_GROWTH_DOWN
377 stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
378 stksize &= -PTR24_ALIGN; /* make thread aligned */
379 t = (kthread_t *)(stk + stksize);
380 bzero(t, sizeof (kthread_t));
381 if (audit_active)
382 audit_thread_create(t);
383 t->t_stk = stk + stksize;
384 t->t_stkbase = stk;
385 #else /* stack grows to larger addresses */
386 stksize -= SA(sizeof (kthread_t));
387 t = (kthread_t *)(stk);
388 bzero(t, sizeof (kthread_t));
389 t->t_stk = stk + sizeof (kthread_t);
390 t->t_stkbase = stk + stksize + sizeof (kthread_t);
391 #endif /* STACK_GROWTH_DOWN */
392 t->t_flag |= T_TALLOCSTK;
393 t->t_swap = stk;
394 } else {
395 t = kmem_cache_alloc(thread_cache, KM_SLEEP);
396 bzero(t, sizeof (kthread_t));
397 ASSERT(((uintptr_t)t & (PTR24_ALIGN - 1)) == 0);
398 if (audit_active)
399 audit_thread_create(t);
400 /*
401 * Initialize t_stk to the kernel stack pointer to use
402 * upon entry to the kernel
403 */
404 #ifdef STACK_GROWTH_DOWN
405 t->t_stk = stk + stksize;
406 t->t_stkbase = stk;
407 #else
408 t->t_stk = stk; /* 3b2-like */
409 t->t_stkbase = stk + stksize;
410 #endif /* STACK_GROWTH_DOWN */
411 }
412
413 if (kmem_stackinfo != 0) {
414 stkinfo_begin(t);
415 }
416
417 t->t_ts = ts;
418
419 /*
420 * p_cred could be NULL if it thread_create is called before cred_init
421 * is called in main.
422 */
423 mutex_enter(&pp->p_crlock);
424 if (pp->p_cred)
425 crhold(t->t_cred = pp->p_cred);
426 mutex_exit(&pp->p_crlock);
427 t->t_start = gethrestime_sec();
428 t->t_startpc = proc;
429 t->t_procp = pp;
430 t->t_clfuncs = &sys_classfuncs.thread;
431 t->t_cid = syscid;
432 t->t_pri = pri;
433 t->t_stime = ddi_get_lbolt();
434 t->t_schedflag = TS_LOAD | TS_DONT_SWAP;
435 t->t_bind_cpu = PBIND_NONE;
436 t->t_bindflag = (uchar_t)default_binding_mode;
437 t->t_bind_pset = PS_NONE;
438 t->t_plockp = &pp->p_lock;
439 t->t_copyops = NULL;
440 t->t_taskq = NULL;
441 t->t_anttime = 0;
442 t->t_hatdepth = 0;
443
444 t->t_dtrace_vtime = 1; /* assure vtimestamp is always non-zero */
445
446 CPU_STATS_ADDQ(CPU, sys, nthreads, 1);
447 #ifndef NPROBE
448 /* Kernel probe */
449 tnf_thread_create(t);
450 #endif /* NPROBE */
451 LOCK_INIT_CLEAR(&t->t_lock);
452
453 /*
454 * Callers who give us a NULL proc must do their own
455 * stack initialization. e.g. lwp_create()
456 */
457 if (proc != NULL) {
458 t->t_stk = thread_stk_init(t->t_stk);
459 thread_load(t, proc, arg, len);
460 }
461
462 /*
463 * Put a hold on project0. If this thread is actually in a
464 * different project, then t_proj will be changed later in
465 * lwp_create(). All kernel-only threads must be in project 0.
466 */
467 t->t_proj = project_hold(proj0p);
468
469 lgrp_affinity_init(&t->t_lgrp_affinity);
470
471 mutex_enter(&pidlock);
472 nthread++;
473 t->t_did = next_t_id++;
474 t->t_prev = curthread->t_prev;
475 t->t_next = curthread;
476
477 /*
478 * Add the thread to the list of all threads, and initialize
479 * its t_cpu pointer. We need to block preemption since
480 * cpu_offline walks the thread list looking for threads
481 * with t_cpu pointing to the CPU being offlined. We want
482 * to make sure that the list is consistent and that if t_cpu
483 * is set, the thread is on the list.
484 */
485 kpreempt_disable();
486 curthread->t_prev->t_next = t;
487 curthread->t_prev = t;
488
489 /*
490 * Threads should never have a NULL t_cpu pointer so assign it
491 * here. If the thread is being created with state TS_RUN a
492 * better CPU may be chosen when it is placed on the run queue.
493 *
494 * We need to keep kernel preemption disabled when setting all
495 * three fields to keep them in sync. Also, always create in
496 * the default partition since that's where kernel threads go
497 * (if this isn't a kernel thread, t_cpupart will be changed
498 * in lwp_create before setting the thread runnable).
499 */
500 t->t_cpupart = &cp_default;
501
502 /*
503 * For now, affiliate this thread with the root lgroup.
504 * Since the kernel does not (presently) allocate its memory
505 * in a locality aware fashion, the root is an appropriate home.
506 * If this thread is later associated with an lwp, it will have
507 * it's lgroup re-assigned at that time.
508 */
509 lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
510
511 /*
512 * Inherit the current cpu. If this cpu isn't part of the chosen
513 * lgroup, a new cpu will be chosen by cpu_choose when the thread
514 * is ready to run.
515 */
516 if (CPU->cpu_part == &cp_default)
517 t->t_cpu = CPU;
518 else
519 t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
520 t->t_pri, NULL);
521
522 t->t_disp_queue = t->t_cpu->cpu_disp;
523 kpreempt_enable();
524
525 /*
526 * Initialize thread state and the dispatcher lock pointer.
527 * Need to hold onto pidlock to block allthreads walkers until
528 * the state is set.
529 */
530 switch (state) {
531 case TS_RUN:
532 curthread->t_oldspl = splhigh(); /* get dispatcher spl */
533 THREAD_SET_STATE(t, TS_STOPPED, &transition_lock);
534 CL_SETRUN(t);
535 thread_unlock(t);
536 break;
537
538 case TS_ONPROC:
539 THREAD_ONPROC(t, t->t_cpu);
540 break;
541
542 case TS_FREE:
543 /*
544 * Free state will be used for intr threads.
545 * The interrupt routine must set the thread dispatcher
546 * lock pointer (t_lockp) if starting on a CPU
547 * other than the current one.
548 */
549 THREAD_FREEINTR(t, CPU);
550 break;
551
552 case TS_STOPPED:
553 THREAD_SET_STATE(t, TS_STOPPED, &stop_lock);
554 break;
555
556 default: /* TS_SLEEP, TS_ZOMB or TS_TRANS */
557 cmn_err(CE_PANIC, "thread_create: invalid state %d", state);
558 }
559 mutex_exit(&pidlock);
560 return (t);
561 }
562
563 /*
564 * Move thread to project0 and take care of project reference counters.
565 */
566 void
567 thread_rele(kthread_t *t)
568 {
569 kproject_t *kpj;
570
571 thread_lock(t);
572
573 ASSERT(t == curthread || t->t_state == TS_FREE || t->t_procp == &p0);
574 kpj = ttoproj(t);
575 t->t_proj = proj0p;
576
577 thread_unlock(t);
578
579 if (kpj != proj0p) {
580 project_rele(kpj);
581 (void) project_hold(proj0p);
582 }
583 }
584
585 void
586 thread_exit(void)
587 {
588 kthread_t *t = curthread;
589
590 if ((t->t_proc_flag & TP_ZTHREAD) != 0)
591 cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
592
593 tsd_exit(); /* Clean up this thread's TSD */
594
595 kcpc_passivate(); /* clean up performance counter state */
596
597 /*
598 * No kernel thread should have called poll() without arranging
599 * calling pollcleanup() here.
600 */
601 ASSERT(t->t_pollstate == NULL);
602 ASSERT(t->t_schedctl == NULL);
603 if (t->t_door)
604 door_slam(); /* in case thread did an upcall */
605
606 #ifndef NPROBE
607 /* Kernel probe */
608 if (t->t_tnf_tpdp)
609 tnf_thread_exit();
610 #endif /* NPROBE */
611
612 thread_rele(t);
613 t->t_preempt++;
614
615 /*
616 * remove thread from the all threads list so that
617 * death-row can use the same pointers.
618 */
619 mutex_enter(&pidlock);
620 t->t_next->t_prev = t->t_prev;
621 t->t_prev->t_next = t->t_next;
622 ASSERT(allthreads != t); /* t0 never exits */
623 cv_broadcast(&t->t_joincv); /* wake up anyone in thread_join */
624 mutex_exit(&pidlock);
625
626 if (t->t_ctx != NULL)
627 exitctx(t);
628 if (t->t_procp->p_pctx != NULL)
629 exitpctx(t->t_procp);
630
631 if (kmem_stackinfo != 0) {
632 stkinfo_end(t);
633 }
634
635 t->t_state = TS_ZOMB; /* set zombie thread */
636
637 swtch_from_zombie(); /* give up the CPU */
638 /* NOTREACHED */
639 }
640
641 /*
642 * Check to see if the specified thread is active (defined as being on
643 * the thread list). This is certainly a slow way to do this; if there's
644 * ever a reason to speed it up, we could maintain a hash table of active
645 * threads indexed by their t_did.
646 */
647 static kthread_t *
648 did_to_thread(kt_did_t tid)
649 {
650 kthread_t *t;
651
652 ASSERT(MUTEX_HELD(&pidlock));
653 for (t = curthread->t_next; t != curthread; t = t->t_next) {
654 if (t->t_did == tid)
655 break;
656 }
657 if (t->t_did == tid)
658 return (t);
659 else
660 return (NULL);
661 }
662
663 /*
664 * Wait for specified thread to exit. Returns immediately if the thread
665 * could not be found, meaning that it has either already exited or never
666 * existed.
667 */
668 void
669 thread_join(kt_did_t tid)
670 {
671 kthread_t *t;
672
673 ASSERT(tid != curthread->t_did);
674 ASSERT(tid != t0.t_did);
675
676 mutex_enter(&pidlock);
677 /*
678 * Make sure we check that the thread is on the thread list
679 * before blocking on it; otherwise we could end up blocking on
680 * a cv that's already been freed. In other words, don't cache
681 * the thread pointer across calls to cv_wait.
682 *
683 * The choice of loop invariant means that whenever a thread
684 * is taken off the allthreads list, a cv_broadcast must be
685 * performed on that thread's t_joincv to wake up any waiters.
686 * The broadcast doesn't have to happen right away, but it
687 * shouldn't be postponed indefinitely (e.g., by doing it in
688 * thread_free which may only be executed when the deathrow
689 * queue is processed.
690 */
691 while (t = did_to_thread(tid))
692 cv_wait(&t->t_joincv, &pidlock);
693 mutex_exit(&pidlock);
694 }
695
696 void
697 thread_free_prevent(kthread_t *t)
698 {
699 kmutex_t *lp;
700
701 lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
702 mutex_enter(lp);
703 }
704
705 void
706 thread_free_allow(kthread_t *t)
707 {
708 kmutex_t *lp;
709
710 lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
711 mutex_exit(lp);
712 }
713
714 static void
715 thread_free_barrier(kthread_t *t)
716 {
717 kmutex_t *lp;
718
719 lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
720 mutex_enter(lp);
721 mutex_exit(lp);
722 }
723
724 void
725 thread_free(kthread_t *t)
726 {
727 boolean_t allocstk = (t->t_flag & T_TALLOCSTK);
728 klwp_t *lwp = t->t_lwp;
729 caddr_t swap = t->t_swap;
730
731 ASSERT(t != &t0 && t->t_state == TS_FREE);
732 ASSERT(t->t_door == NULL);
733 ASSERT(t->t_schedctl == NULL);
734 ASSERT(t->t_pollstate == NULL);
735
736 t->t_pri = 0;
737 t->t_pc = 0;
738 t->t_sp = 0;
739 t->t_wchan0 = NULL;
740 t->t_wchan = NULL;
741 if (t->t_cred != NULL) {
742 crfree(t->t_cred);
743 t->t_cred = 0;
744 }
745 if (t->t_pdmsg) {
746 kmem_free(t->t_pdmsg, strlen(t->t_pdmsg) + 1);
747 t->t_pdmsg = NULL;
748 }
749 if (audit_active)
750 audit_thread_free(t);
751 #ifndef NPROBE
752 if (t->t_tnf_tpdp)
753 tnf_thread_free(t);
754 #endif /* NPROBE */
755 if (t->t_cldata) {
756 CL_EXITCLASS(t->t_cid, (caddr_t *)t->t_cldata);
757 }
758 if (t->t_rprof != NULL) {
759 kmem_free(t->t_rprof, sizeof (*t->t_rprof));
760 t->t_rprof = NULL;
761 }
762 t->t_lockp = NULL; /* nothing should try to lock this thread now */
763 if (lwp)
764 lwp_freeregs(lwp, 0);
765 if (t->t_ctx)
766 freectx(t, 0);
767 t->t_stk = NULL;
768 if (lwp)
769 lwp_stk_fini(lwp);
770 lock_clear(&t->t_lock);
771
772 if (t->t_ts->ts_waiters > 0)
773 panic("thread_free: turnstile still active");
774
775 kmem_cache_free(turnstile_cache, t->t_ts);
776
777 free_afd(&t->t_activefd);
778
779 /*
780 * Barrier for the tick accounting code. The tick accounting code
781 * holds this lock to keep the thread from going away while it's
782 * looking at it.
783 */
784 thread_free_barrier(t);
785
786 ASSERT(ttoproj(t) == proj0p);
787 project_rele(ttoproj(t));
788
789 lgrp_affinity_free(&t->t_lgrp_affinity);
790
791 mutex_enter(&pidlock);
792 nthread--;
793 mutex_exit(&pidlock);
794
795 /*
796 * Free thread, lwp and stack. This needs to be done carefully, since
797 * if T_TALLOCSTK is set, the thread is part of the stack.
798 */
799 t->t_lwp = NULL;
800 t->t_swap = NULL;
801
802 if (swap) {
803 segkp_release(segkp, swap);
804 }
805 if (lwp) {
806 kmem_cache_free(lwp_cache, lwp);
807 }
808 if (!allocstk) {
809 kmem_cache_free(thread_cache, t);
810 }
811 }
812
813 /*
814 * Removes threads associated with the given zone from a deathrow queue.
815 * tp is a pointer to the head of the deathrow queue, and countp is a
816 * pointer to the current deathrow count. Returns a linked list of
817 * threads removed from the list.
818 */
819 static kthread_t *
820 thread_zone_cleanup(kthread_t **tp, int *countp, zoneid_t zoneid)
821 {
822 kthread_t *tmp, *list = NULL;
823 cred_t *cr;
824
825 ASSERT(MUTEX_HELD(&reaplock));
826 while (*tp != NULL) {
827 if ((cr = (*tp)->t_cred) != NULL && crgetzoneid(cr) == zoneid) {
828 tmp = *tp;
829 *tp = tmp->t_forw;
830 tmp->t_forw = list;
831 list = tmp;
832 (*countp)--;
833 } else {
834 tp = &(*tp)->t_forw;
835 }
836 }
837 return (list);
838 }
839
840 static void
841 thread_reap_list(kthread_t *t)
842 {
843 kthread_t *next;
844
845 while (t != NULL) {
846 next = t->t_forw;
847 thread_free(t);
848 t = next;
849 }
850 }
851
852 /* ARGSUSED */
853 static void
854 thread_zone_destroy(zoneid_t zoneid, void *unused)
855 {
856 kthread_t *t, *l;
857
858 mutex_enter(&reaplock);
859 /*
860 * Pull threads and lwps associated with zone off deathrow lists.
861 */
862 t = thread_zone_cleanup(&thread_deathrow, &thread_reapcnt, zoneid);
863 l = thread_zone_cleanup(&lwp_deathrow, &lwp_reapcnt, zoneid);
864 mutex_exit(&reaplock);
865
866 /*
867 * Guard against race condition in mutex_owner_running:
868 * thread=owner(mutex)
869 * <interrupt>
870 * thread exits mutex
871 * thread exits
872 * thread reaped
873 * thread struct freed
874 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
875 * A cross call to all cpus will cause the interrupt handler
876 * to reset the PC if it is in mutex_owner_running, refreshing
877 * stale thread pointers.
878 */
879 mutex_sync(); /* sync with mutex code */
880
881 /*
882 * Reap threads
883 */
884 thread_reap_list(t);
885
886 /*
887 * Reap lwps
888 */
889 thread_reap_list(l);
890 }
891
892 /*
893 * cleanup zombie threads that are on deathrow.
894 */
895 void
896 thread_reaper()
897 {
898 kthread_t *t, *l;
899 callb_cpr_t cprinfo;
900
901 /*
902 * Register callback to clean up threads when zone is destroyed.
903 */
904 zone_key_create(&zone_thread_key, NULL, NULL, thread_zone_destroy);
905
906 CALLB_CPR_INIT(&cprinfo, &reaplock, callb_generic_cpr, "t_reaper");
907 for (;;) {
908 mutex_enter(&reaplock);
909 while (thread_deathrow == NULL && lwp_deathrow == NULL) {
910 CALLB_CPR_SAFE_BEGIN(&cprinfo);
911 cv_wait(&reaper_cv, &reaplock);
912 CALLB_CPR_SAFE_END(&cprinfo, &reaplock);
913 }
914 /*
915 * mutex_sync() needs to be called when reaping, but
916 * not too often. We limit reaping rate to once
917 * per second. Reaplimit is max rate at which threads can
918 * be freed. Does not impact thread destruction/creation.
919 */
920 t = thread_deathrow;
921 l = lwp_deathrow;
922 thread_deathrow = NULL;
923 lwp_deathrow = NULL;
924 thread_reapcnt = 0;
925 lwp_reapcnt = 0;
926 mutex_exit(&reaplock);
927
928 /*
929 * Guard against race condition in mutex_owner_running:
930 * thread=owner(mutex)
931 * <interrupt>
932 * thread exits mutex
933 * thread exits
934 * thread reaped
935 * thread struct freed
936 * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
937 * A cross call to all cpus will cause the interrupt handler
938 * to reset the PC if it is in mutex_owner_running, refreshing
939 * stale thread pointers.
940 */
941 mutex_sync(); /* sync with mutex code */
942 /*
943 * Reap threads
944 */
945 thread_reap_list(t);
946
947 /*
948 * Reap lwps
949 */
950 thread_reap_list(l);
951 delay(hz);
952 }
953 }
954
955 /*
956 * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto
957 * thread_deathrow. The thread's state is changed already TS_FREE to indicate
958 * that is reapable. The thread already holds the reaplock, and was already
959 * freed.
960 */
961 void
962 reapq_move_lq_to_tq(kthread_t *t)
963 {
964 ASSERT(t->t_state == TS_FREE);
965 ASSERT(MUTEX_HELD(&reaplock));
966 t->t_forw = thread_deathrow;
967 thread_deathrow = t;
968 thread_reapcnt++;
969 if (lwp_reapcnt + thread_reapcnt > reaplimit)
970 cv_signal(&reaper_cv); /* wake the reaper */
971 }
972
973 /*
974 * This is called by resume() to put a zombie thread onto deathrow.
975 * The thread's state is changed to TS_FREE to indicate that is reapable.
976 * This is called from the idle thread so it must not block - just spin.
977 */
978 void
979 reapq_add(kthread_t *t)
980 {
981 mutex_enter(&reaplock);
982
983 /*
984 * lwp_deathrow contains threads with lwp linkage and
985 * swappable thread stacks which have the default stacksize.
986 * These threads' lwps and stacks may be reused by lwp_create().
987 *
988 * Anything else goes on thread_deathrow(), where it will eventually
989 * be thread_free()d.
990 */
991 if (t->t_flag & T_LWPREUSE) {
992 ASSERT(ttolwp(t) != NULL);
993 t->t_forw = lwp_deathrow;
994 lwp_deathrow = t;
995 lwp_reapcnt++;
996 } else {
997 t->t_forw = thread_deathrow;
998 thread_deathrow = t;
999 thread_reapcnt++;
1000 }
1001 if (lwp_reapcnt + thread_reapcnt > reaplimit)
1002 cv_signal(&reaper_cv); /* wake the reaper */
1003 t->t_state = TS_FREE;
1004 lock_clear(&t->t_lock);
1005
1006 /*
1007 * Before we return, we need to grab and drop the thread lock for
1008 * the dead thread. At this point, the current thread is the idle
1009 * thread, and the dead thread's CPU lock points to the current
1010 * CPU -- and we must grab and drop the lock to synchronize with
1011 * a racing thread walking a blocking chain that the zombie thread
1012 * was recently in. By this point, that blocking chain is (by
1013 * definition) stale: the dead thread is not holding any locks, and
1014 * is therefore not in any blocking chains -- but if we do not regrab
1015 * our lock before freeing the dead thread's data structures, the
1016 * thread walking the (stale) blocking chain will die on memory
1017 * corruption when it attempts to drop the dead thread's lock. We
1018 * only need do this once because there is no way for the dead thread
1019 * to ever again be on a blocking chain: once we have grabbed and
1020 * dropped the thread lock, we are guaranteed that anyone that could
1021 * have seen this thread in a blocking chain can no longer see it.
1022 */
1023 thread_lock(t);
1024 thread_unlock(t);
1025
1026 mutex_exit(&reaplock);
1027 }
1028
1029 /*
1030 * Install thread context ops for the current thread.
1031 */
1032 void
1033 installctx(
1034 kthread_t *t,
1035 void *arg,
1036 void (*save)(void *),
1037 void (*restore)(void *),
1038 void (*fork)(void *, void *),
1039 void (*lwp_create)(void *, void *),
1040 void (*exit)(void *),
1041 void (*free)(void *, int))
1042 {
1043 struct ctxop *ctx;
1044
1045 ctx = kmem_alloc(sizeof (struct ctxop), KM_SLEEP);
1046 ctx->save_op = save;
1047 ctx->restore_op = restore;
1048 ctx->fork_op = fork;
1049 ctx->lwp_create_op = lwp_create;
1050 ctx->exit_op = exit;
1051 ctx->free_op = free;
1052 ctx->arg = arg;
1053 ctx->next = t->t_ctx;
1054 t->t_ctx = ctx;
1055 }
1056
1057 /*
1058 * Remove the thread context ops from a thread.
1059 */
1060 int
1061 removectx(
1062 kthread_t *t,
1063 void *arg,
1064 void (*save)(void *),
1065 void (*restore)(void *),
1066 void (*fork)(void *, void *),
1067 void (*lwp_create)(void *, void *),
1068 void (*exit)(void *),
1069 void (*free)(void *, int))
1070 {
1071 struct ctxop *ctx, *prev_ctx;
1072
1073 /*
1074 * The incoming kthread_t (which is the thread for which the
1075 * context ops will be removed) should be one of the following:
1076 *
1077 * a) the current thread,
1078 *
1079 * b) a thread of a process that's being forked (SIDL),
1080 *
1081 * c) a thread that belongs to the same process as the current
1082 * thread and for which the current thread is the agent thread,
1083 *
1084 * d) a thread that is TS_STOPPED which is indicative of it
1085 * being (if curthread is not an agent) a thread being created
1086 * as part of an lwp creation.
1087 */
1088 ASSERT(t == curthread || ttoproc(t)->p_stat == SIDL ||
1089 ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1090
1091 /*
1092 * Serialize modifications to t->t_ctx to prevent the agent thread
1093 * and the target thread from racing with each other during lwp exit.
1094 */
1095 mutex_enter(&t->t_ctx_lock);
1096 prev_ctx = NULL;
1097 kpreempt_disable();
1098 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
1099 if (ctx->save_op == save && ctx->restore_op == restore &&
1100 ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
1101 ctx->exit_op == exit && ctx->free_op == free &&
1102 ctx->arg == arg) {
1103 if (prev_ctx)
1104 prev_ctx->next = ctx->next;
1105 else
1106 t->t_ctx = ctx->next;
1107 mutex_exit(&t->t_ctx_lock);
1108 if (ctx->free_op != NULL)
1109 (ctx->free_op)(ctx->arg, 0);
1110 kmem_free(ctx, sizeof (struct ctxop));
1111 kpreempt_enable();
1112 return (1);
1113 }
1114 prev_ctx = ctx;
1115 }
1116 mutex_exit(&t->t_ctx_lock);
1117 kpreempt_enable();
1118
1119 return (0);
1120 }
1121
1122 void
1123 savectx(kthread_t *t)
1124 {
1125 struct ctxop *ctx;
1126
1127 ASSERT(t == curthread);
1128 for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1129 if (ctx->save_op != NULL)
1130 (ctx->save_op)(ctx->arg);
1131 }
1132
1133 void
1134 restorectx(kthread_t *t)
1135 {
1136 struct ctxop *ctx;
1137
1138 ASSERT(t == curthread);
1139 for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1140 if (ctx->restore_op != NULL)
1141 (ctx->restore_op)(ctx->arg);
1142 }
1143
1144 void
1145 forkctx(kthread_t *t, kthread_t *ct)
1146 {
1147 struct ctxop *ctx;
1148
1149 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1150 if (ctx->fork_op != NULL)
1151 (ctx->fork_op)(t, ct);
1152 }
1153
1154 /*
1155 * Note that this operator is only invoked via the _lwp_create
1156 * system call. The system may have other reasons to create lwps
1157 * e.g. the agent lwp or the doors unreferenced lwp.
1158 */
1159 void
1160 lwp_createctx(kthread_t *t, kthread_t *ct)
1161 {
1162 struct ctxop *ctx;
1163
1164 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1165 if (ctx->lwp_create_op != NULL)
1166 (ctx->lwp_create_op)(t, ct);
1167 }
1168
1169 /*
1170 * exitctx is called from thread_exit() and lwp_exit() to perform any actions
1171 * needed when the thread/LWP leaves the processor for the last time. This
1172 * routine is not intended to deal with freeing memory; freectx() is used for
1173 * that purpose during thread_free(). This routine is provided to allow for
1174 * clean-up that can't wait until thread_free().
1175 */
1176 void
1177 exitctx(kthread_t *t)
1178 {
1179 struct ctxop *ctx;
1180
1181 for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1182 if (ctx->exit_op != NULL)
1183 (ctx->exit_op)(t);
1184 }
1185
1186 /*
1187 * freectx is called from thread_free() and exec() to get
1188 * rid of old thread context ops.
1189 */
1190 void
1191 freectx(kthread_t *t, int isexec)
1192 {
1193 struct ctxop *ctx;
1194
1195 kpreempt_disable();
1196 while ((ctx = t->t_ctx) != NULL) {
1197 t->t_ctx = ctx->next;
1198 if (ctx->free_op != NULL)
1199 (ctx->free_op)(ctx->arg, isexec);
1200 kmem_free(ctx, sizeof (struct ctxop));
1201 }
1202 kpreempt_enable();
1203 }
1204
1205 /*
1206 * freectx_ctx is called from lwp_create() when lwp is reused from
1207 * lwp_deathrow and its thread structure is added to thread_deathrow.
1208 * The thread structure to which this ctx was attached may be already
1209 * freed by the thread reaper so free_op implementations shouldn't rely
1210 * on thread structure to which this ctx was attached still being around.
1211 */
1212 void
1213 freectx_ctx(struct ctxop *ctx)
1214 {
1215 struct ctxop *nctx;
1216
1217 ASSERT(ctx != NULL);
1218
1219 kpreempt_disable();
1220 do {
1221 nctx = ctx->next;
1222 if (ctx->free_op != NULL)
1223 (ctx->free_op)(ctx->arg, 0);
1224 kmem_free(ctx, sizeof (struct ctxop));
1225 } while ((ctx = nctx) != NULL);
1226 kpreempt_enable();
1227 }
1228
1229 /*
1230 * Set the thread running; arrange for it to be swapped in if necessary.
1231 */
1232 void
1233 setrun_locked(kthread_t *t)
1234 {
1235 ASSERT(THREAD_LOCK_HELD(t));
1236 if (t->t_state == TS_SLEEP) {
1237 /*
1238 * Take off sleep queue.
1239 */
1240 SOBJ_UNSLEEP(t->t_sobj_ops, t);
1241 } else if (t->t_state & (TS_RUN | TS_ONPROC)) {
1242 /*
1243 * Already on dispatcher queue.
1244 */
1245 return;
1246 } else if (t->t_state == TS_WAIT) {
1247 waitq_setrun(t);
1248 } else if (t->t_state == TS_STOPPED) {
1249 /*
1250 * All of the sending of SIGCONT (TC_XSTART) and /proc
1251 * (TC_PSTART) and lwp_continue() (TC_CSTART) must have
1252 * requested that the thread be run.
1253 * Just calling setrun() is not sufficient to set a stopped
1254 * thread running. TP_TXSTART is always set if the thread
1255 * is not stopped by a jobcontrol stop signal.
1256 * TP_TPSTART is always set if /proc is not controlling it.
1257 * TP_TCSTART is always set if lwp_suspend() didn't stop it.
1258 * The thread won't be stopped unless one of these
1259 * three mechanisms did it.
1260 *
1261 * These flags must be set before calling setrun_locked(t).
1262 * They can't be passed as arguments because the streams
1263 * code calls setrun() indirectly and the mechanism for
1264 * doing so admits only one argument. Note that the
1265 * thread must be locked in order to change t_schedflags.
1266 */
1267 if ((t->t_schedflag & TS_ALLSTART) != TS_ALLSTART)
1268 return;
1269 /*
1270 * Process is no longer stopped (a thread is running).
1271 */
1272 t->t_whystop = 0;
1273 t->t_whatstop = 0;
1274 /*
1275 * Strictly speaking, we do not have to clear these
1276 * flags here; they are cleared on entry to stop().
1277 * However, they are confusing when doing kernel
1278 * debugging or when they are revealed by ps(1).
1279 */
1280 t->t_schedflag &= ~TS_ALLSTART;
1281 THREAD_TRANSITION(t); /* drop stopped-thread lock */
1282 ASSERT(t->t_lockp == &transition_lock);
1283 ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
1284 /*
1285 * Let the class put the process on the dispatcher queue.
1286 */
1287 CL_SETRUN(t);
1288 }
1289 }
1290
1291 void
1292 setrun(kthread_t *t)
1293 {
1294 thread_lock(t);
1295 setrun_locked(t);
1296 thread_unlock(t);
1297 }
1298
1299 /*
1300 * Unpin an interrupted thread.
1301 * When an interrupt occurs, the interrupt is handled on the stack
1302 * of an interrupt thread, taken from a pool linked to the CPU structure.
1303 *
1304 * When swtch() is switching away from an interrupt thread because it
1305 * blocked or was preempted, this routine is called to complete the
1306 * saving of the interrupted thread state, and returns the interrupted
1307 * thread pointer so it may be resumed.
1308 *
1309 * Called by swtch() only at high spl.
1310 */
1311 kthread_t *
1312 thread_unpin()
1313 {
1314 kthread_t *t = curthread; /* current thread */
1315 kthread_t *itp; /* interrupted thread */
1316 int i; /* interrupt level */
1317 extern int intr_passivate();
1318
1319 ASSERT(t->t_intr != NULL);
1320
1321 itp = t->t_intr; /* interrupted thread */
1322 t->t_intr = NULL; /* clear interrupt ptr */
1323
1324 /*
1325 * Get state from interrupt thread for the one
1326 * it interrupted.
1327 */
1328
1329 i = intr_passivate(t, itp);
1330
1331 TRACE_5(TR_FAC_INTR, TR_INTR_PASSIVATE,
1332 "intr_passivate:level %d curthread %p (%T) ithread %p (%T)",
1333 i, t, t, itp, itp);
1334
1335 /*
1336 * Dissociate the current thread from the interrupted thread's LWP.
1337 */
1338 t->t_lwp = NULL;
1339
1340 /*
1341 * Interrupt handlers above the level that spinlocks block must
1342 * not block.
1343 */
1344 #if DEBUG
1345 if (i < 0 || i > LOCK_LEVEL)
1346 cmn_err(CE_PANIC, "thread_unpin: ipl out of range %x", i);
1347 #endif
1348
1349 /*
1350 * Compute the CPU's base interrupt level based on the active
1351 * interrupts.
1352 */
1353 ASSERT(CPU->cpu_intr_actv & (1 << i));
1354 set_base_spl();
1355
1356 return (itp);
1357 }
1358
1359 /*
1360 * Create and initialize an interrupt thread.
1361 * Returns non-zero on error.
1362 * Called at spl7() or better.
1363 */
1364 void
1365 thread_create_intr(struct cpu *cp)
1366 {
1367 kthread_t *tp;
1368
1369 tp = thread_create(NULL, 0,
1370 (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
1371
1372 /*
1373 * Set the thread in the TS_FREE state. The state will change
1374 * to TS_ONPROC only while the interrupt is active. Think of these
1375 * as being on a private free list for the CPU. Being TS_FREE keeps
1376 * inactive interrupt threads out of debugger thread lists.
1377 *
1378 * We cannot call thread_create with TS_FREE because of the current
1379 * checks there for ONPROC. Fix this when thread_create takes flags.
1380 */
1381 THREAD_FREEINTR(tp, cp);
1382
1383 /*
1384 * Nobody should ever reference the credentials of an interrupt
1385 * thread so make it NULL to catch any such references.
1386 */
1387 tp->t_cred = NULL;
1388 tp->t_flag |= T_INTR_THREAD;
1389 tp->t_cpu = cp;
1390 tp->t_bound_cpu = cp;
1391 tp->t_disp_queue = cp->cpu_disp;
1392 tp->t_affinitycnt = 1;
1393 tp->t_preempt = 1;
1394
1395 /*
1396 * Don't make a user-requested binding on this thread so that
1397 * the processor can be offlined.
1398 */
1399 tp->t_bind_cpu = PBIND_NONE; /* no USER-requested binding */
1400 tp->t_bind_pset = PS_NONE;
1401
1402 #if defined(__i386) || defined(__amd64)
1403 tp->t_stk -= STACK_ALIGN;
1404 *(tp->t_stk) = 0; /* terminate intr thread stack */
1405 #endif
1406
1407 /*
1408 * Link onto CPU's interrupt pool.
1409 */
1410 tp->t_link = cp->cpu_intr_thread;
1411 cp->cpu_intr_thread = tp;
1412 }
1413
1414 /*
1415 * TSD -- THREAD SPECIFIC DATA
1416 */
1417 static kmutex_t tsd_mutex; /* linked list spin lock */
1418 static uint_t tsd_nkeys; /* size of destructor array */
1419 /* per-key destructor funcs */
1420 static void (**tsd_destructor)(void *);
1421 /* list of tsd_thread's */
1422 static struct tsd_thread *tsd_list;
1423
1424 /*
1425 * Default destructor
1426 * Needed because NULL destructor means that the key is unused
1427 */
1428 /* ARGSUSED */
1429 void
1430 tsd_defaultdestructor(void *value)
1431 {}
1432
1433 /*
1434 * Create a key (index into per thread array)
1435 * Locks out tsd_create, tsd_destroy, and tsd_exit
1436 * May allocate memory with lock held
1437 */
1438 void
1439 tsd_create(uint_t *keyp, void (*destructor)(void *))
1440 {
1441 int i;
1442 uint_t nkeys;
1443
1444 /*
1445 * if key is allocated, do nothing
1446 */
1447 mutex_enter(&tsd_mutex);
1448 if (*keyp) {
1449 mutex_exit(&tsd_mutex);
1450 return;
1451 }
1452 /*
1453 * find an unused key
1454 */
1455 if (destructor == NULL)
1456 destructor = tsd_defaultdestructor;
1457
1458 for (i = 0; i < tsd_nkeys; ++i)
1459 if (tsd_destructor[i] == NULL)
1460 break;
1461
1462 /*
1463 * if no unused keys, increase the size of the destructor array
1464 */
1465 if (i == tsd_nkeys) {
1466 if ((nkeys = (tsd_nkeys << 1)) == 0)
1467 nkeys = 1;
1468 tsd_destructor =
1469 (void (**)(void *))tsd_realloc((void *)tsd_destructor,
1470 (size_t)(tsd_nkeys * sizeof (void (*)(void *))),
1471 (size_t)(nkeys * sizeof (void (*)(void *))));
1472 tsd_nkeys = nkeys;
1473 }
1474
1475 /*
1476 * allocate the next available unused key
1477 */
1478 tsd_destructor[i] = destructor;
1479 *keyp = i + 1;
1480 mutex_exit(&tsd_mutex);
1481 }
1482
1483 /*
1484 * Destroy a key -- this is for unloadable modules
1485 *
1486 * Assumes that the caller is preventing tsd_set and tsd_get
1487 * Locks out tsd_create, tsd_destroy, and tsd_exit
1488 * May free memory with lock held
1489 */
1490 void
1491 tsd_destroy(uint_t *keyp)
1492 {
1493 uint_t key;
1494 struct tsd_thread *tsd;
1495
1496 /*
1497 * protect the key namespace and our destructor lists
1498 */
1499 mutex_enter(&tsd_mutex);
1500 key = *keyp;
1501 *keyp = 0;
1502
1503 ASSERT(key <= tsd_nkeys);
1504
1505 /*
1506 * if the key is valid
1507 */
1508 if (key != 0) {
1509 uint_t k = key - 1;
1510 /*
1511 * for every thread with TSD, call key's destructor
1512 */
1513 for (tsd = tsd_list; tsd; tsd = tsd->ts_next) {
1514 /*
1515 * no TSD for key in this thread
1516 */
1517 if (key > tsd->ts_nkeys)
1518 continue;
1519 /*
1520 * call destructor for key
1521 */
1522 if (tsd->ts_value[k] && tsd_destructor[k])
1523 (*tsd_destructor[k])(tsd->ts_value[k]);
1524 /*
1525 * reset value for key
1526 */
1527 tsd->ts_value[k] = NULL;
1528 }
1529 /*
1530 * actually free the key (NULL destructor == unused)
1531 */
1532 tsd_destructor[k] = NULL;
1533 }
1534
1535 mutex_exit(&tsd_mutex);
1536 }
1537
1538 /*
1539 * Quickly return the per thread value that was stored with the specified key
1540 * Assumes the caller is protecting key from tsd_create and tsd_destroy
1541 */
1542 void *
1543 tsd_get(uint_t key)
1544 {
1545 return (tsd_agent_get(curthread, key));
1546 }
1547
1548 /*
1549 * Set a per thread value indexed with the specified key
1550 */
1551 int
1552 tsd_set(uint_t key, void *value)
1553 {
1554 return (tsd_agent_set(curthread, key, value));
1555 }
1556
1557 /*
1558 * Like tsd_get(), except that the agent lwp can get the tsd of
1559 * another thread in the same process (the agent thread only runs when the
1560 * process is completely stopped by /proc), or syslwp is creating a new lwp.
1561 */
1562 void *
1563 tsd_agent_get(kthread_t *t, uint_t key)
1564 {
1565 struct tsd_thread *tsd = t->t_tsd;
1566
1567 ASSERT(t == curthread ||
1568 ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1569
1570 if (key && tsd != NULL && key <= tsd->ts_nkeys)
1571 return (tsd->ts_value[key - 1]);
1572 return (NULL);
1573 }
1574
1575 /*
1576 * Like tsd_set(), except that the agent lwp can set the tsd of
1577 * another thread in the same process, or syslwp can set the tsd
1578 * of a thread it's in the middle of creating.
1579 *
1580 * Assumes the caller is protecting key from tsd_create and tsd_destroy
1581 * May lock out tsd_destroy (and tsd_create), may allocate memory with
1582 * lock held
1583 */
1584 int
1585 tsd_agent_set(kthread_t *t, uint_t key, void *value)
1586 {
1587 struct tsd_thread *tsd = t->t_tsd;
1588
1589 ASSERT(t == curthread ||
1590 ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1591
1592 if (key == 0)
1593 return (EINVAL);
1594 if (tsd == NULL)
1595 tsd = t->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1596 if (key <= tsd->ts_nkeys) {
1597 tsd->ts_value[key - 1] = value;
1598 return (0);
1599 }
1600
1601 ASSERT(key <= tsd_nkeys);
1602
1603 /*
1604 * lock out tsd_destroy()
1605 */
1606 mutex_enter(&tsd_mutex);
1607 if (tsd->ts_nkeys == 0) {
1608 /*
1609 * Link onto list of threads with TSD
1610 */
1611 if ((tsd->ts_next = tsd_list) != NULL)
1612 tsd_list->ts_prev = tsd;
1613 tsd_list = tsd;
1614 }
1615
1616 /*
1617 * Allocate thread local storage and set the value for key
1618 */
1619 tsd->ts_value = tsd_realloc(tsd->ts_value,
1620 tsd->ts_nkeys * sizeof (void *),
1621 key * sizeof (void *));
1622 tsd->ts_nkeys = key;
1623 tsd->ts_value[key - 1] = value;
1624 mutex_exit(&tsd_mutex);
1625
1626 return (0);
1627 }
1628
1629
1630 /*
1631 * Return the per thread value that was stored with the specified key
1632 * If necessary, create the key and the value
1633 * Assumes the caller is protecting *keyp from tsd_destroy
1634 */
1635 void *
1636 tsd_getcreate(uint_t *keyp, void (*destroy)(void *), void *(*allocate)(void))
1637 {
1638 void *value;
1639 uint_t key = *keyp;
1640 struct tsd_thread *tsd = curthread->t_tsd;
1641
1642 if (tsd == NULL)
1643 tsd = curthread->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1644 if (key && key <= tsd->ts_nkeys && (value = tsd->ts_value[key - 1]))
1645 return (value);
1646 if (key == 0)
1647 tsd_create(keyp, destroy);
1648 (void) tsd_set(*keyp, value = (*allocate)());
1649
1650 return (value);
1651 }
1652
1653 /*
1654 * Called from thread_exit() to run the destructor function for each tsd
1655 * Locks out tsd_create and tsd_destroy
1656 * Assumes that the destructor *DOES NOT* use tsd
1657 */
1658 void
1659 tsd_exit(void)
1660 {
1661 int i;
1662 struct tsd_thread *tsd = curthread->t_tsd;
1663
1664 if (tsd == NULL)
1665 return;
1666
1667 if (tsd->ts_nkeys == 0) {
1668 kmem_free(tsd, sizeof (*tsd));
1669 curthread->t_tsd = NULL;
1670 return;
1671 }
1672
1673 /*
1674 * lock out tsd_create and tsd_destroy, call
1675 * the destructor, and mark the value as destroyed.
1676 */
1677 mutex_enter(&tsd_mutex);
1678
1679 for (i = 0; i < tsd->ts_nkeys; i++) {
1680 if (tsd->ts_value[i] && tsd_destructor[i])
1681 (*tsd_destructor[i])(tsd->ts_value[i]);
1682 tsd->ts_value[i] = NULL;
1683 }
1684
1685 /*
1686 * remove from linked list of threads with TSD
1687 */
1688 if (tsd->ts_next)
1689 tsd->ts_next->ts_prev = tsd->ts_prev;
1690 if (tsd->ts_prev)
1691 tsd->ts_prev->ts_next = tsd->ts_next;
1692 if (tsd_list == tsd)
1693 tsd_list = tsd->ts_next;
1694
1695 mutex_exit(&tsd_mutex);
1696
1697 /*
1698 * free up the TSD
1699 */
1700 kmem_free(tsd->ts_value, tsd->ts_nkeys * sizeof (void *));
1701 kmem_free(tsd, sizeof (struct tsd_thread));
1702 curthread->t_tsd = NULL;
1703 }
1704
1705 /*
1706 * realloc
1707 */
1708 static void *
1709 tsd_realloc(void *old, size_t osize, size_t nsize)
1710 {
1711 void *new;
1712
1713 new = kmem_zalloc(nsize, KM_SLEEP);
1714 if (old) {
1715 bcopy(old, new, osize);
1716 kmem_free(old, osize);
1717 }
1718 return (new);
1719 }
1720
1721 /*
1722 * Return non-zero if an interrupt is being serviced.
1723 */
1724 int
1725 servicing_interrupt()
1726 {
1727 int onintr = 0;
1728
1729 /* Are we an interrupt thread */
1730 if (curthread->t_flag & T_INTR_THREAD)
1731 return (1);
1732 /* Are we servicing a high level interrupt? */
1733 if (CPU_ON_INTR(CPU)) {
1734 kpreempt_disable();
1735 onintr = CPU_ON_INTR(CPU);
1736 kpreempt_enable();
1737 }
1738 return (onintr);
1739 }
1740
1741
1742 /*
1743 * Change the dispatch priority of a thread in the system.
1744 * Used when raising or lowering a thread's priority.
1745 * (E.g., priority inheritance)
1746 *
1747 * Since threads are queued according to their priority, we
1748 * we must check the thread's state to determine whether it
1749 * is on a queue somewhere. If it is, we've got to:
1750 *
1751 * o Dequeue the thread.
1752 * o Change its effective priority.
1753 * o Enqueue the thread.
1754 *
1755 * Assumptions: The thread whose priority we wish to change
1756 * must be locked before we call thread_change_(e)pri().
1757 * The thread_change(e)pri() function doesn't drop the thread
1758 * lock--that must be done by its caller.
1759 */
1760 void
1761 thread_change_epri(kthread_t *t, pri_t disp_pri)
1762 {
1763 uint_t state;
1764
1765 ASSERT(THREAD_LOCK_HELD(t));
1766
1767 /*
1768 * If the inherited priority hasn't actually changed,
1769 * just return.
1770 */
1771 if (t->t_epri == disp_pri)
1772 return;
1773
1774 state = t->t_state;
1775
1776 /*
1777 * If it's not on a queue, change the priority with impunity.
1778 */
1779 if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1780 t->t_epri = disp_pri;
1781 if (state == TS_ONPROC) {
1782 cpu_t *cp = t->t_disp_queue->disp_cpu;
1783
1784 if (t == cp->cpu_dispthread)
1785 cp->cpu_dispatch_pri = DISP_PRIO(t);
1786 }
1787 } else if (state == TS_SLEEP) {
1788 /*
1789 * Take the thread out of its sleep queue.
1790 * Change the inherited priority.
1791 * Re-enqueue the thread.
1792 * Each synchronization object exports a function
1793 * to do this in an appropriate manner.
1794 */
1795 SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
1796 } else if (state == TS_WAIT) {
1797 /*
1798 * Re-enqueue a thread on the wait queue if its
1799 * effective priority needs to change.
1800 */
1801 if (disp_pri != t->t_epri)
1802 waitq_change_pri(t, disp_pri);
1803 } else {
1804 /*
1805 * The thread is on a run queue.
1806 * Note: setbackdq() may not put the thread
1807 * back on the same run queue where it originally
1808 * resided.
1809 */
1810 (void) dispdeq(t);
1811 t->t_epri = disp_pri;
1812 setbackdq(t);
1813 }
1814 schedctl_set_cidpri(t);
1815 }
1816
1817 /*
1818 * Function: Change the t_pri field of a thread.
1819 * Side Effects: Adjust the thread ordering on a run queue
1820 * or sleep queue, if necessary.
1821 * Returns: 1 if the thread was on a run queue, else 0.
1822 */
1823 int
1824 thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
1825 {
1826 uint_t state;
1827 int on_rq = 0;
1828
1829 ASSERT(THREAD_LOCK_HELD(t));
1830
1831 state = t->t_state;
1832 THREAD_WILLCHANGE_PRI(t, disp_pri);
1833
1834 /*
1835 * If it's not on a queue, change the priority with impunity.
1836 */
1837 if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1838 t->t_pri = disp_pri;
1839
1840 if (state == TS_ONPROC) {
1841 cpu_t *cp = t->t_disp_queue->disp_cpu;
1842
1843 if (t == cp->cpu_dispthread)
1844 cp->cpu_dispatch_pri = DISP_PRIO(t);
1845 }
1846 } else if (state == TS_SLEEP) {
1847 /*
1848 * If the priority has changed, take the thread out of
1849 * its sleep queue and change the priority.
1850 * Re-enqueue the thread.
1851 * Each synchronization object exports a function
1852 * to do this in an appropriate manner.
1853 */
1854 if (disp_pri != t->t_pri)
1855 SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
1856 } else if (state == TS_WAIT) {
1857 /*
1858 * Re-enqueue a thread on the wait queue if its
1859 * priority needs to change.
1860 */
1861 if (disp_pri != t->t_pri)
1862 waitq_change_pri(t, disp_pri);
1863 } else {
1864 /*
1865 * The thread is on a run queue.
1866 * Note: setbackdq() may not put the thread
1867 * back on the same run queue where it originally
1868 * resided.
1869 *
1870 * We still requeue the thread even if the priority
1871 * is unchanged to preserve round-robin (and other)
1872 * effects between threads of the same priority.
1873 */
1874 on_rq = dispdeq(t);
1875 ASSERT(on_rq);
1876 t->t_pri = disp_pri;
1877 if (front) {
1878 setfrontdq(t);
1879 } else {
1880 setbackdq(t);
1881 }
1882 }
1883 schedctl_set_cidpri(t);
1884 return (on_rq);
1885 }
1886
1887 /*
1888 * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
1889 * specific pattern.
1890 */
1891 static void
1892 stkinfo_begin(kthread_t *t)
1893 {
1894 caddr_t start; /* stack start */
1895 caddr_t end; /* stack end */
1896 uint64_t *ptr; /* pattern pointer */
1897
1898 /*
1899 * Stack grows up or down, see thread_create(),
1900 * compute stack memory area start and end (start < end).
1901 */
1902 if (t->t_stk > t->t_stkbase) {
1903 /* stack grows down */
1904 start = t->t_stkbase;
1905 end = t->t_stk;
1906 } else {
1907 /* stack grows up */
1908 start = t->t_stk;
1909 end = t->t_stkbase;
1910 }
1911
1912 /*
1913 * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1914 * alignement for start and end in stack area boundaries
1915 * (protection against corrupt t_stkbase/t_stk data).
1916 */
1917 if ((((uintptr_t)start) & 0x7) != 0) {
1918 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1919 }
1920 end = (caddr_t)(((uintptr_t)end) & (~0x7));
1921
1922 if ((end <= start) || (end - start) > (1024 * 1024)) {
1923 /* negative or stack size > 1 meg, assume bogus */
1924 return;
1925 }
1926
1927 /* fill stack area with a pattern (instead of zeros) */
1928 ptr = (uint64_t *)((void *)start);
1929 while (ptr < (uint64_t *)((void *)end)) {
1930 *ptr++ = KMEM_STKINFO_PATTERN;
1931 }
1932 }
1933
1934
1935 /*
1936 * Tunable kmem_stackinfo is set, create stackinfo log if doesn't already exist,
1937 * compute the percentage of kernel stack really used, and set in the log
1938 * if it's the latest highest percentage.
1939 */
1940 static void
1941 stkinfo_end(kthread_t *t)
1942 {
1943 caddr_t start; /* stack start */
1944 caddr_t end; /* stack end */
1945 uint64_t *ptr; /* pattern pointer */
1946 size_t stksz; /* stack size */
1947 size_t smallest = 0;
1948 size_t percent = 0;
1949 uint_t index = 0;
1950 uint_t i;
1951 static size_t smallest_percent = (size_t)-1;
1952 static uint_t full = 0;
1953
1954 /* create the stackinfo log, if doesn't already exist */
1955 mutex_enter(&kmem_stkinfo_lock);
1956 if (kmem_stkinfo_log == NULL) {
1957 kmem_stkinfo_log = (kmem_stkinfo_t *)
1958 kmem_zalloc(KMEM_STKINFO_LOG_SIZE *
1959 (sizeof (kmem_stkinfo_t)), KM_NOSLEEP);
1960 if (kmem_stkinfo_log == NULL) {
1961 mutex_exit(&kmem_stkinfo_lock);
1962 return;
1963 }
1964 }
1965 mutex_exit(&kmem_stkinfo_lock);
1966
1967 /*
1968 * Stack grows up or down, see thread_create(),
1969 * compute stack memory area start and end (start < end).
1970 */
1971 if (t->t_stk > t->t_stkbase) {
1972 /* stack grows down */
1973 start = t->t_stkbase;
1974 end = t->t_stk;
1975 } else {
1976 /* stack grows up */
1977 start = t->t_stk;
1978 end = t->t_stkbase;
1979 }
1980
1981 /* stack size as found in kthread_t */
1982 stksz = end - start;
1983
1984 /*
1985 * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1986 * alignement for start and end in stack area boundaries
1987 * (protection against corrupt t_stkbase/t_stk data).
1988 */
1989 if ((((uintptr_t)start) & 0x7) != 0) {
1990 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1991 }
1992 end = (caddr_t)(((uintptr_t)end) & (~0x7));
1993
1994 if ((end <= start) || (end - start) > (1024 * 1024)) {
1995 /* negative or stack size > 1 meg, assume bogus */
1996 return;
1997 }
1998
1999 /* search until no pattern in the stack */
2000 if (t->t_stk > t->t_stkbase) {
2001 /* stack grows down */
2002 #if defined(__i386) || defined(__amd64)
2003 /*
2004 * 6 longs are pushed on stack, see thread_load(). Skip
2005 * them, so if kthread has never run, percent is zero.
2006 * 8 bytes alignement is preserved for a 32 bit kernel,
2007 * 6 x 4 = 24, 24 is a multiple of 8.
2008 *
2009 */
2010 end -= (6 * sizeof (long));
2011 #endif
2012 ptr = (uint64_t *)((void *)start);
2013 while (ptr < (uint64_t *)((void *)end)) {
2014 if (*ptr != KMEM_STKINFO_PATTERN) {
2015 percent = stkinfo_percent(end,
2016 start, (caddr_t)ptr);
2017 break;
2018 }
2019 ptr++;
2020 }
2021 } else {
2022 /* stack grows up */
2023 ptr = (uint64_t *)((void *)end);
2024 ptr--;
2025 while (ptr >= (uint64_t *)((void *)start)) {
2026 if (*ptr != KMEM_STKINFO_PATTERN) {
2027 percent = stkinfo_percent(start,
2028 end, (caddr_t)ptr);
2029 break;
2030 }
2031 ptr--;
2032 }
2033 }
2034
2035 DTRACE_PROBE3(stack__usage, kthread_t *, t,
2036 size_t, stksz, size_t, percent);
2037
2038 if (percent == 0) {
2039 return;
2040 }
2041
2042 mutex_enter(&kmem_stkinfo_lock);
2043 if (full == KMEM_STKINFO_LOG_SIZE && percent < smallest_percent) {
2044 /*
2045 * The log is full and already contains the highest values
2046 */
2047 mutex_exit(&kmem_stkinfo_lock);
2048 return;
2049 }
2050
2051 /* keep a log of the highest used stack */
2052 for (i = 0; i < KMEM_STKINFO_LOG_SIZE; i++) {
2053 if (kmem_stkinfo_log[i].percent == 0) {
2054 index = i;
2055 full++;
2056 break;
2057 }
2058 if (smallest == 0) {
2059 smallest = kmem_stkinfo_log[i].percent;
2060 index = i;
2061 continue;
2062 }
2063 if (kmem_stkinfo_log[i].percent < smallest) {
2064 smallest = kmem_stkinfo_log[i].percent;
2065 index = i;
2066 }
2067 }
2068
2069 if (percent >= kmem_stkinfo_log[index].percent) {
2070 kmem_stkinfo_log[index].kthread = (caddr_t)t;
2071 kmem_stkinfo_log[index].t_startpc = (caddr_t)t->t_startpc;
2072 kmem_stkinfo_log[index].start = start;
2073 kmem_stkinfo_log[index].stksz = stksz;
2074 kmem_stkinfo_log[index].percent = percent;
2075 kmem_stkinfo_log[index].t_tid = t->t_tid;
2076 kmem_stkinfo_log[index].cmd[0] = '\0';
2077 if (t->t_tid != 0) {
2078 stksz = strlen((t->t_procp)->p_user.u_comm);
2079 if (stksz >= KMEM_STKINFO_STR_SIZE) {
2080 stksz = KMEM_STKINFO_STR_SIZE - 1;
2081 kmem_stkinfo_log[index].cmd[stksz] = '\0';
2082 } else {
2083 stksz += 1;
2084 }
2085 (void) memcpy(kmem_stkinfo_log[index].cmd,
2086 (t->t_procp)->p_user.u_comm, stksz);
2087 }
2088 if (percent < smallest_percent) {
2089 smallest_percent = percent;
2090 }
2091 }
2092 mutex_exit(&kmem_stkinfo_lock);
2093 }
2094
2095 /*
2096 * Tunable kmem_stackinfo is set, compute stack utilization percentage.
2097 */
2098 static size_t
2099 stkinfo_percent(caddr_t t_stk, caddr_t t_stkbase, caddr_t sp)
2100 {
2101 size_t percent;
2102 size_t s;
2103
2104 if (t_stk > t_stkbase) {
2105 /* stack grows down */
2106 if (sp > t_stk) {
2107 return (0);
2108 }
2109 if (sp < t_stkbase) {
2110 return (100);
2111 }
2112 percent = t_stk - sp + 1;
2113 s = t_stk - t_stkbase + 1;
2114 } else {
2115 /* stack grows up */
2116 if (sp < t_stk) {
2117 return (0);
2118 }
2119 if (sp > t_stkbase) {
2120 return (100);
2121 }
2122 percent = sp - t_stk + 1;
2123 s = t_stkbase - t_stk + 1;
2124 }
2125 percent = ((100 * percent) / s) + 1;
2126 if (percent > 100) {
2127 percent = 100;
2128 }
2129 return (percent);
2130 }