1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/param.h>
  30 #include <sys/sysmacros.h>
  31 #include <sys/signal.h>
  32 #include <sys/stack.h>
  33 #include <sys/pcb.h>
  34 #include <sys/user.h>
  35 #include <sys/systm.h>
  36 #include <sys/sysinfo.h>
  37 #include <sys/errno.h>
  38 #include <sys/cmn_err.h>
  39 #include <sys/cred.h>
  40 #include <sys/resource.h>
  41 #include <sys/task.h>
  42 #include <sys/project.h>
  43 #include <sys/proc.h>
  44 #include <sys/debug.h>
  45 #include <sys/disp.h>
  46 #include <sys/class.h>
  47 #include <vm/seg_kmem.h>
  48 #include <vm/seg_kp.h>
  49 #include <sys/machlock.h>
  50 #include <sys/kmem.h>
  51 #include <sys/varargs.h>
  52 #include <sys/turnstile.h>
  53 #include <sys/poll.h>
  54 #include <sys/vtrace.h>
  55 #include <sys/callb.h>
  56 #include <c2/audit.h>
  57 #include <sys/tnf.h>
  58 #include <sys/sobject.h>
  59 #include <sys/cpupart.h>
  60 #include <sys/pset.h>
  61 #include <sys/door.h>
  62 #include <sys/spl.h>
  63 #include <sys/copyops.h>
  64 #include <sys/rctl.h>
  65 #include <sys/brand.h>
  66 #include <sys/pool.h>
  67 #include <sys/zone.h>
  68 #include <sys/tsol/label.h>
  69 #include <sys/tsol/tndb.h>
  70 #include <sys/cpc_impl.h>
  71 #include <sys/sdt.h>
  72 #include <sys/reboot.h>
  73 #include <sys/kdi.h>
  74 #include <sys/schedctl.h>
  75 #include <sys/waitq.h>
  76 #include <sys/cpucaps.h>
  77 #include <sys/kiconv.h>
  78 
  79 struct kmem_cache *thread_cache;        /* cache of free threads */
  80 struct kmem_cache *lwp_cache;           /* cache of free lwps */
  81 struct kmem_cache *turnstile_cache;     /* cache of free turnstiles */
  82 
  83 /*
  84  * allthreads is only for use by kmem_readers.  All kernel loops can use
  85  * the current thread as a start/end point.
  86  */
  87 kthread_t *allthreads = &t0;        /* circular list of all threads */
  88 
  89 static kcondvar_t reaper_cv;            /* synchronization var */
  90 kthread_t       *thread_deathrow;       /* circular list of reapable threads */
  91 kthread_t       *lwp_deathrow;          /* circular list of reapable threads */
  92 kmutex_t        reaplock;               /* protects lwp and thread deathrows */
  93 int     thread_reapcnt = 0;             /* number of threads on deathrow */
  94 int     lwp_reapcnt = 0;                /* number of lwps on deathrow */
  95 int     reaplimit = 16;                 /* delay reaping until reaplimit */
  96 
  97 thread_free_lock_t      *thread_free_lock;
  98                                         /* protects tick thread from reaper */
  99 
 100 extern int nthread;
 101 
 102 /* System Scheduling classes. */
 103 id_t    syscid;                         /* system scheduling class ID */
 104 id_t    sysdccid = CLASS_UNUSED;        /* reset when SDC loads */
 105 
 106 void    *segkp_thread;                  /* cookie for segkp pool */
 107 
 108 int lwp_cache_sz = 32;
 109 int t_cache_sz = 8;
 110 static kt_did_t next_t_id = 1;
 111 
 112 /* Default mode for thread binding to CPUs and processor sets */
 113 int default_binding_mode = TB_ALLHARD;
 114 
 115 /*
 116  * Min/Max stack sizes for stack size parameters
 117  */
 118 #define MAX_STKSIZE     (32 * DEFAULTSTKSZ)
 119 #define MIN_STKSIZE     DEFAULTSTKSZ
 120 
 121 /*
 122  * default_stksize overrides lwp_default_stksize if it is set.
 123  */
 124 volatile int    default_stksize;
 125 volatile int    lwp_default_stksize;
 126 
 127 static zone_key_t zone_thread_key;
 128 
 129 unsigned int kmem_stackinfo;            /* stackinfo feature on-off */
 130 kmem_stkinfo_t *kmem_stkinfo_log;       /* stackinfo circular log */
 131 static kmutex_t kmem_stkinfo_lock;      /* protects kmem_stkinfo_log */
 132 
 133 /*
 134  * forward declarations for internal thread specific data (tsd)
 135  */
 136 static void *tsd_realloc(void *, size_t, size_t);
 137 
 138 void thread_reaper(void);
 139 
 140 /* forward declarations for stackinfo feature */
 141 static void stkinfo_begin(kthread_t *);
 142 static void stkinfo_end(kthread_t *);
 143 static size_t stkinfo_percent(caddr_t, caddr_t, caddr_t);
 144 
 145 /*ARGSUSED*/
 146 static int
 147 turnstile_constructor(void *buf, void *cdrarg, int kmflags)
 148 {
 149         bzero(buf, sizeof (turnstile_t));
 150         return (0);
 151 }
 152 
 153 /*ARGSUSED*/
 154 static void
 155 turnstile_destructor(void *buf, void *cdrarg)
 156 {
 157         turnstile_t *ts = buf;
 158 
 159         ASSERT(ts->ts_free == NULL);
 160         ASSERT(ts->ts_waiters == 0);
 161         ASSERT(ts->ts_inheritor == NULL);
 162         ASSERT(ts->ts_sleepq[0].sq_first == NULL);
 163         ASSERT(ts->ts_sleepq[1].sq_first == NULL);
 164 }
 165 
 166 void
 167 thread_init(void)
 168 {
 169         kthread_t *tp;
 170         extern char sys_name[];
 171         extern void idle();
 172         struct cpu *cpu = CPU;
 173         int i;
 174         kmutex_t *lp;
 175 
 176         mutex_init(&reaplock, NULL, MUTEX_SPIN, (void *)ipltospl(DISP_LEVEL));
 177         thread_free_lock =
 178             kmem_alloc(sizeof (thread_free_lock_t) * THREAD_FREE_NUM, KM_SLEEP);
 179         for (i = 0; i < THREAD_FREE_NUM; i++) {
 180                 lp = &thread_free_lock[i].tf_lock;
 181                 mutex_init(lp, NULL, MUTEX_DEFAULT, NULL);
 182         }
 183 
 184 #if defined(__i386) || defined(__amd64)
 185         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
 186             PTR24_ALIGN, NULL, NULL, NULL, NULL, NULL, 0);
 187 
 188         /*
 189          * "struct _klwp" includes a "struct pcb", which includes a
 190          * "struct fpu", which needs to be 64-byte aligned on amd64
 191          * (and even on i386) for xsave/xrstor.
 192          */
 193         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
 194             64, NULL, NULL, NULL, NULL, NULL, 0);
 195 #else
 196         /*
 197          * Allocate thread structures from static_arena.  This prevents
 198          * issues where a thread tries to relocate its own thread
 199          * structure and touches it after the mapping has been suspended.
 200          */
 201         thread_cache = kmem_cache_create("thread_cache", sizeof (kthread_t),
 202             PTR24_ALIGN, NULL, NULL, NULL, NULL, static_arena, 0);
 203 
 204         lwp_stk_cache_init();
 205 
 206         lwp_cache = kmem_cache_create("lwp_cache", sizeof (klwp_t),
 207             0, NULL, NULL, NULL, NULL, NULL, 0);
 208 #endif
 209 
 210         turnstile_cache = kmem_cache_create("turnstile_cache",
 211             sizeof (turnstile_t), 0,
 212             turnstile_constructor, turnstile_destructor, NULL, NULL, NULL, 0);
 213 
 214         label_init();
 215         cred_init();
 216 
 217         /*
 218          * Initialize various resource management facilities.
 219          */
 220         rctl_init();
 221         cpucaps_init();
 222         /*
 223          * Zone_init() should be called before project_init() so that project ID
 224          * for the first project is initialized correctly.
 225          */
 226         zone_init();
 227         project_init();
 228         brand_init();
 229         kiconv_init();
 230         task_init();
 231         tcache_init();
 232         pool_init();
 233 
 234         curthread->t_ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
 235 
 236         /*
 237          * Originally, we had two parameters to set default stack
 238          * size: one for lwp's (lwp_default_stksize), and one for
 239          * kernel-only threads (DEFAULTSTKSZ, a.k.a. _defaultstksz).
 240          * Now we have a third parameter that overrides both if it is
 241          * set to a legal stack size, called default_stksize.
 242          */
 243 
 244         if (default_stksize == 0) {
 245                 default_stksize = DEFAULTSTKSZ;
 246         } else if (default_stksize % PAGESIZE != 0 ||
 247             default_stksize > MAX_STKSIZE ||
 248             default_stksize < MIN_STKSIZE) {
 249                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
 250                     (int)DEFAULTSTKSZ);
 251                 default_stksize = DEFAULTSTKSZ;
 252         } else {
 253                 lwp_default_stksize = default_stksize;
 254         }
 255 
 256         if (lwp_default_stksize == 0) {
 257                 lwp_default_stksize = default_stksize;
 258         } else if (lwp_default_stksize % PAGESIZE != 0 ||
 259             lwp_default_stksize > MAX_STKSIZE ||
 260             lwp_default_stksize < MIN_STKSIZE) {
 261                 cmn_err(CE_WARN, "Illegal stack size. Using %d",
 262                     default_stksize);
 263                 lwp_default_stksize = default_stksize;
 264         }
 265 
 266         segkp_lwp = segkp_cache_init(segkp, lwp_cache_sz,
 267             lwp_default_stksize,
 268             (KPD_NOWAIT | KPD_HASREDZONE | KPD_LOCKED));
 269 
 270         segkp_thread = segkp_cache_init(segkp, t_cache_sz,
 271             default_stksize, KPD_HASREDZONE | KPD_LOCKED | KPD_NO_ANON);
 272 
 273         (void) getcid(sys_name, &syscid);
 274         curthread->t_cid = syscid;   /* current thread is t0 */
 275 
 276         /*
 277          * Set up the first CPU's idle thread.
 278          * It runs whenever the CPU has nothing worthwhile to do.
 279          */
 280         tp = thread_create(NULL, 0, idle, NULL, 0, &p0, TS_STOPPED, -1);
 281         cpu->cpu_idle_thread = tp;
 282         tp->t_preempt = 1;
 283         tp->t_disp_queue = cpu->cpu_disp;
 284         ASSERT(tp->t_disp_queue != NULL);
 285         tp->t_bound_cpu = cpu;
 286         tp->t_affinitycnt = 1;
 287 
 288         /*
 289          * Registering a thread in the callback table is usually
 290          * done in the initialization code of the thread. In this
 291          * case, we do it right after thread creation to avoid
 292          * blocking idle thread while registering itself. It also
 293          * avoids the possibility of reregistration in case a CPU
 294          * restarts its idle thread.
 295          */
 296         CALLB_CPR_INIT_SAFE(tp, "idle");
 297 
 298         /*
 299          * Create the thread_reaper daemon. From this point on, exited
 300          * threads will get reaped.
 301          */
 302         (void) thread_create(NULL, 0, (void (*)())thread_reaper,
 303             NULL, 0, &p0, TS_RUN, minclsyspri);
 304 
 305         /*
 306          * Finish initializing the kernel memory allocator now that
 307          * thread_create() is available.
 308          */
 309         kmem_thread_init();
 310 
 311         if (boothowto & RB_DEBUG)
 312                 kdi_dvec_thravail();
 313 }
 314 
 315 /*
 316  * Create a thread.
 317  *
 318  * thread_create() blocks for memory if necessary.  It never fails.
 319  *
 320  * If stk is NULL, the thread is created at the base of the stack
 321  * and cannot be swapped.
 322  */
 323 kthread_t *
 324 thread_create(
 325         caddr_t stk,
 326         size_t  stksize,
 327         void    (*proc)(),
 328         void    *arg,
 329         size_t  len,
 330         proc_t   *pp,
 331         int     state,
 332         pri_t   pri)
 333 {
 334         kthread_t *t;
 335         extern struct classfuncs sys_classfuncs;
 336         turnstile_t *ts;
 337 
 338         /*
 339          * Every thread keeps a turnstile around in case it needs to block.
 340          * The only reason the turnstile is not simply part of the thread
 341          * structure is that we may have to break the association whenever
 342          * more than one thread blocks on a given synchronization object.
 343          * From a memory-management standpoint, turnstiles are like the
 344          * "attached mblks" that hang off dblks in the streams allocator.
 345          */
 346         ts = kmem_cache_alloc(turnstile_cache, KM_SLEEP);
 347 
 348         if (stk == NULL) {
 349                 /*
 350                  * alloc both thread and stack in segkp chunk
 351                  */
 352 
 353                 if (stksize < default_stksize)
 354                         stksize = default_stksize;
 355 
 356                 if (stksize == default_stksize) {
 357                         stk = (caddr_t)segkp_cache_get(segkp_thread);
 358                 } else {
 359                         stksize = roundup(stksize, PAGESIZE);
 360                         stk = (caddr_t)segkp_get(segkp, stksize,
 361                             (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED));
 362                 }
 363 
 364                 ASSERT(stk != NULL);
 365 
 366                 /*
 367                  * The machine-dependent mutex code may require that
 368                  * thread pointers (since they may be used for mutex owner
 369                  * fields) have certain alignment requirements.
 370                  * PTR24_ALIGN is the size of the alignment quanta.
 371                  * XXX - assumes stack grows toward low addresses.
 372                  */
 373                 if (stksize <= sizeof (kthread_t) + PTR24_ALIGN)
 374                         cmn_err(CE_PANIC, "thread_create: proposed stack size"
 375                             " too small to hold thread.");
 376 #ifdef STACK_GROWTH_DOWN
 377                 stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1);
 378                 stksize &= -PTR24_ALIGN;    /* make thread aligned */
 379                 t = (kthread_t *)(stk + stksize);
 380                 bzero(t, sizeof (kthread_t));
 381                 if (audit_active)
 382                         audit_thread_create(t);
 383                 t->t_stk = stk + stksize;
 384                 t->t_stkbase = stk;
 385 #else   /* stack grows to larger addresses */
 386                 stksize -= SA(sizeof (kthread_t));
 387                 t = (kthread_t *)(stk);
 388                 bzero(t, sizeof (kthread_t));
 389                 t->t_stk = stk + sizeof (kthread_t);
 390                 t->t_stkbase = stk + stksize + sizeof (kthread_t);
 391 #endif  /* STACK_GROWTH_DOWN */
 392                 t->t_flag |= T_TALLOCSTK;
 393                 t->t_swap = stk;
 394         } else {
 395                 t = kmem_cache_alloc(thread_cache, KM_SLEEP);
 396                 bzero(t, sizeof (kthread_t));
 397                 ASSERT(((uintptr_t)t & (PTR24_ALIGN - 1)) == 0);
 398                 if (audit_active)
 399                         audit_thread_create(t);
 400                 /*
 401                  * Initialize t_stk to the kernel stack pointer to use
 402                  * upon entry to the kernel
 403                  */
 404 #ifdef STACK_GROWTH_DOWN
 405                 t->t_stk = stk + stksize;
 406                 t->t_stkbase = stk;
 407 #else
 408                 t->t_stk = stk;                      /* 3b2-like */
 409                 t->t_stkbase = stk + stksize;
 410 #endif /* STACK_GROWTH_DOWN */
 411         }
 412 
 413         if (kmem_stackinfo != 0) {
 414                 stkinfo_begin(t);
 415         }
 416 
 417         t->t_ts = ts;
 418 
 419         /*
 420          * p_cred could be NULL if it thread_create is called before cred_init
 421          * is called in main.
 422          */
 423         mutex_enter(&pp->p_crlock);
 424         if (pp->p_cred)
 425                 crhold(t->t_cred = pp->p_cred);
 426         mutex_exit(&pp->p_crlock);
 427         t->t_start = gethrestime_sec();
 428         t->t_startpc = proc;
 429         t->t_procp = pp;
 430         t->t_clfuncs = &sys_classfuncs.thread;
 431         t->t_cid = syscid;
 432         t->t_pri = pri;
 433         t->t_stime = ddi_get_lbolt();
 434         t->t_schedflag = TS_LOAD | TS_DONT_SWAP;
 435         t->t_bind_cpu = PBIND_NONE;
 436         t->t_bindflag = (uchar_t)default_binding_mode;
 437         t->t_bind_pset = PS_NONE;
 438         t->t_plockp = &pp->p_lock;
 439         t->t_copyops = NULL;
 440         t->t_taskq = NULL;
 441         t->t_anttime = 0;
 442         t->t_hatdepth = 0;
 443 
 444         t->t_dtrace_vtime = 1;       /* assure vtimestamp is always non-zero */
 445 
 446         CPU_STATS_ADDQ(CPU, sys, nthreads, 1);
 447 #ifndef NPROBE
 448         /* Kernel probe */
 449         tnf_thread_create(t);
 450 #endif /* NPROBE */
 451         LOCK_INIT_CLEAR(&t->t_lock);
 452 
 453         /*
 454          * Callers who give us a NULL proc must do their own
 455          * stack initialization.  e.g. lwp_create()
 456          */
 457         if (proc != NULL) {
 458                 t->t_stk = thread_stk_init(t->t_stk);
 459                 thread_load(t, proc, arg, len);
 460         }
 461 
 462         /*
 463          * Put a hold on project0. If this thread is actually in a
 464          * different project, then t_proj will be changed later in
 465          * lwp_create().  All kernel-only threads must be in project 0.
 466          */
 467         t->t_proj = project_hold(proj0p);
 468 
 469         lgrp_affinity_init(&t->t_lgrp_affinity);
 470 
 471         mutex_enter(&pidlock);
 472         nthread++;
 473         t->t_did = next_t_id++;
 474         t->t_prev = curthread->t_prev;
 475         t->t_next = curthread;
 476 
 477         /*
 478          * Add the thread to the list of all threads, and initialize
 479          * its t_cpu pointer.  We need to block preemption since
 480          * cpu_offline walks the thread list looking for threads
 481          * with t_cpu pointing to the CPU being offlined.  We want
 482          * to make sure that the list is consistent and that if t_cpu
 483          * is set, the thread is on the list.
 484          */
 485         kpreempt_disable();
 486         curthread->t_prev->t_next = t;
 487         curthread->t_prev = t;
 488 
 489         /*
 490          * Threads should never have a NULL t_cpu pointer so assign it
 491          * here.  If the thread is being created with state TS_RUN a
 492          * better CPU may be chosen when it is placed on the run queue.
 493          *
 494          * We need to keep kernel preemption disabled when setting all
 495          * three fields to keep them in sync.  Also, always create in
 496          * the default partition since that's where kernel threads go
 497          * (if this isn't a kernel thread, t_cpupart will be changed
 498          * in lwp_create before setting the thread runnable).
 499          */
 500         t->t_cpupart = &cp_default;
 501 
 502         /*
 503          * For now, affiliate this thread with the root lgroup.
 504          * Since the kernel does not (presently) allocate its memory
 505          * in a locality aware fashion, the root is an appropriate home.
 506          * If this thread is later associated with an lwp, it will have
 507          * it's lgroup re-assigned at that time.
 508          */
 509         lgrp_move_thread(t, &cp_default.cp_lgrploads[LGRP_ROOTID], 1);
 510 
 511         /*
 512          * Inherit the current cpu.  If this cpu isn't part of the chosen
 513          * lgroup, a new cpu will be chosen by cpu_choose when the thread
 514          * is ready to run.
 515          */
 516         if (CPU->cpu_part == &cp_default)
 517                 t->t_cpu = CPU;
 518         else
 519                 t->t_cpu = disp_lowpri_cpu(cp_default.cp_cpulist, t->t_lpl,
 520                     t->t_pri, NULL);
 521 
 522         t->t_disp_queue = t->t_cpu->cpu_disp;
 523         kpreempt_enable();
 524 
 525         /*
 526          * Initialize thread state and the dispatcher lock pointer.
 527          * Need to hold onto pidlock to block allthreads walkers until
 528          * the state is set.
 529          */
 530         switch (state) {
 531         case TS_RUN:
 532                 curthread->t_oldspl = splhigh();     /* get dispatcher spl */
 533                 THREAD_SET_STATE(t, TS_STOPPED, &transition_lock);
 534                 CL_SETRUN(t);
 535                 thread_unlock(t);
 536                 break;
 537 
 538         case TS_ONPROC:
 539                 THREAD_ONPROC(t, t->t_cpu);
 540                 break;
 541 
 542         case TS_FREE:
 543                 /*
 544                  * Free state will be used for intr threads.
 545                  * The interrupt routine must set the thread dispatcher
 546                  * lock pointer (t_lockp) if starting on a CPU
 547                  * other than the current one.
 548                  */
 549                 THREAD_FREEINTR(t, CPU);
 550                 break;
 551 
 552         case TS_STOPPED:
 553                 THREAD_SET_STATE(t, TS_STOPPED, &stop_lock);
 554                 break;
 555 
 556         default:                        /* TS_SLEEP, TS_ZOMB or TS_TRANS */
 557                 cmn_err(CE_PANIC, "thread_create: invalid state %d", state);
 558         }
 559         mutex_exit(&pidlock);
 560         return (t);
 561 }
 562 
 563 /*
 564  * Move thread to project0 and take care of project reference counters.
 565  */
 566 void
 567 thread_rele(kthread_t *t)
 568 {
 569         kproject_t *kpj;
 570 
 571         thread_lock(t);
 572 
 573         ASSERT(t == curthread || t->t_state == TS_FREE || t->t_procp == &p0);
 574         kpj = ttoproj(t);
 575         t->t_proj = proj0p;
 576 
 577         thread_unlock(t);
 578 
 579         if (kpj != proj0p) {
 580                 project_rele(kpj);
 581                 (void) project_hold(proj0p);
 582         }
 583 }
 584 
 585 void
 586 thread_exit(void)
 587 {
 588         kthread_t *t = curthread;
 589 
 590         if ((t->t_proc_flag & TP_ZTHREAD) != 0)
 591                 cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called");
 592 
 593         tsd_exit();             /* Clean up this thread's TSD */
 594 
 595         kcpc_passivate();       /* clean up performance counter state */
 596 
 597         /*
 598          * No kernel thread should have called poll() without arranging
 599          * calling pollcleanup() here.
 600          */
 601         ASSERT(t->t_pollstate == NULL);
 602         ASSERT(t->t_schedctl == NULL);
 603         if (t->t_door)
 604                 door_slam();    /* in case thread did an upcall */
 605 
 606 #ifndef NPROBE
 607         /* Kernel probe */
 608         if (t->t_tnf_tpdp)
 609                 tnf_thread_exit();
 610 #endif /* NPROBE */
 611 
 612         thread_rele(t);
 613         t->t_preempt++;
 614 
 615         /*
 616          * remove thread from the all threads list so that
 617          * death-row can use the same pointers.
 618          */
 619         mutex_enter(&pidlock);
 620         t->t_next->t_prev = t->t_prev;
 621         t->t_prev->t_next = t->t_next;
 622         ASSERT(allthreads != t);        /* t0 never exits */
 623         cv_broadcast(&t->t_joincv);      /* wake up anyone in thread_join */
 624         mutex_exit(&pidlock);
 625 
 626         if (t->t_ctx != NULL)
 627                 exitctx(t);
 628         if (t->t_procp->p_pctx != NULL)
 629                 exitpctx(t->t_procp);
 630 
 631         if (kmem_stackinfo != 0) {
 632                 stkinfo_end(t);
 633         }
 634 
 635         t->t_state = TS_ZOMB;        /* set zombie thread */
 636 
 637         swtch_from_zombie();    /* give up the CPU */
 638         /* NOTREACHED */
 639 }
 640 
 641 /*
 642  * Check to see if the specified thread is active (defined as being on
 643  * the thread list).  This is certainly a slow way to do this; if there's
 644  * ever a reason to speed it up, we could maintain a hash table of active
 645  * threads indexed by their t_did.
 646  */
 647 static kthread_t *
 648 did_to_thread(kt_did_t tid)
 649 {
 650         kthread_t *t;
 651 
 652         ASSERT(MUTEX_HELD(&pidlock));
 653         for (t = curthread->t_next; t != curthread; t = t->t_next) {
 654                 if (t->t_did == tid)
 655                         break;
 656         }
 657         if (t->t_did == tid)
 658                 return (t);
 659         else
 660                 return (NULL);
 661 }
 662 
 663 /*
 664  * Wait for specified thread to exit.  Returns immediately if the thread
 665  * could not be found, meaning that it has either already exited or never
 666  * existed.
 667  */
 668 void
 669 thread_join(kt_did_t tid)
 670 {
 671         kthread_t *t;
 672 
 673         ASSERT(tid != curthread->t_did);
 674         ASSERT(tid != t0.t_did);
 675 
 676         mutex_enter(&pidlock);
 677         /*
 678          * Make sure we check that the thread is on the thread list
 679          * before blocking on it; otherwise we could end up blocking on
 680          * a cv that's already been freed.  In other words, don't cache
 681          * the thread pointer across calls to cv_wait.
 682          *
 683          * The choice of loop invariant means that whenever a thread
 684          * is taken off the allthreads list, a cv_broadcast must be
 685          * performed on that thread's t_joincv to wake up any waiters.
 686          * The broadcast doesn't have to happen right away, but it
 687          * shouldn't be postponed indefinitely (e.g., by doing it in
 688          * thread_free which may only be executed when the deathrow
 689          * queue is processed.
 690          */
 691         while (t = did_to_thread(tid))
 692                 cv_wait(&t->t_joincv, &pidlock);
 693         mutex_exit(&pidlock);
 694 }
 695 
 696 void
 697 thread_free_prevent(kthread_t *t)
 698 {
 699         kmutex_t *lp;
 700 
 701         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 702         mutex_enter(lp);
 703 }
 704 
 705 void
 706 thread_free_allow(kthread_t *t)
 707 {
 708         kmutex_t *lp;
 709 
 710         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 711         mutex_exit(lp);
 712 }
 713 
 714 static void
 715 thread_free_barrier(kthread_t *t)
 716 {
 717         kmutex_t *lp;
 718 
 719         lp = &thread_free_lock[THREAD_FREE_HASH(t)].tf_lock;
 720         mutex_enter(lp);
 721         mutex_exit(lp);
 722 }
 723 
 724 void
 725 thread_free(kthread_t *t)
 726 {
 727         boolean_t allocstk = (t->t_flag & T_TALLOCSTK);
 728         klwp_t *lwp = t->t_lwp;
 729         caddr_t swap = t->t_swap;
 730 
 731         ASSERT(t != &t0 && t->t_state == TS_FREE);
 732         ASSERT(t->t_door == NULL);
 733         ASSERT(t->t_schedctl == NULL);
 734         ASSERT(t->t_pollstate == NULL);
 735 
 736         t->t_pri = 0;
 737         t->t_pc = 0;
 738         t->t_sp = 0;
 739         t->t_wchan0 = NULL;
 740         t->t_wchan = NULL;
 741         if (t->t_cred != NULL) {
 742                 crfree(t->t_cred);
 743                 t->t_cred = 0;
 744         }
 745         if (t->t_pdmsg) {
 746                 kmem_free(t->t_pdmsg, strlen(t->t_pdmsg) + 1);
 747                 t->t_pdmsg = NULL;
 748         }
 749         if (audit_active)
 750                 audit_thread_free(t);
 751 #ifndef NPROBE
 752         if (t->t_tnf_tpdp)
 753                 tnf_thread_free(t);
 754 #endif /* NPROBE */
 755         if (t->t_cldata) {
 756                 CL_EXITCLASS(t->t_cid, (caddr_t *)t->t_cldata);
 757         }
 758         if (t->t_rprof != NULL) {
 759                 kmem_free(t->t_rprof, sizeof (*t->t_rprof));
 760                 t->t_rprof = NULL;
 761         }
 762         t->t_lockp = NULL;   /* nothing should try to lock this thread now */
 763         if (lwp)
 764                 lwp_freeregs(lwp, 0);
 765         if (t->t_ctx)
 766                 freectx(t, 0);
 767         t->t_stk = NULL;
 768         if (lwp)
 769                 lwp_stk_fini(lwp);
 770         lock_clear(&t->t_lock);
 771 
 772         if (t->t_ts->ts_waiters > 0)
 773                 panic("thread_free: turnstile still active");
 774 
 775         kmem_cache_free(turnstile_cache, t->t_ts);
 776 
 777         free_afd(&t->t_activefd);
 778 
 779         /*
 780          * Barrier for the tick accounting code.  The tick accounting code
 781          * holds this lock to keep the thread from going away while it's
 782          * looking at it.
 783          */
 784         thread_free_barrier(t);
 785 
 786         ASSERT(ttoproj(t) == proj0p);
 787         project_rele(ttoproj(t));
 788 
 789         lgrp_affinity_free(&t->t_lgrp_affinity);
 790 
 791         mutex_enter(&pidlock);
 792         nthread--;
 793         mutex_exit(&pidlock);
 794 
 795         /*
 796          * Free thread, lwp and stack.  This needs to be done carefully, since
 797          * if T_TALLOCSTK is set, the thread is part of the stack.
 798          */
 799         t->t_lwp = NULL;
 800         t->t_swap = NULL;
 801 
 802         if (swap) {
 803                 segkp_release(segkp, swap);
 804         }
 805         if (lwp) {
 806                 kmem_cache_free(lwp_cache, lwp);
 807         }
 808         if (!allocstk) {
 809                 kmem_cache_free(thread_cache, t);
 810         }
 811 }
 812 
 813 /*
 814  * Removes threads associated with the given zone from a deathrow queue.
 815  * tp is a pointer to the head of the deathrow queue, and countp is a
 816  * pointer to the current deathrow count.  Returns a linked list of
 817  * threads removed from the list.
 818  */
 819 static kthread_t *
 820 thread_zone_cleanup(kthread_t **tp, int *countp, zoneid_t zoneid)
 821 {
 822         kthread_t *tmp, *list = NULL;
 823         cred_t *cr;
 824 
 825         ASSERT(MUTEX_HELD(&reaplock));
 826         while (*tp != NULL) {
 827                 if ((cr = (*tp)->t_cred) != NULL && crgetzoneid(cr) == zoneid) {
 828                         tmp = *tp;
 829                         *tp = tmp->t_forw;
 830                         tmp->t_forw = list;
 831                         list = tmp;
 832                         (*countp)--;
 833                 } else {
 834                         tp = &(*tp)->t_forw;
 835                 }
 836         }
 837         return (list);
 838 }
 839 
 840 static void
 841 thread_reap_list(kthread_t *t)
 842 {
 843         kthread_t *next;
 844 
 845         while (t != NULL) {
 846                 next = t->t_forw;
 847                 thread_free(t);
 848                 t = next;
 849         }
 850 }
 851 
 852 /* ARGSUSED */
 853 static void
 854 thread_zone_destroy(zoneid_t zoneid, void *unused)
 855 {
 856         kthread_t *t, *l;
 857 
 858         mutex_enter(&reaplock);
 859         /*
 860          * Pull threads and lwps associated with zone off deathrow lists.
 861          */
 862         t = thread_zone_cleanup(&thread_deathrow, &thread_reapcnt, zoneid);
 863         l = thread_zone_cleanup(&lwp_deathrow, &lwp_reapcnt, zoneid);
 864         mutex_exit(&reaplock);
 865 
 866         /*
 867          * Guard against race condition in mutex_owner_running:
 868          *      thread=owner(mutex)
 869          *      <interrupt>
 870          *                              thread exits mutex
 871          *                              thread exits
 872          *                              thread reaped
 873          *                              thread struct freed
 874          * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 875          * A cross call to all cpus will cause the interrupt handler
 876          * to reset the PC if it is in mutex_owner_running, refreshing
 877          * stale thread pointers.
 878          */
 879         mutex_sync();   /* sync with mutex code */
 880 
 881         /*
 882          * Reap threads
 883          */
 884         thread_reap_list(t);
 885 
 886         /*
 887          * Reap lwps
 888          */
 889         thread_reap_list(l);
 890 }
 891 
 892 /*
 893  * cleanup zombie threads that are on deathrow.
 894  */
 895 void
 896 thread_reaper()
 897 {
 898         kthread_t *t, *l;
 899         callb_cpr_t cprinfo;
 900 
 901         /*
 902          * Register callback to clean up threads when zone is destroyed.
 903          */
 904         zone_key_create(&zone_thread_key, NULL, NULL, thread_zone_destroy);
 905 
 906         CALLB_CPR_INIT(&cprinfo, &reaplock, callb_generic_cpr, "t_reaper");
 907         for (;;) {
 908                 mutex_enter(&reaplock);
 909                 while (thread_deathrow == NULL && lwp_deathrow == NULL) {
 910                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
 911                         cv_wait(&reaper_cv, &reaplock);
 912                         CALLB_CPR_SAFE_END(&cprinfo, &reaplock);
 913                 }
 914                 /*
 915                  * mutex_sync() needs to be called when reaping, but
 916                  * not too often.  We limit reaping rate to once
 917                  * per second.  Reaplimit is max rate at which threads can
 918                  * be freed. Does not impact thread destruction/creation.
 919                  */
 920                 t = thread_deathrow;
 921                 l = lwp_deathrow;
 922                 thread_deathrow = NULL;
 923                 lwp_deathrow = NULL;
 924                 thread_reapcnt = 0;
 925                 lwp_reapcnt = 0;
 926                 mutex_exit(&reaplock);
 927 
 928                 /*
 929                  * Guard against race condition in mutex_owner_running:
 930                  *      thread=owner(mutex)
 931                  *      <interrupt>
 932                  *                              thread exits mutex
 933                  *                              thread exits
 934                  *                              thread reaped
 935                  *                              thread struct freed
 936                  * cpu = thread->t_cpu <- BAD POINTER DEREFERENCE.
 937                  * A cross call to all cpus will cause the interrupt handler
 938                  * to reset the PC if it is in mutex_owner_running, refreshing
 939                  * stale thread pointers.
 940                  */
 941                 mutex_sync();   /* sync with mutex code */
 942                 /*
 943                  * Reap threads
 944                  */
 945                 thread_reap_list(t);
 946 
 947                 /*
 948                  * Reap lwps
 949                  */
 950                 thread_reap_list(l);
 951                 delay(hz);
 952         }
 953 }
 954 
 955 /*
 956  * This is called by lwpcreate, etc.() to put a lwp_deathrow thread onto
 957  * thread_deathrow. The thread's state is changed already TS_FREE to indicate
 958  * that is reapable. The thread already holds the reaplock, and was already
 959  * freed.
 960  */
 961 void
 962 reapq_move_lq_to_tq(kthread_t *t)
 963 {
 964         ASSERT(t->t_state == TS_FREE);
 965         ASSERT(MUTEX_HELD(&reaplock));
 966         t->t_forw = thread_deathrow;
 967         thread_deathrow = t;
 968         thread_reapcnt++;
 969         if (lwp_reapcnt + thread_reapcnt > reaplimit)
 970                 cv_signal(&reaper_cv);  /* wake the reaper */
 971 }
 972 
 973 /*
 974  * This is called by resume() to put a zombie thread onto deathrow.
 975  * The thread's state is changed to TS_FREE to indicate that is reapable.
 976  * This is called from the idle thread so it must not block - just spin.
 977  */
 978 void
 979 reapq_add(kthread_t *t)
 980 {
 981         mutex_enter(&reaplock);
 982 
 983         /*
 984          * lwp_deathrow contains threads with lwp linkage and
 985          * swappable thread stacks which have the default stacksize.
 986          * These threads' lwps and stacks may be reused by lwp_create().
 987          *
 988          * Anything else goes on thread_deathrow(), where it will eventually
 989          * be thread_free()d.
 990          */
 991         if (t->t_flag & T_LWPREUSE) {
 992                 ASSERT(ttolwp(t) != NULL);
 993                 t->t_forw = lwp_deathrow;
 994                 lwp_deathrow = t;
 995                 lwp_reapcnt++;
 996         } else {
 997                 t->t_forw = thread_deathrow;
 998                 thread_deathrow = t;
 999                 thread_reapcnt++;
1000         }
1001         if (lwp_reapcnt + thread_reapcnt > reaplimit)
1002                 cv_signal(&reaper_cv);      /* wake the reaper */
1003         t->t_state = TS_FREE;
1004         lock_clear(&t->t_lock);
1005 
1006         /*
1007          * Before we return, we need to grab and drop the thread lock for
1008          * the dead thread.  At this point, the current thread is the idle
1009          * thread, and the dead thread's CPU lock points to the current
1010          * CPU -- and we must grab and drop the lock to synchronize with
1011          * a racing thread walking a blocking chain that the zombie thread
1012          * was recently in.  By this point, that blocking chain is (by
1013          * definition) stale:  the dead thread is not holding any locks, and
1014          * is therefore not in any blocking chains -- but if we do not regrab
1015          * our lock before freeing the dead thread's data structures, the
1016          * thread walking the (stale) blocking chain will die on memory
1017          * corruption when it attempts to drop the dead thread's lock.  We
1018          * only need do this once because there is no way for the dead thread
1019          * to ever again be on a blocking chain:  once we have grabbed and
1020          * dropped the thread lock, we are guaranteed that anyone that could
1021          * have seen this thread in a blocking chain can no longer see it.
1022          */
1023         thread_lock(t);
1024         thread_unlock(t);
1025 
1026         mutex_exit(&reaplock);
1027 }
1028 
1029 /*
1030  * Install thread context ops for the current thread.
1031  */
1032 void
1033 installctx(
1034         kthread_t *t,
1035         void    *arg,
1036         void    (*save)(void *),
1037         void    (*restore)(void *),
1038         void    (*fork)(void *, void *),
1039         void    (*lwp_create)(void *, void *),
1040         void    (*exit)(void *),
1041         void    (*free)(void *, int))
1042 {
1043         struct ctxop *ctx;
1044 
1045         ctx = kmem_alloc(sizeof (struct ctxop), KM_SLEEP);
1046         ctx->save_op = save;
1047         ctx->restore_op = restore;
1048         ctx->fork_op = fork;
1049         ctx->lwp_create_op = lwp_create;
1050         ctx->exit_op = exit;
1051         ctx->free_op = free;
1052         ctx->arg = arg;
1053         ctx->next = t->t_ctx;
1054         t->t_ctx = ctx;
1055 }
1056 
1057 /*
1058  * Remove the thread context ops from a thread.
1059  */
1060 int
1061 removectx(
1062         kthread_t *t,
1063         void    *arg,
1064         void    (*save)(void *),
1065         void    (*restore)(void *),
1066         void    (*fork)(void *, void *),
1067         void    (*lwp_create)(void *, void *),
1068         void    (*exit)(void *),
1069         void    (*free)(void *, int))
1070 {
1071         struct ctxop *ctx, *prev_ctx;
1072 
1073         /*
1074          * The incoming kthread_t (which is the thread for which the
1075          * context ops will be removed) should be one of the following:
1076          *
1077          * a) the current thread,
1078          *
1079          * b) a thread of a process that's being forked (SIDL),
1080          *
1081          * c) a thread that belongs to the same process as the current
1082          *    thread and for which the current thread is the agent thread,
1083          *
1084          * d) a thread that is TS_STOPPED which is indicative of it
1085          *    being (if curthread is not an agent) a thread being created
1086          *    as part of an lwp creation.
1087          */
1088         ASSERT(t == curthread || ttoproc(t)->p_stat == SIDL ||
1089             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1090 
1091         /*
1092          * Serialize modifications to t->t_ctx to prevent the agent thread
1093          * and the target thread from racing with each other during lwp exit.
1094          */
1095         mutex_enter(&t->t_ctx_lock);
1096         prev_ctx = NULL;
1097         kpreempt_disable();
1098         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next) {
1099                 if (ctx->save_op == save && ctx->restore_op == restore &&
1100                     ctx->fork_op == fork && ctx->lwp_create_op == lwp_create &&
1101                     ctx->exit_op == exit && ctx->free_op == free &&
1102                     ctx->arg == arg) {
1103                         if (prev_ctx)
1104                                 prev_ctx->next = ctx->next;
1105                         else
1106                                 t->t_ctx = ctx->next;
1107                         mutex_exit(&t->t_ctx_lock);
1108                         if (ctx->free_op != NULL)
1109                                 (ctx->free_op)(ctx->arg, 0);
1110                         kmem_free(ctx, sizeof (struct ctxop));
1111                         kpreempt_enable();
1112                         return (1);
1113                 }
1114                 prev_ctx = ctx;
1115         }
1116         mutex_exit(&t->t_ctx_lock);
1117         kpreempt_enable();
1118 
1119         return (0);
1120 }
1121 
1122 void
1123 savectx(kthread_t *t)
1124 {
1125         struct ctxop *ctx;
1126 
1127         ASSERT(t == curthread);
1128         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1129                 if (ctx->save_op != NULL)
1130                         (ctx->save_op)(ctx->arg);
1131 }
1132 
1133 void
1134 restorectx(kthread_t *t)
1135 {
1136         struct ctxop *ctx;
1137 
1138         ASSERT(t == curthread);
1139         for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next)
1140                 if (ctx->restore_op != NULL)
1141                         (ctx->restore_op)(ctx->arg);
1142 }
1143 
1144 void
1145 forkctx(kthread_t *t, kthread_t *ct)
1146 {
1147         struct ctxop *ctx;
1148 
1149         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1150                 if (ctx->fork_op != NULL)
1151                         (ctx->fork_op)(t, ct);
1152 }
1153 
1154 /*
1155  * Note that this operator is only invoked via the _lwp_create
1156  * system call.  The system may have other reasons to create lwps
1157  * e.g. the agent lwp or the doors unreferenced lwp.
1158  */
1159 void
1160 lwp_createctx(kthread_t *t, kthread_t *ct)
1161 {
1162         struct ctxop *ctx;
1163 
1164         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1165                 if (ctx->lwp_create_op != NULL)
1166                         (ctx->lwp_create_op)(t, ct);
1167 }
1168 
1169 /*
1170  * exitctx is called from thread_exit() and lwp_exit() to perform any actions
1171  * needed when the thread/LWP leaves the processor for the last time. This
1172  * routine is not intended to deal with freeing memory; freectx() is used for
1173  * that purpose during thread_free(). This routine is provided to allow for
1174  * clean-up that can't wait until thread_free().
1175  */
1176 void
1177 exitctx(kthread_t *t)
1178 {
1179         struct ctxop *ctx;
1180 
1181         for (ctx = t->t_ctx; ctx != NULL; ctx = ctx->next)
1182                 if (ctx->exit_op != NULL)
1183                         (ctx->exit_op)(t);
1184 }
1185 
1186 /*
1187  * freectx is called from thread_free() and exec() to get
1188  * rid of old thread context ops.
1189  */
1190 void
1191 freectx(kthread_t *t, int isexec)
1192 {
1193         struct ctxop *ctx;
1194 
1195         kpreempt_disable();
1196         while ((ctx = t->t_ctx) != NULL) {
1197                 t->t_ctx = ctx->next;
1198                 if (ctx->free_op != NULL)
1199                         (ctx->free_op)(ctx->arg, isexec);
1200                 kmem_free(ctx, sizeof (struct ctxop));
1201         }
1202         kpreempt_enable();
1203 }
1204 
1205 /*
1206  * freectx_ctx is called from lwp_create() when lwp is reused from
1207  * lwp_deathrow and its thread structure is added to thread_deathrow.
1208  * The thread structure to which this ctx was attached may be already
1209  * freed by the thread reaper so free_op implementations shouldn't rely
1210  * on thread structure to which this ctx was attached still being around.
1211  */
1212 void
1213 freectx_ctx(struct ctxop *ctx)
1214 {
1215         struct ctxop *nctx;
1216 
1217         ASSERT(ctx != NULL);
1218 
1219         kpreempt_disable();
1220         do {
1221                 nctx = ctx->next;
1222                 if (ctx->free_op != NULL)
1223                         (ctx->free_op)(ctx->arg, 0);
1224                 kmem_free(ctx, sizeof (struct ctxop));
1225         } while ((ctx = nctx) != NULL);
1226         kpreempt_enable();
1227 }
1228 
1229 /*
1230  * Set the thread running; arrange for it to be swapped in if necessary.
1231  */
1232 void
1233 setrun_locked(kthread_t *t)
1234 {
1235         ASSERT(THREAD_LOCK_HELD(t));
1236         if (t->t_state == TS_SLEEP) {
1237                 /*
1238                  * Take off sleep queue.
1239                  */
1240                 SOBJ_UNSLEEP(t->t_sobj_ops, t);
1241         } else if (t->t_state & (TS_RUN | TS_ONPROC)) {
1242                 /*
1243                  * Already on dispatcher queue.
1244                  */
1245                 return;
1246         } else if (t->t_state == TS_WAIT) {
1247                 waitq_setrun(t);
1248         } else if (t->t_state == TS_STOPPED) {
1249                 /*
1250                  * All of the sending of SIGCONT (TC_XSTART) and /proc
1251                  * (TC_PSTART) and lwp_continue() (TC_CSTART) must have
1252                  * requested that the thread be run.
1253                  * Just calling setrun() is not sufficient to set a stopped
1254                  * thread running.  TP_TXSTART is always set if the thread
1255                  * is not stopped by a jobcontrol stop signal.
1256                  * TP_TPSTART is always set if /proc is not controlling it.
1257                  * TP_TCSTART is always set if lwp_suspend() didn't stop it.
1258                  * The thread won't be stopped unless one of these
1259                  * three mechanisms did it.
1260                  *
1261                  * These flags must be set before calling setrun_locked(t).
1262                  * They can't be passed as arguments because the streams
1263                  * code calls setrun() indirectly and the mechanism for
1264                  * doing so admits only one argument.  Note that the
1265                  * thread must be locked in order to change t_schedflags.
1266                  */
1267                 if ((t->t_schedflag & TS_ALLSTART) != TS_ALLSTART)
1268                         return;
1269                 /*
1270                  * Process is no longer stopped (a thread is running).
1271                  */
1272                 t->t_whystop = 0;
1273                 t->t_whatstop = 0;
1274                 /*
1275                  * Strictly speaking, we do not have to clear these
1276                  * flags here; they are cleared on entry to stop().
1277                  * However, they are confusing when doing kernel
1278                  * debugging or when they are revealed by ps(1).
1279                  */
1280                 t->t_schedflag &= ~TS_ALLSTART;
1281                 THREAD_TRANSITION(t);   /* drop stopped-thread lock */
1282                 ASSERT(t->t_lockp == &transition_lock);
1283                 ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
1284                 /*
1285                  * Let the class put the process on the dispatcher queue.
1286                  */
1287                 CL_SETRUN(t);
1288         }
1289 }
1290 
1291 void
1292 setrun(kthread_t *t)
1293 {
1294         thread_lock(t);
1295         setrun_locked(t);
1296         thread_unlock(t);
1297 }
1298 
1299 /*
1300  * Unpin an interrupted thread.
1301  *      When an interrupt occurs, the interrupt is handled on the stack
1302  *      of an interrupt thread, taken from a pool linked to the CPU structure.
1303  *
1304  *      When swtch() is switching away from an interrupt thread because it
1305  *      blocked or was preempted, this routine is called to complete the
1306  *      saving of the interrupted thread state, and returns the interrupted
1307  *      thread pointer so it may be resumed.
1308  *
1309  *      Called by swtch() only at high spl.
1310  */
1311 kthread_t *
1312 thread_unpin()
1313 {
1314         kthread_t       *t = curthread; /* current thread */
1315         kthread_t       *itp;           /* interrupted thread */
1316         int             i;              /* interrupt level */
1317         extern int      intr_passivate();
1318 
1319         ASSERT(t->t_intr != NULL);
1320 
1321         itp = t->t_intr;             /* interrupted thread */
1322         t->t_intr = NULL;            /* clear interrupt ptr */
1323 
1324         /*
1325          * Get state from interrupt thread for the one
1326          * it interrupted.
1327          */
1328 
1329         i = intr_passivate(t, itp);
1330 
1331         TRACE_5(TR_FAC_INTR, TR_INTR_PASSIVATE,
1332             "intr_passivate:level %d curthread %p (%T) ithread %p (%T)",
1333             i, t, t, itp, itp);
1334 
1335         /*
1336          * Dissociate the current thread from the interrupted thread's LWP.
1337          */
1338         t->t_lwp = NULL;
1339 
1340         /*
1341          * Interrupt handlers above the level that spinlocks block must
1342          * not block.
1343          */
1344 #if DEBUG
1345         if (i < 0 || i > LOCK_LEVEL)
1346                 cmn_err(CE_PANIC, "thread_unpin: ipl out of range %x", i);
1347 #endif
1348 
1349         /*
1350          * Compute the CPU's base interrupt level based on the active
1351          * interrupts.
1352          */
1353         ASSERT(CPU->cpu_intr_actv & (1 << i));
1354         set_base_spl();
1355 
1356         return (itp);
1357 }
1358 
1359 /*
1360  * Create and initialize an interrupt thread.
1361  *      Returns non-zero on error.
1362  *      Called at spl7() or better.
1363  */
1364 void
1365 thread_create_intr(struct cpu *cp)
1366 {
1367         kthread_t *tp;
1368 
1369         tp = thread_create(NULL, 0,
1370             (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0);
1371 
1372         /*
1373          * Set the thread in the TS_FREE state.  The state will change
1374          * to TS_ONPROC only while the interrupt is active.  Think of these
1375          * as being on a private free list for the CPU.  Being TS_FREE keeps
1376          * inactive interrupt threads out of debugger thread lists.
1377          *
1378          * We cannot call thread_create with TS_FREE because of the current
1379          * checks there for ONPROC.  Fix this when thread_create takes flags.
1380          */
1381         THREAD_FREEINTR(tp, cp);
1382 
1383         /*
1384          * Nobody should ever reference the credentials of an interrupt
1385          * thread so make it NULL to catch any such references.
1386          */
1387         tp->t_cred = NULL;
1388         tp->t_flag |= T_INTR_THREAD;
1389         tp->t_cpu = cp;
1390         tp->t_bound_cpu = cp;
1391         tp->t_disp_queue = cp->cpu_disp;
1392         tp->t_affinitycnt = 1;
1393         tp->t_preempt = 1;
1394 
1395         /*
1396          * Don't make a user-requested binding on this thread so that
1397          * the processor can be offlined.
1398          */
1399         tp->t_bind_cpu = PBIND_NONE; /* no USER-requested binding */
1400         tp->t_bind_pset = PS_NONE;
1401 
1402 #if defined(__i386) || defined(__amd64)
1403         tp->t_stk -= STACK_ALIGN;
1404         *(tp->t_stk) = 0;            /* terminate intr thread stack */
1405 #endif
1406 
1407         /*
1408          * Link onto CPU's interrupt pool.
1409          */
1410         tp->t_link = cp->cpu_intr_thread;
1411         cp->cpu_intr_thread = tp;
1412 }
1413 
1414 /*
1415  * TSD -- THREAD SPECIFIC DATA
1416  */
1417 static kmutex_t         tsd_mutex;       /* linked list spin lock */
1418 static uint_t           tsd_nkeys;       /* size of destructor array */
1419 /* per-key destructor funcs */
1420 static void             (**tsd_destructor)(void *);
1421 /* list of tsd_thread's */
1422 static struct tsd_thread        *tsd_list;
1423 
1424 /*
1425  * Default destructor
1426  *      Needed because NULL destructor means that the key is unused
1427  */
1428 /* ARGSUSED */
1429 void
1430 tsd_defaultdestructor(void *value)
1431 {}
1432 
1433 /*
1434  * Create a key (index into per thread array)
1435  *      Locks out tsd_create, tsd_destroy, and tsd_exit
1436  *      May allocate memory with lock held
1437  */
1438 void
1439 tsd_create(uint_t *keyp, void (*destructor)(void *))
1440 {
1441         int     i;
1442         uint_t  nkeys;
1443 
1444         /*
1445          * if key is allocated, do nothing
1446          */
1447         mutex_enter(&tsd_mutex);
1448         if (*keyp) {
1449                 mutex_exit(&tsd_mutex);
1450                 return;
1451         }
1452         /*
1453          * find an unused key
1454          */
1455         if (destructor == NULL)
1456                 destructor = tsd_defaultdestructor;
1457 
1458         for (i = 0; i < tsd_nkeys; ++i)
1459                 if (tsd_destructor[i] == NULL)
1460                         break;
1461 
1462         /*
1463          * if no unused keys, increase the size of the destructor array
1464          */
1465         if (i == tsd_nkeys) {
1466                 if ((nkeys = (tsd_nkeys << 1)) == 0)
1467                         nkeys = 1;
1468                 tsd_destructor =
1469                     (void (**)(void *))tsd_realloc((void *)tsd_destructor,
1470                     (size_t)(tsd_nkeys * sizeof (void (*)(void *))),
1471                     (size_t)(nkeys * sizeof (void (*)(void *))));
1472                 tsd_nkeys = nkeys;
1473         }
1474 
1475         /*
1476          * allocate the next available unused key
1477          */
1478         tsd_destructor[i] = destructor;
1479         *keyp = i + 1;
1480         mutex_exit(&tsd_mutex);
1481 }
1482 
1483 /*
1484  * Destroy a key -- this is for unloadable modules
1485  *
1486  * Assumes that the caller is preventing tsd_set and tsd_get
1487  * Locks out tsd_create, tsd_destroy, and tsd_exit
1488  * May free memory with lock held
1489  */
1490 void
1491 tsd_destroy(uint_t *keyp)
1492 {
1493         uint_t key;
1494         struct tsd_thread *tsd;
1495 
1496         /*
1497          * protect the key namespace and our destructor lists
1498          */
1499         mutex_enter(&tsd_mutex);
1500         key = *keyp;
1501         *keyp = 0;
1502 
1503         ASSERT(key <= tsd_nkeys);
1504 
1505         /*
1506          * if the key is valid
1507          */
1508         if (key != 0) {
1509                 uint_t k = key - 1;
1510                 /*
1511                  * for every thread with TSD, call key's destructor
1512                  */
1513                 for (tsd = tsd_list; tsd; tsd = tsd->ts_next) {
1514                         /*
1515                          * no TSD for key in this thread
1516                          */
1517                         if (key > tsd->ts_nkeys)
1518                                 continue;
1519                         /*
1520                          * call destructor for key
1521                          */
1522                         if (tsd->ts_value[k] && tsd_destructor[k])
1523                                 (*tsd_destructor[k])(tsd->ts_value[k]);
1524                         /*
1525                          * reset value for key
1526                          */
1527                         tsd->ts_value[k] = NULL;
1528                 }
1529                 /*
1530                  * actually free the key (NULL destructor == unused)
1531                  */
1532                 tsd_destructor[k] = NULL;
1533         }
1534 
1535         mutex_exit(&tsd_mutex);
1536 }
1537 
1538 /*
1539  * Quickly return the per thread value that was stored with the specified key
1540  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1541  */
1542 void *
1543 tsd_get(uint_t key)
1544 {
1545         return (tsd_agent_get(curthread, key));
1546 }
1547 
1548 /*
1549  * Set a per thread value indexed with the specified key
1550  */
1551 int
1552 tsd_set(uint_t key, void *value)
1553 {
1554         return (tsd_agent_set(curthread, key, value));
1555 }
1556 
1557 /*
1558  * Like tsd_get(), except that the agent lwp can get the tsd of
1559  * another thread in the same process (the agent thread only runs when the
1560  * process is completely stopped by /proc), or syslwp is creating a new lwp.
1561  */
1562 void *
1563 tsd_agent_get(kthread_t *t, uint_t key)
1564 {
1565         struct tsd_thread *tsd = t->t_tsd;
1566 
1567         ASSERT(t == curthread ||
1568             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1569 
1570         if (key && tsd != NULL && key <= tsd->ts_nkeys)
1571                 return (tsd->ts_value[key - 1]);
1572         return (NULL);
1573 }
1574 
1575 /*
1576  * Like tsd_set(), except that the agent lwp can set the tsd of
1577  * another thread in the same process, or syslwp can set the tsd
1578  * of a thread it's in the middle of creating.
1579  *
1580  * Assumes the caller is protecting key from tsd_create and tsd_destroy
1581  * May lock out tsd_destroy (and tsd_create), may allocate memory with
1582  * lock held
1583  */
1584 int
1585 tsd_agent_set(kthread_t *t, uint_t key, void *value)
1586 {
1587         struct tsd_thread *tsd = t->t_tsd;
1588 
1589         ASSERT(t == curthread ||
1590             ttoproc(t)->p_agenttp == curthread || t->t_state == TS_STOPPED);
1591 
1592         if (key == 0)
1593                 return (EINVAL);
1594         if (tsd == NULL)
1595                 tsd = t->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1596         if (key <= tsd->ts_nkeys) {
1597                 tsd->ts_value[key - 1] = value;
1598                 return (0);
1599         }
1600 
1601         ASSERT(key <= tsd_nkeys);
1602 
1603         /*
1604          * lock out tsd_destroy()
1605          */
1606         mutex_enter(&tsd_mutex);
1607         if (tsd->ts_nkeys == 0) {
1608                 /*
1609                  * Link onto list of threads with TSD
1610                  */
1611                 if ((tsd->ts_next = tsd_list) != NULL)
1612                         tsd_list->ts_prev = tsd;
1613                 tsd_list = tsd;
1614         }
1615 
1616         /*
1617          * Allocate thread local storage and set the value for key
1618          */
1619         tsd->ts_value = tsd_realloc(tsd->ts_value,
1620             tsd->ts_nkeys * sizeof (void *),
1621             key * sizeof (void *));
1622         tsd->ts_nkeys = key;
1623         tsd->ts_value[key - 1] = value;
1624         mutex_exit(&tsd_mutex);
1625 
1626         return (0);
1627 }
1628 
1629 
1630 /*
1631  * Return the per thread value that was stored with the specified key
1632  *      If necessary, create the key and the value
1633  *      Assumes the caller is protecting *keyp from tsd_destroy
1634  */
1635 void *
1636 tsd_getcreate(uint_t *keyp, void (*destroy)(void *), void *(*allocate)(void))
1637 {
1638         void *value;
1639         uint_t key = *keyp;
1640         struct tsd_thread *tsd = curthread->t_tsd;
1641 
1642         if (tsd == NULL)
1643                 tsd = curthread->t_tsd = kmem_zalloc(sizeof (*tsd), KM_SLEEP);
1644         if (key && key <= tsd->ts_nkeys && (value = tsd->ts_value[key - 1]))
1645                 return (value);
1646         if (key == 0)
1647                 tsd_create(keyp, destroy);
1648         (void) tsd_set(*keyp, value = (*allocate)());
1649 
1650         return (value);
1651 }
1652 
1653 /*
1654  * Called from thread_exit() to run the destructor function for each tsd
1655  *      Locks out tsd_create and tsd_destroy
1656  *      Assumes that the destructor *DOES NOT* use tsd
1657  */
1658 void
1659 tsd_exit(void)
1660 {
1661         int i;
1662         struct tsd_thread *tsd = curthread->t_tsd;
1663 
1664         if (tsd == NULL)
1665                 return;
1666 
1667         if (tsd->ts_nkeys == 0) {
1668                 kmem_free(tsd, sizeof (*tsd));
1669                 curthread->t_tsd = NULL;
1670                 return;
1671         }
1672 
1673         /*
1674          * lock out tsd_create and tsd_destroy, call
1675          * the destructor, and mark the value as destroyed.
1676          */
1677         mutex_enter(&tsd_mutex);
1678 
1679         for (i = 0; i < tsd->ts_nkeys; i++) {
1680                 if (tsd->ts_value[i] && tsd_destructor[i])
1681                         (*tsd_destructor[i])(tsd->ts_value[i]);
1682                 tsd->ts_value[i] = NULL;
1683         }
1684 
1685         /*
1686          * remove from linked list of threads with TSD
1687          */
1688         if (tsd->ts_next)
1689                 tsd->ts_next->ts_prev = tsd->ts_prev;
1690         if (tsd->ts_prev)
1691                 tsd->ts_prev->ts_next = tsd->ts_next;
1692         if (tsd_list == tsd)
1693                 tsd_list = tsd->ts_next;
1694 
1695         mutex_exit(&tsd_mutex);
1696 
1697         /*
1698          * free up the TSD
1699          */
1700         kmem_free(tsd->ts_value, tsd->ts_nkeys * sizeof (void *));
1701         kmem_free(tsd, sizeof (struct tsd_thread));
1702         curthread->t_tsd = NULL;
1703 }
1704 
1705 /*
1706  * realloc
1707  */
1708 static void *
1709 tsd_realloc(void *old, size_t osize, size_t nsize)
1710 {
1711         void *new;
1712 
1713         new = kmem_zalloc(nsize, KM_SLEEP);
1714         if (old) {
1715                 bcopy(old, new, osize);
1716                 kmem_free(old, osize);
1717         }
1718         return (new);
1719 }
1720 
1721 /*
1722  * Return non-zero if an interrupt is being serviced.
1723  */
1724 int
1725 servicing_interrupt()
1726 {
1727         int onintr = 0;
1728 
1729         /* Are we an interrupt thread */
1730         if (curthread->t_flag & T_INTR_THREAD)
1731                 return (1);
1732         /* Are we servicing a high level interrupt? */
1733         if (CPU_ON_INTR(CPU)) {
1734                 kpreempt_disable();
1735                 onintr = CPU_ON_INTR(CPU);
1736                 kpreempt_enable();
1737         }
1738         return (onintr);
1739 }
1740 
1741 
1742 /*
1743  * Change the dispatch priority of a thread in the system.
1744  * Used when raising or lowering a thread's priority.
1745  * (E.g., priority inheritance)
1746  *
1747  * Since threads are queued according to their priority, we
1748  * we must check the thread's state to determine whether it
1749  * is on a queue somewhere. If it is, we've got to:
1750  *
1751  *      o Dequeue the thread.
1752  *      o Change its effective priority.
1753  *      o Enqueue the thread.
1754  *
1755  * Assumptions: The thread whose priority we wish to change
1756  * must be locked before we call thread_change_(e)pri().
1757  * The thread_change(e)pri() function doesn't drop the thread
1758  * lock--that must be done by its caller.
1759  */
1760 void
1761 thread_change_epri(kthread_t *t, pri_t disp_pri)
1762 {
1763         uint_t  state;
1764 
1765         ASSERT(THREAD_LOCK_HELD(t));
1766 
1767         /*
1768          * If the inherited priority hasn't actually changed,
1769          * just return.
1770          */
1771         if (t->t_epri == disp_pri)
1772                 return;
1773 
1774         state = t->t_state;
1775 
1776         /*
1777          * If it's not on a queue, change the priority with impunity.
1778          */
1779         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1780                 t->t_epri = disp_pri;
1781                 if (state == TS_ONPROC) {
1782                         cpu_t *cp = t->t_disp_queue->disp_cpu;
1783 
1784                         if (t == cp->cpu_dispthread)
1785                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
1786                 }
1787         } else if (state == TS_SLEEP) {
1788                 /*
1789                  * Take the thread out of its sleep queue.
1790                  * Change the inherited priority.
1791                  * Re-enqueue the thread.
1792                  * Each synchronization object exports a function
1793                  * to do this in an appropriate manner.
1794                  */
1795                 SOBJ_CHANGE_EPRI(t->t_sobj_ops, t, disp_pri);
1796         } else if (state == TS_WAIT) {
1797                 /*
1798                  * Re-enqueue a thread on the wait queue if its
1799                  * effective priority needs to change.
1800                  */
1801                 if (disp_pri != t->t_epri)
1802                         waitq_change_pri(t, disp_pri);
1803         } else {
1804                 /*
1805                  * The thread is on a run queue.
1806                  * Note: setbackdq() may not put the thread
1807                  * back on the same run queue where it originally
1808                  * resided.
1809                  */
1810                 (void) dispdeq(t);
1811                 t->t_epri = disp_pri;
1812                 setbackdq(t);
1813         }
1814         schedctl_set_cidpri(t);
1815 }
1816 
1817 /*
1818  * Function: Change the t_pri field of a thread.
1819  * Side Effects: Adjust the thread ordering on a run queue
1820  *               or sleep queue, if necessary.
1821  * Returns: 1 if the thread was on a run queue, else 0.
1822  */
1823 int
1824 thread_change_pri(kthread_t *t, pri_t disp_pri, int front)
1825 {
1826         uint_t  state;
1827         int     on_rq = 0;
1828 
1829         ASSERT(THREAD_LOCK_HELD(t));
1830 
1831         state = t->t_state;
1832         THREAD_WILLCHANGE_PRI(t, disp_pri);
1833 
1834         /*
1835          * If it's not on a queue, change the priority with impunity.
1836          */
1837         if ((state & (TS_SLEEP | TS_RUN | TS_WAIT)) == 0) {
1838                 t->t_pri = disp_pri;
1839 
1840                 if (state == TS_ONPROC) {
1841                         cpu_t *cp = t->t_disp_queue->disp_cpu;
1842 
1843                         if (t == cp->cpu_dispthread)
1844                                 cp->cpu_dispatch_pri = DISP_PRIO(t);
1845                 }
1846         } else if (state == TS_SLEEP) {
1847                 /*
1848                  * If the priority has changed, take the thread out of
1849                  * its sleep queue and change the priority.
1850                  * Re-enqueue the thread.
1851                  * Each synchronization object exports a function
1852                  * to do this in an appropriate manner.
1853                  */
1854                 if (disp_pri != t->t_pri)
1855                         SOBJ_CHANGE_PRI(t->t_sobj_ops, t, disp_pri);
1856         } else if (state == TS_WAIT) {
1857                 /*
1858                  * Re-enqueue a thread on the wait queue if its
1859                  * priority needs to change.
1860                  */
1861                 if (disp_pri != t->t_pri)
1862                         waitq_change_pri(t, disp_pri);
1863         } else {
1864                 /*
1865                  * The thread is on a run queue.
1866                  * Note: setbackdq() may not put the thread
1867                  * back on the same run queue where it originally
1868                  * resided.
1869                  *
1870                  * We still requeue the thread even if the priority
1871                  * is unchanged to preserve round-robin (and other)
1872                  * effects between threads of the same priority.
1873                  */
1874                 on_rq = dispdeq(t);
1875                 ASSERT(on_rq);
1876                 t->t_pri = disp_pri;
1877                 if (front) {
1878                         setfrontdq(t);
1879                 } else {
1880                         setbackdq(t);
1881                 }
1882         }
1883         schedctl_set_cidpri(t);
1884         return (on_rq);
1885 }
1886 
1887 /*
1888  * Tunable kmem_stackinfo is set, fill the kernel thread stack with a
1889  * specific pattern.
1890  */
1891 static void
1892 stkinfo_begin(kthread_t *t)
1893 {
1894         caddr_t start;  /* stack start */
1895         caddr_t end;    /* stack end  */
1896         uint64_t *ptr;  /* pattern pointer */
1897 
1898         /*
1899          * Stack grows up or down, see thread_create(),
1900          * compute stack memory area start and end (start < end).
1901          */
1902         if (t->t_stk > t->t_stkbase) {
1903                 /* stack grows down */
1904                 start = t->t_stkbase;
1905                 end = t->t_stk;
1906         } else {
1907                 /* stack grows up */
1908                 start = t->t_stk;
1909                 end = t->t_stkbase;
1910         }
1911 
1912         /*
1913          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1914          * alignement for start and end in stack area boundaries
1915          * (protection against corrupt t_stkbase/t_stk data).
1916          */
1917         if ((((uintptr_t)start) & 0x7) != 0) {
1918                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1919         }
1920         end = (caddr_t)(((uintptr_t)end) & (~0x7));
1921 
1922         if ((end <= start) || (end - start) > (1024 * 1024)) {
1923                 /* negative or stack size > 1 meg, assume bogus */
1924                 return;
1925         }
1926 
1927         /* fill stack area with a pattern (instead of zeros) */
1928         ptr = (uint64_t *)((void *)start);
1929         while (ptr < (uint64_t *)((void *)end)) {
1930                 *ptr++ = KMEM_STKINFO_PATTERN;
1931         }
1932 }
1933 
1934 
1935 /*
1936  * Tunable kmem_stackinfo is set, create stackinfo log if doesn't already exist,
1937  * compute the percentage of kernel stack really used, and set in the log
1938  * if it's the latest highest percentage.
1939  */
1940 static void
1941 stkinfo_end(kthread_t *t)
1942 {
1943         caddr_t start;  /* stack start */
1944         caddr_t end;    /* stack end  */
1945         uint64_t *ptr;  /* pattern pointer */
1946         size_t stksz;   /* stack size */
1947         size_t smallest = 0;
1948         size_t percent = 0;
1949         uint_t index = 0;
1950         uint_t i;
1951         static size_t smallest_percent = (size_t)-1;
1952         static uint_t full = 0;
1953 
1954         /* create the stackinfo log, if doesn't already exist */
1955         mutex_enter(&kmem_stkinfo_lock);
1956         if (kmem_stkinfo_log == NULL) {
1957                 kmem_stkinfo_log = (kmem_stkinfo_t *)
1958                     kmem_zalloc(KMEM_STKINFO_LOG_SIZE *
1959                     (sizeof (kmem_stkinfo_t)), KM_NOSLEEP);
1960                 if (kmem_stkinfo_log == NULL) {
1961                         mutex_exit(&kmem_stkinfo_lock);
1962                         return;
1963                 }
1964         }
1965         mutex_exit(&kmem_stkinfo_lock);
1966 
1967         /*
1968          * Stack grows up or down, see thread_create(),
1969          * compute stack memory area start and end (start < end).
1970          */
1971         if (t->t_stk > t->t_stkbase) {
1972                 /* stack grows down */
1973                 start = t->t_stkbase;
1974                 end = t->t_stk;
1975         } else {
1976                 /* stack grows up */
1977                 start = t->t_stk;
1978                 end = t->t_stkbase;
1979         }
1980 
1981         /* stack size as found in kthread_t */
1982         stksz = end - start;
1983 
1984         /*
1985          * Stackinfo pattern size is 8 bytes. Ensure proper 8 bytes
1986          * alignement for start and end in stack area boundaries
1987          * (protection against corrupt t_stkbase/t_stk data).
1988          */
1989         if ((((uintptr_t)start) & 0x7) != 0) {
1990                 start = (caddr_t)((((uintptr_t)start) & (~0x7)) + 8);
1991         }
1992         end = (caddr_t)(((uintptr_t)end) & (~0x7));
1993 
1994         if ((end <= start) || (end - start) > (1024 * 1024)) {
1995                 /* negative or stack size > 1 meg, assume bogus */
1996                 return;
1997         }
1998 
1999         /* search until no pattern in the stack */
2000         if (t->t_stk > t->t_stkbase) {
2001                 /* stack grows down */
2002 #if defined(__i386) || defined(__amd64)
2003                 /*
2004                  * 6 longs are pushed on stack, see thread_load(). Skip
2005                  * them, so if kthread has never run, percent is zero.
2006                  * 8 bytes alignement is preserved for a 32 bit kernel,
2007                  * 6 x 4 = 24, 24 is a multiple of 8.
2008                  *
2009                  */
2010                 end -= (6 * sizeof (long));
2011 #endif
2012                 ptr = (uint64_t *)((void *)start);
2013                 while (ptr < (uint64_t *)((void *)end)) {
2014                         if (*ptr != KMEM_STKINFO_PATTERN) {
2015                                 percent = stkinfo_percent(end,
2016                                     start, (caddr_t)ptr);
2017                                 break;
2018                         }
2019                         ptr++;
2020                 }
2021         } else {
2022                 /* stack grows up */
2023                 ptr = (uint64_t *)((void *)end);
2024                 ptr--;
2025                 while (ptr >= (uint64_t *)((void *)start)) {
2026                         if (*ptr != KMEM_STKINFO_PATTERN) {
2027                                 percent = stkinfo_percent(start,
2028                                     end, (caddr_t)ptr);
2029                                 break;
2030                         }
2031                         ptr--;
2032                 }
2033         }
2034 
2035         DTRACE_PROBE3(stack__usage, kthread_t *, t,
2036             size_t, stksz, size_t, percent);
2037 
2038         if (percent == 0) {
2039                 return;
2040         }
2041 
2042         mutex_enter(&kmem_stkinfo_lock);
2043         if (full == KMEM_STKINFO_LOG_SIZE && percent < smallest_percent) {
2044                 /*
2045                  * The log is full and already contains the highest values
2046                  */
2047                 mutex_exit(&kmem_stkinfo_lock);
2048                 return;
2049         }
2050 
2051         /* keep a log of the highest used stack */
2052         for (i = 0; i < KMEM_STKINFO_LOG_SIZE; i++) {
2053                 if (kmem_stkinfo_log[i].percent == 0) {
2054                         index = i;
2055                         full++;
2056                         break;
2057                 }
2058                 if (smallest == 0) {
2059                         smallest = kmem_stkinfo_log[i].percent;
2060                         index = i;
2061                         continue;
2062                 }
2063                 if (kmem_stkinfo_log[i].percent < smallest) {
2064                         smallest = kmem_stkinfo_log[i].percent;
2065                         index = i;
2066                 }
2067         }
2068 
2069         if (percent >= kmem_stkinfo_log[index].percent) {
2070                 kmem_stkinfo_log[index].kthread = (caddr_t)t;
2071                 kmem_stkinfo_log[index].t_startpc = (caddr_t)t->t_startpc;
2072                 kmem_stkinfo_log[index].start = start;
2073                 kmem_stkinfo_log[index].stksz = stksz;
2074                 kmem_stkinfo_log[index].percent = percent;
2075                 kmem_stkinfo_log[index].t_tid = t->t_tid;
2076                 kmem_stkinfo_log[index].cmd[0] = '\0';
2077                 if (t->t_tid != 0) {
2078                         stksz = strlen((t->t_procp)->p_user.u_comm);
2079                         if (stksz >= KMEM_STKINFO_STR_SIZE) {
2080                                 stksz = KMEM_STKINFO_STR_SIZE - 1;
2081                                 kmem_stkinfo_log[index].cmd[stksz] = '\0';
2082                         } else {
2083                                 stksz += 1;
2084                         }
2085                         (void) memcpy(kmem_stkinfo_log[index].cmd,
2086                             (t->t_procp)->p_user.u_comm, stksz);
2087                 }
2088                 if (percent < smallest_percent) {
2089                         smallest_percent = percent;
2090                 }
2091         }
2092         mutex_exit(&kmem_stkinfo_lock);
2093 }
2094 
2095 /*
2096  * Tunable kmem_stackinfo is set, compute stack utilization percentage.
2097  */
2098 static size_t
2099 stkinfo_percent(caddr_t t_stk, caddr_t t_stkbase, caddr_t sp)
2100 {
2101         size_t percent;
2102         size_t s;
2103 
2104         if (t_stk > t_stkbase) {
2105                 /* stack grows down */
2106                 if (sp > t_stk) {
2107                         return (0);
2108                 }
2109                 if (sp < t_stkbase) {
2110                         return (100);
2111                 }
2112                 percent = t_stk - sp + 1;
2113                 s = t_stk - t_stkbase + 1;
2114         } else {
2115                 /* stack grows up */
2116                 if (sp < t_stk) {
2117                         return (0);
2118                 }
2119                 if (sp > t_stkbase) {
2120                         return (100);
2121                 }
2122                 percent = sp - t_stk + 1;
2123                 s = t_stkbase - t_stk + 1;
2124         }
2125         percent = ((100 * percent) / s) + 1;
2126         if (percent > 100) {
2127                 percent = 100;
2128         }
2129         return (percent);
2130 }