1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2015, Joyent, Inc.
  25  */
  26 
  27 #include <sys/types.h>
  28 #include <sys/param.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/cred.h>
  31 #include <sys/proc.h>
  32 #include <sys/session.h>
  33 #include <sys/strsubr.h>
  34 #include <sys/user.h>
  35 #include <sys/priocntl.h>
  36 #include <sys/class.h>
  37 #include <sys/disp.h>
  38 #include <sys/procset.h>
  39 #include <sys/debug.h>
  40 #include <sys/kmem.h>
  41 #include <sys/errno.h>
  42 #include <sys/fx.h>
  43 #include <sys/fxpriocntl.h>
  44 #include <sys/cpuvar.h>
  45 #include <sys/systm.h>
  46 #include <sys/vtrace.h>
  47 #include <sys/schedctl.h>
  48 #include <sys/tnf_probe.h>
  49 #include <sys/sunddi.h>
  50 #include <sys/spl.h>
  51 #include <sys/modctl.h>
  52 #include <sys/policy.h>
  53 #include <sys/sdt.h>
  54 #include <sys/cpupart.h>
  55 #include <sys/cpucaps.h>
  56 
  57 static pri_t fx_init(id_t, int, classfuncs_t **);
  58 
  59 static struct sclass csw = {
  60         "FX",
  61         fx_init,
  62         0
  63 };
  64 
  65 static struct modlsched modlsched = {
  66         &mod_schedops, "Fixed priority sched class", &csw
  67 };
  68 
  69 static struct modlinkage modlinkage = {
  70         MODREV_1, (void *)&modlsched, NULL
  71 };
  72 
  73 
  74 #define FX_MAX_UNPRIV_PRI       0       /* maximum unpriviledge priority */
  75 
  76 /*
  77  * The fxproc_t structures that have a registered callback vector,
  78  * are also kept in an array of circular doubly linked lists. A hash on
  79  * the thread id (from ddi_get_kt_did()) is used to determine which list
  80  * each of such fxproc structures should be placed. Each list has a dummy
  81  * "head" which is never removed, so the list is never empty.
  82  */
  83 
  84 #define FX_CB_LISTS 16          /* number of lists, must be power of 2 */
  85 #define FX_CB_LIST_HASH(ktid)   ((uint_t)ktid & (FX_CB_LISTS - 1))
  86 
  87 /* Insert fxproc into callback list */
  88 #define FX_CB_LIST_INSERT(fxpp)                                         \
  89 {                                                                       \
  90         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);                  \
  91         kmutex_t *lockp = &fx_cb_list_lock[index];                  \
  92         fxproc_t *headp = &fx_cb_plisthead[index];                  \
  93         mutex_enter(lockp);                                             \
  94         fxpp->fx_cb_next = headp->fx_cb_next;                             \
  95         fxpp->fx_cb_prev = headp;                                    \
  96         headp->fx_cb_next->fx_cb_prev = fxpp;                             \
  97         headp->fx_cb_next = fxpp;                                    \
  98         mutex_exit(lockp);                                              \
  99 }
 100 
 101 /*
 102  * Remove thread from callback list.
 103  */
 104 #define FX_CB_LIST_DELETE(fxpp)                                         \
 105 {                                                                       \
 106         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);                  \
 107         kmutex_t *lockp = &fx_cb_list_lock[index];                  \
 108         mutex_enter(lockp);                                             \
 109         fxpp->fx_cb_prev->fx_cb_next = fxpp->fx_cb_next;               \
 110         fxpp->fx_cb_next->fx_cb_prev = fxpp->fx_cb_prev;               \
 111         mutex_exit(lockp);                                              \
 112 }
 113 
 114 #define FX_HAS_CB(fxpp) (fxpp->fx_callback != NULL)
 115 
 116 /* adjust x to be between 0 and fx_maxumdpri */
 117 
 118 #define FX_ADJUST_PRI(pri)                                              \
 119 {                                                                       \
 120         if (pri < 0)                                                 \
 121                 pri = 0;                                                \
 122         else if (pri > fx_maxumdpri)                                         \
 123                 pri = fx_maxumdpri;                                     \
 124 }
 125 
 126 #define FX_ADJUST_QUANTUM(q)                                            \
 127 {                                                                       \
 128         if (q > INT_MAX)                                             \
 129                 q = INT_MAX;                                            \
 130         else if (q <= 0)                                             \
 131                 q = FX_TQINF;                                           \
 132 }
 133 
 134 #define FX_ISVALID(pri, quantum) \
 135         (((pri >= 0) || (pri == FX_CB_NOCHANGE)) &&                  \
 136             ((quantum >= 0) || (quantum == FX_NOCHANGE) ||           \
 137                 (quantum == FX_TQDEF) || (quantum == FX_TQINF)))
 138 
 139 
 140 static id_t     fx_cid;         /* fixed priority class ID */
 141 static fxdpent_t *fx_dptbl;     /* fixed priority disp parameter table */
 142 
 143 static pri_t    fx_maxupri = FXMAXUPRI;
 144 static pri_t    fx_maxumdpri;   /* max user mode fixed priority */
 145 
 146 static pri_t    fx_maxglobpri;  /* maximum global priority used by fx class */
 147 static kmutex_t fx_dptblock;    /* protects fixed priority dispatch table */
 148 
 149 
 150 static kmutex_t fx_cb_list_lock[FX_CB_LISTS];   /* protects list of fxprocs */
 151                                                 /* that have callbacks */
 152 static fxproc_t fx_cb_plisthead[FX_CB_LISTS];   /* dummy fxproc at head of */
 153                                                 /* list of fxprocs with */
 154                                                 /* callbacks */
 155 
 156 static int      fx_admin(caddr_t, cred_t *);
 157 static int      fx_getclinfo(void *);
 158 static int      fx_parmsin(void *);
 159 static int      fx_parmsout(void *, pc_vaparms_t *);
 160 static int      fx_vaparmsin(void *, pc_vaparms_t *);
 161 static int      fx_vaparmsout(void *, pc_vaparms_t *);
 162 static int      fx_getclpri(pcpri_t *);
 163 static int      fx_alloc(void **, int);
 164 static void     fx_free(void *);
 165 static int      fx_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
 166 static void     fx_exitclass(void *);
 167 static int      fx_canexit(kthread_t *, cred_t *);
 168 static int      fx_fork(kthread_t *, kthread_t *, void *);
 169 static void     fx_forkret(kthread_t *, kthread_t *);
 170 static void     fx_parmsget(kthread_t *, void *);
 171 static int      fx_parmsset(kthread_t *, void *, id_t, cred_t *);
 172 static void     fx_stop(kthread_t *, int, int);
 173 static void     fx_exit(kthread_t *);
 174 static pri_t    fx_swapin(kthread_t *, int);
 175 static pri_t    fx_swapout(kthread_t *, int);
 176 static void     fx_trapret(kthread_t *);
 177 static void     fx_preempt(kthread_t *);
 178 static void     fx_setrun(kthread_t *);
 179 static void     fx_sleep(kthread_t *);
 180 static void     fx_tick(kthread_t *);
 181 static void     fx_wakeup(kthread_t *);
 182 static int      fx_donice(kthread_t *, cred_t *, int, int *);
 183 static int      fx_doprio(kthread_t *, cred_t *, int, int *);
 184 static pri_t    fx_globpri(kthread_t *);
 185 static void     fx_yield(kthread_t *);
 186 static void     fx_nullsys();
 187 
 188 extern fxdpent_t *fx_getdptbl(void);
 189 
 190 static void     fx_change_priority(kthread_t *, fxproc_t *);
 191 static fxproc_t *fx_list_lookup(kt_did_t);
 192 static void fx_list_release(fxproc_t *);
 193 
 194 
 195 static struct classfuncs fx_classfuncs = {
 196         /* class functions */
 197         fx_admin,
 198         fx_getclinfo,
 199         fx_parmsin,
 200         fx_parmsout,
 201         fx_vaparmsin,
 202         fx_vaparmsout,
 203         fx_getclpri,
 204         fx_alloc,
 205         fx_free,
 206 
 207         /* thread functions */
 208         fx_enterclass,
 209         fx_exitclass,
 210         fx_canexit,
 211         fx_fork,
 212         fx_forkret,
 213         fx_parmsget,
 214         fx_parmsset,
 215         fx_stop,
 216         fx_exit,
 217         fx_nullsys,     /* active */
 218         fx_nullsys,     /* inactive */
 219         fx_swapin,
 220         fx_swapout,
 221         fx_trapret,
 222         fx_preempt,
 223         fx_setrun,
 224         fx_sleep,
 225         fx_tick,
 226         fx_wakeup,
 227         fx_donice,
 228         fx_globpri,
 229         fx_nullsys,     /* set_process_group */
 230         fx_yield,
 231         fx_doprio,
 232 };
 233 
 234 
 235 int
 236 _init()
 237 {
 238         return (mod_install(&modlinkage));
 239 }
 240 
 241 int
 242 _fini()
 243 {
 244         return (EBUSY);
 245 }
 246 
 247 int
 248 _info(struct modinfo *modinfop)
 249 {
 250         return (mod_info(&modlinkage, modinfop));
 251 }
 252 
 253 /*
 254  * Fixed priority class initialization. Called by dispinit() at boot time.
 255  * We can ignore the clparmsz argument since we know that the smallest
 256  * possible parameter buffer is big enough for us.
 257  */
 258 /* ARGSUSED */
 259 static pri_t
 260 fx_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
 261 {
 262         int i;
 263         extern pri_t fx_getmaxumdpri(void);
 264 
 265         fx_dptbl = fx_getdptbl();
 266         fx_maxumdpri = fx_getmaxumdpri();
 267         fx_maxglobpri = fx_dptbl[fx_maxumdpri].fx_globpri;
 268 
 269         fx_cid = cid;           /* Record our class ID */
 270 
 271         /*
 272          * Initialize the hash table for fxprocs with callbacks
 273          */
 274         for (i = 0; i < FX_CB_LISTS; i++) {
 275                 fx_cb_plisthead[i].fx_cb_next = fx_cb_plisthead[i].fx_cb_prev =
 276                     &fx_cb_plisthead[i];
 277         }
 278 
 279         /*
 280          * We're required to return a pointer to our classfuncs
 281          * structure and the highest global priority value we use.
 282          */
 283         *clfuncspp = &fx_classfuncs;
 284         return (fx_maxglobpri);
 285 }
 286 
 287 /*
 288  * Get or reset the fx_dptbl values per the user's request.
 289  */
 290 static int
 291 fx_admin(caddr_t uaddr, cred_t *reqpcredp)
 292 {
 293         fxadmin_t       fxadmin;
 294         fxdpent_t       *tmpdpp;
 295         int             userdpsz;
 296         int             i;
 297         size_t          fxdpsz;
 298 
 299         if (get_udatamodel() == DATAMODEL_NATIVE) {
 300                 if (copyin(uaddr, &fxadmin, sizeof (fxadmin_t)))
 301                         return (EFAULT);
 302         }
 303 #ifdef _SYSCALL32_IMPL
 304         else {
 305                 /* get fxadmin struct from ILP32 caller */
 306                 fxadmin32_t fxadmin32;
 307                 if (copyin(uaddr, &fxadmin32, sizeof (fxadmin32_t)))
 308                         return (EFAULT);
 309                 fxadmin.fx_dpents =
 310                     (struct fxdpent *)(uintptr_t)fxadmin32.fx_dpents;
 311                 fxadmin.fx_ndpents = fxadmin32.fx_ndpents;
 312                 fxadmin.fx_cmd = fxadmin32.fx_cmd;
 313         }
 314 #endif /* _SYSCALL32_IMPL */
 315 
 316         fxdpsz = (fx_maxumdpri + 1) * sizeof (fxdpent_t);
 317 
 318         switch (fxadmin.fx_cmd) {
 319         case FX_GETDPSIZE:
 320                 fxadmin.fx_ndpents = fx_maxumdpri + 1;
 321 
 322                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 323                         if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
 324                                 return (EFAULT);
 325                 }
 326 #ifdef _SYSCALL32_IMPL
 327                 else {
 328                         /* return fxadmin struct to ILP32 caller */
 329                         fxadmin32_t fxadmin32;
 330                         fxadmin32.fx_dpents =
 331                             (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
 332                         fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
 333                         fxadmin32.fx_cmd = fxadmin.fx_cmd;
 334                         if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
 335                                 return (EFAULT);
 336                 }
 337 #endif /* _SYSCALL32_IMPL */
 338                 break;
 339 
 340         case FX_GETDPTBL:
 341                 userdpsz = MIN(fxadmin.fx_ndpents * sizeof (fxdpent_t),
 342                     fxdpsz);
 343                 if (copyout(fx_dptbl, fxadmin.fx_dpents, userdpsz))
 344                         return (EFAULT);
 345 
 346                 fxadmin.fx_ndpents = userdpsz / sizeof (fxdpent_t);
 347 
 348                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 349                         if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
 350                                 return (EFAULT);
 351                 }
 352 #ifdef _SYSCALL32_IMPL
 353                 else {
 354                         /* return fxadmin struct to ILP32 callers */
 355                         fxadmin32_t fxadmin32;
 356                         fxadmin32.fx_dpents =
 357                             (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
 358                         fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
 359                         fxadmin32.fx_cmd = fxadmin.fx_cmd;
 360                         if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
 361                                 return (EFAULT);
 362                 }
 363 #endif /* _SYSCALL32_IMPL */
 364                 break;
 365 
 366         case FX_SETDPTBL:
 367                 /*
 368                  * We require that the requesting process has sufficient
 369                  * privileges. We also require that the table supplied by
 370                  * the user exactly match the current fx_dptbl in size.
 371                  */
 372                 if (secpolicy_dispadm(reqpcredp) != 0) {
 373                         return (EPERM);
 374                 }
 375                 if (fxadmin.fx_ndpents * sizeof (fxdpent_t) != fxdpsz) {
 376                         return (EINVAL);
 377                 }
 378 
 379                 /*
 380                  * We read the user supplied table into a temporary buffer
 381                  * where it is validated before being copied over the
 382                  * fx_dptbl.
 383                  */
 384                 tmpdpp = kmem_alloc(fxdpsz, KM_SLEEP);
 385                 if (copyin(fxadmin.fx_dpents, tmpdpp, fxdpsz)) {
 386                         kmem_free(tmpdpp, fxdpsz);
 387                         return (EFAULT);
 388                 }
 389                 for (i = 0; i < fxadmin.fx_ndpents; i++) {
 390 
 391                         /*
 392                          * Validate the user supplied values. All we are doing
 393                          * here is verifying that the values are within their
 394                          * allowable ranges and will not panic the system. We
 395                          * make no attempt to ensure that the resulting
 396                          * configuration makes sense or results in reasonable
 397                          * performance.
 398                          */
 399                         if (tmpdpp[i].fx_quantum <= 0 &&
 400                             tmpdpp[i].fx_quantum != FX_TQINF) {
 401                                 kmem_free(tmpdpp, fxdpsz);
 402                                 return (EINVAL);
 403                         }
 404                 }
 405 
 406                 /*
 407                  * Copy the user supplied values over the current fx_dptbl
 408                  * values. The fx_globpri member is read-only so we don't
 409                  * overwrite it.
 410                  */
 411                 mutex_enter(&fx_dptblock);
 412                 for (i = 0; i < fxadmin.fx_ndpents; i++) {
 413                         fx_dptbl[i].fx_quantum = tmpdpp[i].fx_quantum;
 414                 }
 415                 mutex_exit(&fx_dptblock);
 416                 kmem_free(tmpdpp, fxdpsz);
 417                 break;
 418 
 419         default:
 420                 return (EINVAL);
 421         }
 422         return (0);
 423 }
 424 
 425 /*
 426  * Allocate a fixed priority class specific thread structure and
 427  * initialize it with the parameters supplied. Also move the thread
 428  * to specified priority.
 429  */
 430 static int
 431 fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
 432     void *bufp)
 433 {
 434         fxkparms_t      *fxkparmsp = (fxkparms_t *)parmsp;
 435         fxproc_t        *fxpp;
 436         pri_t           reqfxupri;
 437         pri_t           reqfxuprilim;
 438 
 439         fxpp = (fxproc_t *)bufp;
 440         ASSERT(fxpp != NULL);
 441 
 442         /*
 443          * Initialize the fxproc structure.
 444          */
 445         fxpp->fx_flags = 0;
 446         fxpp->fx_callback = NULL;
 447         fxpp->fx_cookie = NULL;
 448 
 449         if (fxkparmsp == NULL) {
 450                 /*
 451                  * Use default values.
 452                  */
 453                 fxpp->fx_pri = fxpp->fx_uprilim = 0;
 454                 fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
 455                 fxpp->fx_nice =  NZERO;
 456         } else {
 457                 /*
 458                  * Use supplied values.
 459                  */
 460 
 461                 if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0) {
 462                         reqfxuprilim = 0;
 463                 } else {
 464                         if (fxkparmsp->fx_uprilim > FX_MAX_UNPRIV_PRI &&
 465                             secpolicy_setpriority(reqpcredp) != 0)
 466                                 return (EPERM);
 467                         reqfxuprilim = fxkparmsp->fx_uprilim;
 468                         FX_ADJUST_PRI(reqfxuprilim);
 469                 }
 470 
 471                 if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0) {
 472                         reqfxupri = reqfxuprilim;
 473                 } else {
 474                         if (fxkparmsp->fx_upri > FX_MAX_UNPRIV_PRI &&
 475                             secpolicy_setpriority(reqpcredp) != 0)
 476                                 return (EPERM);
 477                         /*
 478                          * Set the user priority to the requested value
 479                          * or the upri limit, whichever is lower.
 480                          */
 481                         reqfxupri = fxkparmsp->fx_upri;
 482                         FX_ADJUST_PRI(reqfxupri);
 483 
 484                         if (reqfxupri > reqfxuprilim)
 485                                 reqfxupri = reqfxuprilim;
 486                 }
 487 
 488 
 489                 fxpp->fx_uprilim = reqfxuprilim;
 490                 fxpp->fx_pri = reqfxupri;
 491 
 492                 fxpp->fx_nice = NZERO - (NZERO * reqfxupri) / fx_maxupri;
 493 
 494                 if (((fxkparmsp->fx_cflags & FX_DOTQ) == 0) ||
 495                     (fxkparmsp->fx_tqntm == FX_TQDEF)) {
 496                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
 497                 } else {
 498                         if (secpolicy_setpriority(reqpcredp) != 0)
 499                                 return (EPERM);
 500 
 501                         if (fxkparmsp->fx_tqntm == FX_TQINF)
 502                                 fxpp->fx_pquantum = FX_TQINF;
 503                         else {
 504                                 fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
 505                         }
 506                 }
 507 
 508         }
 509 
 510         fxpp->fx_timeleft = fxpp->fx_pquantum;
 511         cpucaps_sc_init(&fxpp->fx_caps);
 512         fxpp->fx_tp = t;
 513 
 514         thread_lock(t);                 /* get dispatcher lock on thread */
 515         t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
 516         t->t_cid = cid;
 517         t->t_cldata = (void *)fxpp;
 518         t->t_schedflag &= ~TS_RUNQMATCH;
 519         fx_change_priority(t, fxpp);
 520         thread_unlock(t);
 521 
 522         return (0);
 523 }
 524 
 525 /*
 526  * The thread is exiting.
 527  */
 528 static void
 529 fx_exit(kthread_t *t)
 530 {
 531         fxproc_t *fxpp;
 532 
 533         thread_lock(t);
 534         fxpp = (fxproc_t *)(t->t_cldata);
 535 
 536         /*
 537          * A thread could be exiting in between clock ticks, so we need to
 538          * calculate how much CPU time it used since it was charged last time.
 539          *
 540          * CPU caps are not enforced on exiting processes - it is usually
 541          * desirable to exit as soon as possible to free resources.
 542          */
 543         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
 544 
 545         if (FX_HAS_CB(fxpp)) {
 546                 FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
 547                 fxpp->fx_callback = NULL;
 548                 fxpp->fx_cookie = NULL;
 549                 thread_unlock(t);
 550                 FX_CB_LIST_DELETE(fxpp);
 551                 return;
 552         }
 553 
 554         thread_unlock(t);
 555 }
 556 
 557 /*
 558  * Exiting the class. Free fxproc structure of thread.
 559  */
 560 static void
 561 fx_exitclass(void *procp)
 562 {
 563         fxproc_t *fxpp = (fxproc_t *)procp;
 564 
 565         thread_lock(fxpp->fx_tp);
 566         if (FX_HAS_CB(fxpp)) {
 567 
 568                 FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
 569 
 570                 fxpp->fx_callback = NULL;
 571                 fxpp->fx_cookie = NULL;
 572                 thread_unlock(fxpp->fx_tp);
 573                 FX_CB_LIST_DELETE(fxpp);
 574         } else
 575                 thread_unlock(fxpp->fx_tp);
 576 
 577         kmem_free(fxpp, sizeof (fxproc_t));
 578 }
 579 
 580 /* ARGSUSED */
 581 static int
 582 fx_canexit(kthread_t *t, cred_t *cred)
 583 {
 584         /*
 585          * A thread can always leave the FX class
 586          */
 587         return (0);
 588 }
 589 
 590 /*
 591  * Initialize fixed-priority class specific proc structure for a child.
 592  * callbacks are not inherited upon fork.
 593  */
 594 static int
 595 fx_fork(kthread_t *t, kthread_t *ct, void *bufp)
 596 {
 597         fxproc_t        *pfxpp;         /* ptr to parent's fxproc structure */
 598         fxproc_t        *cfxpp;         /* ptr to child's fxproc structure */
 599 
 600         ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
 601 
 602         cfxpp = (fxproc_t *)bufp;
 603         ASSERT(cfxpp != NULL);
 604         thread_lock(t);
 605         pfxpp = (fxproc_t *)t->t_cldata;
 606         /*
 607          * Initialize child's fxproc structure.
 608          */
 609         cfxpp->fx_timeleft = cfxpp->fx_pquantum = pfxpp->fx_pquantum;
 610         cfxpp->fx_pri = pfxpp->fx_pri;
 611         cfxpp->fx_uprilim = pfxpp->fx_uprilim;
 612         cfxpp->fx_nice = pfxpp->fx_nice;
 613         cfxpp->fx_callback = NULL;
 614         cfxpp->fx_cookie = NULL;
 615         cfxpp->fx_flags = pfxpp->fx_flags & ~(FXBACKQ);
 616         cpucaps_sc_init(&cfxpp->fx_caps);
 617 
 618         cfxpp->fx_tp = ct;
 619         ct->t_cldata = (void *)cfxpp;
 620         thread_unlock(t);
 621 
 622         /*
 623          * Link new structure into fxproc list.
 624          */
 625         return (0);
 626 }
 627 
 628 
 629 /*
 630  * Child is placed at back of dispatcher queue and parent gives
 631  * up processor so that the child runs first after the fork.
 632  * This allows the child immediately execing to break the multiple
 633  * use of copy on write pages with no disk home. The parent will
 634  * get to steal them back rather than uselessly copying them.
 635  */
 636 static void
 637 fx_forkret(kthread_t *t, kthread_t *ct)
 638 {
 639         proc_t  *pp = ttoproc(t);
 640         proc_t  *cp = ttoproc(ct);
 641         fxproc_t *fxpp;
 642 
 643         ASSERT(t == curthread);
 644         ASSERT(MUTEX_HELD(&pidlock));
 645 
 646         /*
 647          * Grab the child's p_lock before dropping pidlock to ensure
 648          * the process does not disappear before we set it running.
 649          */
 650         mutex_enter(&cp->p_lock);
 651         continuelwps(cp);
 652         mutex_exit(&cp->p_lock);
 653 
 654         mutex_enter(&pp->p_lock);
 655         mutex_exit(&pidlock);
 656         continuelwps(pp);
 657 
 658         thread_lock(t);
 659         fxpp = (fxproc_t *)(t->t_cldata);
 660         t->t_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
 661         ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
 662         THREAD_TRANSITION(t);
 663         fx_setrun(t);
 664         thread_unlock(t);
 665         /*
 666          * Safe to drop p_lock now since it is safe to change
 667          * the scheduling class after this point.
 668          */
 669         mutex_exit(&pp->p_lock);
 670 
 671         swtch();
 672 }
 673 
 674 
 675 /*
 676  * Get information about the fixed-priority class into the buffer
 677  * pointed to by fxinfop. The maximum configured user priority
 678  * is the only information we supply.
 679  */
 680 static int
 681 fx_getclinfo(void *infop)
 682 {
 683         fxinfo_t *fxinfop = (fxinfo_t *)infop;
 684         fxinfop->fx_maxupri = fx_maxupri;
 685         return (0);
 686 }
 687 
 688 
 689 
 690 /*
 691  * Return the user mode scheduling priority range.
 692  */
 693 static int
 694 fx_getclpri(pcpri_t *pcprip)
 695 {
 696         pcprip->pc_clpmax = fx_maxupri;
 697         pcprip->pc_clpmin = 0;
 698         return (0);
 699 }
 700 
 701 
 702 static void
 703 fx_nullsys()
 704 {}
 705 
 706 
 707 /*
 708  * Get the fixed-priority parameters of the thread pointed to by
 709  * fxprocp into the buffer pointed to by fxparmsp.
 710  */
 711 static void
 712 fx_parmsget(kthread_t *t, void *parmsp)
 713 {
 714         fxproc_t *fxpp = (fxproc_t *)t->t_cldata;
 715         fxkparms_t *fxkparmsp = (fxkparms_t *)parmsp;
 716 
 717         fxkparmsp->fx_upri = fxpp->fx_pri;
 718         fxkparmsp->fx_uprilim = fxpp->fx_uprilim;
 719         fxkparmsp->fx_tqntm = fxpp->fx_pquantum;
 720 }
 721 
 722 
 723 
 724 /*
 725  * Check the validity of the fixed-priority parameters in the buffer
 726  * pointed to by fxparmsp.
 727  */
 728 static int
 729 fx_parmsin(void *parmsp)
 730 {
 731         fxparms_t       *fxparmsp = (fxparms_t *)parmsp;
 732         uint_t          cflags;
 733         longlong_t      ticks;
 734         /*
 735          * Check validity of parameters.
 736          */
 737 
 738         if ((fxparmsp->fx_uprilim > fx_maxupri ||
 739             fxparmsp->fx_uprilim < 0) &&
 740             fxparmsp->fx_uprilim != FX_NOCHANGE)
 741                 return (EINVAL);
 742 
 743         if ((fxparmsp->fx_upri > fx_maxupri ||
 744             fxparmsp->fx_upri < 0) &&
 745             fxparmsp->fx_upri != FX_NOCHANGE)
 746                 return (EINVAL);
 747 
 748         if ((fxparmsp->fx_tqsecs == 0 && fxparmsp->fx_tqnsecs == 0) ||
 749             fxparmsp->fx_tqnsecs >= NANOSEC)
 750                 return (EINVAL);
 751 
 752         cflags = (fxparmsp->fx_upri != FX_NOCHANGE ? FX_DOUPRI : 0);
 753 
 754         if (fxparmsp->fx_uprilim != FX_NOCHANGE) {
 755                 cflags |= FX_DOUPRILIM;
 756         }
 757 
 758         if (fxparmsp->fx_tqnsecs != FX_NOCHANGE)
 759                 cflags |= FX_DOTQ;
 760 
 761         /*
 762          * convert the buffer to kernel format.
 763          */
 764 
 765         if (fxparmsp->fx_tqnsecs >= 0) {
 766                 if ((ticks = SEC_TO_TICK((longlong_t)fxparmsp->fx_tqsecs) +
 767                     NSEC_TO_TICK_ROUNDUP(fxparmsp->fx_tqnsecs)) > INT_MAX)
 768                         return (ERANGE);
 769 
 770                 ((fxkparms_t *)fxparmsp)->fx_tqntm = (int)ticks;
 771         } else {
 772                 if ((fxparmsp->fx_tqnsecs != FX_NOCHANGE) &&
 773                     (fxparmsp->fx_tqnsecs != FX_TQINF) &&
 774                     (fxparmsp->fx_tqnsecs != FX_TQDEF))
 775                         return (EINVAL);
 776                 ((fxkparms_t *)fxparmsp)->fx_tqntm = fxparmsp->fx_tqnsecs;
 777         }
 778 
 779         ((fxkparms_t *)fxparmsp)->fx_cflags = cflags;
 780 
 781         return (0);
 782 }
 783 
 784 
 785 /*
 786  * Check the validity of the fixed-priority parameters in the pc_vaparms_t
 787  * structure vaparmsp and put them in the buffer pointed to by fxprmsp.
 788  * pc_vaparms_t contains (key, value) pairs of parameter.
 789  */
 790 static int
 791 fx_vaparmsin(void *prmsp, pc_vaparms_t *vaparmsp)
 792 {
 793         uint_t          secs = 0;
 794         uint_t          cnt;
 795         int             nsecs = 0;
 796         int             priflag, secflag, nsecflag, limflag;
 797         longlong_t      ticks;
 798         fxkparms_t      *fxprmsp = (fxkparms_t *)prmsp;
 799         pc_vaparm_t     *vpp = &vaparmsp->pc_parms[0];
 800 
 801 
 802         /*
 803          * First check the validity of parameters and convert them
 804          * from the user supplied format to the internal format.
 805          */
 806         priflag = secflag = nsecflag = limflag = 0;
 807 
 808         fxprmsp->fx_cflags = 0;
 809 
 810         if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
 811                 return (EINVAL);
 812 
 813         for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
 814 
 815                 switch (vpp->pc_key) {
 816                 case FX_KY_UPRILIM:
 817                         if (limflag++)
 818                                 return (EINVAL);
 819                         fxprmsp->fx_cflags |= FX_DOUPRILIM;
 820                         fxprmsp->fx_uprilim = (pri_t)vpp->pc_parm;
 821                         if (fxprmsp->fx_uprilim > fx_maxupri ||
 822                             fxprmsp->fx_uprilim < 0)
 823                                 return (EINVAL);
 824                         break;
 825 
 826                 case FX_KY_UPRI:
 827                         if (priflag++)
 828                                 return (EINVAL);
 829                         fxprmsp->fx_cflags |= FX_DOUPRI;
 830                         fxprmsp->fx_upri = (pri_t)vpp->pc_parm;
 831                         if (fxprmsp->fx_upri > fx_maxupri ||
 832                             fxprmsp->fx_upri < 0)
 833                                 return (EINVAL);
 834                         break;
 835 
 836                 case FX_KY_TQSECS:
 837                         if (secflag++)
 838                                 return (EINVAL);
 839                         fxprmsp->fx_cflags |= FX_DOTQ;
 840                         secs = (uint_t)vpp->pc_parm;
 841                         break;
 842 
 843                 case FX_KY_TQNSECS:
 844                         if (nsecflag++)
 845                                 return (EINVAL);
 846                         fxprmsp->fx_cflags |= FX_DOTQ;
 847                         nsecs = (int)vpp->pc_parm;
 848                         break;
 849 
 850                 default:
 851                         return (EINVAL);
 852                 }
 853         }
 854 
 855         if (vaparmsp->pc_vaparmscnt == 0) {
 856                 /*
 857                  * Use default parameters.
 858                  */
 859                 fxprmsp->fx_upri = 0;
 860                 fxprmsp->fx_uprilim = 0;
 861                 fxprmsp->fx_tqntm = FX_TQDEF;
 862                 fxprmsp->fx_cflags = FX_DOUPRI | FX_DOUPRILIM | FX_DOTQ;
 863         } else if ((fxprmsp->fx_cflags & FX_DOTQ) != 0) {
 864                 if ((secs == 0 && nsecs == 0) || nsecs >= NANOSEC)
 865                         return (EINVAL);
 866 
 867                 if (nsecs >= 0) {
 868                         if ((ticks = SEC_TO_TICK((longlong_t)secs) +
 869                             NSEC_TO_TICK_ROUNDUP(nsecs)) > INT_MAX)
 870                                 return (ERANGE);
 871 
 872                         fxprmsp->fx_tqntm = (int)ticks;
 873                 } else {
 874                         if (nsecs != FX_TQINF && nsecs != FX_TQDEF)
 875                                 return (EINVAL);
 876                         fxprmsp->fx_tqntm = nsecs;
 877                 }
 878         }
 879 
 880         return (0);
 881 }
 882 
 883 
 884 /*
 885  * Nothing to do here but return success.
 886  */
 887 /* ARGSUSED */
 888 static int
 889 fx_parmsout(void *parmsp, pc_vaparms_t *vaparmsp)
 890 {
 891         register fxkparms_t     *fxkprmsp = (fxkparms_t *)parmsp;
 892 
 893         if (vaparmsp != NULL)
 894                 return (0);
 895 
 896         if (fxkprmsp->fx_tqntm < 0) {
 897                 /*
 898                  * Quantum field set to special value (e.g. FX_TQINF)
 899                  */
 900                 ((fxparms_t *)fxkprmsp)->fx_tqnsecs = fxkprmsp->fx_tqntm;
 901                 ((fxparms_t *)fxkprmsp)->fx_tqsecs = 0;
 902 
 903         } else {
 904                 /* Convert quantum from ticks to seconds-nanoseconds */
 905 
 906                 timestruc_t ts;
 907                 TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
 908                 ((fxparms_t *)fxkprmsp)->fx_tqsecs = ts.tv_sec;
 909                 ((fxparms_t *)fxkprmsp)->fx_tqnsecs = ts.tv_nsec;
 910         }
 911 
 912         return (0);
 913 }
 914 
 915 
 916 /*
 917  * Copy all selected fixed-priority class parameters to the user.
 918  * The parameters are specified by a key.
 919  */
 920 static int
 921 fx_vaparmsout(void *prmsp, pc_vaparms_t *vaparmsp)
 922 {
 923         fxkparms_t      *fxkprmsp = (fxkparms_t *)prmsp;
 924         timestruc_t     ts;
 925         uint_t          cnt;
 926         uint_t          secs;
 927         int             nsecs;
 928         int             priflag, secflag, nsecflag, limflag;
 929         pc_vaparm_t     *vpp = &vaparmsp->pc_parms[0];
 930 
 931         ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
 932 
 933         priflag = secflag = nsecflag = limflag = 0;
 934 
 935         if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
 936                 return (EINVAL);
 937 
 938         if (fxkprmsp->fx_tqntm < 0) {
 939                 /*
 940                  * Quantum field set to special value (e.g. FX_TQINF).
 941                  */
 942                 secs = 0;
 943                 nsecs = fxkprmsp->fx_tqntm;
 944         } else {
 945                 /*
 946                  * Convert quantum from ticks to seconds-nanoseconds.
 947                  */
 948                 TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
 949                 secs = ts.tv_sec;
 950                 nsecs = ts.tv_nsec;
 951         }
 952 
 953 
 954         for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
 955 
 956                 switch (vpp->pc_key) {
 957                 case FX_KY_UPRILIM:
 958                         if (limflag++)
 959                                 return (EINVAL);
 960                         if (copyout(&fxkprmsp->fx_uprilim,
 961                             (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
 962                                 return (EFAULT);
 963                         break;
 964 
 965                 case FX_KY_UPRI:
 966                         if (priflag++)
 967                                 return (EINVAL);
 968                         if (copyout(&fxkprmsp->fx_upri,
 969                             (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
 970                                 return (EFAULT);
 971                         break;
 972 
 973                 case FX_KY_TQSECS:
 974                         if (secflag++)
 975                                 return (EINVAL);
 976                         if (copyout(&secs,
 977                             (void *)(uintptr_t)vpp->pc_parm, sizeof (uint_t)))
 978                                 return (EFAULT);
 979                         break;
 980 
 981                 case FX_KY_TQNSECS:
 982                         if (nsecflag++)
 983                                 return (EINVAL);
 984                         if (copyout(&nsecs,
 985                             (void *)(uintptr_t)vpp->pc_parm, sizeof (int)))
 986                                 return (EFAULT);
 987                         break;
 988 
 989                 default:
 990                         return (EINVAL);
 991                 }
 992         }
 993 
 994         return (0);
 995 }
 996 
 997 /*
 998  * Set the scheduling parameters of the thread pointed to by fxprocp
 999  * to those specified in the buffer pointed to by fxparmsp.
1000  */
1001 /* ARGSUSED */
1002 static int
1003 fx_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
1004 {
1005         char            nice;
1006         pri_t           reqfxuprilim;
1007         pri_t           reqfxupri;
1008         fxkparms_t      *fxkparmsp = (fxkparms_t *)parmsp;
1009         fxproc_t        *fxpp;
1010 
1011 
1012         ASSERT(MUTEX_HELD(&(ttoproc(tx))->p_lock));
1013 
1014         thread_lock(tx);
1015         fxpp = (fxproc_t *)tx->t_cldata;
1016 
1017         if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0)
1018                 reqfxuprilim = fxpp->fx_uprilim;
1019         else
1020                 reqfxuprilim = fxkparmsp->fx_uprilim;
1021 
1022         /*
1023          * Basic permissions enforced by generic kernel code
1024          * for all classes require that a thread attempting
1025          * to change the scheduling parameters of a target
1026          * thread be privileged or have a real or effective
1027          * UID matching that of the target thread. We are not
1028          * called unless these basic permission checks have
1029          * already passed. The fixed priority class requires in
1030          * addition that the calling thread be privileged if it
1031          * is attempting to raise the pri above its current
1032          * value This may have been checked previously but if our
1033          * caller passed us a non-NULL credential pointer we assume
1034          * it hasn't and we check it here.
1035          */
1036 
1037         if ((reqpcredp != NULL) &&
1038             (reqfxuprilim > fxpp->fx_uprilim ||
1039             ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)) &&
1040             secpolicy_raisepriority(reqpcredp) != 0) {
1041                 thread_unlock(tx);
1042                 return (EPERM);
1043         }
1044 
1045         FX_ADJUST_PRI(reqfxuprilim);
1046 
1047         if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0)
1048                 reqfxupri = fxpp->fx_pri;
1049         else
1050                 reqfxupri = fxkparmsp->fx_upri;
1051 
1052 
1053         /*
1054          * Make sure the user priority doesn't exceed the upri limit.
1055          */
1056         if (reqfxupri > reqfxuprilim)
1057                 reqfxupri = reqfxuprilim;
1058 
1059         /*
1060          * Set fx_nice to the nice value corresponding to the user
1061          * priority we are setting.  Note that setting the nice field
1062          * of the parameter struct won't affect upri or nice.
1063          */
1064 
1065         nice = NZERO - (reqfxupri * NZERO) / fx_maxupri;
1066 
1067         if (nice > NZERO)
1068                 nice = NZERO;
1069 
1070         fxpp->fx_uprilim = reqfxuprilim;
1071         fxpp->fx_pri = reqfxupri;
1072 
1073         if (fxkparmsp->fx_tqntm == FX_TQINF)
1074                 fxpp->fx_pquantum = FX_TQINF;
1075         else if (fxkparmsp->fx_tqntm == FX_TQDEF)
1076                 fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1077         else if ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)
1078                 fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
1079 
1080         fxpp->fx_nice = nice;
1081 
1082         fx_change_priority(tx, fxpp);
1083         thread_unlock(tx);
1084         return (0);
1085 }
1086 
1087 
1088 /*
1089  * Return the global scheduling priority that would be assigned
1090  * to a thread entering the fixed-priority class with the fx_upri.
1091  */
1092 static pri_t
1093 fx_globpri(kthread_t *t)
1094 {
1095         fxproc_t *fxpp;
1096 
1097         ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
1098 
1099         fxpp = (fxproc_t *)t->t_cldata;
1100         return (fx_dptbl[fxpp->fx_pri].fx_globpri);
1101 
1102 }
1103 
1104 /*
1105  * Arrange for thread to be placed in appropriate location
1106  * on dispatcher queue.
1107  *
1108  * This is called with the current thread in TS_ONPROC and locked.
1109  */
1110 static void
1111 fx_preempt(kthread_t *t)
1112 {
1113         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1114 
1115         ASSERT(t == curthread);
1116         ASSERT(THREAD_LOCK_HELD(curthread));
1117 
1118         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1119 
1120         /*
1121          * Check to see if we're doing "preemption control" here.  If
1122          * we are, and if the user has requested that this thread not
1123          * be preempted, and if preemptions haven't been put off for
1124          * too long, let the preemption happen here but try to make
1125          * sure the thread is rescheduled as soon as possible.  We do
1126          * this by putting it on the front of the highest priority run
1127          * queue in the FX class.  If the preemption has been put off
1128          * for too long, clear the "nopreempt" bit and let the thread
1129          * be preempted.
1130          */
1131         if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1132                 if (fxpp->fx_pquantum == FX_TQINF ||
1133                     fxpp->fx_timeleft > -SC_MAX_TICKS) {
1134                         DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
1135                         schedctl_set_yield(t, 1);
1136                         setfrontdq(t);
1137                         return;
1138                 } else {
1139                         schedctl_set_nopreempt(t, 0);
1140                         DTRACE_SCHED1(schedctl__preempt, kthread_t *, t);
1141                         TNF_PROBE_2(schedctl_preempt, "schedctl FX fx_preempt",
1142                             /* CSTYLED */, tnf_pid, pid, ttoproc(t)->p_pid,
1143                             tnf_lwpid, lwpid, t->t_tid);
1144                         /*
1145                          * Fall through and be preempted below.
1146                          */
1147                 }
1148         }
1149 
1150         if (FX_HAS_CB(fxpp)) {
1151                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1152                 pri_t   newpri = fxpp->fx_pri;
1153                 FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1154                     &new_quantum, &newpri);
1155                 FX_ADJUST_QUANTUM(new_quantum);
1156                 if ((int)new_quantum != fxpp->fx_pquantum) {
1157                         fxpp->fx_pquantum = (int)new_quantum;
1158                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1159                 }
1160                 FX_ADJUST_PRI(newpri);
1161                 fxpp->fx_pri = newpri;
1162                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1163         }
1164 
1165         /*
1166          * This thread may be placed on wait queue by CPU Caps. In this case we
1167          * do not need to do anything until it is removed from the wait queue.
1168          */
1169         if (CPUCAPS_ENFORCE(t)) {
1170                 return;
1171         }
1172 
1173         if ((fxpp->fx_flags & (FXBACKQ)) == FXBACKQ) {
1174                 fxpp->fx_timeleft = fxpp->fx_pquantum;
1175                 fxpp->fx_flags &= ~FXBACKQ;
1176                 setbackdq(t);
1177         } else {
1178                 setfrontdq(t);
1179         }
1180 }
1181 
1182 static void
1183 fx_setrun(kthread_t *t)
1184 {
1185         fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1186 
1187         ASSERT(THREAD_LOCK_HELD(t));    /* t should be in transition */
1188         fxpp->fx_flags &= ~FXBACKQ;
1189 
1190         if (t->t_disp_time != ddi_get_lbolt())
1191                 setbackdq(t);
1192         else
1193                 setfrontdq(t);
1194 }
1195 
1196 
1197 /*
1198  * Prepare thread for sleep. We reset the thread priority so it will
1199  * run at the kernel priority level when it wakes up.
1200  */
1201 static void
1202 fx_sleep(kthread_t *t)
1203 {
1204         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1205 
1206         ASSERT(t == curthread);
1207         ASSERT(THREAD_LOCK_HELD(t));
1208 
1209         /*
1210          * Account for time spent on CPU before going to sleep.
1211          */
1212         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1213 
1214         if (FX_HAS_CB(fxpp)) {
1215                 FX_CB_SLEEP(FX_CALLB(fxpp), fxpp->fx_cookie);
1216         }
1217         t->t_stime = ddi_get_lbolt();                /* time stamp for the swapper */
1218 }
1219 
1220 
1221 /*
1222  * Return Values:
1223  *
1224  *      -1 if the thread is loaded or is not eligible to be swapped in.
1225  *
1226  * FX and RT threads are designed so that they don't swapout; however,
1227  * it is possible that while the thread is swapped out and in another class, it
1228  * can be changed to FX or RT.  Since these threads should be swapped in
1229  * as soon as they're runnable, rt_swapin returns SHRT_MAX, and fx_swapin
1230  * returns SHRT_MAX - 1, so that it gives deference to any swapped out
1231  * RT threads.
1232  */
1233 /* ARGSUSED */
1234 static pri_t
1235 fx_swapin(kthread_t *t, int flags)
1236 {
1237         pri_t   tpri = -1;
1238 
1239         ASSERT(THREAD_LOCK_HELD(t));
1240 
1241         if (t->t_state == TS_RUN && (t->t_schedflag & TS_LOAD) == 0) {
1242                 tpri = (pri_t)SHRT_MAX - 1;
1243         }
1244 
1245         return (tpri);
1246 }
1247 
1248 /*
1249  * Return Values
1250  *      -1 if the thread isn't loaded or is not eligible to be swapped out.
1251  */
1252 /* ARGSUSED */
1253 static pri_t
1254 fx_swapout(kthread_t *t, int flags)
1255 {
1256         ASSERT(THREAD_LOCK_HELD(t));
1257 
1258         return (-1);
1259 
1260 }
1261 
1262 /* ARGSUSED */
1263 static void
1264 fx_stop(kthread_t *t, int why, int what)
1265 {
1266         fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1267 
1268         ASSERT(THREAD_LOCK_HELD(t));
1269 
1270         if (FX_HAS_CB(fxpp)) {
1271                 FX_CB_STOP(FX_CALLB(fxpp), fxpp->fx_cookie);
1272         }
1273 }
1274 
1275 /*
1276  * Check for time slice expiration.  If time slice has expired
1277  * set runrun to cause preemption.
1278  */
1279 static void
1280 fx_tick(kthread_t *t)
1281 {
1282         boolean_t call_cpu_surrender = B_FALSE;
1283         fxproc_t *fxpp;
1284 
1285         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1286 
1287         thread_lock(t);
1288 
1289         fxpp = (fxproc_t *)(t->t_cldata);
1290 
1291         if (FX_HAS_CB(fxpp)) {
1292                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1293                 pri_t   newpri = fxpp->fx_pri;
1294                 FX_CB_TICK(FX_CALLB(fxpp), fxpp->fx_cookie,
1295                     &new_quantum, &newpri);
1296                 FX_ADJUST_QUANTUM(new_quantum);
1297                 if ((int)new_quantum != fxpp->fx_pquantum) {
1298                         fxpp->fx_pquantum = (int)new_quantum;
1299                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1300                 }
1301                 FX_ADJUST_PRI(newpri);
1302                 if (newpri != fxpp->fx_pri) {
1303                         fxpp->fx_pri = newpri;
1304                         fx_change_priority(t, fxpp);
1305                 }
1306         }
1307 
1308         /*
1309          * Keep track of thread's project CPU usage.  Note that projects
1310          * get charged even when threads are running in the kernel.
1311          */
1312         call_cpu_surrender =  CPUCAPS_CHARGE(t, &fxpp->fx_caps,
1313             CPUCAPS_CHARGE_ENFORCE);
1314 
1315         if ((fxpp->fx_pquantum != FX_TQINF) &&
1316             (--fxpp->fx_timeleft <= 0)) {
1317                 pri_t   new_pri;
1318 
1319                 /*
1320                  * If we're doing preemption control and trying to
1321                  * avoid preempting this thread, just note that
1322                  * the thread should yield soon and let it keep
1323                  * running (unless it's been a while).
1324                  */
1325                 if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1326                         if (fxpp->fx_timeleft > -SC_MAX_TICKS) {
1327                                 DTRACE_SCHED1(schedctl__nopreempt,
1328                                     kthread_t *, t);
1329                                 schedctl_set_yield(t, 1);
1330                                 thread_unlock_nopreempt(t);
1331                                 return;
1332                         }
1333                         TNF_PROBE_2(schedctl_failsafe,
1334                             "schedctl FX fx_tick", /* CSTYLED */,
1335                             tnf_pid, pid, ttoproc(t)->p_pid,
1336                             tnf_lwpid, lwpid, t->t_tid);
1337                 }
1338                 new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1339                 ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1340                 /*
1341                  * When the priority of a thread is changed,
1342                  * it may be necessary to adjust its position
1343                  * on a sleep queue or dispatch queue. Even
1344                  * when the priority is not changed, we need
1345                  * to preserve round robin on dispatch queue.
1346                  * The function thread_change_pri accomplishes
1347                  * this.
1348                  */
1349                 if (thread_change_pri(t, new_pri, 0)) {
1350                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1351                 } else {
1352                         call_cpu_surrender = B_TRUE;
1353                 }
1354         } else if (t->t_state == TS_ONPROC &&
1355             t->t_pri < t->t_disp_queue->disp_maxrunpri) {
1356                 call_cpu_surrender = B_TRUE;
1357         }
1358 
1359         if (call_cpu_surrender) {
1360                 fxpp->fx_flags |= FXBACKQ;
1361                 cpu_surrender(t);
1362         }
1363         thread_unlock_nopreempt(t);     /* clock thread can't be preempted */
1364 }
1365 
1366 
1367 static void
1368 fx_trapret(kthread_t *t)
1369 {
1370         cpu_t           *cp = CPU;
1371 
1372         ASSERT(THREAD_LOCK_HELD(t));
1373         ASSERT(t == curthread);
1374         ASSERT(cp->cpu_dispthread == t);
1375         ASSERT(t->t_state == TS_ONPROC);
1376 }
1377 
1378 
1379 /*
1380  * Processes waking up go to the back of their queue.
1381  */
1382 static void
1383 fx_wakeup(kthread_t *t)
1384 {
1385         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1386 
1387         ASSERT(THREAD_LOCK_HELD(t));
1388 
1389         t->t_stime = ddi_get_lbolt();                /* time stamp for the swapper */
1390         if (FX_HAS_CB(fxpp)) {
1391                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1392                 pri_t   newpri = fxpp->fx_pri;
1393                 FX_CB_WAKEUP(FX_CALLB(fxpp), fxpp->fx_cookie,
1394                     &new_quantum, &newpri);
1395                 FX_ADJUST_QUANTUM(new_quantum);
1396                 if ((int)new_quantum != fxpp->fx_pquantum) {
1397                         fxpp->fx_pquantum = (int)new_quantum;
1398                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1399                 }
1400 
1401                 FX_ADJUST_PRI(newpri);
1402                 if (newpri != fxpp->fx_pri) {
1403                         fxpp->fx_pri = newpri;
1404                         THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1405                 }
1406         }
1407 
1408         fxpp->fx_flags &= ~FXBACKQ;
1409 
1410         if (t->t_disp_time != ddi_get_lbolt())
1411                 setbackdq(t);
1412         else
1413                 setfrontdq(t);
1414 }
1415 
1416 
1417 /*
1418  * When a thread yields, put it on the back of the run queue.
1419  */
1420 static void
1421 fx_yield(kthread_t *t)
1422 {
1423         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1424 
1425         ASSERT(t == curthread);
1426         ASSERT(THREAD_LOCK_HELD(t));
1427 
1428         /*
1429          * Collect CPU usage spent before yielding CPU.
1430          */
1431         (void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1432 
1433         if (FX_HAS_CB(fxpp))  {
1434                 clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1435                 pri_t   newpri = fxpp->fx_pri;
1436                 FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1437                     &new_quantum, &newpri);
1438                 FX_ADJUST_QUANTUM(new_quantum);
1439                 if ((int)new_quantum != fxpp->fx_pquantum) {
1440                         fxpp->fx_pquantum = (int)new_quantum;
1441                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1442                 }
1443                 FX_ADJUST_PRI(newpri);
1444                 fxpp->fx_pri = newpri;
1445                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1446         }
1447 
1448         /*
1449          * Clear the preemption control "yield" bit since the user is
1450          * doing a yield.
1451          */
1452         if (t->t_schedctl)
1453                 schedctl_set_yield(t, 0);
1454 
1455         if (fxpp->fx_timeleft <= 0) {
1456                 /*
1457                  * Time slice was artificially extended to avoid
1458                  * preemption, so pretend we're preempting it now.
1459                  */
1460                 DTRACE_SCHED1(schedctl__yield, int, -fxpp->fx_timeleft);
1461                 fxpp->fx_timeleft = fxpp->fx_pquantum;
1462                 THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1463                 ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
1464         }
1465 
1466         fxpp->fx_flags &= ~FXBACKQ;
1467         setbackdq(t);
1468 }
1469 
1470 /*
1471  * Increment the nice value of the specified thread by incr and
1472  * return the new value in *retvalp.
1473  */
1474 static int
1475 fx_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1476 {
1477         int             newnice;
1478         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1479         fxkparms_t      fxkparms;
1480 
1481         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1482 
1483         /* If there's no change to priority, just return current setting */
1484         if (incr == 0) {
1485                 if (retvalp) {
1486                         *retvalp = fxpp->fx_nice - NZERO;
1487                 }
1488                 return (0);
1489         }
1490 
1491         if ((incr < 0 || incr > 2 * NZERO) &&
1492             secpolicy_raisepriority(cr) != 0)
1493                 return (EPERM);
1494 
1495         /*
1496          * Specifying a nice increment greater than the upper limit of
1497          * 2 * NZERO - 1 will result in the thread's nice value being
1498          * set to the upper limit.  We check for this before computing
1499          * the new value because otherwise we could get overflow
1500          * if a privileged user specified some ridiculous increment.
1501          */
1502         if (incr > 2 * NZERO - 1)
1503                 incr = 2 * NZERO - 1;
1504 
1505         newnice = fxpp->fx_nice + incr;
1506         if (newnice > NZERO)
1507                 newnice = NZERO;
1508         else if (newnice < 0)
1509                 newnice = 0;
1510 
1511         fxkparms.fx_uprilim = fxkparms.fx_upri =
1512             -((newnice - NZERO) * fx_maxupri) / NZERO;
1513 
1514         fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1515 
1516         fxkparms.fx_tqntm = FX_TQDEF;
1517 
1518         /*
1519          * Reset the uprilim and upri values of the thread. Adjust
1520          * time quantum accordingly.
1521          */
1522 
1523         (void) fx_parmsset(t, (void *)&fxkparms, (id_t)0, (cred_t *)NULL);
1524 
1525         /*
1526          * Although fx_parmsset already reset fx_nice it may
1527          * not have been set to precisely the value calculated above
1528          * because fx_parmsset determines the nice value from the
1529          * user priority and we may have truncated during the integer
1530          * conversion from nice value to user priority and back.
1531          * We reset fx_nice to the value we calculated above.
1532          */
1533         fxpp->fx_nice = (char)newnice;
1534 
1535         if (retvalp)
1536                 *retvalp = newnice - NZERO;
1537 
1538         return (0);
1539 }
1540 
1541 /*
1542  * Increment the priority of the specified thread by incr and
1543  * return the new value in *retvalp.
1544  */
1545 static int
1546 fx_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1547 {
1548         int             newpri;
1549         fxproc_t        *fxpp = (fxproc_t *)(t->t_cldata);
1550         fxkparms_t      fxkparms;
1551 
1552         ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1553 
1554         /* If there's no change to priority, just return current setting */
1555         if (incr == 0) {
1556                 *retvalp = fxpp->fx_pri;
1557                 return (0);
1558         }
1559 
1560         newpri = fxpp->fx_pri + incr;
1561         if (newpri > fx_maxupri || newpri < 0)
1562                 return (EINVAL);
1563 
1564         *retvalp = newpri;
1565         fxkparms.fx_uprilim = fxkparms.fx_upri = newpri;
1566         fxkparms.fx_tqntm = FX_NOCHANGE;
1567         fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1568 
1569         /*
1570          * Reset the uprilim and upri values of the thread.
1571          */
1572         return (fx_parmsset(t, (void *)&fxkparms, (id_t)0, cr));
1573 }
1574 
1575 static void
1576 fx_change_priority(kthread_t *t, fxproc_t *fxpp)
1577 {
1578         pri_t   new_pri;
1579 
1580         ASSERT(THREAD_LOCK_HELD(t));
1581         new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1582         ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1583         t->t_cpri = fxpp->fx_pri;
1584         if (t == curthread || t->t_state == TS_ONPROC) {
1585                 /* curthread is always onproc */
1586                 cpu_t   *cp = t->t_disp_queue->disp_cpu;
1587                 THREAD_CHANGE_PRI(t, new_pri);
1588                 if (t == cp->cpu_dispthread)
1589                         cp->cpu_dispatch_pri = DISP_PRIO(t);
1590                 if (DISP_MUST_SURRENDER(t)) {
1591                         fxpp->fx_flags |= FXBACKQ;
1592                         cpu_surrender(t);
1593                 } else {
1594                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1595                 }
1596         } else {
1597                 /*
1598                  * When the priority of a thread is changed,
1599                  * it may be necessary to adjust its position
1600                  * on a sleep queue or dispatch queue.
1601                  * The function thread_change_pri accomplishes
1602                  * this.
1603                  */
1604                 if (thread_change_pri(t, new_pri, 0)) {
1605                         /*
1606                          * The thread was on a run queue. Reset
1607                          * its CPU timeleft from the quantum
1608                          * associated with the new priority.
1609                          */
1610                         fxpp->fx_timeleft = fxpp->fx_pquantum;
1611                 } else {
1612                         fxpp->fx_flags |= FXBACKQ;
1613                 }
1614         }
1615 }
1616 
1617 static int
1618 fx_alloc(void **p, int flag)
1619 {
1620         void *bufp;
1621 
1622         bufp = kmem_alloc(sizeof (fxproc_t), flag);
1623         if (bufp == NULL) {
1624                 return (ENOMEM);
1625         } else {
1626                 *p = bufp;
1627                 return (0);
1628         }
1629 }
1630 
1631 static void
1632 fx_free(void *bufp)
1633 {
1634         if (bufp)
1635                 kmem_free(bufp, sizeof (fxproc_t));
1636 }
1637 
1638 /*
1639  * Release the callback list mutex after successful lookup
1640  */
1641 void
1642 fx_list_release(fxproc_t *fxpp)
1643 {
1644         int index = FX_CB_LIST_HASH(fxpp->fx_ktid);
1645         kmutex_t *lockp = &fx_cb_list_lock[index];
1646         mutex_exit(lockp);
1647 }
1648 
1649 fxproc_t *
1650 fx_list_lookup(kt_did_t ktid)
1651 {
1652         int index = FX_CB_LIST_HASH(ktid);
1653         kmutex_t *lockp = &fx_cb_list_lock[index];
1654         fxproc_t *fxpp;
1655 
1656         mutex_enter(lockp);
1657 
1658         for (fxpp = fx_cb_plisthead[index].fx_cb_next;
1659             fxpp != &fx_cb_plisthead[index]; fxpp = fxpp->fx_cb_next) {
1660                 if (fxpp->fx_tp->t_cid == fx_cid && fxpp->fx_ktid == ktid &&
1661                     fxpp->fx_callback != NULL) {
1662                         /*
1663                          * The caller is responsible for calling
1664                          * fx_list_release to drop the lock upon
1665                          * successful lookup
1666                          */
1667                         return (fxpp);
1668                 }
1669         }
1670         mutex_exit(lockp);
1671         return ((fxproc_t *)NULL);
1672 }
1673 
1674 
1675 /*
1676  * register a callback set of routines for current thread
1677  * thread should already be in FX class
1678  */
1679 int
1680 fx_register_callbacks(fx_callbacks_t *fx_callback, fx_cookie_t cookie,
1681         pri_t pri, clock_t quantum)
1682 {
1683 
1684         fxproc_t        *fxpp;
1685 
1686         if (fx_callback == NULL)
1687                 return (EINVAL);
1688 
1689         if (secpolicy_dispadm(CRED()) != 0)
1690                 return (EPERM);
1691 
1692         if (FX_CB_VERSION(fx_callback) != FX_CALLB_REV)
1693                 return (EINVAL);
1694 
1695         if (!FX_ISVALID(pri, quantum))
1696                 return (EINVAL);
1697 
1698         thread_lock(curthread);         /* get dispatcher lock on thread */
1699 
1700         if (curthread->t_cid != fx_cid) {
1701                 thread_unlock(curthread);
1702                 return (EINVAL);
1703         }
1704 
1705         fxpp = (fxproc_t *)(curthread->t_cldata);
1706         ASSERT(fxpp != NULL);
1707         if (FX_HAS_CB(fxpp)) {
1708                 thread_unlock(curthread);
1709                 return (EINVAL);
1710         }
1711 
1712         fxpp->fx_callback = fx_callback;
1713         fxpp->fx_cookie = cookie;
1714 
1715         if (pri != FX_CB_NOCHANGE) {
1716                 fxpp->fx_pri = pri;
1717                 FX_ADJUST_PRI(fxpp->fx_pri);
1718                 if (quantum == FX_TQDEF) {
1719                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1720                 } else if (quantum == FX_TQINF) {
1721                         fxpp->fx_pquantum = FX_TQINF;
1722                 } else if (quantum != FX_NOCHANGE) {
1723                         FX_ADJUST_QUANTUM(quantum);
1724                         fxpp->fx_pquantum = quantum;
1725                 }
1726         } else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1727                 if (quantum == FX_TQINF)
1728                         fxpp->fx_pquantum = FX_TQINF;
1729                 else {
1730                         FX_ADJUST_QUANTUM(quantum);
1731                         fxpp->fx_pquantum = quantum;
1732                 }
1733         }
1734 
1735         fxpp->fx_ktid = ddi_get_kt_did();
1736 
1737         fx_change_priority(curthread, fxpp);
1738 
1739         thread_unlock(curthread);
1740 
1741         /*
1742          * Link new structure into fxproc list.
1743          */
1744         FX_CB_LIST_INSERT(fxpp);
1745         return (0);
1746 }
1747 
1748 /* unregister a callback set of routines for current thread */
1749 int
1750 fx_unregister_callbacks()
1751 {
1752         fxproc_t        *fxpp;
1753 
1754         if ((fxpp = fx_list_lookup(ddi_get_kt_did())) == NULL) {
1755                 /*
1756                  * did not have a registered callback;
1757                  */
1758                 return (EINVAL);
1759         }
1760 
1761         thread_lock(fxpp->fx_tp);
1762         fxpp->fx_callback = NULL;
1763         fxpp->fx_cookie = NULL;
1764         thread_unlock(fxpp->fx_tp);
1765         fx_list_release(fxpp);
1766 
1767         FX_CB_LIST_DELETE(fxpp);
1768         return (0);
1769 }
1770 
1771 /*
1772  * modify priority and/or quantum value of a thread with callback
1773  */
1774 int
1775 fx_modify_priority(kt_did_t ktid, clock_t quantum, pri_t pri)
1776 {
1777         fxproc_t        *fxpp;
1778 
1779         if (!FX_ISVALID(pri, quantum))
1780                 return (EINVAL);
1781 
1782         if ((fxpp = fx_list_lookup(ktid)) == NULL) {
1783                 /*
1784                  * either thread had exited or did not have a registered
1785                  * callback;
1786                  */
1787                 return (ESRCH);
1788         }
1789 
1790         thread_lock(fxpp->fx_tp);
1791 
1792         if (pri != FX_CB_NOCHANGE) {
1793                 fxpp->fx_pri = pri;
1794                 FX_ADJUST_PRI(fxpp->fx_pri);
1795                 if (quantum == FX_TQDEF) {
1796                         fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1797                 } else if (quantum == FX_TQINF) {
1798                         fxpp->fx_pquantum = FX_TQINF;
1799                 } else if (quantum != FX_NOCHANGE) {
1800                         FX_ADJUST_QUANTUM(quantum);
1801                         fxpp->fx_pquantum = quantum;
1802                 }
1803         } else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1804                 if (quantum == FX_TQINF) {
1805                         fxpp->fx_pquantum = FX_TQINF;
1806                 } else {
1807                         FX_ADJUST_QUANTUM(quantum);
1808                         fxpp->fx_pquantum = quantum;
1809                 }
1810         }
1811 
1812         fx_change_priority(fxpp->fx_tp, fxpp);
1813 
1814         thread_unlock(fxpp->fx_tp);
1815         fx_list_release(fxpp);
1816         return (0);
1817 }
1818 
1819 
1820 /*
1821  * return an iblock cookie for mutex initialization to be used in callbacks
1822  */
1823 void *
1824 fx_get_mutex_cookie()
1825 {
1826         return ((void *)(uintptr_t)__ipltospl(DISP_LEVEL));
1827 }
1828 
1829 /*
1830  * return maximum relative priority
1831  */
1832 pri_t
1833 fx_get_maxpri()
1834 {
1835         return (fx_maxumdpri);
1836 }