Print this page
    
re #13613 rb4516 Tunables needs volatile keyword
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/disp/disp.c
          +++ new/usr/src/uts/common/disp/disp.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
  24   24   */
       25 +/*
       26 + * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
       27 + */
  25   28  
  26   29  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  27   30  /*        All Rights Reserved   */
  28   31  
  29   32  
  30   33  #include <sys/types.h>
  31   34  #include <sys/param.h>
  32   35  #include <sys/sysmacros.h>
  33   36  #include <sys/signal.h>
  34   37  #include <sys/user.h>
  35   38  #include <sys/systm.h>
  36   39  #include <sys/sysinfo.h>
  37   40  #include <sys/var.h>
  38   41  #include <sys/errno.h>
  39   42  #include <sys/cmn_err.h>
  40   43  #include <sys/debug.h>
  41   44  #include <sys/inline.h>
  42   45  #include <sys/disp.h>
  43   46  #include <sys/class.h>
  44   47  #include <sys/bitmap.h>
  45   48  #include <sys/kmem.h>
  46   49  #include <sys/cpuvar.h>
  47   50  #include <sys/vtrace.h>
  48   51  #include <sys/tnf.h>
  49   52  #include <sys/cpupart.h>
  50   53  #include <sys/lgrp.h>
  51   54  #include <sys/pg.h>
  52   55  #include <sys/cmt.h>
  53   56  #include <sys/bitset.h>
  54   57  #include <sys/schedctl.h>
  55   58  #include <sys/atomic.h>
  56   59  #include <sys/dtrace.h>
  57   60  #include <sys/sdt.h>
  58   61  #include <sys/archsystm.h>
  59   62  
  60   63  #include <vm/as.h>
  61   64  
  62   65  #define BOUND_CPU       0x1
  63   66  #define BOUND_PARTITION 0x2
  64   67  #define BOUND_INTR      0x4
  65   68  
  66   69  /* Dispatch queue allocation structure and functions */
  67   70  struct disp_queue_info {
  68   71          disp_t  *dp;
  69   72          dispq_t *olddispq;
  70   73          dispq_t *newdispq;
  71   74          ulong_t *olddqactmap;
  72   75          ulong_t *newdqactmap;
  73   76          int     oldnglobpris;
  74   77  };
  75   78  static void     disp_dq_alloc(struct disp_queue_info *dptr, int numpris,
  76   79      disp_t *dp);
  77   80  static void     disp_dq_assign(struct disp_queue_info *dptr, int numpris);
  78   81  static void     disp_dq_free(struct disp_queue_info *dptr);
  79   82  
  80   83  /* platform-specific routine to call when processor is idle */
  81   84  static void     generic_idle_cpu();
  82   85  void            (*idle_cpu)() = generic_idle_cpu;
  83   86  
  84   87  /* routines invoked when a CPU enters/exits the idle loop */
  85   88  static void     idle_enter();
  86   89  static void     idle_exit();
  87   90  
  88   91  /* platform-specific routine to call when thread is enqueued */
  89   92  static void     generic_enq_thread(cpu_t *, int);
  90   93  void            (*disp_enq_thread)(cpu_t *, int) = generic_enq_thread;
  91   94  
  92   95  pri_t   kpreemptpri;            /* priority where kernel preemption applies */
  93   96  pri_t   upreemptpri = 0;        /* priority where normal preemption applies */
  94   97  pri_t   intr_pri;               /* interrupt thread priority base level */
  95   98  
  96   99  #define KPQPRI  -1              /* pri where cpu affinity is dropped for kpq */
  97  100  pri_t   kpqpri = KPQPRI;        /* can be set in /etc/system */
  98  101  disp_t  cpu0_disp;              /* boot CPU's dispatch queue */
  99  102  disp_lock_t     swapped_lock;   /* lock swapped threads and swap queue */
 100  103  int     nswapped;               /* total number of swapped threads */
 101  104  void    disp_swapped_enq(kthread_t *tp);
 102  105  static void     disp_swapped_setrun(kthread_t *tp);
 103  106  static void     cpu_resched(cpu_t *cp, pri_t tpri);
 104  107  
 105  108  /*
 106  109   * If this is set, only interrupt threads will cause kernel preemptions.
 107  110   * This is done by changing the value of kpreemptpri.  kpreemptpri
 108  111   * will either be the max sysclass pri + 1 or the min interrupt pri.
 109  112   */
 110  113  int     only_intr_kpreempt;
 111  114  
 112  115  extern void set_idle_cpu(int cpun);
  
    | 
      ↓ open down ↓ | 
    78 lines elided | 
    
      ↑ open up ↑ | 
  
 113  116  extern void unset_idle_cpu(int cpun);
 114  117  static void setkpdq(kthread_t *tp, int borf);
 115  118  #define SETKP_BACK      0
 116  119  #define SETKP_FRONT     1
 117  120  /*
 118  121   * Parameter that determines how recently a thread must have run
 119  122   * on the CPU to be considered loosely-bound to that CPU to reduce
 120  123   * cold cache effects.  The interval is in hertz.
 121  124   */
 122  125  #define RECHOOSE_INTERVAL 3
 123      -int     rechoose_interval = RECHOOSE_INTERVAL;
      126 +volatile int    rechoose_interval = RECHOOSE_INTERVAL;
 124  127  
 125  128  /*
 126  129   * Parameter that determines how long (in nanoseconds) a thread must
 127  130   * be sitting on a run queue before it can be stolen by another CPU
 128  131   * to reduce migrations.  The interval is in nanoseconds.
 129  132   *
 130  133   * The nosteal_nsec should be set by platform code cmp_set_nosteal_interval()
 131  134   * to an appropriate value.  nosteal_nsec is set to NOSTEAL_UNINITIALIZED
 132  135   * here indicating it is uninitiallized.
 133  136   * Setting nosteal_nsec to 0 effectively disables the nosteal 'protection'.
 134  137   *
 135  138   */
 136  139  #define NOSTEAL_UNINITIALIZED   (-1)
 137  140  hrtime_t nosteal_nsec = NOSTEAL_UNINITIALIZED;
 138  141  extern void cmp_set_nosteal_interval(void);
 139  142  
 140  143  id_t    defaultcid;     /* system "default" class; see dispadmin(1M) */
 141  144  
 142  145  disp_lock_t     transition_lock;        /* lock on transitioning threads */
 143  146  disp_lock_t     stop_lock;              /* lock on stopped threads */
 144  147  
 145  148  static void     cpu_dispqalloc(int numpris);
 146  149  
 147  150  /*
 148  151   * This gets returned by disp_getwork/disp_getbest if we couldn't steal
 149  152   * a thread because it was sitting on its run queue for a very short
 150  153   * period of time.
 151  154   */
 152  155  #define T_DONTSTEAL     (kthread_t *)(-1) /* returned by disp_getwork/getbest */
 153  156  
 154  157  static kthread_t        *disp_getwork(cpu_t *to);
 155  158  static kthread_t        *disp_getbest(disp_t *from);
 156  159  static kthread_t        *disp_ratify(kthread_t *tp, disp_t *kpq);
 157  160  
 158  161  void    swtch_to(kthread_t *);
 159  162  
 160  163  /*
 161  164   * dispatcher and scheduler initialization
 162  165   */
 163  166  
 164  167  /*
 165  168   * disp_setup - Common code to calculate and allocate dispatcher
 166  169   *              variables and structures based on the maximum priority.
 167  170   */
 168  171  static void
 169  172  disp_setup(pri_t maxglobpri, pri_t oldnglobpris)
 170  173  {
 171  174          pri_t   newnglobpris;
 172  175  
 173  176          ASSERT(MUTEX_HELD(&cpu_lock));
 174  177  
 175  178          newnglobpris = maxglobpri + 1 + LOCK_LEVEL;
 176  179  
 177  180          if (newnglobpris > oldnglobpris) {
 178  181                  /*
 179  182                   * Allocate new kp queues for each CPU partition.
 180  183                   */
 181  184                  cpupart_kpqalloc(newnglobpris);
 182  185  
 183  186                  /*
 184  187                   * Allocate new dispatch queues for each CPU.
 185  188                   */
 186  189                  cpu_dispqalloc(newnglobpris);
 187  190  
 188  191                  /*
 189  192                   * compute new interrupt thread base priority
 190  193                   */
 191  194                  intr_pri = maxglobpri;
 192  195                  if (only_intr_kpreempt) {
 193  196                          kpreemptpri = intr_pri + 1;
 194  197                          if (kpqpri == KPQPRI)
 195  198                                  kpqpri = kpreemptpri;
 196  199                  }
 197  200                  v.v_nglobpris = newnglobpris;
 198  201          }
 199  202  }
 200  203  
 201  204  /*
 202  205   * dispinit - Called to initialize all loaded classes and the
 203  206   *            dispatcher framework.
 204  207   */
 205  208  void
 206  209  dispinit(void)
 207  210  {
 208  211          id_t    cid;
 209  212          pri_t   maxglobpri;
 210  213          pri_t   cl_maxglobpri;
 211  214  
 212  215          maxglobpri = -1;
 213  216  
 214  217          /*
 215  218           * Initialize transition lock, which will always be set.
 216  219           */
 217  220          DISP_LOCK_INIT(&transition_lock);
 218  221          disp_lock_enter_high(&transition_lock);
 219  222          DISP_LOCK_INIT(&stop_lock);
 220  223  
 221  224          mutex_enter(&cpu_lock);
 222  225          CPU->cpu_disp->disp_maxrunpri = -1;
 223  226          CPU->cpu_disp->disp_max_unbound_pri = -1;
 224  227  
 225  228          /*
 226  229           * Initialize the default CPU partition.
 227  230           */
 228  231          cpupart_initialize_default();
 229  232          /*
 230  233           * Call the class specific initialization functions for
 231  234           * all pre-installed schedulers.
 232  235           *
 233  236           * We pass the size of a class specific parameter
 234  237           * buffer to each of the initialization functions
 235  238           * to try to catch problems with backward compatibility
 236  239           * of class modules.
 237  240           *
 238  241           * For example a new class module running on an old system
 239  242           * which didn't provide sufficiently large parameter buffers
 240  243           * would be bad news. Class initialization modules can check for
 241  244           * this and take action if they detect a problem.
 242  245           */
 243  246  
 244  247          for (cid = 0; cid < nclass; cid++) {
 245  248                  sclass_t        *sc;
 246  249  
 247  250                  sc = &sclass[cid];
 248  251                  if (SCHED_INSTALLED(sc)) {
 249  252                          cl_maxglobpri = sc->cl_init(cid, PC_CLPARMSZ,
 250  253                              &sc->cl_funcs);
 251  254                          if (cl_maxglobpri > maxglobpri)
 252  255                                  maxglobpri = cl_maxglobpri;
 253  256                  }
 254  257          }
 255  258          kpreemptpri = (pri_t)v.v_maxsyspri + 1;
 256  259          if (kpqpri == KPQPRI)
 257  260                  kpqpri = kpreemptpri;
 258  261  
 259  262          ASSERT(maxglobpri >= 0);
 260  263          disp_setup(maxglobpri, 0);
 261  264  
 262  265          mutex_exit(&cpu_lock);
 263  266  
 264  267          /*
 265  268           * Platform specific sticky scheduler setup.
 266  269           */
 267  270          if (nosteal_nsec == NOSTEAL_UNINITIALIZED)
 268  271                  cmp_set_nosteal_interval();
 269  272  
 270  273          /*
 271  274           * Get the default class ID; this may be later modified via
 272  275           * dispadmin(1M).  This will load the class (normally TS) and that will
 273  276           * call disp_add(), which is why we had to drop cpu_lock first.
 274  277           */
 275  278          if (getcid(defaultclass, &defaultcid) != 0) {
 276  279                  cmn_err(CE_PANIC, "Couldn't load default scheduling class '%s'",
 277  280                      defaultclass);
 278  281          }
 279  282  }
 280  283  
 281  284  /*
 282  285   * disp_add - Called with class pointer to initialize the dispatcher
 283  286   *            for a newly loaded class.
 284  287   */
 285  288  void
 286  289  disp_add(sclass_t *clp)
 287  290  {
 288  291          pri_t   maxglobpri;
 289  292          pri_t   cl_maxglobpri;
 290  293  
 291  294          mutex_enter(&cpu_lock);
 292  295          /*
 293  296           * Initialize the scheduler class.
 294  297           */
 295  298          maxglobpri = (pri_t)(v.v_nglobpris - LOCK_LEVEL - 1);
 296  299          cl_maxglobpri = clp->cl_init(clp - sclass, PC_CLPARMSZ, &clp->cl_funcs);
 297  300          if (cl_maxglobpri > maxglobpri)
 298  301                  maxglobpri = cl_maxglobpri;
 299  302  
 300  303          /*
 301  304           * Save old queue information.  Since we're initializing a
 302  305           * new scheduling class which has just been loaded, then
 303  306           * the size of the dispq may have changed.  We need to handle
 304  307           * that here.
 305  308           */
 306  309          disp_setup(maxglobpri, v.v_nglobpris);
 307  310  
 308  311          mutex_exit(&cpu_lock);
 309  312  }
 310  313  
 311  314  
 312  315  /*
 313  316   * For each CPU, allocate new dispatch queues
 314  317   * with the stated number of priorities.
 315  318   */
 316  319  static void
 317  320  cpu_dispqalloc(int numpris)
 318  321  {
 319  322          cpu_t   *cpup;
 320  323          struct disp_queue_info  *disp_mem;
 321  324          int i, num;
 322  325  
 323  326          ASSERT(MUTEX_HELD(&cpu_lock));
 324  327  
 325  328          disp_mem = kmem_zalloc(NCPU *
 326  329              sizeof (struct disp_queue_info), KM_SLEEP);
 327  330  
 328  331          /*
 329  332           * This routine must allocate all of the memory before stopping
 330  333           * the cpus because it must not sleep in kmem_alloc while the
 331  334           * CPUs are stopped.  Locks they hold will not be freed until they
 332  335           * are restarted.
 333  336           */
 334  337          i = 0;
 335  338          cpup = cpu_list;
 336  339          do {
 337  340                  disp_dq_alloc(&disp_mem[i], numpris, cpup->cpu_disp);
 338  341                  i++;
 339  342                  cpup = cpup->cpu_next;
 340  343          } while (cpup != cpu_list);
 341  344          num = i;
 342  345  
 343  346          pause_cpus(NULL, NULL);
 344  347          for (i = 0; i < num; i++)
 345  348                  disp_dq_assign(&disp_mem[i], numpris);
 346  349          start_cpus();
 347  350  
 348  351          /*
 349  352           * I must free all of the memory after starting the cpus because
 350  353           * I can not risk sleeping in kmem_free while the cpus are stopped.
 351  354           */
 352  355          for (i = 0; i < num; i++)
 353  356                  disp_dq_free(&disp_mem[i]);
 354  357  
 355  358          kmem_free(disp_mem, NCPU * sizeof (struct disp_queue_info));
 356  359  }
 357  360  
 358  361  static void
 359  362  disp_dq_alloc(struct disp_queue_info *dptr, int numpris, disp_t *dp)
 360  363  {
 361  364          dptr->newdispq = kmem_zalloc(numpris * sizeof (dispq_t), KM_SLEEP);
 362  365          dptr->newdqactmap = kmem_zalloc(((numpris / BT_NBIPUL) + 1) *
 363  366              sizeof (long), KM_SLEEP);
 364  367          dptr->dp = dp;
 365  368  }
 366  369  
 367  370  static void
 368  371  disp_dq_assign(struct disp_queue_info *dptr, int numpris)
 369  372  {
 370  373          disp_t  *dp;
 371  374  
 372  375          dp = dptr->dp;
 373  376          dptr->olddispq = dp->disp_q;
 374  377          dptr->olddqactmap = dp->disp_qactmap;
 375  378          dptr->oldnglobpris = dp->disp_npri;
 376  379  
 377  380          ASSERT(dptr->oldnglobpris < numpris);
 378  381  
 379  382          if (dptr->olddispq != NULL) {
 380  383                  /*
 381  384                   * Use kcopy because bcopy is platform-specific
 382  385                   * and could block while we might have paused the cpus.
 383  386                   */
 384  387                  (void) kcopy(dptr->olddispq, dptr->newdispq,
 385  388                      dptr->oldnglobpris * sizeof (dispq_t));
 386  389                  (void) kcopy(dptr->olddqactmap, dptr->newdqactmap,
 387  390                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) *
 388  391                      sizeof (long));
 389  392          }
 390  393          dp->disp_q = dptr->newdispq;
 391  394          dp->disp_qactmap = dptr->newdqactmap;
 392  395          dp->disp_q_limit = &dptr->newdispq[numpris];
 393  396          dp->disp_npri = numpris;
 394  397  }
 395  398  
 396  399  static void
 397  400  disp_dq_free(struct disp_queue_info *dptr)
 398  401  {
 399  402          if (dptr->olddispq != NULL)
 400  403                  kmem_free(dptr->olddispq,
 401  404                      dptr->oldnglobpris * sizeof (dispq_t));
 402  405          if (dptr->olddqactmap != NULL)
 403  406                  kmem_free(dptr->olddqactmap,
 404  407                      ((dptr->oldnglobpris / BT_NBIPUL) + 1) * sizeof (long));
 405  408  }
 406  409  
 407  410  /*
 408  411   * For a newly created CPU, initialize the dispatch queue.
 409  412   * This is called before the CPU is known through cpu[] or on any lists.
 410  413   */
 411  414  void
 412  415  disp_cpu_init(cpu_t *cp)
 413  416  {
 414  417          disp_t  *dp;
 415  418          dispq_t *newdispq;
 416  419          ulong_t *newdqactmap;
 417  420  
 418  421          ASSERT(MUTEX_HELD(&cpu_lock));  /* protect dispatcher queue sizes */
 419  422  
 420  423          if (cp == cpu0_disp.disp_cpu)
 421  424                  dp = &cpu0_disp;
 422  425          else
 423  426                  dp = kmem_alloc(sizeof (disp_t), KM_SLEEP);
 424  427          bzero(dp, sizeof (disp_t));
 425  428          cp->cpu_disp = dp;
 426  429          dp->disp_cpu = cp;
 427  430          dp->disp_maxrunpri = -1;
 428  431          dp->disp_max_unbound_pri = -1;
 429  432          DISP_LOCK_INIT(&cp->cpu_thread_lock);
 430  433          /*
 431  434           * Allocate memory for the dispatcher queue headers
 432  435           * and the active queue bitmap.
 433  436           */
 434  437          newdispq = kmem_zalloc(v.v_nglobpris * sizeof (dispq_t), KM_SLEEP);
 435  438          newdqactmap = kmem_zalloc(((v.v_nglobpris / BT_NBIPUL) + 1) *
 436  439              sizeof (long), KM_SLEEP);
 437  440          dp->disp_q = newdispq;
 438  441          dp->disp_qactmap = newdqactmap;
 439  442          dp->disp_q_limit = &newdispq[v.v_nglobpris];
 440  443          dp->disp_npri = v.v_nglobpris;
 441  444  }
 442  445  
 443  446  void
 444  447  disp_cpu_fini(cpu_t *cp)
 445  448  {
 446  449          ASSERT(MUTEX_HELD(&cpu_lock));
 447  450  
 448  451          disp_kp_free(cp->cpu_disp);
 449  452          if (cp->cpu_disp != &cpu0_disp)
 450  453                  kmem_free(cp->cpu_disp, sizeof (disp_t));
 451  454  }
 452  455  
 453  456  /*
 454  457   * Allocate new, larger kpreempt dispatch queue to replace the old one.
 455  458   */
 456  459  void
 457  460  disp_kp_alloc(disp_t *dq, pri_t npri)
 458  461  {
 459  462          struct disp_queue_info  mem_info;
 460  463  
 461  464          if (npri > dq->disp_npri) {
 462  465                  /*
 463  466                   * Allocate memory for the new array.
 464  467                   */
 465  468                  disp_dq_alloc(&mem_info, npri, dq);
 466  469  
 467  470                  /*
 468  471                   * We need to copy the old structures to the new
 469  472                   * and free the old.
 470  473                   */
 471  474                  disp_dq_assign(&mem_info, npri);
 472  475                  disp_dq_free(&mem_info);
 473  476          }
 474  477  }
 475  478  
 476  479  /*
 477  480   * Free dispatch queue.
 478  481   * Used for the kpreempt queues for a removed CPU partition and
 479  482   * for the per-CPU queues of deleted CPUs.
 480  483   */
 481  484  void
 482  485  disp_kp_free(disp_t *dq)
 483  486  {
 484  487          struct disp_queue_info  mem_info;
 485  488  
 486  489          mem_info.olddispq = dq->disp_q;
 487  490          mem_info.olddqactmap = dq->disp_qactmap;
 488  491          mem_info.oldnglobpris = dq->disp_npri;
 489  492          disp_dq_free(&mem_info);
 490  493  }
 491  494  
 492  495  /*
 493  496   * End dispatcher and scheduler initialization.
 494  497   */
 495  498  
 496  499  /*
 497  500   * See if there's anything to do other than remain idle.
 498  501   * Return non-zero if there is.
 499  502   *
 500  503   * This function must be called with high spl, or with
 501  504   * kernel preemption disabled to prevent the partition's
 502  505   * active cpu list from changing while being traversed.
 503  506   *
 504  507   * This is essentially a simpler version of disp_getwork()
 505  508   * to be called by CPUs preparing to "halt".
 506  509   */
 507  510  int
 508  511  disp_anywork(void)
 509  512  {
 510  513          cpu_t           *cp = CPU;
 511  514          cpu_t           *ocp;
 512  515          volatile int    *local_nrunnable = &cp->cpu_disp->disp_nrunnable;
 513  516  
 514  517          if (!(cp->cpu_flags & CPU_OFFLINE)) {
 515  518                  if (CP_MAXRUNPRI(cp->cpu_part) >= 0)
 516  519                          return (1);
 517  520  
 518  521                  for (ocp = cp->cpu_next_part; ocp != cp;
 519  522                      ocp = ocp->cpu_next_part) {
 520  523                          ASSERT(CPU_ACTIVE(ocp));
 521  524  
 522  525                          /*
 523  526                           * Something has appeared on the local run queue.
 524  527                           */
 525  528                          if (*local_nrunnable > 0)
 526  529                                  return (1);
 527  530                          /*
 528  531                           * If we encounter another idle CPU that will
 529  532                           * soon be trolling around through disp_anywork()
 530  533                           * terminate our walk here and let this other CPU
 531  534                           * patrol the next part of the list.
 532  535                           */
 533  536                          if (ocp->cpu_dispatch_pri == -1 &&
 534  537                              (ocp->cpu_disp_flags & CPU_DISP_HALTED) == 0)
 535  538                                  return (0);
 536  539                          /*
 537  540                           * Work can be taken from another CPU if:
 538  541                           *      - There is unbound work on the run queue
 539  542                           *      - That work isn't a thread undergoing a
 540  543                           *      - context switch on an otherwise empty queue.
 541  544                           *      - The CPU isn't running the idle loop.
 542  545                           */
 543  546                          if (ocp->cpu_disp->disp_max_unbound_pri != -1 &&
 544  547                              !((ocp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
 545  548                              ocp->cpu_disp->disp_nrunnable == 1) &&
 546  549                              ocp->cpu_dispatch_pri != -1)
 547  550                                  return (1);
 548  551                  }
 549  552          }
 550  553          return (0);
 551  554  }
 552  555  
 553  556  /*
 554  557   * Called when CPU enters the idle loop
 555  558   */
 556  559  static void
 557  560  idle_enter()
 558  561  {
 559  562          cpu_t           *cp = CPU;
 560  563  
 561  564          new_cpu_mstate(CMS_IDLE, gethrtime_unscaled());
 562  565          CPU_STATS_ADDQ(cp, sys, idlethread, 1);
 563  566          set_idle_cpu(cp->cpu_id);       /* arch-dependent hook */
 564  567  }
 565  568  
 566  569  /*
 567  570   * Called when CPU exits the idle loop
 568  571   */
 569  572  static void
 570  573  idle_exit()
 571  574  {
 572  575          cpu_t           *cp = CPU;
 573  576  
 574  577          new_cpu_mstate(CMS_SYSTEM, gethrtime_unscaled());
 575  578          unset_idle_cpu(cp->cpu_id);     /* arch-dependent hook */
 576  579  }
 577  580  
 578  581  /*
 579  582   * Idle loop.
 580  583   */
 581  584  void
 582  585  idle()
 583  586  {
 584  587          struct cpu      *cp = CPU;              /* pointer to this CPU */
 585  588          kthread_t       *t;                     /* taken thread */
 586  589  
 587  590          idle_enter();
 588  591  
 589  592          /*
 590  593           * Uniprocessor version of idle loop.
 591  594           * Do this until notified that we're on an actual multiprocessor.
 592  595           */
 593  596          while (ncpus == 1) {
 594  597                  if (cp->cpu_disp->disp_nrunnable == 0) {
 595  598                          (*idle_cpu)();
 596  599                          continue;
 597  600                  }
 598  601                  idle_exit();
 599  602                  swtch();
 600  603  
 601  604                  idle_enter(); /* returned from swtch */
 602  605          }
 603  606  
 604  607          /*
 605  608           * Multiprocessor idle loop.
 606  609           */
 607  610          for (;;) {
 608  611                  /*
 609  612                   * If CPU is completely quiesced by p_online(2), just wait
 610  613                   * here with minimal bus traffic until put online.
 611  614                   */
 612  615                  while (cp->cpu_flags & CPU_QUIESCED)
 613  616                          (*idle_cpu)();
 614  617  
 615  618                  if (cp->cpu_disp->disp_nrunnable != 0) {
 616  619                          idle_exit();
 617  620                          swtch();
 618  621                  } else {
 619  622                          if (cp->cpu_flags & CPU_OFFLINE)
 620  623                                  continue;
 621  624                          if ((t = disp_getwork(cp)) == NULL) {
 622  625                                  if (cp->cpu_chosen_level != -1) {
 623  626                                          disp_t *dp = cp->cpu_disp;
 624  627                                          disp_t *kpq;
 625  628  
 626  629                                          disp_lock_enter(&dp->disp_lock);
 627  630                                          /*
 628  631                                           * Set kpq under lock to prevent
 629  632                                           * migration between partitions.
 630  633                                           */
 631  634                                          kpq = &cp->cpu_part->cp_kp_queue;
 632  635                                          if (kpq->disp_maxrunpri == -1)
 633  636                                                  cp->cpu_chosen_level = -1;
 634  637                                          disp_lock_exit(&dp->disp_lock);
 635  638                                  }
 636  639                                  (*idle_cpu)();
 637  640                                  continue;
 638  641                          }
 639  642                          /*
 640  643                           * If there was a thread but we couldn't steal
 641  644                           * it, then keep trying.
 642  645                           */
 643  646                          if (t == T_DONTSTEAL)
 644  647                                  continue;
 645  648                          idle_exit();
 646  649                          swtch_to(t);
 647  650                  }
 648  651                  idle_enter(); /* returned from swtch/swtch_to */
 649  652          }
 650  653  }
 651  654  
 652  655  
 653  656  /*
 654  657   * Preempt the currently running thread in favor of the highest
 655  658   * priority thread.  The class of the current thread controls
 656  659   * where it goes on the dispatcher queues. If panicking, turn
 657  660   * preemption off.
 658  661   */
 659  662  void
 660  663  preempt()
 661  664  {
 662  665          kthread_t       *t = curthread;
 663  666          klwp_t          *lwp = ttolwp(curthread);
 664  667  
 665  668          if (panicstr)
 666  669                  return;
 667  670  
 668  671          TRACE_0(TR_FAC_DISP, TR_PREEMPT_START, "preempt_start");
 669  672  
 670  673          thread_lock(t);
 671  674  
 672  675          if (t->t_state != TS_ONPROC || t->t_disp_queue != CPU->cpu_disp) {
 673  676                  /*
 674  677                   * this thread has already been chosen to be run on
 675  678                   * another CPU. Clear kprunrun on this CPU since we're
 676  679                   * already headed for swtch().
 677  680                   */
 678  681                  CPU->cpu_kprunrun = 0;
 679  682                  thread_unlock_nopreempt(t);
 680  683                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 681  684          } else {
 682  685                  if (lwp != NULL)
 683  686                          lwp->lwp_ru.nivcsw++;
 684  687                  CPU_STATS_ADDQ(CPU, sys, inv_swtch, 1);
 685  688                  THREAD_TRANSITION(t);
 686  689                  CL_PREEMPT(t);
 687  690                  DTRACE_SCHED(preempt);
 688  691                  thread_unlock_nopreempt(t);
 689  692  
 690  693                  TRACE_0(TR_FAC_DISP, TR_PREEMPT_END, "preempt_end");
 691  694  
 692  695                  swtch();                /* clears CPU->cpu_runrun via disp() */
 693  696          }
 694  697  }
 695  698  
 696  699  extern kthread_t *thread_unpin();
 697  700  
 698  701  /*
 699  702   * disp() - find the highest priority thread for this processor to run, and
 700  703   * set it in TS_ONPROC state so that resume() can be called to run it.
 701  704   */
 702  705  static kthread_t *
 703  706  disp()
 704  707  {
 705  708          cpu_t           *cpup;
 706  709          disp_t          *dp;
 707  710          kthread_t       *tp;
 708  711          dispq_t         *dq;
 709  712          int             maxrunword;
 710  713          pri_t           pri;
 711  714          disp_t          *kpq;
 712  715  
 713  716          TRACE_0(TR_FAC_DISP, TR_DISP_START, "disp_start");
 714  717  
 715  718          cpup = CPU;
 716  719          /*
 717  720           * Find the highest priority loaded, runnable thread.
 718  721           */
 719  722          dp = cpup->cpu_disp;
 720  723  
 721  724  reschedule:
 722  725          /*
 723  726           * If there is more important work on the global queue with a better
 724  727           * priority than the maximum on this CPU, take it now.
 725  728           */
 726  729          kpq = &cpup->cpu_part->cp_kp_queue;
 727  730          while ((pri = kpq->disp_maxrunpri) >= 0 &&
 728  731              pri >= dp->disp_maxrunpri &&
 729  732              (cpup->cpu_flags & CPU_OFFLINE) == 0 &&
 730  733              (tp = disp_getbest(kpq)) != NULL) {
 731  734                  if (disp_ratify(tp, kpq) != NULL) {
 732  735                          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 733  736                              "disp_end:tid %p", tp);
 734  737                          return (tp);
 735  738                  }
 736  739          }
 737  740  
 738  741          disp_lock_enter(&dp->disp_lock);
 739  742          pri = dp->disp_maxrunpri;
 740  743  
 741  744          /*
 742  745           * If there is nothing to run, look at what's runnable on other queues.
 743  746           * Choose the idle thread if the CPU is quiesced.
 744  747           * Note that CPUs that have the CPU_OFFLINE flag set can still run
 745  748           * interrupt threads, which will be the only threads on the CPU's own
 746  749           * queue, but cannot run threads from other queues.
 747  750           */
 748  751          if (pri == -1) {
 749  752                  if (!(cpup->cpu_flags & CPU_OFFLINE)) {
 750  753                          disp_lock_exit(&dp->disp_lock);
 751  754                          if ((tp = disp_getwork(cpup)) == NULL ||
 752  755                              tp == T_DONTSTEAL) {
 753  756                                  tp = cpup->cpu_idle_thread;
 754  757                                  (void) splhigh();
 755  758                                  THREAD_ONPROC(tp, cpup);
 756  759                                  cpup->cpu_dispthread = tp;
 757  760                                  cpup->cpu_dispatch_pri = -1;
 758  761                                  cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 759  762                                  cpup->cpu_chosen_level = -1;
 760  763                          }
 761  764                  } else {
 762  765                          disp_lock_exit_high(&dp->disp_lock);
 763  766                          tp = cpup->cpu_idle_thread;
 764  767                          THREAD_ONPROC(tp, cpup);
 765  768                          cpup->cpu_dispthread = tp;
 766  769                          cpup->cpu_dispatch_pri = -1;
 767  770                          cpup->cpu_runrun = cpup->cpu_kprunrun = 0;
 768  771                          cpup->cpu_chosen_level = -1;
 769  772                  }
 770  773                  TRACE_1(TR_FAC_DISP, TR_DISP_END,
 771  774                      "disp_end:tid %p", tp);
 772  775                  return (tp);
 773  776          }
 774  777  
 775  778          dq = &dp->disp_q[pri];
 776  779          tp = dq->dq_first;
 777  780  
 778  781          ASSERT(tp != NULL);
 779  782          ASSERT(tp->t_schedflag & TS_LOAD);      /* thread must be swapped in */
 780  783  
 781  784          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
 782  785  
 783  786          /*
 784  787           * Found it so remove it from queue.
 785  788           */
 786  789          dp->disp_nrunnable--;
 787  790          dq->dq_sruncnt--;
 788  791          if ((dq->dq_first = tp->t_link) == NULL) {
 789  792                  ulong_t *dqactmap = dp->disp_qactmap;
 790  793  
 791  794                  ASSERT(dq->dq_sruncnt == 0);
 792  795                  dq->dq_last = NULL;
 793  796  
 794  797                  /*
 795  798                   * The queue is empty, so the corresponding bit needs to be
 796  799                   * turned off in dqactmap.   If nrunnable != 0 just took the
 797  800                   * last runnable thread off the
 798  801                   * highest queue, so recompute disp_maxrunpri.
 799  802                   */
 800  803                  maxrunword = pri >> BT_ULSHIFT;
 801  804                  dqactmap[maxrunword] &= ~BT_BIW(pri);
 802  805  
 803  806                  if (dp->disp_nrunnable == 0) {
 804  807                          dp->disp_max_unbound_pri = -1;
 805  808                          dp->disp_maxrunpri = -1;
 806  809                  } else {
 807  810                          int ipri;
 808  811  
 809  812                          ipri = bt_gethighbit(dqactmap, maxrunword);
 810  813                          dp->disp_maxrunpri = ipri;
 811  814                          if (ipri < dp->disp_max_unbound_pri)
 812  815                                  dp->disp_max_unbound_pri = ipri;
 813  816                  }
 814  817          } else {
 815  818                  tp->t_link = NULL;
 816  819          }
 817  820  
 818  821          /*
 819  822           * Set TS_DONT_SWAP flag to prevent another processor from swapping
 820  823           * out this thread before we have a chance to run it.
 821  824           * While running, it is protected against swapping by t_lock.
 822  825           */
 823  826          tp->t_schedflag |= TS_DONT_SWAP;
 824  827          cpup->cpu_dispthread = tp;              /* protected by spl only */
 825  828          cpup->cpu_dispatch_pri = pri;
 826  829          ASSERT(pri == DISP_PRIO(tp));
 827  830          thread_onproc(tp, cpup);                /* set t_state to TS_ONPROC */
 828  831          disp_lock_exit_high(&dp->disp_lock);    /* drop run queue lock */
 829  832  
 830  833          ASSERT(tp != NULL);
 831  834          TRACE_1(TR_FAC_DISP, TR_DISP_END,
 832  835              "disp_end:tid %p", tp);
 833  836  
 834  837          if (disp_ratify(tp, kpq) == NULL)
 835  838                  goto reschedule;
 836  839  
 837  840          return (tp);
 838  841  }
 839  842  
 840  843  /*
 841  844   * swtch()
 842  845   *      Find best runnable thread and run it.
 843  846   *      Called with the current thread already switched to a new state,
 844  847   *      on a sleep queue, run queue, stopped, and not zombied.
 845  848   *      May be called at any spl level less than or equal to LOCK_LEVEL.
 846  849   *      Always drops spl to the base level (spl0()).
 847  850   */
 848  851  void
 849  852  swtch()
 850  853  {
 851  854          kthread_t       *t = curthread;
 852  855          kthread_t       *next;
 853  856          cpu_t           *cp;
 854  857  
 855  858          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 856  859  
 857  860          if (t->t_flag & T_INTR_THREAD)
 858  861                  cpu_intr_swtch_enter(t);
 859  862  
 860  863          if (t->t_intr != NULL) {
 861  864                  /*
 862  865                   * We are an interrupt thread.  Setup and return
 863  866                   * the interrupted thread to be resumed.
 864  867                   */
 865  868                  (void) splhigh();       /* block other scheduler action */
 866  869                  cp = CPU;               /* now protected against migration */
 867  870                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 868  871                  CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 869  872                  CPU_STATS_ADDQ(cp, sys, intrblk, 1);
 870  873                  next = thread_unpin();
 871  874                  TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 872  875                  resume_from_intr(next);
 873  876          } else {
 874  877  #ifdef  DEBUG
 875  878                  if (t->t_state == TS_ONPROC &&
 876  879                      t->t_disp_queue->disp_cpu == CPU &&
 877  880                      t->t_preempt == 0) {
 878  881                          thread_lock(t);
 879  882                          ASSERT(t->t_state != TS_ONPROC ||
 880  883                              t->t_disp_queue->disp_cpu != CPU ||
 881  884                              t->t_preempt != 0); /* cannot migrate */
 882  885                          thread_unlock_nopreempt(t);
 883  886                  }
 884  887  #endif  /* DEBUG */
 885  888                  cp = CPU;
 886  889                  next = disp();          /* returns with spl high */
 887  890                  ASSERT(CPU_ON_INTR(cp) == 0);   /* not called with PIL > 10 */
 888  891  
 889  892                  /* OK to steal anything left on run queue */
 890  893                  cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
 891  894  
 892  895                  if (next != t) {
 893  896                          hrtime_t now;
 894  897  
 895  898                          now = gethrtime_unscaled();
 896  899                          pg_ev_thread_swtch(cp, now, t, next);
 897  900  
 898  901                          /*
 899  902                           * If t was previously in the TS_ONPROC state,
 900  903                           * setfrontdq and setbackdq won't have set its t_waitrq.
 901  904                           * Since we now finally know that we're switching away
 902  905                           * from this thread, set its t_waitrq if it is on a run
 903  906                           * queue.
 904  907                           */
 905  908                          if ((t->t_state == TS_RUN) && (t->t_waitrq == 0)) {
 906  909                                  t->t_waitrq = now;
 907  910                          }
 908  911  
 909  912                          /*
 910  913                           * restore mstate of thread that we are switching to
 911  914                           */
 912  915                          restore_mstate(next);
 913  916  
 914  917                          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
 915  918                          cp->cpu_last_swtch = t->t_disp_time = ddi_get_lbolt();
 916  919                          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 917  920  
 918  921                          if (dtrace_vtime_active)
 919  922                                  dtrace_vtime_switch(next);
 920  923  
 921  924                          resume(next);
 922  925                          /*
 923  926                           * The TR_RESUME_END and TR_SWTCH_END trace points
 924  927                           * appear at the end of resume(), because we may not
 925  928                           * return here
 926  929                           */
 927  930                  } else {
 928  931                          if (t->t_flag & T_INTR_THREAD)
 929  932                                  cpu_intr_swtch_exit(t);
 930  933                          /*
 931  934                           * Threads that enqueue themselves on a run queue defer
 932  935                           * setting t_waitrq. It is then either set in swtch()
 933  936                           * when the CPU is actually yielded, or not at all if it
 934  937                           * is remaining on the CPU.
 935  938                           * There is however a window between where the thread
 936  939                           * placed itself on a run queue, and where it selects
 937  940                           * itself in disp(), where a third party (eg. clock()
 938  941                           * doing tick processing) may have re-enqueued this
 939  942                           * thread, setting t_waitrq in the process. We detect
 940  943                           * this race by noticing that despite switching to
 941  944                           * ourself, our t_waitrq has been set, and should be
 942  945                           * cleared.
 943  946                           */
 944  947                          if (t->t_waitrq != 0)
 945  948                                  t->t_waitrq = 0;
 946  949  
 947  950                          pg_ev_thread_remain(cp, t);
 948  951  
 949  952                          DTRACE_SCHED(remain__cpu);
 950  953                          TRACE_0(TR_FAC_DISP, TR_SWTCH_END, "swtch_end");
 951  954                          (void) spl0();
 952  955                  }
 953  956          }
 954  957  }
 955  958  
 956  959  /*
 957  960   * swtch_from_zombie()
 958  961   *      Special case of swtch(), which allows checks for TS_ZOMB to be
 959  962   *      eliminated from normal resume.
 960  963   *      Find best runnable thread and run it.
 961  964   *      Called with the current thread zombied.
 962  965   *      Zombies cannot migrate, so CPU references are safe.
 963  966   */
 964  967  void
 965  968  swtch_from_zombie()
 966  969  {
 967  970          kthread_t       *next;
 968  971          cpu_t           *cpu = CPU;
 969  972  
 970  973          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
 971  974  
 972  975          ASSERT(curthread->t_state == TS_ZOMB);
 973  976  
 974  977          next = disp();                  /* returns with spl high */
 975  978          ASSERT(CPU_ON_INTR(CPU) == 0);  /* not called with PIL > 10 */
 976  979          CPU_STATS_ADDQ(CPU, sys, pswitch, 1);
 977  980          ASSERT(next != curthread);
 978  981          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
 979  982  
 980  983          pg_ev_thread_swtch(cpu, gethrtime_unscaled(), curthread, next);
 981  984  
 982  985          restore_mstate(next);
 983  986  
 984  987          if (dtrace_vtime_active)
 985  988                  dtrace_vtime_switch(next);
 986  989  
 987  990          resume_from_zombie(next);
 988  991          /*
 989  992           * The TR_RESUME_END and TR_SWTCH_END trace points
 990  993           * appear at the end of resume(), because we certainly will not
 991  994           * return here
 992  995           */
 993  996  }
 994  997  
 995  998  #if defined(DEBUG) && (defined(DISP_DEBUG) || defined(lint))
 996  999  
 997 1000  /*
 998 1001   * search_disp_queues()
 999 1002   *      Search the given dispatch queues for thread tp.
1000 1003   *      Return 1 if tp is found, otherwise return 0.
1001 1004   */
1002 1005  static int
1003 1006  search_disp_queues(disp_t *dp, kthread_t *tp)
1004 1007  {
1005 1008          dispq_t         *dq;
1006 1009          dispq_t         *eq;
1007 1010  
1008 1011          disp_lock_enter_high(&dp->disp_lock);
1009 1012  
1010 1013          for (dq = dp->disp_q, eq = dp->disp_q_limit; dq < eq; ++dq) {
1011 1014                  kthread_t       *rp;
1012 1015  
1013 1016                  ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1014 1017  
1015 1018                  for (rp = dq->dq_first; rp; rp = rp->t_link)
1016 1019                          if (tp == rp) {
1017 1020                                  disp_lock_exit_high(&dp->disp_lock);
1018 1021                                  return (1);
1019 1022                          }
1020 1023          }
1021 1024          disp_lock_exit_high(&dp->disp_lock);
1022 1025  
1023 1026          return (0);
1024 1027  }
1025 1028  
1026 1029  /*
1027 1030   * thread_on_queue()
1028 1031   *      Search all per-CPU dispatch queues and all partition-wide kpreempt
1029 1032   *      queues for thread tp. Return 1 if tp is found, otherwise return 0.
1030 1033   */
1031 1034  static int
1032 1035  thread_on_queue(kthread_t *tp)
1033 1036  {
1034 1037          cpu_t           *cp;
1035 1038          struct cpupart  *part;
1036 1039  
1037 1040          ASSERT(getpil() >= DISP_LEVEL);
1038 1041  
1039 1042          /*
1040 1043           * Search the per-CPU dispatch queues for tp.
1041 1044           */
1042 1045          cp = CPU;
1043 1046          do {
1044 1047                  if (search_disp_queues(cp->cpu_disp, tp))
1045 1048                          return (1);
1046 1049          } while ((cp = cp->cpu_next_onln) != CPU);
1047 1050  
1048 1051          /*
1049 1052           * Search the partition-wide kpreempt queues for tp.
1050 1053           */
1051 1054          part = CPU->cpu_part;
1052 1055          do {
1053 1056                  if (search_disp_queues(&part->cp_kp_queue, tp))
1054 1057                          return (1);
1055 1058          } while ((part = part->cp_next) != CPU->cpu_part);
1056 1059  
1057 1060          return (0);
1058 1061  }
1059 1062  
1060 1063  #else
1061 1064  
1062 1065  #define thread_on_queue(tp)     0       /* ASSERT must be !thread_on_queue */
1063 1066  
1064 1067  #endif  /* DEBUG */
1065 1068  
1066 1069  /*
1067 1070   * like swtch(), but switch to a specified thread taken from another CPU.
1068 1071   *      called with spl high..
1069 1072   */
1070 1073  void
1071 1074  swtch_to(kthread_t *next)
1072 1075  {
1073 1076          cpu_t                   *cp = CPU;
1074 1077          hrtime_t                now;
1075 1078  
1076 1079          TRACE_0(TR_FAC_DISP, TR_SWTCH_START, "swtch_start");
1077 1080  
1078 1081          /*
1079 1082           * Update context switch statistics.
1080 1083           */
1081 1084          CPU_STATS_ADDQ(cp, sys, pswitch, 1);
1082 1085  
1083 1086          TRACE_0(TR_FAC_DISP, TR_RESUME_START, "resume_start");
1084 1087  
1085 1088          now = gethrtime_unscaled();
1086 1089          pg_ev_thread_swtch(cp, now, curthread, next);
1087 1090  
1088 1091          /* OK to steal anything left on run queue */
1089 1092          cp->cpu_disp_flags &= ~CPU_DISP_DONTSTEAL;
1090 1093  
1091 1094          /* record last execution time */
1092 1095          cp->cpu_last_swtch = curthread->t_disp_time = ddi_get_lbolt();
1093 1096  
1094 1097          /*
1095 1098           * If t was previously in the TS_ONPROC state, setfrontdq and setbackdq
1096 1099           * won't have set its t_waitrq.  Since we now finally know that we're
1097 1100           * switching away from this thread, set its t_waitrq if it is on a run
1098 1101           * queue.
1099 1102           */
1100 1103          if ((curthread->t_state == TS_RUN) && (curthread->t_waitrq == 0)) {
1101 1104                  curthread->t_waitrq = now;
1102 1105          }
1103 1106  
1104 1107          /* restore next thread to previously running microstate */
1105 1108          restore_mstate(next);
1106 1109  
1107 1110          if (dtrace_vtime_active)
1108 1111                  dtrace_vtime_switch(next);
1109 1112  
1110 1113          resume(next);
1111 1114          /*
1112 1115           * The TR_RESUME_END and TR_SWTCH_END trace points
1113 1116           * appear at the end of resume(), because we may not
1114 1117           * return here
1115 1118           */
1116 1119  }
1117 1120  
1118 1121  #define CPU_IDLING(pri) ((pri) == -1)
1119 1122  
1120 1123  static void
1121 1124  cpu_resched(cpu_t *cp, pri_t tpri)
1122 1125  {
1123 1126          int     call_poke_cpu = 0;
1124 1127          pri_t   cpupri = cp->cpu_dispatch_pri;
1125 1128  
1126 1129          if (!CPU_IDLING(cpupri) && (cpupri < tpri)) {
1127 1130                  TRACE_2(TR_FAC_DISP, TR_CPU_RESCHED,
1128 1131                      "CPU_RESCHED:Tpri %d Cpupri %d", tpri, cpupri);
1129 1132                  if (tpri >= upreemptpri && cp->cpu_runrun == 0) {
1130 1133                          cp->cpu_runrun = 1;
1131 1134                          aston(cp->cpu_dispthread);
1132 1135                          if (tpri < kpreemptpri && cp != CPU)
1133 1136                                  call_poke_cpu = 1;
1134 1137                  }
1135 1138                  if (tpri >= kpreemptpri && cp->cpu_kprunrun == 0) {
1136 1139                          cp->cpu_kprunrun = 1;
1137 1140                          if (cp != CPU)
1138 1141                                  call_poke_cpu = 1;
1139 1142                  }
1140 1143          }
1141 1144  
1142 1145          /*
1143 1146           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1144 1147           */
1145 1148          membar_enter();
1146 1149  
1147 1150          if (call_poke_cpu)
1148 1151                  poke_cpu(cp->cpu_id);
1149 1152  }
1150 1153  
1151 1154  /*
1152 1155   * setbackdq() keeps runqs balanced such that the difference in length
1153 1156   * between the chosen runq and the next one is no more than RUNQ_MAX_DIFF.
1154 1157   * For threads with priorities below RUNQ_MATCH_PRI levels, the runq's lengths
1155 1158   * must match.  When per-thread TS_RUNQMATCH flag is set, setbackdq() will
1156 1159   * try to keep runqs perfectly balanced regardless of the thread priority.
1157 1160   */
1158 1161  #define RUNQ_MATCH_PRI  16      /* pri below which queue lengths must match */
1159 1162  #define RUNQ_MAX_DIFF   2       /* maximum runq length difference */
1160 1163  #define RUNQ_LEN(cp, pri)       ((cp)->cpu_disp->disp_q[pri].dq_sruncnt)
1161 1164  
1162 1165  /*
1163 1166   * Macro that evaluates to true if it is likely that the thread has cache
1164 1167   * warmth. This is based on the amount of time that has elapsed since the
1165 1168   * thread last ran. If that amount of time is less than "rechoose_interval"
1166 1169   * ticks, then we decide that the thread has enough cache warmth to warrant
1167 1170   * some affinity for t->t_cpu.
1168 1171   */
1169 1172  #define THREAD_HAS_CACHE_WARMTH(thread) \
1170 1173          ((thread == curthread) ||       \
1171 1174          ((ddi_get_lbolt() - thread->t_disp_time) <= rechoose_interval))
1172 1175  /*
1173 1176   * Put the specified thread on the back of the dispatcher
1174 1177   * queue corresponding to its current priority.
1175 1178   *
1176 1179   * Called with the thread in transition, onproc or stopped state
1177 1180   * and locked (transition implies locked) and at high spl.
1178 1181   * Returns with the thread in TS_RUN state and still locked.
1179 1182   */
1180 1183  void
1181 1184  setbackdq(kthread_t *tp)
1182 1185  {
1183 1186          dispq_t *dq;
1184 1187          disp_t          *dp;
1185 1188          cpu_t           *cp;
1186 1189          pri_t           tpri;
1187 1190          int             bound;
1188 1191          boolean_t       self;
1189 1192  
1190 1193          ASSERT(THREAD_LOCK_HELD(tp));
1191 1194          ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1192 1195          ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1193 1196  
1194 1197          /*
1195 1198           * If thread is "swapped" or on the swap queue don't
1196 1199           * queue it, but wake sched.
1197 1200           */
1198 1201          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1199 1202                  disp_swapped_setrun(tp);
1200 1203                  return;
1201 1204          }
1202 1205  
1203 1206          self = (tp == curthread);
1204 1207  
1205 1208          if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1206 1209                  bound = 1;
1207 1210          else
1208 1211                  bound = 0;
1209 1212  
1210 1213          tpri = DISP_PRIO(tp);
1211 1214          if (ncpus == 1)
1212 1215                  cp = tp->t_cpu;
1213 1216          else if (!bound) {
1214 1217                  if (tpri >= kpqpri) {
1215 1218                          setkpdq(tp, SETKP_BACK);
1216 1219                          return;
1217 1220                  }
1218 1221  
1219 1222                  /*
1220 1223                   * We'll generally let this thread continue to run where
1221 1224                   * it last ran...but will consider migration if:
1222 1225                   * - We thread probably doesn't have much cache warmth.
1223 1226                   * - The CPU where it last ran is the target of an offline
1224 1227                   *   request.
1225 1228                   * - The thread last ran outside it's home lgroup.
1226 1229                   */
1227 1230                  if ((!THREAD_HAS_CACHE_WARMTH(tp)) ||
1228 1231                      (tp->t_cpu == cpu_inmotion)) {
1229 1232                          cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri, NULL);
1230 1233                  } else if (!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, tp->t_cpu)) {
1231 1234                          cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1232 1235                              self ? tp->t_cpu : NULL);
1233 1236                  } else {
1234 1237                          cp = tp->t_cpu;
1235 1238                  }
1236 1239  
1237 1240                  if (tp->t_cpupart == cp->cpu_part) {
1238 1241                          int     qlen;
1239 1242  
1240 1243                          /*
1241 1244                           * Perform any CMT load balancing
1242 1245                           */
1243 1246                          cp = cmt_balance(tp, cp);
1244 1247  
1245 1248                          /*
1246 1249                           * Balance across the run queues
1247 1250                           */
1248 1251                          qlen = RUNQ_LEN(cp, tpri);
1249 1252                          if (tpri >= RUNQ_MATCH_PRI &&
1250 1253                              !(tp->t_schedflag & TS_RUNQMATCH))
1251 1254                                  qlen -= RUNQ_MAX_DIFF;
1252 1255                          if (qlen > 0) {
1253 1256                                  cpu_t *newcp;
1254 1257  
1255 1258                                  if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) {
1256 1259                                          newcp = cp->cpu_next_part;
1257 1260                                  } else if ((newcp = cp->cpu_next_lpl) == cp) {
1258 1261                                          newcp = cp->cpu_next_part;
1259 1262                                  }
1260 1263  
1261 1264                                  if (RUNQ_LEN(newcp, tpri) < qlen) {
1262 1265                                          DTRACE_PROBE3(runq__balance,
1263 1266                                              kthread_t *, tp,
1264 1267                                              cpu_t *, cp, cpu_t *, newcp);
1265 1268                                          cp = newcp;
1266 1269                                  }
1267 1270                          }
1268 1271                  } else {
1269 1272                          /*
1270 1273                           * Migrate to a cpu in the new partition.
1271 1274                           */
1272 1275                          cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1273 1276                              tp->t_lpl, tp->t_pri, NULL);
1274 1277                  }
1275 1278                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1276 1279          } else {
1277 1280                  /*
1278 1281                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1279 1282                   * a short time until weak binding that existed when the
1280 1283                   * strong binding was established has dropped) so we must
1281 1284                   * favour weak binding over strong.
1282 1285                   */
1283 1286                  cp = tp->t_weakbound_cpu ?
1284 1287                      tp->t_weakbound_cpu : tp->t_bound_cpu;
1285 1288          }
1286 1289          /*
1287 1290           * A thread that is ONPROC may be temporarily placed on the run queue
1288 1291           * but then chosen to run again by disp.  If the thread we're placing on
1289 1292           * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1290 1293           * replacement process is actually scheduled in swtch().  In this
1291 1294           * situation, curthread is the only thread that could be in the ONPROC
1292 1295           * state.
1293 1296           */
1294 1297          if ((!self) && (tp->t_waitrq == 0)) {
1295 1298                  hrtime_t curtime;
1296 1299  
1297 1300                  curtime = gethrtime_unscaled();
1298 1301                  (void) cpu_update_pct(tp, curtime);
1299 1302                  tp->t_waitrq = curtime;
1300 1303          } else {
1301 1304                  (void) cpu_update_pct(tp, gethrtime_unscaled());
1302 1305          }
1303 1306  
1304 1307          dp = cp->cpu_disp;
1305 1308          disp_lock_enter_high(&dp->disp_lock);
1306 1309  
1307 1310          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 0);
1308 1311          TRACE_3(TR_FAC_DISP, TR_BACKQ, "setbackdq:pri %d cpu %p tid %p",
1309 1312              tpri, cp, tp);
1310 1313  
1311 1314  #ifndef NPROBE
1312 1315          /* Kernel probe */
1313 1316          if (tnf_tracing_active)
1314 1317                  tnf_thread_queue(tp, cp, tpri);
1315 1318  #endif /* NPROBE */
1316 1319  
1317 1320          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1318 1321  
1319 1322          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1320 1323          tp->t_disp_queue = dp;
1321 1324          tp->t_link = NULL;
1322 1325  
1323 1326          dq = &dp->disp_q[tpri];
1324 1327          dp->disp_nrunnable++;
1325 1328          if (!bound)
1326 1329                  dp->disp_steal = 0;
1327 1330          membar_enter();
1328 1331  
1329 1332          if (dq->dq_sruncnt++ != 0) {
1330 1333                  ASSERT(dq->dq_first != NULL);
1331 1334                  dq->dq_last->t_link = tp;
1332 1335                  dq->dq_last = tp;
1333 1336          } else {
1334 1337                  ASSERT(dq->dq_first == NULL);
1335 1338                  ASSERT(dq->dq_last == NULL);
1336 1339                  dq->dq_first = dq->dq_last = tp;
1337 1340                  BT_SET(dp->disp_qactmap, tpri);
1338 1341                  if (tpri > dp->disp_maxrunpri) {
1339 1342                          dp->disp_maxrunpri = tpri;
1340 1343                          membar_enter();
1341 1344                          cpu_resched(cp, tpri);
1342 1345                  }
1343 1346          }
1344 1347  
1345 1348          if (!bound && tpri > dp->disp_max_unbound_pri) {
1346 1349                  if (self && dp->disp_max_unbound_pri == -1 && cp == CPU) {
1347 1350                          /*
1348 1351                           * If there are no other unbound threads on the
1349 1352                           * run queue, don't allow other CPUs to steal
1350 1353                           * this thread while we are in the middle of a
1351 1354                           * context switch. We may just switch to it
1352 1355                           * again right away. CPU_DISP_DONTSTEAL is cleared
1353 1356                           * in swtch and swtch_to.
1354 1357                           */
1355 1358                          cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1356 1359                  }
1357 1360                  dp->disp_max_unbound_pri = tpri;
1358 1361          }
1359 1362          (*disp_enq_thread)(cp, bound);
1360 1363  }
1361 1364  
1362 1365  /*
1363 1366   * Put the specified thread on the front of the dispatcher
1364 1367   * queue corresponding to its current priority.
1365 1368   *
1366 1369   * Called with the thread in transition, onproc or stopped state
1367 1370   * and locked (transition implies locked) and at high spl.
1368 1371   * Returns with the thread in TS_RUN state and still locked.
1369 1372   */
1370 1373  void
1371 1374  setfrontdq(kthread_t *tp)
1372 1375  {
1373 1376          disp_t          *dp;
1374 1377          dispq_t         *dq;
1375 1378          cpu_t           *cp;
1376 1379          pri_t           tpri;
1377 1380          int             bound;
1378 1381  
1379 1382          ASSERT(THREAD_LOCK_HELD(tp));
1380 1383          ASSERT((tp->t_schedflag & TS_ALLSTART) == 0);
1381 1384          ASSERT(!thread_on_queue(tp));   /* make sure tp isn't on a runq */
1382 1385  
1383 1386          /*
1384 1387           * If thread is "swapped" or on the swap queue don't
1385 1388           * queue it, but wake sched.
1386 1389           */
1387 1390          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD) {
1388 1391                  disp_swapped_setrun(tp);
1389 1392                  return;
1390 1393          }
1391 1394  
1392 1395          if (tp->t_bound_cpu || tp->t_weakbound_cpu)
1393 1396                  bound = 1;
1394 1397          else
1395 1398                  bound = 0;
1396 1399  
1397 1400          tpri = DISP_PRIO(tp);
1398 1401          if (ncpus == 1)
1399 1402                  cp = tp->t_cpu;
1400 1403          else if (!bound) {
1401 1404                  if (tpri >= kpqpri) {
1402 1405                          setkpdq(tp, SETKP_FRONT);
1403 1406                          return;
1404 1407                  }
1405 1408                  cp = tp->t_cpu;
1406 1409                  if (tp->t_cpupart == cp->cpu_part) {
1407 1410                          /*
1408 1411                           * We'll generally let this thread continue to run
1409 1412                           * where it last ran, but will consider migration if:
1410 1413                           * - The thread last ran outside it's home lgroup.
1411 1414                           * - The CPU where it last ran is the target of an
1412 1415                           *   offline request (a thread_nomigrate() on the in
1413 1416                           *   motion CPU relies on this when forcing a preempt).
1414 1417                           * - The thread isn't the highest priority thread where
1415 1418                           *   it last ran, and it is considered not likely to
1416 1419                           *   have significant cache warmth.
1417 1420                           */
1418 1421                          if ((!LGRP_CONTAINS_CPU(tp->t_lpl->lpl_lgrp, cp)) ||
1419 1422                              (cp == cpu_inmotion)) {
1420 1423                                  cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1421 1424                                      (tp == curthread) ? cp : NULL);
1422 1425                          } else if ((tpri < cp->cpu_disp->disp_maxrunpri) &&
1423 1426                              (!THREAD_HAS_CACHE_WARMTH(tp))) {
1424 1427                                  cp = disp_lowpri_cpu(tp->t_cpu, tp->t_lpl, tpri,
1425 1428                                      NULL);
1426 1429                          }
1427 1430                  } else {
1428 1431                          /*
1429 1432                           * Migrate to a cpu in the new partition.
1430 1433                           */
1431 1434                          cp = disp_lowpri_cpu(tp->t_cpupart->cp_cpulist,
1432 1435                              tp->t_lpl, tp->t_pri, NULL);
1433 1436                  }
1434 1437                  ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1435 1438          } else {
1436 1439                  /*
1437 1440                   * It is possible that t_weakbound_cpu != t_bound_cpu (for
1438 1441                   * a short time until weak binding that existed when the
1439 1442                   * strong binding was established has dropped) so we must
1440 1443                   * favour weak binding over strong.
1441 1444                   */
1442 1445                  cp = tp->t_weakbound_cpu ?
1443 1446                      tp->t_weakbound_cpu : tp->t_bound_cpu;
1444 1447          }
1445 1448  
1446 1449          /*
1447 1450           * A thread that is ONPROC may be temporarily placed on the run queue
1448 1451           * but then chosen to run again by disp.  If the thread we're placing on
1449 1452           * the queue is in TS_ONPROC state, don't set its t_waitrq until a
1450 1453           * replacement process is actually scheduled in swtch().  In this
1451 1454           * situation, curthread is the only thread that could be in the ONPROC
1452 1455           * state.
1453 1456           */
1454 1457          if ((tp != curthread) && (tp->t_waitrq == 0)) {
1455 1458                  hrtime_t curtime;
1456 1459  
1457 1460                  curtime = gethrtime_unscaled();
1458 1461                  (void) cpu_update_pct(tp, curtime);
1459 1462                  tp->t_waitrq = curtime;
1460 1463          } else {
1461 1464                  (void) cpu_update_pct(tp, gethrtime_unscaled());
1462 1465          }
1463 1466  
1464 1467          dp = cp->cpu_disp;
1465 1468          disp_lock_enter_high(&dp->disp_lock);
1466 1469  
1467 1470          TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1468 1471          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, 1);
1469 1472  
1470 1473  #ifndef NPROBE
1471 1474          /* Kernel probe */
1472 1475          if (tnf_tracing_active)
1473 1476                  tnf_thread_queue(tp, cp, tpri);
1474 1477  #endif /* NPROBE */
1475 1478  
1476 1479          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1477 1480  
1478 1481          THREAD_RUN(tp, &dp->disp_lock);         /* set TS_RUN state and lock */
1479 1482          tp->t_disp_queue = dp;
1480 1483  
1481 1484          dq = &dp->disp_q[tpri];
1482 1485          dp->disp_nrunnable++;
1483 1486          if (!bound)
1484 1487                  dp->disp_steal = 0;
1485 1488          membar_enter();
1486 1489  
1487 1490          if (dq->dq_sruncnt++ != 0) {
1488 1491                  ASSERT(dq->dq_last != NULL);
1489 1492                  tp->t_link = dq->dq_first;
1490 1493                  dq->dq_first = tp;
1491 1494          } else {
1492 1495                  ASSERT(dq->dq_last == NULL);
1493 1496                  ASSERT(dq->dq_first == NULL);
1494 1497                  tp->t_link = NULL;
1495 1498                  dq->dq_first = dq->dq_last = tp;
1496 1499                  BT_SET(dp->disp_qactmap, tpri);
1497 1500                  if (tpri > dp->disp_maxrunpri) {
1498 1501                          dp->disp_maxrunpri = tpri;
1499 1502                          membar_enter();
1500 1503                          cpu_resched(cp, tpri);
1501 1504                  }
1502 1505          }
1503 1506  
1504 1507          if (!bound && tpri > dp->disp_max_unbound_pri) {
1505 1508                  if (tp == curthread && dp->disp_max_unbound_pri == -1 &&
1506 1509                      cp == CPU) {
1507 1510                          /*
1508 1511                           * If there are no other unbound threads on the
1509 1512                           * run queue, don't allow other CPUs to steal
1510 1513                           * this thread while we are in the middle of a
1511 1514                           * context switch. We may just switch to it
1512 1515                           * again right away. CPU_DISP_DONTSTEAL is cleared
1513 1516                           * in swtch and swtch_to.
1514 1517                           */
1515 1518                          cp->cpu_disp_flags |= CPU_DISP_DONTSTEAL;
1516 1519                  }
1517 1520                  dp->disp_max_unbound_pri = tpri;
1518 1521          }
1519 1522          (*disp_enq_thread)(cp, bound);
1520 1523  }
1521 1524  
1522 1525  /*
1523 1526   * Put a high-priority unbound thread on the kp queue
1524 1527   */
1525 1528  static void
1526 1529  setkpdq(kthread_t *tp, int borf)
1527 1530  {
1528 1531          dispq_t *dq;
1529 1532          disp_t  *dp;
1530 1533          cpu_t   *cp;
1531 1534          pri_t   tpri;
1532 1535  
1533 1536          tpri = DISP_PRIO(tp);
1534 1537  
1535 1538          dp = &tp->t_cpupart->cp_kp_queue;
1536 1539          disp_lock_enter_high(&dp->disp_lock);
1537 1540  
1538 1541          TRACE_2(TR_FAC_DISP, TR_FRONTQ, "frontq:pri %d tid %p", tpri, tp);
1539 1542  
1540 1543          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
1541 1544          DTRACE_SCHED3(enqueue, kthread_t *, tp, disp_t *, dp, int, borf);
1542 1545          THREAD_RUN(tp, &dp->disp_lock);         /* set t_state to TS_RUN */
1543 1546          tp->t_disp_queue = dp;
1544 1547          dp->disp_nrunnable++;
1545 1548          dq = &dp->disp_q[tpri];
1546 1549  
1547 1550          if (dq->dq_sruncnt++ != 0) {
1548 1551                  if (borf == SETKP_BACK) {
1549 1552                          ASSERT(dq->dq_first != NULL);
1550 1553                          tp->t_link = NULL;
1551 1554                          dq->dq_last->t_link = tp;
1552 1555                          dq->dq_last = tp;
1553 1556                  } else {
1554 1557                          ASSERT(dq->dq_last != NULL);
1555 1558                          tp->t_link = dq->dq_first;
1556 1559                          dq->dq_first = tp;
1557 1560                  }
1558 1561          } else {
1559 1562                  if (borf == SETKP_BACK) {
1560 1563                          ASSERT(dq->dq_first == NULL);
1561 1564                          ASSERT(dq->dq_last == NULL);
1562 1565                          dq->dq_first = dq->dq_last = tp;
1563 1566                  } else {
1564 1567                          ASSERT(dq->dq_last == NULL);
1565 1568                          ASSERT(dq->dq_first == NULL);
1566 1569                          tp->t_link = NULL;
1567 1570                          dq->dq_first = dq->dq_last = tp;
1568 1571                  }
1569 1572                  BT_SET(dp->disp_qactmap, tpri);
1570 1573                  if (tpri > dp->disp_max_unbound_pri)
1571 1574                          dp->disp_max_unbound_pri = tpri;
1572 1575                  if (tpri > dp->disp_maxrunpri) {
1573 1576                          dp->disp_maxrunpri = tpri;
1574 1577                          membar_enter();
1575 1578                  }
1576 1579          }
1577 1580  
1578 1581          cp = tp->t_cpu;
1579 1582          if (tp->t_cpupart != cp->cpu_part) {
1580 1583                  /* migrate to a cpu in the new partition */
1581 1584                  cp = tp->t_cpupart->cp_cpulist;
1582 1585          }
1583 1586          cp = disp_lowpri_cpu(cp, tp->t_lpl, tp->t_pri, NULL);
1584 1587          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
1585 1588          ASSERT((cp->cpu_flags & CPU_QUIESCED) == 0);
1586 1589  
1587 1590  #ifndef NPROBE
1588 1591          /* Kernel probe */
1589 1592          if (tnf_tracing_active)
1590 1593                  tnf_thread_queue(tp, cp, tpri);
1591 1594  #endif /* NPROBE */
1592 1595  
1593 1596          if (cp->cpu_chosen_level < tpri)
1594 1597                  cp->cpu_chosen_level = tpri;
1595 1598          cpu_resched(cp, tpri);
1596 1599          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
1597 1600          (*disp_enq_thread)(cp, 0);
1598 1601  }
1599 1602  
1600 1603  /*
1601 1604   * Remove a thread from the dispatcher queue if it is on it.
1602 1605   * It is not an error if it is not found but we return whether
1603 1606   * or not it was found in case the caller wants to check.
1604 1607   */
1605 1608  int
1606 1609  dispdeq(kthread_t *tp)
1607 1610  {
1608 1611          disp_t          *dp;
1609 1612          dispq_t         *dq;
1610 1613          kthread_t       *rp;
1611 1614          kthread_t       *trp;
1612 1615          kthread_t       **ptp;
1613 1616          int             tpri;
1614 1617  
1615 1618          ASSERT(THREAD_LOCK_HELD(tp));
1616 1619  
1617 1620          if (tp->t_state != TS_RUN)
1618 1621                  return (0);
1619 1622  
1620 1623          /*
1621 1624           * The thread is "swapped" or is on the swap queue and
1622 1625           * hence no longer on the run queue, so return true.
1623 1626           */
1624 1627          if ((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD)
1625 1628                  return (1);
1626 1629  
1627 1630          tpri = DISP_PRIO(tp);
1628 1631          dp = tp->t_disp_queue;
1629 1632          ASSERT(tpri < dp->disp_npri);
1630 1633          dq = &dp->disp_q[tpri];
1631 1634          ptp = &dq->dq_first;
1632 1635          rp = *ptp;
1633 1636          trp = NULL;
1634 1637  
1635 1638          ASSERT(dq->dq_last == NULL || dq->dq_last->t_link == NULL);
1636 1639  
1637 1640          /*
1638 1641           * Search for thread in queue.
1639 1642           * Double links would simplify this at the expense of disp/setrun.
1640 1643           */
1641 1644          while (rp != tp && rp != NULL) {
1642 1645                  trp = rp;
1643 1646                  ptp = &trp->t_link;
1644 1647                  rp = trp->t_link;
1645 1648          }
1646 1649  
1647 1650          if (rp == NULL) {
1648 1651                  panic("dispdeq: thread not on queue");
1649 1652          }
1650 1653  
1651 1654          DTRACE_SCHED2(dequeue, kthread_t *, tp, disp_t *, dp);
1652 1655  
1653 1656          /*
1654 1657           * Found it so remove it from queue.
1655 1658           */
1656 1659          if ((*ptp = rp->t_link) == NULL)
1657 1660                  dq->dq_last = trp;
1658 1661  
1659 1662          dp->disp_nrunnable--;
1660 1663          if (--dq->dq_sruncnt == 0) {
1661 1664                  dp->disp_qactmap[tpri >> BT_ULSHIFT] &= ~BT_BIW(tpri);
1662 1665                  if (dp->disp_nrunnable == 0) {
1663 1666                          dp->disp_max_unbound_pri = -1;
1664 1667                          dp->disp_maxrunpri = -1;
1665 1668                  } else if (tpri == dp->disp_maxrunpri) {
1666 1669                          int ipri;
1667 1670  
1668 1671                          ipri = bt_gethighbit(dp->disp_qactmap,
1669 1672                              dp->disp_maxrunpri >> BT_ULSHIFT);
1670 1673                          if (ipri < dp->disp_max_unbound_pri)
1671 1674                                  dp->disp_max_unbound_pri = ipri;
1672 1675                          dp->disp_maxrunpri = ipri;
1673 1676                  }
1674 1677          }
1675 1678          tp->t_link = NULL;
1676 1679          THREAD_TRANSITION(tp);          /* put in intermediate state */
1677 1680          return (1);
1678 1681  }
1679 1682  
1680 1683  
1681 1684  /*
1682 1685   * dq_sruninc and dq_srundec are public functions for
1683 1686   * incrementing/decrementing the sruncnts when a thread on
1684 1687   * a dispatcher queue is made schedulable/unschedulable by
1685 1688   * resetting the TS_LOAD flag.
1686 1689   *
1687 1690   * The caller MUST have the thread lock and therefore the dispatcher
1688 1691   * queue lock so that the operation which changes
1689 1692   * the flag, the operation that checks the status of the thread to
1690 1693   * determine if it's on a disp queue AND the call to this function
1691 1694   * are one atomic operation with respect to interrupts.
1692 1695   */
1693 1696  
1694 1697  /*
1695 1698   * Called by sched AFTER TS_LOAD flag is set on a swapped, runnable thread.
1696 1699   */
1697 1700  void
1698 1701  dq_sruninc(kthread_t *t)
1699 1702  {
1700 1703          ASSERT(t->t_state == TS_RUN);
1701 1704          ASSERT(t->t_schedflag & TS_LOAD);
1702 1705  
1703 1706          THREAD_TRANSITION(t);
1704 1707          setfrontdq(t);
1705 1708  }
1706 1709  
1707 1710  /*
1708 1711   * See comment on calling conventions above.
1709 1712   * Called by sched BEFORE TS_LOAD flag is cleared on a runnable thread.
1710 1713   */
1711 1714  void
1712 1715  dq_srundec(kthread_t *t)
1713 1716  {
1714 1717          ASSERT(t->t_schedflag & TS_LOAD);
1715 1718  
1716 1719          (void) dispdeq(t);
1717 1720          disp_swapped_enq(t);
1718 1721  }
1719 1722  
1720 1723  /*
1721 1724   * Change the dispatcher lock of thread to the "swapped_lock"
1722 1725   * and return with thread lock still held.
1723 1726   *
1724 1727   * Called with thread_lock held, in transition state, and at high spl.
1725 1728   */
1726 1729  void
1727 1730  disp_swapped_enq(kthread_t *tp)
1728 1731  {
1729 1732          ASSERT(THREAD_LOCK_HELD(tp));
1730 1733          ASSERT(tp->t_schedflag & TS_LOAD);
1731 1734  
1732 1735          switch (tp->t_state) {
1733 1736          case TS_RUN:
1734 1737                  disp_lock_enter_high(&swapped_lock);
1735 1738                  THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1736 1739                  break;
1737 1740          case TS_ONPROC:
1738 1741                  disp_lock_enter_high(&swapped_lock);
1739 1742                  THREAD_TRANSITION(tp);
1740 1743                  wake_sched_sec = 1;             /* tell clock to wake sched */
1741 1744                  THREAD_SWAP(tp, &swapped_lock); /* set TS_RUN state and lock */
1742 1745                  break;
1743 1746          default:
1744 1747                  panic("disp_swapped: tp: %p bad t_state", (void *)tp);
1745 1748          }
1746 1749  }
1747 1750  
1748 1751  /*
1749 1752   * This routine is called by setbackdq/setfrontdq if the thread is
1750 1753   * not loaded or loaded and on the swap queue.
1751 1754   *
1752 1755   * Thread state TS_SLEEP implies that a swapped thread
1753 1756   * has been woken up and needs to be swapped in by the swapper.
1754 1757   *
1755 1758   * Thread state TS_RUN, it implies that the priority of a swapped
1756 1759   * thread is being increased by scheduling class (e.g. ts_update).
1757 1760   */
1758 1761  static void
1759 1762  disp_swapped_setrun(kthread_t *tp)
1760 1763  {
1761 1764          ASSERT(THREAD_LOCK_HELD(tp));
1762 1765          ASSERT((tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ)) != TS_LOAD);
1763 1766  
1764 1767          switch (tp->t_state) {
1765 1768          case TS_SLEEP:
1766 1769                  disp_lock_enter_high(&swapped_lock);
1767 1770                  /*
1768 1771                   * Wakeup sched immediately (i.e., next tick) if the
1769 1772                   * thread priority is above maxclsyspri.
1770 1773                   */
1771 1774                  if (DISP_PRIO(tp) > maxclsyspri)
1772 1775                          wake_sched = 1;
1773 1776                  else
1774 1777                          wake_sched_sec = 1;
1775 1778                  THREAD_RUN(tp, &swapped_lock); /* set TS_RUN state and lock */
1776 1779                  break;
1777 1780          case TS_RUN:                            /* called from ts_update */
1778 1781                  break;
1779 1782          default:
1780 1783                  panic("disp_swapped_setrun: tp: %p bad t_state", (void *)tp);
1781 1784          }
1782 1785  }
1783 1786  
1784 1787  /*
1785 1788   *      Make a thread give up its processor.  Find the processor on
1786 1789   *      which this thread is executing, and have that processor
1787 1790   *      preempt.
1788 1791   *
1789 1792   *      We allow System Duty Cycle (SDC) threads to be preempted even if
1790 1793   *      they are running at kernel priorities.  To implement this, we always
1791 1794   *      set cpu_kprunrun; this ensures preempt() will be called.  Since SDC
1792 1795   *      calls cpu_surrender() very often, we only preempt if there is anyone
1793 1796   *      competing with us.
1794 1797   */
1795 1798  void
1796 1799  cpu_surrender(kthread_t *tp)
1797 1800  {
1798 1801          cpu_t   *cpup;
1799 1802          int     max_pri;
1800 1803          int     max_run_pri;
1801 1804          klwp_t  *lwp;
1802 1805  
1803 1806          ASSERT(THREAD_LOCK_HELD(tp));
1804 1807  
1805 1808          if (tp->t_state != TS_ONPROC)
1806 1809                  return;
1807 1810          cpup = tp->t_disp_queue->disp_cpu;      /* CPU thread dispatched to */
1808 1811          max_pri = cpup->cpu_disp->disp_maxrunpri; /* best pri of that CPU */
1809 1812          max_run_pri = CP_MAXRUNPRI(cpup->cpu_part);
1810 1813          if (max_pri < max_run_pri)
1811 1814                  max_pri = max_run_pri;
1812 1815  
1813 1816          if (tp->t_cid == sysdccid) {
1814 1817                  uint_t t_pri = DISP_PRIO(tp);
1815 1818                  if (t_pri > max_pri)
1816 1819                          return;         /* we are not competing w/ anyone */
1817 1820                  cpup->cpu_runrun = cpup->cpu_kprunrun = 1;
1818 1821          } else {
1819 1822                  cpup->cpu_runrun = 1;
1820 1823                  if (max_pri >= kpreemptpri && cpup->cpu_kprunrun == 0) {
1821 1824                          cpup->cpu_kprunrun = 1;
1822 1825                  }
1823 1826          }
1824 1827  
1825 1828          /*
1826 1829           * Propagate cpu_runrun, and cpu_kprunrun to global visibility.
1827 1830           */
1828 1831          membar_enter();
1829 1832  
1830 1833          DTRACE_SCHED1(surrender, kthread_t *, tp);
1831 1834  
1832 1835          /*
1833 1836           * Make the target thread take an excursion through trap()
1834 1837           * to do preempt() (unless we're already in trap or post_syscall,
1835 1838           * calling cpu_surrender via CL_TRAPRET).
1836 1839           */
1837 1840          if (tp != curthread || (lwp = tp->t_lwp) == NULL ||
1838 1841              lwp->lwp_state != LWP_USER) {
1839 1842                  aston(tp);
1840 1843                  if (cpup != CPU)
1841 1844                          poke_cpu(cpup->cpu_id);
1842 1845          }
1843 1846          TRACE_2(TR_FAC_DISP, TR_CPU_SURRENDER,
1844 1847              "cpu_surrender:tid %p cpu %p", tp, cpup);
1845 1848  }
1846 1849  
1847 1850  /*
1848 1851   * Commit to and ratify a scheduling decision
1849 1852   */
1850 1853  /*ARGSUSED*/
1851 1854  static kthread_t *
1852 1855  disp_ratify(kthread_t *tp, disp_t *kpq)
1853 1856  {
1854 1857          pri_t   tpri, maxpri;
1855 1858          pri_t   maxkpri;
1856 1859          cpu_t   *cpup;
1857 1860  
1858 1861          ASSERT(tp != NULL);
1859 1862          /*
1860 1863           * Commit to, then ratify scheduling decision
1861 1864           */
1862 1865          cpup = CPU;
1863 1866          if (cpup->cpu_runrun != 0)
1864 1867                  cpup->cpu_runrun = 0;
1865 1868          if (cpup->cpu_kprunrun != 0)
1866 1869                  cpup->cpu_kprunrun = 0;
1867 1870          if (cpup->cpu_chosen_level != -1)
1868 1871                  cpup->cpu_chosen_level = -1;
1869 1872          membar_enter();
1870 1873          tpri = DISP_PRIO(tp);
1871 1874          maxpri = cpup->cpu_disp->disp_maxrunpri;
1872 1875          maxkpri = kpq->disp_maxrunpri;
1873 1876          if (maxpri < maxkpri)
1874 1877                  maxpri = maxkpri;
1875 1878          if (tpri < maxpri) {
1876 1879                  /*
1877 1880                   * should have done better
1878 1881                   * put this one back and indicate to try again
1879 1882                   */
1880 1883                  cpup->cpu_dispthread = curthread;       /* fixup dispthread */
1881 1884                  cpup->cpu_dispatch_pri = DISP_PRIO(curthread);
1882 1885                  thread_lock_high(tp);
1883 1886                  THREAD_TRANSITION(tp);
1884 1887                  setfrontdq(tp);
1885 1888                  thread_unlock_nopreempt(tp);
1886 1889  
1887 1890                  tp = NULL;
1888 1891          }
1889 1892          return (tp);
1890 1893  }
1891 1894  
1892 1895  /*
1893 1896   * See if there is any work on the dispatcher queue for other CPUs.
1894 1897   * If there is, dequeue the best thread and return.
1895 1898   */
1896 1899  static kthread_t *
1897 1900  disp_getwork(cpu_t *cp)
1898 1901  {
1899 1902          cpu_t           *ocp;           /* other CPU */
1900 1903          cpu_t           *ocp_start;
1901 1904          cpu_t           *tcp;           /* target local CPU */
1902 1905          kthread_t       *tp;
1903 1906          kthread_t       *retval = NULL;
1904 1907          pri_t           maxpri;
1905 1908          disp_t          *kpq;           /* kp queue for this partition */
1906 1909          lpl_t           *lpl, *lpl_leaf;
1907 1910          int             leafidx, startidx;
1908 1911          hrtime_t        stealtime;
1909 1912          lgrp_id_t       local_id;
1910 1913  
1911 1914          maxpri = -1;
1912 1915          tcp = NULL;
1913 1916  
1914 1917          kpq = &cp->cpu_part->cp_kp_queue;
1915 1918          while (kpq->disp_maxrunpri >= 0) {
1916 1919                  /*
1917 1920                   * Try to take a thread from the kp_queue.
1918 1921                   */
1919 1922                  tp = (disp_getbest(kpq));
1920 1923                  if (tp)
1921 1924                          return (disp_ratify(tp, kpq));
1922 1925          }
1923 1926  
1924 1927          kpreempt_disable();             /* protect the cpu_active list */
1925 1928  
1926 1929          /*
1927 1930           * Try to find something to do on another CPU's run queue.
1928 1931           * Loop through all other CPUs looking for the one with the highest
1929 1932           * priority unbound thread.
1930 1933           *
1931 1934           * On NUMA machines, the partition's CPUs are consulted in order of
1932 1935           * distance from the current CPU. This way, the first available
1933 1936           * work found is also the closest, and will suffer the least
1934 1937           * from being migrated.
1935 1938           */
1936 1939          lpl = lpl_leaf = cp->cpu_lpl;
1937 1940          local_id = lpl_leaf->lpl_lgrpid;
1938 1941          leafidx = startidx = 0;
1939 1942  
1940 1943          /*
1941 1944           * This loop traverses the lpl hierarchy. Higher level lpls represent
1942 1945           * broader levels of locality
1943 1946           */
1944 1947          do {
1945 1948                  /* This loop iterates over the lpl's leaves */
1946 1949                  do {
1947 1950                          if (lpl_leaf != cp->cpu_lpl)
1948 1951                                  ocp = lpl_leaf->lpl_cpus;
1949 1952                          else
1950 1953                                  ocp = cp->cpu_next_lpl;
1951 1954  
1952 1955                          /* This loop iterates over the CPUs in the leaf */
1953 1956                          ocp_start = ocp;
1954 1957                          do {
1955 1958                                  pri_t pri;
1956 1959  
1957 1960                                  ASSERT(CPU_ACTIVE(ocp));
1958 1961  
1959 1962                                  /*
1960 1963                                   * End our stroll around this lpl if:
1961 1964                                   *
1962 1965                                   * - Something became runnable on the local
1963 1966                                   *   queue...which also ends our stroll around
1964 1967                                   *   the partition.
1965 1968                                   *
1966 1969                                   * - We happen across another idle CPU.
1967 1970                                   *   Since it is patrolling the next portion
1968 1971                                   *   of the lpl's list (assuming it's not
1969 1972                                   *   halted, or busy servicing an interrupt),
1970 1973                                   *   move to the next higher level of locality.
1971 1974                                   */
1972 1975                                  if (cp->cpu_disp->disp_nrunnable != 0) {
1973 1976                                          kpreempt_enable();
1974 1977                                          return (NULL);
1975 1978                                  }
1976 1979                                  if (ocp->cpu_dispatch_pri == -1) {
1977 1980                                          if (ocp->cpu_disp_flags &
1978 1981                                              CPU_DISP_HALTED ||
1979 1982                                              ocp->cpu_intr_actv != 0)
1980 1983                                                  continue;
1981 1984                                          else
1982 1985                                                  goto next_level;
1983 1986                                  }
1984 1987  
1985 1988                                  /*
1986 1989                                   * If there's only one thread and the CPU
1987 1990                                   * is in the middle of a context switch,
1988 1991                                   * or it's currently running the idle thread,
1989 1992                                   * don't steal it.
1990 1993                                   */
1991 1994                                  if ((ocp->cpu_disp_flags &
1992 1995                                      CPU_DISP_DONTSTEAL) &&
1993 1996                                      ocp->cpu_disp->disp_nrunnable == 1)
1994 1997                                          continue;
1995 1998  
1996 1999                                  pri = ocp->cpu_disp->disp_max_unbound_pri;
1997 2000                                  if (pri > maxpri) {
1998 2001                                          /*
1999 2002                                           * Don't steal threads that we attempted
2000 2003                                           * to steal recently until they're ready
2001 2004                                           * to be stolen again.
2002 2005                                           */
2003 2006                                          stealtime = ocp->cpu_disp->disp_steal;
2004 2007                                          if (stealtime == 0 ||
2005 2008                                              stealtime - gethrtime() <= 0) {
2006 2009                                                  maxpri = pri;
2007 2010                                                  tcp = ocp;
2008 2011                                          } else {
2009 2012                                                  /*
2010 2013                                                   * Don't update tcp, just set
2011 2014                                                   * the retval to T_DONTSTEAL, so
2012 2015                                                   * that if no acceptable CPUs
2013 2016                                                   * are found the return value
2014 2017                                                   * will be T_DONTSTEAL rather
2015 2018                                                   * then NULL.
2016 2019                                                   */
2017 2020                                                  retval = T_DONTSTEAL;
2018 2021                                          }
2019 2022                                  }
2020 2023                          } while ((ocp = ocp->cpu_next_lpl) != ocp_start);
2021 2024  
2022 2025                          /*
2023 2026                           * Iterate to the next leaf lpl in the resource set
2024 2027                           * at this level of locality. If we hit the end of
2025 2028                           * the set, wrap back around to the beginning.
2026 2029                           *
2027 2030                           * Note: This iteration is NULL terminated for a reason
2028 2031                           * see lpl_topo_bootstrap() in lgrp.c for details.
2029 2032                           */
2030 2033                          if ((lpl_leaf = lpl->lpl_rset[++leafidx]) == NULL) {
2031 2034                                  leafidx = 0;
2032 2035                                  lpl_leaf = lpl->lpl_rset[leafidx];
2033 2036                          }
2034 2037                  } while (leafidx != startidx);
2035 2038  
2036 2039  next_level:
2037 2040                  /*
2038 2041                   * Expand the search to include farther away CPUs (next
2039 2042                   * locality level). The closer CPUs that have already been
2040 2043                   * checked will be checked again. In doing so, idle CPUs
2041 2044                   * will tend to be more aggresive about stealing from CPUs
2042 2045                   * that are closer (since the closer CPUs will be considered
2043 2046                   * more often).
2044 2047                   * Begin at this level with the CPUs local leaf lpl.
2045 2048                   */
2046 2049                  if ((lpl = lpl->lpl_parent) != NULL) {
2047 2050                          leafidx = startidx = lpl->lpl_id2rset[local_id];
2048 2051                          lpl_leaf = lpl->lpl_rset[leafidx];
2049 2052                  }
2050 2053          } while (!tcp && lpl);
2051 2054  
2052 2055          kpreempt_enable();
2053 2056  
2054 2057          /*
2055 2058           * If another queue looks good, and there is still nothing on
2056 2059           * the local queue, try to transfer one or more threads
2057 2060           * from it to our queue.
2058 2061           */
2059 2062          if (tcp && cp->cpu_disp->disp_nrunnable == 0) {
2060 2063                  tp = disp_getbest(tcp->cpu_disp);
2061 2064                  if (tp == NULL || tp == T_DONTSTEAL)
2062 2065                          return (tp);
2063 2066                  return (disp_ratify(tp, kpq));
2064 2067          }
2065 2068          return (retval);
2066 2069  }
2067 2070  
2068 2071  
2069 2072  /*
2070 2073   * disp_fix_unbound_pri()
2071 2074   *      Determines the maximum priority of unbound threads on the queue.
2072 2075   *      The priority is kept for the queue, but is only increased, never
2073 2076   *      reduced unless some CPU is looking for something on that queue.
2074 2077   *
2075 2078   *      The priority argument is the known upper limit.
2076 2079   *
2077 2080   *      Perhaps this should be kept accurately, but that probably means
2078 2081   *      separate bitmaps for bound and unbound threads.  Since only idled
2079 2082   *      CPUs will have to do this recalculation, it seems better this way.
2080 2083   */
2081 2084  static void
2082 2085  disp_fix_unbound_pri(disp_t *dp, pri_t pri)
2083 2086  {
2084 2087          kthread_t       *tp;
2085 2088          dispq_t         *dq;
2086 2089          ulong_t         *dqactmap = dp->disp_qactmap;
2087 2090          ulong_t         mapword;
2088 2091          int             wx;
2089 2092  
2090 2093          ASSERT(DISP_LOCK_HELD(&dp->disp_lock));
2091 2094  
2092 2095          ASSERT(pri >= 0);                       /* checked by caller */
2093 2096  
2094 2097          /*
2095 2098           * Start the search at the next lowest priority below the supplied
2096 2099           * priority.  This depends on the bitmap implementation.
2097 2100           */
2098 2101          do {
2099 2102                  wx = pri >> BT_ULSHIFT;         /* index of word in map */
2100 2103  
2101 2104                  /*
2102 2105                   * Form mask for all lower priorities in the word.
2103 2106                   */
2104 2107                  mapword = dqactmap[wx] & (BT_BIW(pri) - 1);
2105 2108  
2106 2109                  /*
2107 2110                   * Get next lower active priority.
2108 2111                   */
2109 2112                  if (mapword != 0) {
2110 2113                          pri = (wx << BT_ULSHIFT) + highbit(mapword) - 1;
2111 2114                  } else if (wx > 0) {
2112 2115                          pri = bt_gethighbit(dqactmap, wx - 1); /* sign extend */
2113 2116                          if (pri < 0)
2114 2117                                  break;
2115 2118                  } else {
2116 2119                          pri = -1;
2117 2120                          break;
2118 2121                  }
2119 2122  
2120 2123                  /*
2121 2124                   * Search the queue for unbound, runnable threads.
2122 2125                   */
2123 2126                  dq = &dp->disp_q[pri];
2124 2127                  tp = dq->dq_first;
2125 2128  
2126 2129                  while (tp && (tp->t_bound_cpu || tp->t_weakbound_cpu)) {
2127 2130                          tp = tp->t_link;
2128 2131                  }
2129 2132  
2130 2133                  /*
2131 2134                   * If a thread was found, set the priority and return.
2132 2135                   */
2133 2136          } while (tp == NULL);
2134 2137  
2135 2138          /*
2136 2139           * pri holds the maximum unbound thread priority or -1.
2137 2140           */
2138 2141          if (dp->disp_max_unbound_pri != pri)
2139 2142                  dp->disp_max_unbound_pri = pri;
2140 2143  }
2141 2144  
2142 2145  /*
2143 2146   * disp_adjust_unbound_pri() - thread is becoming unbound, so we should
2144 2147   *      check if the CPU to which is was previously bound should have
2145 2148   *      its disp_max_unbound_pri increased.
2146 2149   */
2147 2150  void
2148 2151  disp_adjust_unbound_pri(kthread_t *tp)
2149 2152  {
2150 2153          disp_t *dp;
2151 2154          pri_t tpri;
2152 2155  
2153 2156          ASSERT(THREAD_LOCK_HELD(tp));
2154 2157  
2155 2158          /*
2156 2159           * Don't do anything if the thread is not bound, or
2157 2160           * currently not runnable or swapped out.
2158 2161           */
2159 2162          if (tp->t_bound_cpu == NULL ||
2160 2163              tp->t_state != TS_RUN ||
2161 2164              tp->t_schedflag & TS_ON_SWAPQ)
2162 2165                  return;
2163 2166  
2164 2167          tpri = DISP_PRIO(tp);
2165 2168          dp = tp->t_bound_cpu->cpu_disp;
2166 2169          ASSERT(tpri >= 0 && tpri < dp->disp_npri);
2167 2170          if (tpri > dp->disp_max_unbound_pri)
2168 2171                  dp->disp_max_unbound_pri = tpri;
2169 2172  }
2170 2173  
2171 2174  /*
2172 2175   * disp_getbest()
2173 2176   *   De-queue the highest priority unbound runnable thread.
2174 2177   *   Returns with the thread unlocked and onproc but at splhigh (like disp()).
2175 2178   *   Returns NULL if nothing found.
2176 2179   *   Returns T_DONTSTEAL if the thread was not stealable.
2177 2180   *   so that the caller will try again later.
2178 2181   *
2179 2182   *   Passed a pointer to a dispatch queue not associated with this CPU, and
2180 2183   *   its type.
2181 2184   */
2182 2185  static kthread_t *
2183 2186  disp_getbest(disp_t *dp)
2184 2187  {
2185 2188          kthread_t       *tp;
2186 2189          dispq_t         *dq;
2187 2190          pri_t           pri;
2188 2191          cpu_t           *cp, *tcp;
2189 2192          boolean_t       allbound;
2190 2193  
2191 2194          disp_lock_enter(&dp->disp_lock);
2192 2195  
2193 2196          /*
2194 2197           * If there is nothing to run, or the CPU is in the middle of a
2195 2198           * context switch of the only thread, return NULL.
2196 2199           */
2197 2200          tcp = dp->disp_cpu;
2198 2201          cp = CPU;
2199 2202          pri = dp->disp_max_unbound_pri;
2200 2203          if (pri == -1 ||
2201 2204              (tcp != NULL && (tcp->cpu_disp_flags & CPU_DISP_DONTSTEAL) &&
2202 2205              tcp->cpu_disp->disp_nrunnable == 1)) {
2203 2206                  disp_lock_exit_nopreempt(&dp->disp_lock);
2204 2207                  return (NULL);
2205 2208          }
2206 2209  
2207 2210          dq = &dp->disp_q[pri];
2208 2211  
2209 2212  
2210 2213          /*
2211 2214           * Assume that all threads are bound on this queue, and change it
2212 2215           * later when we find out that it is not the case.
2213 2216           */
2214 2217          allbound = B_TRUE;
2215 2218          for (tp = dq->dq_first; tp != NULL; tp = tp->t_link) {
2216 2219                  hrtime_t now, nosteal, rqtime;
2217 2220  
2218 2221                  /*
2219 2222                   * Skip over bound threads which could be here even
2220 2223                   * though disp_max_unbound_pri indicated this level.
2221 2224                   */
2222 2225                  if (tp->t_bound_cpu || tp->t_weakbound_cpu)
2223 2226                          continue;
2224 2227  
2225 2228                  /*
2226 2229                   * We've got some unbound threads on this queue, so turn
2227 2230                   * the allbound flag off now.
2228 2231                   */
2229 2232                  allbound = B_FALSE;
2230 2233  
2231 2234                  /*
2232 2235                   * The thread is a candidate for stealing from its run queue. We
2233 2236                   * don't want to steal threads that became runnable just a
2234 2237                   * moment ago. This improves CPU affinity for threads that get
2235 2238                   * preempted for short periods of time and go back on the run
2236 2239                   * queue.
2237 2240                   *
2238 2241                   * We want to let it stay on its run queue if it was only placed
2239 2242                   * there recently and it was running on the same CPU before that
2240 2243                   * to preserve its cache investment. For the thread to remain on
2241 2244                   * its run queue, ALL of the following conditions must be
2242 2245                   * satisfied:
2243 2246                   *
2244 2247                   * - the disp queue should not be the kernel preemption queue
2245 2248                   * - delayed idle stealing should not be disabled
2246 2249                   * - nosteal_nsec should be non-zero
2247 2250                   * - it should run with user priority
2248 2251                   * - it should be on the run queue of the CPU where it was
2249 2252                   *   running before being placed on the run queue
2250 2253                   * - it should be the only thread on the run queue (to prevent
2251 2254                   *   extra scheduling latency for other threads)
2252 2255                   * - it should sit on the run queue for less than per-chip
2253 2256                   *   nosteal interval or global nosteal interval
2254 2257                   * - in case of CPUs with shared cache it should sit in a run
2255 2258                   *   queue of a CPU from a different chip
2256 2259                   *
2257 2260                   * The checks are arranged so that the ones that are faster are
2258 2261                   * placed earlier.
2259 2262                   */
2260 2263                  if (tcp == NULL ||
2261 2264                      pri >= minclsyspri ||
2262 2265                      tp->t_cpu != tcp)
2263 2266                          break;
2264 2267  
2265 2268                  /*
2266 2269                   * Steal immediately if, due to CMT processor architecture
2267 2270                   * migraiton between cp and tcp would incur no performance
2268 2271                   * penalty.
2269 2272                   */
2270 2273                  if (pg_cmt_can_migrate(cp, tcp))
2271 2274                          break;
2272 2275  
2273 2276                  nosteal = nosteal_nsec;
2274 2277                  if (nosteal == 0)
2275 2278                          break;
2276 2279  
2277 2280                  /*
2278 2281                   * Calculate time spent sitting on run queue
2279 2282                   */
2280 2283                  now = gethrtime_unscaled();
2281 2284                  rqtime = now - tp->t_waitrq;
2282 2285                  scalehrtime(&rqtime);
2283 2286  
2284 2287                  /*
2285 2288                   * Steal immediately if the time spent on this run queue is more
2286 2289                   * than allowed nosteal delay.
2287 2290                   *
2288 2291                   * Negative rqtime check is needed here to avoid infinite
2289 2292                   * stealing delays caused by unlikely but not impossible
2290 2293                   * drifts between CPU times on different CPUs.
2291 2294                   */
2292 2295                  if (rqtime > nosteal || rqtime < 0)
2293 2296                          break;
2294 2297  
2295 2298                  DTRACE_PROBE4(nosteal, kthread_t *, tp,
2296 2299                      cpu_t *, tcp, cpu_t *, cp, hrtime_t, rqtime);
2297 2300                  scalehrtime(&now);
2298 2301                  /*
2299 2302                   * Calculate when this thread becomes stealable
2300 2303                   */
2301 2304                  now += (nosteal - rqtime);
2302 2305  
2303 2306                  /*
2304 2307                   * Calculate time when some thread becomes stealable
2305 2308                   */
2306 2309                  if (now < dp->disp_steal)
2307 2310                          dp->disp_steal = now;
2308 2311          }
2309 2312  
2310 2313          /*
2311 2314           * If there were no unbound threads on this queue, find the queue
2312 2315           * where they are and then return later. The value of
2313 2316           * disp_max_unbound_pri is not always accurate because it isn't
2314 2317           * reduced until another idle CPU looks for work.
2315 2318           */
2316 2319          if (allbound)
2317 2320                  disp_fix_unbound_pri(dp, pri);
2318 2321  
2319 2322          /*
2320 2323           * If we reached the end of the queue and found no unbound threads
2321 2324           * then return NULL so that other CPUs will be considered.  If there
2322 2325           * are unbound threads but they cannot yet be stolen, then
2323 2326           * return T_DONTSTEAL and try again later.
2324 2327           */
2325 2328          if (tp == NULL) {
2326 2329                  disp_lock_exit_nopreempt(&dp->disp_lock);
2327 2330                  return (allbound ? NULL : T_DONTSTEAL);
2328 2331          }
2329 2332  
2330 2333          /*
2331 2334           * Found a runnable, unbound thread, so remove it from queue.
2332 2335           * dispdeq() requires that we have the thread locked, and we do,
2333 2336           * by virtue of holding the dispatch queue lock.  dispdeq() will
2334 2337           * put the thread in transition state, thereby dropping the dispq
2335 2338           * lock.
2336 2339           */
2337 2340  
2338 2341  #ifdef DEBUG
2339 2342          {
2340 2343                  int     thread_was_on_queue;
2341 2344  
2342 2345                  thread_was_on_queue = dispdeq(tp);      /* drops disp_lock */
2343 2346                  ASSERT(thread_was_on_queue);
2344 2347          }
2345 2348  
2346 2349  #else /* DEBUG */
2347 2350          (void) dispdeq(tp);                     /* drops disp_lock */
2348 2351  #endif /* DEBUG */
2349 2352  
2350 2353          /*
2351 2354           * Reset the disp_queue steal time - we do not know what is the smallest
2352 2355           * value across the queue is.
2353 2356           */
2354 2357          dp->disp_steal = 0;
2355 2358  
2356 2359          tp->t_schedflag |= TS_DONT_SWAP;
2357 2360  
2358 2361          /*
2359 2362           * Setup thread to run on the current CPU.
2360 2363           */
2361 2364          tp->t_disp_queue = cp->cpu_disp;
2362 2365  
2363 2366          cp->cpu_dispthread = tp;                /* protected by spl only */
2364 2367          cp->cpu_dispatch_pri = pri;
2365 2368  
2366 2369          /*
2367 2370           * There can be a memory synchronization race between disp_getbest()
2368 2371           * and disp_ratify() vs cpu_resched() where cpu_resched() is trying
2369 2372           * to preempt the current thread to run the enqueued thread while
2370 2373           * disp_getbest() and disp_ratify() are changing the current thread
2371 2374           * to the stolen thread. This may lead to a situation where
2372 2375           * cpu_resched() tries to preempt the wrong thread and the
2373 2376           * stolen thread continues to run on the CPU which has been tagged
2374 2377           * for preemption.
2375 2378           * Later the clock thread gets enqueued but doesn't get to run on the
2376 2379           * CPU causing the system to hang.
2377 2380           *
2378 2381           * To avoid this, grabbing and dropping the disp_lock (which does
2379 2382           * a memory barrier) is needed to synchronize the execution of
2380 2383           * cpu_resched() with disp_getbest() and disp_ratify() and
2381 2384           * synchronize the memory read and written by cpu_resched(),
2382 2385           * disp_getbest(), and disp_ratify() with each other.
2383 2386           *  (see CR#6482861 for more details).
2384 2387           */
2385 2388          disp_lock_enter_high(&cp->cpu_disp->disp_lock);
2386 2389          disp_lock_exit_high(&cp->cpu_disp->disp_lock);
2387 2390  
2388 2391          ASSERT(pri == DISP_PRIO(tp));
2389 2392  
2390 2393          DTRACE_PROBE3(steal, kthread_t *, tp, cpu_t *, tcp, cpu_t *, cp);
2391 2394  
2392 2395          thread_onproc(tp, cp);                  /* set t_state to TS_ONPROC */
2393 2396  
2394 2397          /*
2395 2398           * Return with spl high so that swtch() won't need to raise it.
2396 2399           * The disp_lock was dropped by dispdeq().
2397 2400           */
2398 2401  
2399 2402          return (tp);
2400 2403  }
2401 2404  
2402 2405  /*
2403 2406   * disp_bound_common() - common routine for higher level functions
2404 2407   *      that check for bound threads under certain conditions.
2405 2408   *      If 'threadlistsafe' is set then there is no need to acquire
2406 2409   *      pidlock to stop the thread list from changing (eg, if
2407 2410   *      disp_bound_* is called with cpus paused).
2408 2411   */
2409 2412  static int
2410 2413  disp_bound_common(cpu_t *cp, int threadlistsafe, int flag)
2411 2414  {
2412 2415          int             found = 0;
2413 2416          kthread_t       *tp;
2414 2417  
2415 2418          ASSERT(flag);
2416 2419  
2417 2420          if (!threadlistsafe)
2418 2421                  mutex_enter(&pidlock);
2419 2422          tp = curthread;         /* faster than allthreads */
2420 2423          do {
2421 2424                  if (tp->t_state != TS_FREE) {
2422 2425                          /*
2423 2426                           * If an interrupt thread is busy, but the
2424 2427                           * caller doesn't care (i.e. BOUND_INTR is off),
2425 2428                           * then just ignore it and continue through.
2426 2429                           */
2427 2430                          if ((tp->t_flag & T_INTR_THREAD) &&
2428 2431                              !(flag & BOUND_INTR))
2429 2432                                  continue;
2430 2433  
2431 2434                          /*
2432 2435                           * Skip the idle thread for the CPU
2433 2436                           * we're about to set offline.
2434 2437                           */
2435 2438                          if (tp == cp->cpu_idle_thread)
2436 2439                                  continue;
2437 2440  
2438 2441                          /*
2439 2442                           * Skip the pause thread for the CPU
2440 2443                           * we're about to set offline.
2441 2444                           */
2442 2445                          if (tp == cp->cpu_pause_thread)
2443 2446                                  continue;
2444 2447  
2445 2448                          if ((flag & BOUND_CPU) &&
2446 2449                              (tp->t_bound_cpu == cp ||
2447 2450                              tp->t_bind_cpu == cp->cpu_id ||
2448 2451                              tp->t_weakbound_cpu == cp)) {
2449 2452                                  found = 1;
2450 2453                                  break;
2451 2454                          }
2452 2455  
2453 2456                          if ((flag & BOUND_PARTITION) &&
2454 2457                              (tp->t_cpupart == cp->cpu_part)) {
2455 2458                                  found = 1;
2456 2459                                  break;
2457 2460                          }
2458 2461                  }
2459 2462          } while ((tp = tp->t_next) != curthread && found == 0);
2460 2463          if (!threadlistsafe)
2461 2464                  mutex_exit(&pidlock);
2462 2465          return (found);
2463 2466  }
2464 2467  
2465 2468  /*
2466 2469   * disp_bound_threads - return nonzero if threads are bound to the processor.
2467 2470   *      Called infrequently.  Keep this simple.
2468 2471   *      Includes threads that are asleep or stopped but not onproc.
2469 2472   */
2470 2473  int
2471 2474  disp_bound_threads(cpu_t *cp, int threadlistsafe)
2472 2475  {
2473 2476          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU));
2474 2477  }
2475 2478  
2476 2479  /*
2477 2480   * disp_bound_anythreads - return nonzero if _any_ threads are bound
2478 2481   * to the given processor, including interrupt threads.
2479 2482   */
2480 2483  int
2481 2484  disp_bound_anythreads(cpu_t *cp, int threadlistsafe)
2482 2485  {
2483 2486          return (disp_bound_common(cp, threadlistsafe, BOUND_CPU | BOUND_INTR));
2484 2487  }
2485 2488  
2486 2489  /*
2487 2490   * disp_bound_partition - return nonzero if threads are bound to the same
2488 2491   * partition as the processor.
2489 2492   *      Called infrequently.  Keep this simple.
2490 2493   *      Includes threads that are asleep or stopped but not onproc.
2491 2494   */
2492 2495  int
2493 2496  disp_bound_partition(cpu_t *cp, int threadlistsafe)
2494 2497  {
2495 2498          return (disp_bound_common(cp, threadlistsafe, BOUND_PARTITION));
2496 2499  }
2497 2500  
2498 2501  /*
2499 2502   * disp_cpu_inactive - make a CPU inactive by moving all of its unbound
2500 2503   * threads to other CPUs.
2501 2504   */
2502 2505  void
2503 2506  disp_cpu_inactive(cpu_t *cp)
2504 2507  {
2505 2508          kthread_t       *tp;
2506 2509          disp_t          *dp = cp->cpu_disp;
2507 2510          dispq_t         *dq;
2508 2511          pri_t           pri;
2509 2512          int             wasonq;
2510 2513  
2511 2514          disp_lock_enter(&dp->disp_lock);
2512 2515          while ((pri = dp->disp_max_unbound_pri) != -1) {
2513 2516                  dq = &dp->disp_q[pri];
2514 2517                  tp = dq->dq_first;
2515 2518  
2516 2519                  /*
2517 2520                   * Skip over bound threads.
2518 2521                   */
2519 2522                  while (tp != NULL && tp->t_bound_cpu != NULL) {
2520 2523                          tp = tp->t_link;
2521 2524                  }
2522 2525  
2523 2526                  if (tp == NULL) {
2524 2527                          /* disp_max_unbound_pri must be inaccurate, so fix it */
2525 2528                          disp_fix_unbound_pri(dp, pri);
2526 2529                          continue;
2527 2530                  }
2528 2531  
2529 2532                  wasonq = dispdeq(tp);           /* drops disp_lock */
2530 2533                  ASSERT(wasonq);
2531 2534                  ASSERT(tp->t_weakbound_cpu == NULL);
2532 2535  
2533 2536                  setbackdq(tp);
2534 2537                  /*
2535 2538                   * Called from cpu_offline:
2536 2539                   *
2537 2540                   * cp has already been removed from the list of active cpus
2538 2541                   * and tp->t_cpu has been changed so there is no risk of
2539 2542                   * tp ending up back on cp.
2540 2543                   *
2541 2544                   * Called from cpupart_move_cpu:
2542 2545                   *
2543 2546                   * The cpu has moved to a new cpupart.  Any threads that
2544 2547                   * were on it's dispatch queues before the move remain
2545 2548                   * in the old partition and can't run in the new partition.
2546 2549                   */
2547 2550                  ASSERT(tp->t_cpu != cp);
2548 2551                  thread_unlock(tp);
2549 2552  
2550 2553                  disp_lock_enter(&dp->disp_lock);
2551 2554          }
2552 2555          disp_lock_exit(&dp->disp_lock);
2553 2556  }
2554 2557  
2555 2558  /*
2556 2559   * disp_lowpri_cpu - find CPU running the lowest priority thread.
2557 2560   *      The hint passed in is used as a starting point so we don't favor
2558 2561   *      CPU 0 or any other CPU.  The caller should pass in the most recently
2559 2562   *      used CPU for the thread.
2560 2563   *
2561 2564   *      The lgroup and priority are used to determine the best CPU to run on
2562 2565   *      in a NUMA machine.  The lgroup specifies which CPUs are closest while
2563 2566   *      the thread priority will indicate whether the thread will actually run
2564 2567   *      there.  To pick the best CPU, the CPUs inside and outside of the given
2565 2568   *      lgroup which are running the lowest priority threads are found.  The
2566 2569   *      remote CPU is chosen only if the thread will not run locally on a CPU
2567 2570   *      within the lgroup, but will run on the remote CPU. If the thread
2568 2571   *      cannot immediately run on any CPU, the best local CPU will be chosen.
2569 2572   *
2570 2573   *      The lpl specified also identifies the cpu partition from which
2571 2574   *      disp_lowpri_cpu should select a CPU.
2572 2575   *
2573 2576   *      curcpu is used to indicate that disp_lowpri_cpu is being called on
2574 2577   *      behalf of the current thread. (curthread is looking for a new cpu)
2575 2578   *      In this case, cpu_dispatch_pri for this thread's cpu should be
2576 2579   *      ignored.
2577 2580   *
2578 2581   *      If a cpu is the target of an offline request then try to avoid it.
2579 2582   *
2580 2583   *      This function must be called at either high SPL, or with preemption
2581 2584   *      disabled, so that the "hint" CPU cannot be removed from the online
2582 2585   *      CPU list while we are traversing it.
2583 2586   */
2584 2587  cpu_t *
2585 2588  disp_lowpri_cpu(cpu_t *hint, lpl_t *lpl, pri_t tpri, cpu_t *curcpu)
2586 2589  {
2587 2590          cpu_t   *bestcpu;
2588 2591          cpu_t   *besthomecpu;
2589 2592          cpu_t   *cp, *cpstart;
2590 2593  
2591 2594          pri_t   bestpri;
2592 2595          pri_t   cpupri;
2593 2596  
2594 2597          klgrpset_t      done;
2595 2598          klgrpset_t      cur_set;
2596 2599  
2597 2600          lpl_t           *lpl_iter, *lpl_leaf;
2598 2601          int             i;
2599 2602  
2600 2603          /*
2601 2604           * Scan for a CPU currently running the lowest priority thread.
2602 2605           * Cannot get cpu_lock here because it is adaptive.
2603 2606           * We do not require lock on CPU list.
2604 2607           */
2605 2608          ASSERT(hint != NULL);
2606 2609          ASSERT(lpl != NULL);
2607 2610          ASSERT(lpl->lpl_ncpu > 0);
2608 2611  
2609 2612          /*
2610 2613           * First examine local CPUs. Note that it's possible the hint CPU
2611 2614           * passed in in remote to the specified home lgroup. If our priority
2612 2615           * isn't sufficient enough such that we can run immediately at home,
2613 2616           * then examine CPUs remote to our home lgroup.
2614 2617           * We would like to give preference to CPUs closest to "home".
2615 2618           * If we can't find a CPU where we'll run at a given level
2616 2619           * of locality, we expand our search to include the next level.
2617 2620           */
2618 2621          bestcpu = besthomecpu = NULL;
2619 2622          klgrpset_clear(done);
2620 2623          /* start with lpl we were passed */
2621 2624  
2622 2625          lpl_iter = lpl;
2623 2626  
2624 2627          do {
2625 2628  
2626 2629                  bestpri = SHRT_MAX;
2627 2630                  klgrpset_clear(cur_set);
2628 2631  
2629 2632                  for (i = 0; i < lpl_iter->lpl_nrset; i++) {
2630 2633                          lpl_leaf = lpl_iter->lpl_rset[i];
2631 2634                          if (klgrpset_ismember(done, lpl_leaf->lpl_lgrpid))
2632 2635                                  continue;
2633 2636  
2634 2637                          klgrpset_add(cur_set, lpl_leaf->lpl_lgrpid);
2635 2638  
2636 2639                          if (hint->cpu_lpl == lpl_leaf)
2637 2640                                  cp = cpstart = hint;
2638 2641                          else
2639 2642                                  cp = cpstart = lpl_leaf->lpl_cpus;
2640 2643  
2641 2644                          do {
2642 2645                                  if (cp == curcpu)
2643 2646                                          cpupri = -1;
2644 2647                                  else if (cp == cpu_inmotion)
2645 2648                                          cpupri = SHRT_MAX;
2646 2649                                  else
2647 2650                                          cpupri = cp->cpu_dispatch_pri;
2648 2651                                  if (cp->cpu_disp->disp_maxrunpri > cpupri)
2649 2652                                          cpupri = cp->cpu_disp->disp_maxrunpri;
2650 2653                                  if (cp->cpu_chosen_level > cpupri)
2651 2654                                          cpupri = cp->cpu_chosen_level;
2652 2655                                  if (cpupri < bestpri) {
2653 2656                                          if (CPU_IDLING(cpupri)) {
2654 2657                                                  ASSERT((cp->cpu_flags &
2655 2658                                                      CPU_QUIESCED) == 0);
2656 2659                                                  return (cp);
2657 2660                                          }
2658 2661                                          bestcpu = cp;
2659 2662                                          bestpri = cpupri;
2660 2663                                  }
2661 2664                          } while ((cp = cp->cpu_next_lpl) != cpstart);
2662 2665                  }
2663 2666  
2664 2667                  if (bestcpu && (tpri > bestpri)) {
2665 2668                          ASSERT((bestcpu->cpu_flags & CPU_QUIESCED) == 0);
2666 2669                          return (bestcpu);
2667 2670                  }
2668 2671                  if (besthomecpu == NULL)
2669 2672                          besthomecpu = bestcpu;
2670 2673                  /*
2671 2674                   * Add the lgrps we just considered to the "done" set
2672 2675                   */
2673 2676                  klgrpset_or(done, cur_set);
2674 2677  
2675 2678          } while ((lpl_iter = lpl_iter->lpl_parent) != NULL);
2676 2679  
2677 2680          /*
2678 2681           * The specified priority isn't high enough to run immediately
2679 2682           * anywhere, so just return the best CPU from the home lgroup.
2680 2683           */
2681 2684          ASSERT((besthomecpu->cpu_flags & CPU_QUIESCED) == 0);
2682 2685          return (besthomecpu);
2683 2686  }
2684 2687  
2685 2688  /*
2686 2689   * This routine provides the generic idle cpu function for all processors.
2687 2690   * If a processor has some specific code to execute when idle (say, to stop
2688 2691   * the pipeline and save power) then that routine should be defined in the
2689 2692   * processors specific code (module_xx.c) and the global variable idle_cpu
2690 2693   * set to that function.
2691 2694   */
2692 2695  static void
2693 2696  generic_idle_cpu(void)
2694 2697  {
2695 2698  }
2696 2699  
2697 2700  /*ARGSUSED*/
2698 2701  static void
2699 2702  generic_enq_thread(cpu_t *cpu, int bound)
2700 2703  {
2701 2704  }
  
    | 
      ↓ open down ↓ | 
    2568 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX