Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/syscall/poll.c
          +++ new/usr/src/uts/common/syscall/poll.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24   24   * Use is subject to license terms.
  25   25   */
  26   26  
  27   27  /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
  28   28  /*        All Rights Reserved   */
  29   29  
  30   30  /*
  31   31   * Copyright (c) 2012 by Delphix. All rights reserved.
  32   32   * Copyright 2016, Joyent, Inc.
  33   33   */
  34   34  
  35   35  /*
  36   36   * Portions of this source code were derived from Berkeley 4.3 BSD
  37   37   * under license from the Regents of the University of California.
  38   38   */
  39   39  
  40   40  #include <sys/param.h>
  41   41  #include <sys/isa_defs.h>
  42   42  #include <sys/types.h>
  43   43  #include <sys/sysmacros.h>
  44   44  #include <sys/user.h>
  45   45  #include <sys/systm.h>
  46   46  #include <sys/errno.h>
  47   47  #include <sys/time.h>
  48   48  #include <sys/vnode.h>
  49   49  #include <sys/file.h>
  50   50  #include <sys/mode.h>
  51   51  #include <sys/proc.h>
  52   52  #include <sys/uio.h>
  53   53  #include <sys/poll_impl.h>
  54   54  #include <sys/kmem.h>
  55   55  #include <sys/cmn_err.h>
  56   56  #include <sys/debug.h>
  57   57  #include <sys/bitmap.h>
  58   58  #include <sys/kstat.h>
  59   59  #include <sys/rctl.h>
  60   60  #include <sys/port_impl.h>
  61   61  #include <sys/schedctl.h>
  62   62  #include <sys/cpu.h>
  63   63  
  64   64  #define NPHLOCKS        64      /* Number of locks; must be power of 2 */
  65   65  #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)]
  66   66  #define PHLOCK(php)     PHLOCKADDR(php).pp_lock
  67   67  #define PH_ENTER(php)   mutex_enter(PHLOCK(php))
  68   68  #define PH_EXIT(php)    mutex_exit(PHLOCK(php))
  69   69  #define VALID_POLL_EVENTS       (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \
  70   70          | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL)
  71   71  
  72   72  /*
  73   73   * global counters to collect some stats
  74   74   */
  75   75  static struct {
  76   76          kstat_named_t   polllistmiss;   /* failed to find a cached poll list */
  77   77          kstat_named_t   pollcachehit;   /* list matched 100% w/ cached one */
  78   78          kstat_named_t   pollcachephit;  /* list matched < 100% w/ cached one */
  79   79          kstat_named_t   pollcachemiss;  /* every list entry is dif from cache */
  80   80          kstat_named_t   pollunlockfail; /* failed to perform pollunlock */
  81   81  } pollstats = {
  82   82          { "polllistmiss",       KSTAT_DATA_UINT64 },
  83   83          { "pollcachehit",       KSTAT_DATA_UINT64 },
  84   84          { "pollcachephit",      KSTAT_DATA_UINT64 },
  85   85          { "pollcachemiss",      KSTAT_DATA_UINT64 },
  86   86          { "pollunlockfail",     KSTAT_DATA_UINT64 }
  87   87  };
  88   88  
  89   89  kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats;
  90   90  uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t);
  91   91  
  92   92  struct pplock   {
  93   93          kmutex_t        pp_lock;
  94   94          short           pp_flag;
  95   95          kcondvar_t      pp_wait_cv;
  96   96          int32_t         pp_pad;         /* to a nice round 16 bytes */
  97   97  };
  98   98  
  99   99  static struct pplock plocks[NPHLOCKS];  /* Hash array of pollhead locks */
 100  100  
 101  101  /* Contention lock & list for preventing deadlocks in recursive /dev/poll. */
 102  102  static  kmutex_t        pollstate_contenders_lock;
 103  103  static  pollstate_t     *pollstate_contenders = NULL;
 104  104  
 105  105  #ifdef DEBUG
 106  106  static int pollchecksanity(pollstate_t *, nfds_t);
 107  107  static int pollcheckxref(pollstate_t *, int);
 108  108  static void pollcheckphlist(void);
 109  109  static int pollcheckrevents(pollstate_t *, int, int, int);
 110  110  static void checkpolldat(pollstate_t *);
 111  111  #endif  /* DEBUG */
 112  112  static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int,
 113  113      int *);
 114  114  
 115  115  /*
 116  116   * Data structure overview:
 117  117   * The per-thread poll state consists of
 118  118   *      one pollstate_t
 119  119   *      one pollcache_t
 120  120   *      one bitmap with one event bit per fd
 121  121   *      a (two-dimensional) hashed array of polldat_t structures - one entry
 122  122   *      per fd
 123  123   *
 124  124   * This conglomerate of data structures interact with
 125  125   *      the pollhead which is used by VOP_POLL and pollwakeup
 126  126   *      (protected by the PHLOCK, cached array of plocks), and
 127  127   *      the fpollinfo list hanging off the fi_list which is used to notify
 128  128   *      poll when a cached fd is closed. This is protected by uf_lock.
 129  129   *
 130  130   * Invariants:
 131  131   *      pd_php (pollhead pointer) is set iff (if and only if) the polldat
 132  132   *      is on that pollhead. This is modified atomically under pc_lock.
 133  133   *
 134  134   *      pd_fp (file_t pointer) is set iff the thread is on the fpollinfo
 135  135   *      list for that open file.
 136  136   *      This is modified atomically under pc_lock.
 137  137   *
 138  138   *      pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt.
 139  139   *      Iff pd_ref[i].xf_refcnt >= 1 then
 140  140   *              ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd
 141  141   *      Iff pd_ref[i].xf_refcnt > 1 then
 142  142   *              In ps_pcacheset[i].pcs_pollfd between index
 143  143   *              pd_ref[i].xf_position] and the end of the list
 144  144   *              there are xf_refcnt entries with .fd == pd_fd
 145  145   *
 146  146   * Locking design:
 147  147   * Whenever possible the design relies on the fact that the poll cache state
 148  148   * is per thread thus for both poll and exit it is self-synchronizing.
 149  149   * Thus the key interactions where other threads access the state are:
 150  150   *      pollwakeup (and polltime), and
 151  151   *      close cleaning up the cached references to an open file
 152  152   *
 153  153   * The two key locks in poll proper is ps_lock and pc_lock.
 154  154   *
 155  155   * The ps_lock is used for synchronization between poll, (lwp_)exit and close
 156  156   * to ensure that modifications to pollcacheset structure are serialized.
 157  157   * This lock is held through most of poll() except where poll sleeps
 158  158   * since there is little need to handle closes concurrently with the execution
 159  159   * of poll.
 160  160   * The pc_lock protects most of the fields in pollcache structure and polldat
 161  161   * structures (which are accessed by poll, pollwakeup, and polltime)
 162  162   * with the exception of fields that are only modified when only one thread
 163  163   * can access this per-thread state.
 164  164   * Those exceptions occur in poll when first allocating the per-thread state,
 165  165   * when poll grows the number of polldat (never shrinks), and when
 166  166   * exit/pollcleanup has ensured that there are no references from either
 167  167   * pollheads or fpollinfo to the threads poll state.
 168  168   *
 169  169   * Poll(2) system call is the only path which ps_lock and pc_lock are both
 170  170   * held, in that order. It needs ps_lock to synchronize with close and
 171  171   * lwp_exit; and pc_lock with pollwakeup.
 172  172   *
 173  173   * The locking interaction between pc_lock and PHLOCK take into account
 174  174   * that poll acquires these locks in the order of pc_lock and then PHLOCK
 175  175   * while pollwakeup does it in the reverse order. Thus pollwakeup implements
 176  176   * deadlock avoidance by dropping the locks and reacquiring them in the
 177  177   * reverse order. For this to work pollwakeup needs to prevent the thread
 178  178   * from exiting and freeing all of the poll related state. Thus is done
 179  179   * using
 180  180   *      the pc_no_exit lock
 181  181   *      the pc_busy counter
 182  182   *      the pc_busy_cv condition variable
 183  183   *
 184  184   * The locking interaction between pc_lock and uf_lock has similar
 185  185   * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef
 186  186   * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock
 187  187   * to prevent poll or exit from doing a delfpollinfo after which the thread
 188  188   * might exit. But the cleanup needs to acquire pc_lock when modifying
 189  189   * the poll cache state. The solution is to use pc_busy and do the close
 190  190   * cleanup in two phases:
 191  191   *      First close calls pollblockexit which increments pc_busy.
 192  192   *      This prevents the per-thread poll related state from being freed.
 193  193   *      Then close drops uf_lock and calls pollcacheclean.
 194  194   *      This routine can then acquire pc_lock and remove any references
 195  195   *      to the closing fd (as well as recording that it has been closed
 196  196   *      so that a POLLNVAL can be generated even if the fd is reused before
 197  197   *      poll has been woken up and checked getf() again).
 198  198   *
 199  199   * When removing a polled fd from poll cache, the fd is always removed
 200  200   * from pollhead list first and then from fpollinfo list, i.e.,
 201  201   * pollhead_delete() is called before delfpollinfo().
 202  202   *
 203  203   *
 204  204   * Locking hierarchy:
 205  205   *      pc_no_exit is a leaf level lock.
 206  206   *      ps_lock is held when acquiring pc_lock (except when pollwakeup
 207  207   *      acquires pc_lock).
 208  208   *      pc_lock might be held when acquiring PHLOCK (pollhead_insert/
 209  209   *      pollhead_delete)
 210  210   *      pc_lock is always held (but this is not required)
 211  211   *      when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called
 212  212   *      from pcache_clean_entry).
 213  213   *      pc_lock is held across addfpollinfo/delfpollinfo which acquire
 214  214   *      uf_lock.
 215  215   *      pc_lock is held across getf/releasef which acquire uf_lock.
 216  216   *      ps_lock might be held across getf/releasef which acquire uf_lock.
 217  217   *      pollwakeup tries to acquire pc_lock while holding PHLOCK
 218  218   *      but drops the locks and reacquire them in reverse order to avoid
 219  219   *      deadlock.
 220  220   *
 221  221   * Note also that there is deadlock avoidance support for VOP_POLL routines
 222  222   * and pollwakeup involving a file system or driver lock.
 223  223   * See below.
 224  224   */
 225  225  
 226  226  /*
 227  227   * Deadlock avoidance support for VOP_POLL() routines.  This is
 228  228   * sometimes necessary to prevent deadlock between polling threads
 229  229   * (which hold poll locks on entry to xx_poll(), then acquire foo)
 230  230   * and pollwakeup() threads (which hold foo, then acquire poll locks).
 231  231   *
 232  232   * pollunlock(*cookie) releases whatever poll locks the current thread holds,
 233  233   *      setting a cookie for use by pollrelock();
 234  234   *
 235  235   * pollrelock(cookie) reacquires previously dropped poll locks;
 236  236   *
 237  237   * polllock(php, mutex) does the common case: pollunlock(),
 238  238   *      acquire the problematic mutex, pollrelock().
 239  239   *
 240  240   * If polllock() or pollunlock() return non-zero, it indicates that a recursive
 241  241   * /dev/poll is in progress and pollcache locks cannot be dropped.  Callers
 242  242   * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL.
 243  243   */
 244  244  int
 245  245  pollunlock(int *lockstate)
 246  246  {
 247  247          pollstate_t *ps = curthread->t_pollstate;
 248  248          pollcache_t *pcp;
 249  249  
 250  250          ASSERT(lockstate != NULL);
 251  251  
 252  252          /*
 253  253           * There is no way to safely perform a pollunlock() while in the depths
 254  254           * of a recursive /dev/poll operation.
 255  255           */
 256  256          if (ps != NULL && ps->ps_depth > 1) {
 257  257                  ps->ps_flags |= POLLSTATE_ULFAIL;
 258  258                  pollstats.pollunlockfail.value.ui64++;
 259  259                  return (-1);
 260  260          }
 261  261  
 262  262          /*
 263  263           * t_pollcache is set by /dev/poll and event ports (port_fd.c).
 264  264           * If the pollrelock/pollunlock is called as a result of poll(2),
 265  265           * the t_pollcache should be NULL.
 266  266           */
 267  267          if (curthread->t_pollcache == NULL)
 268  268                  pcp = ps->ps_pcache;
 269  269          else
 270  270                  pcp = curthread->t_pollcache;
 271  271  
 272  272          if (!mutex_owned(&pcp->pc_lock)) {
 273  273                  *lockstate = 0;
 274  274          } else {
 275  275                  *lockstate = 1;
 276  276                  mutex_exit(&pcp->pc_lock);
 277  277          }
 278  278          return (0);
 279  279  }
 280  280  
 281  281  void
 282  282  pollrelock(int lockstate)
 283  283  {
 284  284          pollstate_t *ps = curthread->t_pollstate;
 285  285          pollcache_t *pcp;
 286  286  
 287  287          /* Skip this whole ordeal if the pollcache was not locked to begin */
 288  288          if (lockstate == 0)
 289  289                  return;
 290  290  
 291  291          /*
 292  292           * t_pollcache is set by /dev/poll and event ports (port_fd.c).
 293  293           * If the pollrelock/pollunlock is called as a result of poll(2),
 294  294           * the t_pollcache should be NULL.
 295  295           */
 296  296          if (curthread->t_pollcache == NULL)
 297  297                  pcp = ps->ps_pcache;
 298  298          else
 299  299                  pcp = curthread->t_pollcache;
 300  300  
 301  301          mutex_enter(&pcp->pc_lock);
 302  302  }
 303  303  
 304  304  /* ARGSUSED */
 305  305  int
 306  306  polllock(pollhead_t *php, kmutex_t *lp)
 307  307  {
 308  308          if (mutex_tryenter(lp) == 0) {
 309  309                  int state;
 310  310  
 311  311                  if (pollunlock(&state) != 0) {
 312  312                          return (-1);
 313  313                  }
 314  314                  mutex_enter(lp);
 315  315                  pollrelock(state);
 316  316          }
 317  317          return (0);
 318  318  }
 319  319  
 320  320  int
 321  321  poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds)
 322  322  {
 323  323          pollfd_t *pollfdp;
 324  324          nfds_t old_nfds;
 325  325  
 326  326          /*
 327  327           * NOTE: for performance, buffers are saved across poll() calls.
 328  328           * The theory is that if a process polls heavily, it tends to poll
 329  329           * on the same set of descriptors.  Therefore, we only reallocate
 330  330           * buffers when nfds changes.  There is no hysteresis control,
 331  331           * because there is no data to suggest that this is necessary;
 332  332           * the penalty of reallocating is not *that* great in any event.
 333  333           */
 334  334          old_nfds = ps->ps_nfds;
 335  335          if (nfds != old_nfds) {
 336  336                  kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
 337  337                  pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
 338  338                  ps->ps_pollfd = pollfdp;
 339  339                  ps->ps_nfds = nfds;
 340  340          }
 341  341  
 342  342          pollfdp = ps->ps_pollfd;
 343  343          if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
 344  344                  return (EFAULT);
 345  345          }
 346  346  
 347  347          if (fds == NULL) {
 348  348                  /*
 349  349                   * If the process has page 0 mapped, then the copyin() above
 350  350                   * will succeed even if fds is NULL.  However, our cached
 351  351                   * poll lists are keyed by the address of the passed-in fds
 352  352                   * structure, and we use the value NULL to indicate an unused
 353  353                   * poll cache list entry.  As such, we elect not to support
 354  354                   * NULL as a valid (user) memory address and fail the poll()
 355  355                   * call.
 356  356                   */
 357  357                  return (EFAULT);
 358  358          }
 359  359          return (0);
 360  360  }
 361  361  
 362  362  int
 363  363  poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp,
 364  364      int *fdcnt)
 365  365  {
 366  366          kthread_t *t = curthread;
 367  367          hrtime_t deadline; /* hrtime value when we want to return */
 368  368          pollfd_t *pollfdp;
 369  369          pollcache_t *pcp;
 370  370          int error = 0;
 371  371          int cacheindex = 0;     /* which cache set is used */
 372  372  
 373  373          /*
 374  374           * Determine the precise future time of the requested timeout, if any.
 375  375           */
 376  376          if (tsp == NULL) {
 377  377                  deadline = -1;
 378  378          } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
 379  379                  deadline = 0;
 380  380          } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) {
 381  381                  /* Use an indefinite timeout if tv_sec would cause overflow */
 382  382                  deadline = -1;
 383  383          } else {
 384  384                  /*
 385  385                   * The above check, when combined with the protections offered
 386  386                   * by itimerspecfix (ensuring that neither field is negative
 387  387                   * and that tv_nsec represents less than a whole second), will
 388  388                   * prevent overflow during the conversion from timespec_t to
 389  389                   * uhrtime_t.
 390  390                   */
 391  391                  uhrtime_t utime = tsp->tv_sec * NANOSEC;
 392  392                  utime += tsp->tv_nsec;
 393  393  
 394  394                  /* They must wait at least a tick. */
 395  395                  utime = MAX(utime, nsec_per_tick);
 396  396  
 397  397                  /*
 398  398                   * Since utime has an upper bound of HRTIME_MAX, adding the
 399  399                   * gethrtime() result cannot incur an overflow as the unsigned
 400  400                   * type has an adequate bound.
 401  401                   */
 402  402                  utime += (uhrtime_t)gethrtime();
 403  403                  if (utime > HRTIME_MAX) {
 404  404                          deadline = -1;
 405  405                  } else {
 406  406                          deadline = (hrtime_t)utime;
 407  407                  }
 408  408          }
 409  409  
 410  410          /*
 411  411           * Check to see if the caller just wants to use poll() as a timeout.
 412  412           * If yes then bypass all the other stuff and make him sleep.
 413  413           */
 414  414          if (nfds == 0) {
 415  415                  *fdcnt = 0;
 416  416                  /*
 417  417                   * Sleep until we have passed the requested future
 418  418                   * time or until interrupted by a signal.
 419  419                   * Do not check for signals if we do not want to wait.
 420  420                   */
 421  421                  if (deadline != 0) {
 422  422                          mutex_enter(&t->t_delay_lock);
 423  423                          while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
 424  424                              &t->t_delay_lock, deadline)) > 0)
 425  425                                  continue;
 426  426                          mutex_exit(&t->t_delay_lock);
 427  427                          return ((error == 0) ? EINTR : 0);
 428  428                  }
 429  429                  return (0);
 430  430          }
 431  431  
 432  432          VERIFY(ps != NULL);
 433  433          pollfdp = ps->ps_pollfd;
 434  434          VERIFY(pollfdp != NULL);
 435  435  
 436  436          /*
 437  437           * If this thread polls for the first time, allocate ALL poll
 438  438           * cache data structures and cache the poll fd list. This
 439  439           * allocation is delayed till now because lwp's polling 0 fd
 440  440           * (i.e. using poll as timeout()) don't need this memory.
 441  441           */
 442  442          mutex_enter(&ps->ps_lock);
 443  443          pcp = ps->ps_pcache;
 444  444          ASSERT(pcp != NULL);
 445  445          if (pcp->pc_bitmap == NULL) {
 446  446                  pcache_create(pcp, nfds);
 447  447                  /*
 448  448                   * poll and cache this poll fd list in ps_pcacheset[0].
 449  449                   */
 450  450                  error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex);
 451  451                  if (error || *fdcnt) {
 452  452                          mutex_exit(&ps->ps_lock);
 453  453                          return (error);
 454  454                  }
 455  455          } else {
 456  456                  pollcacheset_t  *pcset = ps->ps_pcacheset;
 457  457  
 458  458                  /*
 459  459                   * Not first time polling. Select a cached poll list by
 460  460                   * matching user pollfd list buffer address.
 461  461                   */
 462  462                  for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) {
 463  463                          if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) {
 464  464                                  if ((++pcset[cacheindex].pcs_count) == 0) {
 465  465                                          /*
 466  466                                           * counter is wrapping around.
 467  467                                           */
 468  468                                          pcacheset_reset_count(ps, cacheindex);
 469  469                                  }
 470  470                                  /*
 471  471                                   * examine and resolve possible
 472  472                                   * difference of the current poll
 473  473                                   * list and previously cached one.
 474  474                                   * If there is an error during resolve(),
 475  475                                   * the callee will guarantee the consistency
 476  476                                   * of cached poll list and cache content.
 477  477                                   */
 478  478                                  error = pcacheset_resolve(ps, nfds, fdcnt,
 479  479                                      cacheindex);
 480  480                                  if (error) {
 481  481                                          mutex_exit(&ps->ps_lock);
 482  482                                          return (error);
 483  483                                  }
 484  484                                  break;
 485  485                          }
 486  486  
 487  487                          /*
 488  488                           * Note that pcs_usradr field of an used entry won't be
 489  489                           * NULL because it stores the address of passed-in fds,
 490  490                           * and NULL fds will not be cached (Then it is either
 491  491                           * the special timeout case when nfds is 0 or it returns
 492  492                           * failure directly).
 493  493                           */
 494  494                          if (pcset[cacheindex].pcs_usradr == NULL) {
 495  495                                  /*
 496  496                                   * found an unused entry. Use it to cache
 497  497                                   * this poll list.
 498  498                                   */
 499  499                                  error = pcacheset_cache_list(ps, fds, fdcnt,
 500  500                                      cacheindex);
 501  501                                  if (error || *fdcnt) {
 502  502                                          mutex_exit(&ps->ps_lock);
 503  503                                          return (error);
 504  504                                  }
 505  505                                  break;
 506  506                          }
 507  507                  }
 508  508                  if (cacheindex == ps->ps_nsets) {
 509  509                          /*
 510  510                           * We failed to find a matching cached poll fd list.
 511  511                           * replace an old list.
 512  512                           */
 513  513                          pollstats.polllistmiss.value.ui64++;
 514  514                          cacheindex = pcacheset_replace(ps);
 515  515                          ASSERT(cacheindex < ps->ps_nsets);
 516  516                          pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
 517  517                          error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex);
 518  518                          if (error) {
 519  519                                  mutex_exit(&ps->ps_lock);
 520  520                                  return (error);
 521  521                          }
 522  522                  }
 523  523          }
 524  524  
 525  525          /*
 526  526           * Always scan the bitmap with the lock on the pollcache held.
 527  527           * This is to make sure that a wakeup does not come undetected.
 528  528           * If the lock is not held, a pollwakeup could have come for an
 529  529           * fd we already checked but before this thread sleeps, in which
 530  530           * case the wakeup is missed. Now we hold the pcache lock and
 531  531           * check the bitmap again. This will prevent wakeup from happening
 532  532           * while we hold pcache lock since pollwakeup() will also lock
 533  533           * the pcache before updating poll bitmap.
 534  534           */
 535  535          mutex_enter(&pcp->pc_lock);
 536  536          for (;;) {
 537  537                  pcp->pc_flag = 0;
 538  538                  error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex);
 539  539                  if (error || *fdcnt) {
 540  540                          mutex_exit(&pcp->pc_lock);
 541  541                          mutex_exit(&ps->ps_lock);
 542  542                          break;
 543  543                  }
 544  544  
 545  545                  /*
 546  546                   * If PC_POLLWAKE is set, a pollwakeup() was performed on
 547  547                   * one of the file descriptors.  This can happen only if
 548  548                   * one of the VOP_POLL() functions dropped pcp->pc_lock.
 549  549                   * The only current cases of this is in procfs (prpoll())
 550  550                   * and STREAMS (strpoll()).
 551  551                   */
 552  552                  if (pcp->pc_flag & PC_POLLWAKE)
 553  553                          continue;
 554  554  
 555  555                  /*
 556  556                   * If you get here, the poll of fds was unsuccessful.
 557  557                   * Wait until some fd becomes readable, writable, or gets
 558  558                   * an exception, or until a signal or a timeout occurs.
 559  559                   * Do not check for signals if we have a zero timeout.
 560  560                   */
 561  561                  mutex_exit(&ps->ps_lock);
 562  562                  if (deadline == 0) {
 563  563                          error = -1;
 564  564                  } else {
 565  565                          error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
 566  566                              &pcp->pc_lock, deadline);
 567  567                  }
 568  568                  mutex_exit(&pcp->pc_lock);
 569  569                  /*
 570  570                   * If we have received a signal or timed out
 571  571                   * then break out and return.
 572  572                   */
 573  573                  if (error <= 0) {
 574  574                          error = (error == 0) ? EINTR : 0;
 575  575                          break;
 576  576                  }
 577  577                  /*
 578  578                   * We have not received a signal or timed out.
 579  579                   * Continue around and poll fds again.
 580  580                   */
 581  581                  mutex_enter(&ps->ps_lock);
 582  582                  mutex_enter(&pcp->pc_lock);
 583  583          }
 584  584  
 585  585          return (error);
 586  586  }
 587  587  
 588  588  /*
 589  589   * This is the system call trap that poll(),
 590  590   * select() and pselect() are built upon.
 591  591   * It is a private interface between libc and the kernel.
 592  592   */
 593  593  int
 594  594  pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
 595  595  {
 596  596          kthread_t *t = curthread;
 597  597          klwp_t *lwp = ttolwp(t);
 598  598          proc_t *p = ttoproc(t);
 599  599          timespec_t ts;
 600  600          timespec_t *tsp;
 601  601          k_sigset_t kset;
 602  602          pollstate_t *ps = NULL;
 603  603          pollfd_t *pollfdp = NULL;
 604  604          int error = 0, fdcnt = 0;
 605  605  
 606  606          /*
 607  607           * Copy in timeout
 608  608           */
 609  609          if (timeoutp == NULL) {
 610  610                  tsp = NULL;
 611  611          } else {
 612  612                  if (get_udatamodel() == DATAMODEL_NATIVE) {
 613  613                          if (copyin(timeoutp, &ts, sizeof (ts)))
 614  614                                  return (set_errno(EFAULT));
 615  615                  } else {
 616  616                          timespec32_t ts32;
 617  617  
 618  618                          if (copyin(timeoutp, &ts32, sizeof (ts32)))
 619  619                                  return (set_errno(EFAULT));
 620  620                          TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
 621  621                  }
 622  622  
 623  623                  if (itimerspecfix(&ts))
 624  624                          return (set_errno(EINVAL));
 625  625                  tsp = &ts;
 626  626          }
 627  627  
 628  628          /*
 629  629           * Copy in and reset signal mask, if requested.
 630  630           */
 631  631          if (setp != NULL) {
 632  632                  sigset_t set;
 633  633  
 634  634                  if (copyin(setp, &set, sizeof (set)))
 635  635                          return (set_errno(EFAULT));
 636  636                  sigutok(&set, &kset);
 637  637  
 638  638                  mutex_enter(&p->p_lock);
 639  639                  schedctl_finish_sigblock(t);
 640  640                  lwp->lwp_sigoldmask = t->t_hold;
 641  641                  t->t_hold = kset;
 642  642                  t->t_flag |= T_TOMASK;
 643  643                  /*
 644  644                   * Call cv_reltimedwait_sig() just to check for signals.
 645  645                   * We will return immediately with either 0 or -1.
 646  646                   */
 647  647                  if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
 648  648                      TR_CLOCK_TICK)) {
 649  649                          mutex_exit(&p->p_lock);
 650  650                          error = EINTR;
 651  651                          goto pollout;
 652  652                  }
 653  653                  mutex_exit(&p->p_lock);
 654  654          }
 655  655  
 656  656          /*
 657  657           * Initialize pollstate and copy in pollfd data if present.
 658  658           * If nfds == 0, we will skip all of the copying and check steps and
 659  659           * proceed directly into poll_common to process the supplied timeout.
 660  660           */
 661  661          if (nfds != 0) {
 662  662                  if (nfds > p->p_fno_ctl) {
 663  663                          mutex_enter(&p->p_lock);
 664  664                          (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
 665  665                              p->p_rctls, p, RCA_SAFE);
 666  666                          mutex_exit(&p->p_lock);
 667  667                          error = EINVAL;
 668  668                          goto pollout;
 669  669                  }
 670  670  
 671  671                  /*
 672  672                   * Need to allocate memory for pollstate before anything
 673  673                   * because the mutex and cv are created in this space
 674  674                   */
 675  675                  ps = pollstate_create();
 676  676                  if (ps->ps_pcache == NULL)
 677  677                          ps->ps_pcache = pcache_alloc();
 678  678  
 679  679                  if ((error = poll_copyin(ps, fds, nfds)) != 0)
 680  680                          goto pollout;
 681  681                  pollfdp = ps->ps_pollfd;
 682  682          }
 683  683  
 684  684          /*
 685  685           * Perform the actual poll.
 686  686           */
 687  687          error = poll_common(ps, fds, nfds, tsp, &fdcnt);
 688  688  
 689  689  pollout:
 690  690          /*
 691  691           * If we changed the signal mask but we received no signal then restore
 692  692           * the signal mask.  Otherwise psig() will deal with the signal mask.
 693  693           */
 694  694          if (setp != NULL) {
 695  695                  mutex_enter(&p->p_lock);
 696  696                  if (lwp->lwp_cursig == 0) {
 697  697                          t->t_hold = lwp->lwp_sigoldmask;
 698  698                          t->t_flag &= ~T_TOMASK;
 699  699                  }
 700  700                  mutex_exit(&p->p_lock);
 701  701          }
 702  702  
 703  703          if (error)
 704  704                  return (set_errno(error));
 705  705          /*
 706  706           * Copy out the events and return the fdcnt to the user.
 707  707           */
 708  708          if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
 709  709                  return (set_errno(EFAULT));
 710  710  
 711  711  #ifdef DEBUG
 712  712          /*
 713  713           * Another sanity check:
 714  714           */
 715  715          if (fdcnt) {
 716  716                  int i, reventcnt = 0;
 717  717  
 718  718                  for (i = 0; i < nfds; i++) {
 719  719                          if (pollfdp[i].fd < 0) {
 720  720                                  ASSERT(pollfdp[i].revents == 0);
 721  721                                  continue;
 722  722                          }
 723  723                          if (pollfdp[i].revents) {
 724  724                                  reventcnt++;
 725  725                          }
 726  726                  }
 727  727                  ASSERT(fdcnt == reventcnt);
 728  728          } else {
 729  729                  int i;
 730  730  
 731  731                  for (i = 0; i < nfds; i++) {
 732  732                          ASSERT(pollfdp[i].revents == 0);
 733  733                  }
 734  734          }
 735  735  #endif  /* DEBUG */
 736  736  
 737  737          return (fdcnt);
 738  738  }
 739  739  
 740  740  /*
 741  741   * Clean up any state left around by poll(2). Called when a thread exits.
 742  742   */
 743  743  void
 744  744  pollcleanup()
 745  745  {
 746  746          pollstate_t *ps = curthread->t_pollstate;
 747  747          pollcache_t *pcp;
 748  748  
 749  749          if (ps == NULL)
 750  750                  return;
 751  751          pcp = ps->ps_pcache;
 752  752          /*
 753  753           * free up all cached poll fds
 754  754           */
 755  755          if (pcp == NULL) {
 756  756                  /* this pollstate is used by /dev/poll */
 757  757                  goto pollcleanout;
 758  758          }
 759  759  
 760  760          if (pcp->pc_bitmap != NULL) {
 761  761                  ASSERT(MUTEX_NOT_HELD(&ps->ps_lock));
 762  762                  /*
 763  763                   * a close lwp can race with us when cleaning up a polldat
 764  764                   * entry. We hold the ps_lock when cleaning hash table.
 765  765                   * Since this pollcache is going away anyway, there is no
 766  766                   * need to hold the pc_lock.
 767  767                   */
 768  768                  mutex_enter(&ps->ps_lock);
 769  769                  pcache_clean(pcp);
 770  770                  mutex_exit(&ps->ps_lock);
 771  771  #ifdef DEBUG
 772  772                  /*
 773  773                   * At this point, all fds cached by this lwp should be
 774  774                   * cleaned up. There should be no fd in fi_list still
 775  775                   * reference this thread.
 776  776                   */
 777  777                  checkfpollinfo();       /* sanity check */
 778  778                  pollcheckphlist();      /* sanity check */
 779  779  #endif  /* DEBUG */
 780  780          }
 781  781          /*
 782  782           * Be sure no one is referencing thread before exiting
 783  783           */
 784  784          mutex_enter(&pcp->pc_no_exit);
 785  785          ASSERT(pcp->pc_busy >= 0);
 786  786          while (pcp->pc_busy > 0)
 787  787                  cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
 788  788          mutex_exit(&pcp->pc_no_exit);
 789  789  pollcleanout:
 790  790          pollstate_destroy(ps);
 791  791          curthread->t_pollstate = NULL;
 792  792  }
 793  793  
 794  794  /*
 795  795   * pollwakeup() - poke threads waiting in poll() for some event
 796  796   * on a particular object.
 797  797   *
 798  798   * The threads hanging off of the specified pollhead structure are scanned.
 799  799   * If their event mask matches the specified event(s), then pollnotify() is
 800  800   * called to poke the thread.
 801  801   *
 802  802   * Multiple events may be specified.  When POLLHUP or POLLERR are specified,
 803  803   * all waiting threads are poked.
 804  804   *
 805  805   * It is important that pollnotify() not drop the lock protecting the list
 806  806   * of threads.
 807  807   */
 808  808  void
 809  809  pollwakeup(pollhead_t *php, short events_arg)
 810  810  {
 811  811          polldat_t       *pdp;
 812  812          int             events = (ushort_t)events_arg;
 813  813          struct plist {
 814  814                  port_t *pp;
 815  815                  int     pevents;
 816  816                  struct plist *next;
 817  817                  };
 818  818          struct plist *plhead = NULL, *pltail = NULL;
 819  819  
 820  820  retry:
 821  821          PH_ENTER(php);
 822  822  
 823  823          for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) {
 824  824                  if ((pdp->pd_events & events) ||
 825  825                      (events & (POLLHUP | POLLERR))) {
 826  826  
 827  827                          pollcache_t     *pcp;
 828  828  
 829  829                          if (pdp->pd_portev != NULL) {
 830  830                                  port_kevent_t   *pkevp = pdp->pd_portev;
 831  831                                  /*
 832  832                                   * Object (fd) is associated with an event port,
 833  833                                   * => send event notification to the port.
 834  834                                   */
 835  835                                  ASSERT(pkevp->portkev_source == PORT_SOURCE_FD);
 836  836                                  mutex_enter(&pkevp->portkev_lock);
 837  837                                  if (pkevp->portkev_flags & PORT_KEV_VALID) {
 838  838                                          int pevents;
 839  839  
 840  840                                          pkevp->portkev_flags &= ~PORT_KEV_VALID;
 841  841                                          pkevp->portkev_events |= events &
 842  842                                              (pdp->pd_events | POLLHUP |
 843  843                                              POLLERR);
 844  844                                          /*
 845  845                                           * portkev_lock mutex will be released
 846  846                                           * by port_send_event().
 847  847                                           */
 848  848                                          port_send_event(pkevp);
 849  849  
 850  850                                          /*
 851  851                                           * If we have some thread polling the
 852  852                                           * port's fd, add it to the list. They
 853  853                                           * will be notified later.
 854  854                                           * The port_pollwkup() will flag the
 855  855                                           * port_t so that it will not disappear
 856  856                                           * till port_pollwkdone() is called.
 857  857                                           */
 858  858                                          pevents =
 859  859                                              port_pollwkup(pkevp->portkev_port);
 860  860                                          if (pevents) {
 861  861                                                  struct plist *t;
 862  862                                                  t = kmem_zalloc(
 863  863                                                      sizeof (struct plist),
 864  864                                                      KM_SLEEP);
 865  865                                                  t->pp = pkevp->portkev_port;
 866  866                                                  t->pevents = pevents;
 867  867                                                  if (plhead == NULL) {
 868  868                                                          plhead = t;
 869  869                                                  } else {
 870  870                                                          pltail->next = t;
 871  871                                                  }
 872  872                                                  pltail = t;
 873  873                                          }
 874  874                                  } else {
 875  875                                          mutex_exit(&pkevp->portkev_lock);
 876  876                                  }
 877  877                                  continue;
 878  878                          }
 879  879  
 880  880                          pcp = pdp->pd_pcache;
 881  881  
 882  882                          /*
 883  883                           * Try to grab the lock for this thread. If
 884  884                           * we don't get it then we may deadlock so
 885  885                           * back out and restart all over again. Note
 886  886                           * that the failure rate is very very low.
 887  887                           */
 888  888                          if (mutex_tryenter(&pcp->pc_lock)) {
 889  889                                  pollnotify(pcp, pdp->pd_fd);
 890  890                                  mutex_exit(&pcp->pc_lock);
 891  891                          } else {
 892  892                                  /*
 893  893                                   * We are here because:
 894  894                                   *      1) This thread has been woke up
 895  895                                   *         and is trying to get out of poll().
 896  896                                   *      2) Some other thread is also here
 897  897                                   *         but with a different pollhead lock.
 898  898                                   *
 899  899                                   * So, we need to drop the lock on pollhead
 900  900                                   * because of (1) but we want to prevent
 901  901                                   * that thread from doing lwp_exit() or
 902  902                                   * devpoll close. We want to ensure that
 903  903                                   * the pollcache pointer is still invalid.
 904  904                                   *
 905  905                                   * Solution: Grab the pcp->pc_no_exit lock,
 906  906                                   * increment the pc_busy counter, drop every
 907  907                                   * lock in sight. Get out of the way and wait
 908  908                                   * for type (2) threads to finish.
 909  909                                   */
 910  910  
 911  911                                  mutex_enter(&pcp->pc_no_exit);
 912  912                                  pcp->pc_busy++; /* prevents exit()'s */
 913  913                                  mutex_exit(&pcp->pc_no_exit);
 914  914  
 915  915                                  PH_EXIT(php);
 916  916                                  mutex_enter(&pcp->pc_lock);
 917  917                                  mutex_exit(&pcp->pc_lock);
 918  918                                  mutex_enter(&pcp->pc_no_exit);
 919  919                                  pcp->pc_busy--;
 920  920                                  if (pcp->pc_busy == 0) {
 921  921                                          /*
 922  922                                           * Wakeup the thread waiting in
 923  923                                           * thread_exit().
 924  924                                           */
 925  925                                          cv_signal(&pcp->pc_busy_cv);
 926  926                                  }
 927  927                                  mutex_exit(&pcp->pc_no_exit);
 928  928                                  goto retry;
 929  929                          }
 930  930                  }
 931  931          }
 932  932  
 933  933  
 934  934          /*
 935  935           * Event ports - If this php is of the port on the list,
 936  936           * call port_pollwkdone() to release it. The port_pollwkdone()
 937  937           * needs to be called before dropping the PH lock so that any new
 938  938           * thread attempting to poll this port are blocked. There can be
 939  939           * only one thread here in pollwakeup notifying this port's fd.
 940  940           */
 941  941          if (plhead != NULL && &plhead->pp->port_pollhd == php) {
 942  942                  struct plist *t;
 943  943                  port_pollwkdone(plhead->pp);
 944  944                  t = plhead;
 945  945                  plhead = plhead->next;
 946  946                  kmem_free(t, sizeof (struct plist));
 947  947          }
 948  948          PH_EXIT(php);
 949  949  
 950  950          /*
 951  951           * Event ports - Notify threads polling the event port's fd.
 952  952           * This is normally done in port_send_event() where it calls
 953  953           * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone,
 954  954           * we do it here in pollwakeup() to avoid a recursive call.
 955  955           */
 956  956          if (plhead != NULL) {
 957  957                  php = &plhead->pp->port_pollhd;
 958  958                  events = plhead->pevents;
 959  959                  goto retry;
 960  960          }
 961  961  }
 962  962  
 963  963  /*
 964  964   * This function is called to inform a thread (or threads) that an event being
 965  965   * polled on has occurred.  The pollstate lock on the thread should be held
 966  966   * on entry.
 967  967   */
 968  968  void
 969  969  pollnotify(pollcache_t *pcp, int fd)
 970  970  {
 971  971          ASSERT(fd < pcp->pc_mapsize);
 972  972          ASSERT(MUTEX_HELD(&pcp->pc_lock));
 973  973          BT_SET(pcp->pc_bitmap, fd);
 974  974          pcp->pc_flag |= PC_POLLWAKE;
 975  975          cv_broadcast(&pcp->pc_cv);
 976  976          pcache_wake_parents(pcp);
 977  977  }
 978  978  
 979  979  /*
 980  980   * add a polldat entry to pollhead ph_list. The polldat struct is used
 981  981   * by pollwakeup to wake sleeping pollers when polled events has happened.
 982  982   */
 983  983  void
 984  984  pollhead_insert(pollhead_t *php, polldat_t *pdp)
 985  985  {
 986  986          PH_ENTER(php);
 987  987          ASSERT(pdp->pd_next == NULL);
 988  988  #ifdef DEBUG
 989  989          {
 990  990                  /*
 991  991                   * the polldat should not be already on the list
 992  992                   */
 993  993                  polldat_t *wp;
 994  994                  for (wp = php->ph_list; wp; wp = wp->pd_next) {
 995  995                          ASSERT(wp != pdp);
 996  996                  }
 997  997          }
 998  998  #endif  /* DEBUG */
 999  999          pdp->pd_next = php->ph_list;
1000 1000          php->ph_list = pdp;
1001 1001          PH_EXIT(php);
1002 1002  }
1003 1003  
1004 1004  /*
1005 1005   * Delete the polldat entry from ph_list.
1006 1006   */
1007 1007  void
1008 1008  pollhead_delete(pollhead_t *php, polldat_t *pdp)
1009 1009  {
1010 1010          polldat_t *wp;
1011 1011          polldat_t **wpp;
1012 1012  
1013 1013          PH_ENTER(php);
1014 1014          for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) {
1015 1015                  if (wp == pdp) {
1016 1016                          *wpp = pdp->pd_next;
1017 1017                          pdp->pd_next = NULL;
1018 1018                          break;
1019 1019                  }
1020 1020          }
1021 1021  #ifdef DEBUG
1022 1022          /* assert that pdp is no longer in the list */
1023 1023          for (wp = *wpp; wp; wp = wp->pd_next) {
1024 1024                  ASSERT(wp != pdp);
1025 1025          }
1026 1026  #endif  /* DEBUG */
1027 1027          PH_EXIT(php);
1028 1028  }
1029 1029  
1030 1030  /*
1031 1031   * walk through the poll fd lists to see if they are identical. This is an
1032 1032   * expensive operation and should not be done more than once for each poll()
1033 1033   * call.
1034 1034   *
1035 1035   * As an optimization (i.e., not having to go through the lists more than
1036 1036   * once), this routine also clear the revents field of pollfd in 'current'.
1037 1037   * Zeroing out the revents field of each entry in current poll list is
1038 1038   * required by poll man page.
1039 1039   *
1040 1040   * Since the events field of cached list has illegal poll events filtered
1041 1041   * out, the current list applies the same filtering before comparison.
1042 1042   *
1043 1043   * The routine stops when it detects a meaningful difference, or when it
1044 1044   * exhausts the lists.
1045 1045   */
1046 1046  int
1047 1047  pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n)
1048 1048  {
1049 1049          int    ix;
1050 1050  
1051 1051          for (ix = 0; ix < n; ix++) {
1052 1052                  /* Prefetch 64 bytes worth of 8-byte elements */
1053 1053                  if ((ix & 0x7) == 0) {
1054 1054                          prefetch_write_many((caddr_t)¤t[ix + 8]);
1055 1055                          prefetch_write_many((caddr_t)&cached[ix + 8]);
1056 1056                  }
1057 1057                  if (current[ix].fd == cached[ix].fd) {
1058 1058                          /*
1059 1059                           * Filter out invalid poll events while we are in
1060 1060                           * inside the loop.
1061 1061                           */
1062 1062                          if (current[ix].events & ~VALID_POLL_EVENTS) {
1063 1063                                  current[ix].events &= VALID_POLL_EVENTS;
1064 1064                                  if (newlist != NULL)
1065 1065                                          newlist[ix].events = current[ix].events;
1066 1066                          }
1067 1067                          if (current[ix].events == cached[ix].events) {
1068 1068                                  current[ix].revents = 0;
1069 1069                                  continue;
1070 1070                          }
1071 1071                  }
1072 1072                  if ((current[ix].fd < 0) && (cached[ix].fd < 0)) {
1073 1073                          current[ix].revents = 0;
1074 1074                          continue;
1075 1075                  }
1076 1076                  return (ix);
1077 1077          }
1078 1078          return (ix);
1079 1079  }
1080 1080  
1081 1081  /*
1082 1082   * This routine returns a pointer to a cached poll fd entry, or NULL if it
1083 1083   * does not find it in the hash table.
1084 1084   */
1085 1085  polldat_t *
1086 1086  pcache_lookup_fd(pollcache_t *pcp, int fd)
1087 1087  {
1088 1088          int hashindex;
1089 1089          polldat_t *pdp;
1090 1090  
1091 1091          hashindex = POLLHASH(pcp->pc_hashsize, fd);
1092 1092          pdp = pcp->pc_hash[hashindex];
1093 1093          while (pdp != NULL) {
1094 1094                  if (pdp->pd_fd == fd)
1095 1095                          break;
1096 1096                  pdp = pdp->pd_hashnext;
1097 1097          }
1098 1098          return (pdp);
1099 1099  }
1100 1100  
1101 1101  polldat_t *
1102 1102  pcache_alloc_fd(int nsets)
1103 1103  {
1104 1104          polldat_t *pdp;
1105 1105  
1106 1106          pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP);
1107 1107          if (nsets > 0) {
1108 1108                  pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP);
1109 1109                  pdp->pd_nsets = nsets;
1110 1110          }
1111 1111          return (pdp);
1112 1112  }
1113 1113  
1114 1114  /*
1115 1115   * This routine  inserts a polldat into the pollcache's hash table. It
1116 1116   * may be necessary to grow the size of the hash table.
1117 1117   */
1118 1118  void
1119 1119  pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds)
1120 1120  {
1121 1121          int hashindex;
1122 1122          int fd;
1123 1123  
1124 1124          if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) ||
1125 1125              (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) {
1126 1126                  pcache_grow_hashtbl(pcp, nfds);
1127 1127          }
1128 1128          fd = pdp->pd_fd;
1129 1129          hashindex = POLLHASH(pcp->pc_hashsize, fd);
1130 1130          pdp->pd_hashnext = pcp->pc_hash[hashindex];
1131 1131          pcp->pc_hash[hashindex] = pdp;
1132 1132          pcp->pc_fdcount++;
1133 1133  
1134 1134  #ifdef DEBUG
1135 1135          {
1136 1136                  /*
1137 1137                   * same fd should not appear on a hash list twice
1138 1138                   */
1139 1139                  polldat_t *pdp1;
1140 1140                  for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) {
1141 1141                          ASSERT(pdp->pd_fd != pdp1->pd_fd);
1142 1142                  }
1143 1143          }
1144 1144  #endif  /* DEBUG */
1145 1145  }
1146 1146  
1147 1147  /*
1148 1148   * Grow the hash table -- either double the table size or round it to the
1149 1149   * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the
1150 1150   * elements on the hash table.
1151 1151   */
1152 1152  void
1153 1153  pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds)
1154 1154  {
1155 1155          int     oldsize;
1156 1156          polldat_t **oldtbl;
1157 1157          polldat_t *pdp, *pdp1;
1158 1158          int     i;
1159 1159  #ifdef DEBUG
1160 1160          int     count = 0;
1161 1161  #endif
1162 1162  
1163 1163          ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0);
1164 1164          oldsize = pcp->pc_hashsize;
1165 1165          oldtbl = pcp->pc_hash;
1166 1166          if (nfds > pcp->pc_hashsize * POLLHASHINC) {
1167 1167                  pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
1168 1168                      ~(POLLHASHCHUNKSZ - 1);
1169 1169          } else {
1170 1170                  pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC;
1171 1171          }
1172 1172          pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
1173 1173              KM_SLEEP);
1174 1174          /*
1175 1175           * rehash existing elements
1176 1176           */
1177 1177          pcp->pc_fdcount = 0;
1178 1178          for (i = 0; i < oldsize; i++) {
1179 1179                  pdp = oldtbl[i];
1180 1180                  while (pdp != NULL) {
1181 1181                          pdp1 = pdp->pd_hashnext;
1182 1182                          pcache_insert_fd(pcp, pdp, nfds);
1183 1183                          pdp = pdp1;
1184 1184  #ifdef DEBUG
1185 1185                          count++;
1186 1186  #endif
1187 1187                  }
1188 1188          }
1189 1189          kmem_free(oldtbl, oldsize * sizeof (polldat_t *));
1190 1190          ASSERT(pcp->pc_fdcount == count);
1191 1191  }
1192 1192  
1193 1193  void
1194 1194  pcache_grow_map(pollcache_t *pcp, int fd)
1195 1195  {
1196 1196          int     newsize;
1197 1197          ulong_t *newmap;
1198 1198  
1199 1199          /*
1200 1200           * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is
1201 1201           * power of 2.
1202 1202           */
1203 1203          newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1);
1204 1204          newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t),
1205 1205              KM_SLEEP);
1206 1206          /*
1207 1207           * don't want pollwakeup to set a bit while growing the bitmap.
1208 1208           */
1209 1209          ASSERT(mutex_owned(&pcp->pc_lock) == 0);
1210 1210          mutex_enter(&pcp->pc_lock);
1211 1211          bcopy(pcp->pc_bitmap, newmap,
1212 1212              (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t));
1213 1213          kmem_free(pcp->pc_bitmap,
1214 1214              (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t));
1215 1215          pcp->pc_bitmap = newmap;
1216 1216          pcp->pc_mapsize = newsize;
1217 1217          mutex_exit(&pcp->pc_lock);
1218 1218  }
1219 1219  
1220 1220  /*
1221 1221   * remove all the reference from pollhead list and fpollinfo lists.
1222 1222   */
1223 1223  void
1224 1224  pcache_clean(pollcache_t *pcp)
1225 1225  {
1226 1226          int i;
1227 1227          polldat_t **hashtbl;
1228 1228          polldat_t *pdp;
1229 1229  
1230 1230          ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock));
1231 1231          hashtbl = pcp->pc_hash;
1232 1232          for (i = 0; i < pcp->pc_hashsize; i++) {
1233 1233                  for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1234 1234                          if (pdp->pd_php != NULL) {
1235 1235                                  pollhead_delete(pdp->pd_php, pdp);
1236 1236                                  pdp->pd_php = NULL;
1237 1237                          }
1238 1238                          if (pdp->pd_fp != NULL) {
1239 1239                                  delfpollinfo(pdp->pd_fd);
1240 1240                                  pdp->pd_fp = NULL;
1241 1241                          }
1242 1242                  }
1243 1243          }
1244 1244  }
1245 1245  
1246 1246  void
1247 1247  pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp)
1248 1248  {
1249 1249          int     i;
1250 1250          int     fd = pdp->pd_fd;
1251 1251  
1252 1252          /*
1253 1253           * we come here because an earlier close() on this cached poll fd.
1254 1254           */
1255 1255          ASSERT(pdp->pd_fp == NULL);
1256 1256          ASSERT(MUTEX_HELD(&ps->ps_lock));
1257 1257          pdp->pd_events = 0;
1258 1258          for (i = 0; i < ps->ps_nsets; i++) {
1259 1259                  xref_t          *refp;
1260 1260                  pollcacheset_t  *pcsp;
1261 1261  
1262 1262                  ASSERT(pdp->pd_ref != NULL);
1263 1263                  refp = &pdp->pd_ref[i];
1264 1264                  if (refp->xf_refcnt) {
1265 1265                          ASSERT(refp->xf_position >= 0);
1266 1266                          pcsp = &ps->ps_pcacheset[i];
1267 1267                          if (refp->xf_refcnt == 1) {
1268 1268                                  pcsp->pcs_pollfd[refp->xf_position].fd = -1;
1269 1269                                  refp->xf_refcnt = 0;
1270 1270                                  pdp->pd_count--;
1271 1271                          } else if (refp->xf_refcnt > 1) {
1272 1272                                  int     j;
1273 1273  
1274 1274                                  /*
1275 1275                                   * turn off every appearance in pcs_pollfd list
1276 1276                                   */
1277 1277                                  for (j = refp->xf_position;
1278 1278                                      j < pcsp->pcs_nfds; j++) {
1279 1279                                          if (pcsp->pcs_pollfd[j].fd == fd) {
1280 1280                                                  pcsp->pcs_pollfd[j].fd = -1;
1281 1281                                                  refp->xf_refcnt--;
1282 1282                                                  pdp->pd_count--;
1283 1283                                          }
1284 1284                                  }
1285 1285                          }
1286 1286                          ASSERT(refp->xf_refcnt == 0);
1287 1287                          refp->xf_position = POLLPOSINVAL;
1288 1288                  }
1289 1289          }
1290 1290          ASSERT(pdp->pd_count == 0);
1291 1291  }
1292 1292  
1293 1293  /*
1294 1294   * Insert poll fd into the pollcache, and add poll registration.
1295 1295   * This routine is called after getf() and before releasef(). So the vnode
1296 1296   * can not disappear even if we block here.
1297 1297   * If there is an error, the polled fd is not cached.
1298 1298   */
1299 1299  int
1300 1300  pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp,
1301 1301      ssize_t pos, int which)
1302 1302  {
1303 1303          pollcache_t     *pcp = ps->ps_pcache;
1304 1304          polldat_t       *pdp;
1305 1305          int             error;
1306 1306          int             fd;
1307 1307          pollhead_t      *memphp = NULL;
1308 1308          xref_t          *refp;
1309 1309          int             newpollfd = 0;
1310 1310  
1311 1311          ASSERT(MUTEX_HELD(&ps->ps_lock));
1312 1312          /*
1313 1313           * The poll caching uses the existing VOP_POLL interface. If there
1314 1314           * is no polled events, we want the polled device to set its "some
1315 1315           * one is sleeping in poll" flag. When the polled events happen
1316 1316           * later, the driver will call pollwakeup(). We achieve this by
1317 1317           * always passing 0 in the third parameter ("anyyet") when calling
1318 1318           * VOP_POLL. This parameter is not looked at by drivers when the
1319 1319           * polled events exist. If a driver chooses to ignore this parameter
1320 1320           * and call pollwakeup whenever the polled events happen, that will
1321 1321           * be OK too.
1322 1322           */
1323 1323          ASSERT(curthread->t_pollcache == NULL);
1324 1324          error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents,
1325 1325              &memphp, NULL);
1326 1326          if (error) {
1327 1327                  return (error);
1328 1328          }
1329 1329          if (pollfdp->revents) {
1330 1330                  (*fdcntp)++;
1331 1331          }
1332 1332          /*
1333 1333           * polling the underlying device succeeded. Now we can cache it.
1334 1334           * A close can't come in here because we have not done a releasef()
1335 1335           * yet.
1336 1336           */
1337 1337          fd = pollfdp->fd;
1338 1338          pdp = pcache_lookup_fd(pcp, fd);
1339 1339          if (pdp == NULL) {
1340 1340                  ASSERT(ps->ps_nsets > 0);
1341 1341                  pdp = pcache_alloc_fd(ps->ps_nsets);
1342 1342                  newpollfd = 1;
1343 1343          }
1344 1344          /*
1345 1345           * If this entry was used to cache a poll fd which was closed, and
1346 1346           * this entry has not been cleaned, do it now.
1347 1347           */
1348 1348          if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) {
1349 1349                  pcacheset_invalidate(ps, pdp);
1350 1350                  ASSERT(pdp->pd_next == NULL);
1351 1351          }
1352 1352          if (pdp->pd_count == 0) {
1353 1353                  pdp->pd_fd = fd;
1354 1354                  pdp->pd_fp = fp;
1355 1355                  addfpollinfo(fd);
1356 1356                  pdp->pd_thread = curthread;
1357 1357                  pdp->pd_pcache = pcp;
1358 1358                  /*
1359 1359                   * the entry is never used or cleared by removing a cached
1360 1360                   * pollfd (pcache_delete_fd). So all the fields should be clear.
1361 1361                   */
1362 1362                  ASSERT(pdp->pd_next == NULL);
1363 1363          }
1364 1364  
1365 1365          /*
1366 1366           * A polled fd is considered cached. So there should be a fpollinfo
1367 1367           * entry on uf_fpollinfo list.
1368 1368           */
1369 1369          ASSERT(infpollinfo(fd));
1370 1370          /*
1371 1371           * If there is an inconsistency, we want to know it here.
1372 1372           */
1373 1373          ASSERT(pdp->pd_fp == fp);
1374 1374  
1375 1375          /*
1376 1376           * XXX pd_events is a union of all polled events on this fd, possibly
1377 1377           * by different threads. Unless this is a new first poll(), pd_events
1378 1378           * never shrinks. If an event is no longer polled by a process, there
1379 1379           * is no way to cancel that event. In that case, poll degrade to its
1380 1380           * old form -- polling on this fd every time poll() is called. The
1381 1381           * assumption is an app always polls the same type of events.
1382 1382           */
1383 1383          pdp->pd_events |= pollfdp->events;
1384 1384  
1385 1385          pdp->pd_count++;
1386 1386          /*
1387 1387           * There is not much special handling for multiple appearances of
1388 1388           * same fd other than xf_position always recording the first
1389 1389           * appearance in poll list. If this is called from pcacheset_cache_list,
1390 1390           * a VOP_POLL is called on every pollfd entry; therefore each
1391 1391           * revents and fdcnt should be set correctly. If this is called from
1392 1392           * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will
1393 1393           * pick up the right count and handle revents field of each pollfd
1394 1394           * entry.
1395 1395           */
1396 1396          ASSERT(pdp->pd_ref != NULL);
1397 1397          refp = &pdp->pd_ref[which];
1398 1398          if (refp->xf_refcnt == 0) {
1399 1399                  refp->xf_position = pos;
1400 1400          } else {
1401 1401                  /*
1402 1402                   * xf_position records the fd's first appearance in poll list
1403 1403                   */
1404 1404                  if (pos < refp->xf_position) {
1405 1405                          refp->xf_position = pos;
1406 1406                  }
1407 1407          }
1408 1408          ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd);
1409 1409          refp->xf_refcnt++;
1410 1410          if (fd >= pcp->pc_mapsize) {
1411 1411                  pcache_grow_map(pcp, fd);
1412 1412          }
1413 1413          if (fd > pcp->pc_mapend) {
1414 1414                  pcp->pc_mapend = fd;
1415 1415          }
1416 1416          if (newpollfd != 0) {
1417 1417                  pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds);
1418 1418          }
1419 1419          if (memphp) {
1420 1420                  if (pdp->pd_php == NULL) {
1421 1421                          pollhead_insert(memphp, pdp);
1422 1422                          pdp->pd_php = memphp;
1423 1423                  } else {
1424 1424                          if (memphp != pdp->pd_php) {
1425 1425                                  /*
1426 1426                                   * layered devices (e.g. console driver)
1427 1427                                   * may change the vnode and thus the pollhead
1428 1428                                   * pointer out from underneath us.
1429 1429                                   */
1430 1430                                  pollhead_delete(pdp->pd_php, pdp);
1431 1431                                  pollhead_insert(memphp, pdp);
1432 1432                                  pdp->pd_php = memphp;
1433 1433                          }
1434 1434                  }
1435 1435          }
1436 1436          /*
1437 1437           * Since there is a considerable window between VOP_POLL and when
1438 1438           * we actually put the polldat struct on the pollhead list, we could
1439 1439           * miss a pollwakeup. In the case of polling additional events, we
1440 1440           * don't update the events until after VOP_POLL. So we could miss
1441 1441           * pollwakeup there too. So we always set the bit here just to be
1442 1442           * safe. The real performance gain is in subsequent pcache_poll.
1443 1443           */
1444 1444          mutex_enter(&pcp->pc_lock);
1445 1445          BT_SET(pcp->pc_bitmap, fd);
1446 1446          mutex_exit(&pcp->pc_lock);
1447 1447          return (0);
1448 1448  }
1449 1449  
1450 1450  /*
1451 1451   * The entry is not really deleted. The fields are cleared so that the
1452 1452   * entry is no longer useful, but it will remain in the hash table for reuse
1453 1453   * later. It will be freed when the polling lwp exits.
1454 1454   */
1455 1455  int
1456 1456  pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent)
1457 1457  {
1458 1458          pollcache_t     *pcp = ps->ps_pcache;
1459 1459          polldat_t       *pdp;
1460 1460          xref_t          *refp;
1461 1461  
1462 1462          ASSERT(fd < pcp->pc_mapsize);
1463 1463          ASSERT(MUTEX_HELD(&ps->ps_lock));
1464 1464  
1465 1465          pdp = pcache_lookup_fd(pcp, fd);
1466 1466          ASSERT(pdp != NULL);
1467 1467          ASSERT(pdp->pd_count > 0);
1468 1468          ASSERT(pdp->pd_ref != NULL);
1469 1469          refp = &pdp->pd_ref[which];
1470 1470          if (pdp->pd_count == 1) {
1471 1471                  pdp->pd_events = 0;
1472 1472                  refp->xf_position = POLLPOSINVAL;
1473 1473                  ASSERT(refp->xf_refcnt == 1);
1474 1474                  refp->xf_refcnt = 0;
1475 1475                  if (pdp->pd_php) {
1476 1476                          /*
1477 1477                           * It is possible for a wakeup thread to get ahead
1478 1478                           * of the following pollhead_delete and set the bit in
1479 1479                           * bitmap.  It is OK because the bit will be cleared
1480 1480                           * here anyway.
1481 1481                           */
1482 1482                          pollhead_delete(pdp->pd_php, pdp);
1483 1483                          pdp->pd_php = NULL;
1484 1484                  }
1485 1485                  pdp->pd_count = 0;
1486 1486                  if (pdp->pd_fp != NULL) {
1487 1487                          pdp->pd_fp = NULL;
1488 1488                          delfpollinfo(fd);
1489 1489                  }
1490 1490                  mutex_enter(&pcp->pc_lock);
1491 1491                  BT_CLEAR(pcp->pc_bitmap, fd);
1492 1492                  mutex_exit(&pcp->pc_lock);
1493 1493                  return (0);
1494 1494          }
1495 1495          if ((cevent & POLLCLOSED) == POLLCLOSED) {
1496 1496                  /*
1497 1497                   * fd cached here has been closed. This is the first
1498 1498                   * pcache_delete_fd called after the close. Clean up the
1499 1499                   * entire entry.
1500 1500                   */
1501 1501                  pcacheset_invalidate(ps, pdp);
1502 1502                  ASSERT(pdp->pd_php == NULL);
1503 1503                  mutex_enter(&pcp->pc_lock);
1504 1504                  BT_CLEAR(pcp->pc_bitmap, fd);
1505 1505                  mutex_exit(&pcp->pc_lock);
1506 1506                  return (0);
1507 1507          }
1508 1508  #ifdef DEBUG
1509 1509          if (getf(fd) != NULL) {
1510 1510                  ASSERT(infpollinfo(fd));
1511 1511                  releasef(fd);
1512 1512          }
1513 1513  #endif  /* DEBUG */
1514 1514          pdp->pd_count--;
1515 1515          ASSERT(refp->xf_refcnt > 0);
1516 1516          if (--refp->xf_refcnt == 0) {
1517 1517                  refp->xf_position = POLLPOSINVAL;
1518 1518          } else {
1519 1519                  ASSERT(pos >= refp->xf_position);
1520 1520                  if (pos == refp->xf_position) {
1521 1521                          /*
1522 1522                           * The xref position is no longer valid.
1523 1523                           * Reset it to a special value and let
1524 1524                           * caller know it needs to updatexref()
1525 1525                           * with a new xf_position value.
1526 1526                           */
1527 1527                          refp->xf_position = POLLPOSTRANS;
1528 1528                          return (1);
1529 1529                  }
1530 1530          }
1531 1531          return (0);
1532 1532  }
1533 1533  
1534 1534  void
1535 1535  pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which)
1536 1536  {
1537 1537          polldat_t       *pdp;
1538 1538  
1539 1539          pdp = pcache_lookup_fd(pcp, fd);
1540 1540          ASSERT(pdp != NULL);
1541 1541          ASSERT(pdp->pd_ref != NULL);
1542 1542          pdp->pd_ref[which].xf_position = pos;
1543 1543  }
1544 1544  
1545 1545  #ifdef DEBUG
1546 1546  /*
1547 1547   * For each polled fd, it's either in the bitmap or cached in
1548 1548   * pcache hash table. If this routine returns 0, something is wrong.
1549 1549   */
1550 1550  static int
1551 1551  pollchecksanity(pollstate_t *ps, nfds_t nfds)
1552 1552  {
1553 1553          int             i;
1554 1554          int             fd;
1555 1555          pollcache_t     *pcp = ps->ps_pcache;
1556 1556          polldat_t       *pdp;
1557 1557          pollfd_t        *pollfdp = ps->ps_pollfd;
1558 1558          file_t          *fp;
1559 1559  
1560 1560          ASSERT(MUTEX_HELD(&ps->ps_lock));
1561 1561          for (i = 0; i < nfds; i++) {
1562 1562                  fd = pollfdp[i].fd;
1563 1563                  if (fd < 0) {
1564 1564                          ASSERT(pollfdp[i].revents == 0);
1565 1565                          continue;
1566 1566                  }
1567 1567                  if (pollfdp[i].revents == POLLNVAL)
1568 1568                          continue;
1569 1569                  if ((fp = getf(fd)) == NULL)
1570 1570                          continue;
1571 1571                  pdp = pcache_lookup_fd(pcp, fd);
1572 1572                  ASSERT(pdp != NULL);
1573 1573                  ASSERT(infpollinfo(fd));
1574 1574                  ASSERT(pdp->pd_fp == fp);
1575 1575                  releasef(fd);
1576 1576                  if (BT_TEST(pcp->pc_bitmap, fd))
1577 1577                          continue;
1578 1578                  if (pdp->pd_php == NULL)
1579 1579                          return (0);
1580 1580          }
1581 1581          return (1);
1582 1582  }
1583 1583  #endif  /* DEBUG */
1584 1584  
1585 1585  /*
1586 1586   * resolve the difference between the current poll list and a cached one.
1587 1587   */
1588 1588  int
1589 1589  pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which)
1590 1590  {
1591 1591          int             i;
1592 1592          pollcache_t     *pcp = ps->ps_pcache;
1593 1593          pollfd_t        *newlist = NULL;
1594 1594          pollfd_t        *current = ps->ps_pollfd;
1595 1595          pollfd_t        *cached;
1596 1596          pollcacheset_t  *pcsp;
1597 1597          int             common;
1598 1598          int             count = 0;
1599 1599          int             offset;
1600 1600          int             remain;
1601 1601          int             fd;
1602 1602          file_t          *fp;
1603 1603          int             fdcnt = 0;
1604 1604          int             cnt = 0;
1605 1605          nfds_t          old_nfds;
1606 1606          int             error = 0;
1607 1607          int             mismatch = 0;
1608 1608  
1609 1609          ASSERT(MUTEX_HELD(&ps->ps_lock));
1610 1610  #ifdef DEBUG
1611 1611          checkpolldat(ps);
1612 1612  #endif
1613 1613          pcsp = &ps->ps_pcacheset[which];
1614 1614          old_nfds = pcsp->pcs_nfds;
1615 1615          common = (nfds > old_nfds) ? old_nfds : nfds;
1616 1616          if (nfds != old_nfds) {
1617 1617                  /*
1618 1618                   * the length of poll list has changed. allocate a new
1619 1619                   * pollfd list.
1620 1620                   */
1621 1621                  newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
1622 1622                  bcopy(current, newlist, sizeof (pollfd_t) * nfds);
1623 1623          }
1624 1624          /*
1625 1625           * Compare the overlapping part of the current fd list with the
1626 1626           * cached one. Whenever a difference is found, resolve it.
1627 1627           * The comparison is done on the current poll list and the
1628 1628           * cached list. But we may be setting up the newlist to be the
1629 1629           * cached list for next poll.
1630 1630           */
1631 1631          cached = pcsp->pcs_pollfd;
1632 1632          remain = common;
1633 1633  
1634 1634          while (count < common) {
1635 1635                  int     tmpfd;
1636 1636                  pollfd_t *np;
1637 1637  
1638 1638                  np = (newlist != NULL) ? &newlist[count] : NULL;
1639 1639                  offset = pcacheset_cmp(¤t[count], &cached[count], np,
1640 1640                      remain);
1641 1641                  /*
1642 1642                   * Collect stats. If lists are completed the first time,
1643 1643                   * it's a hit. Otherwise, it's a partial hit or miss.
1644 1644                   */
1645 1645                  if ((count == 0) && (offset == common)) {
1646 1646                          pollstats.pollcachehit.value.ui64++;
1647 1647                  } else {
1648 1648                          mismatch++;
1649 1649                  }
1650 1650                  count += offset;
1651 1651                  if (offset < remain) {
1652 1652                          ASSERT(count < common);
1653 1653                          ASSERT((current[count].fd != cached[count].fd) ||
1654 1654                              (current[count].events != cached[count].events));
1655 1655                          /*
1656 1656                           * Filter out invalid events.
1657 1657                           */
1658 1658                          if (current[count].events & ~VALID_POLL_EVENTS) {
1659 1659                                  if (newlist != NULL) {
1660 1660                                          newlist[count].events =
1661 1661                                              current[count].events &=
1662 1662                                              VALID_POLL_EVENTS;
1663 1663                                  } else {
1664 1664                                          current[count].events &=
1665 1665                                              VALID_POLL_EVENTS;
1666 1666                                  }
1667 1667                          }
1668 1668                          /*
1669 1669                           * when resolving a difference, we always remove the
1670 1670                           * fd from cache before inserting one into cache.
1671 1671                           */
1672 1672                          if (cached[count].fd >= 0) {
1673 1673                                  tmpfd = cached[count].fd;
1674 1674                                  if (pcache_delete_fd(ps, tmpfd, count, which,
1675 1675                                      (uint_t)cached[count].events)) {
1676 1676                                          /*
1677 1677                                           * This should be rare but needed for
1678 1678                                           * correctness.
1679 1679                                           *
1680 1680                                           * The first appearance in cached list
1681 1681                                           * is being "turned off". The same fd
1682 1682                                           * appear more than once in the cached
1683 1683                                           * poll list. Find the next one on the
1684 1684                                           * list and update the cached
1685 1685                                           * xf_position field.
1686 1686                                           */
1687 1687                                          for (i = count + 1; i < old_nfds; i++) {
1688 1688                                                  if (cached[i].fd == tmpfd) {
1689 1689                                                          pcache_update_xref(pcp,
1690 1690                                                              tmpfd, (ssize_t)i,
1691 1691                                                              which);
1692 1692                                                          break;
1693 1693                                                  }
1694 1694                                          }
1695 1695                                          ASSERT(i <= old_nfds);
1696 1696                                  }
1697 1697                                  /*
1698 1698                                   * In case a new cache list is allocated,
1699 1699                                   * need to keep both cache lists in sync
1700 1700                                   * b/c the new one can be freed if we have
1701 1701                                   * an error later.
1702 1702                                   */
1703 1703                                  cached[count].fd = -1;
1704 1704                                  if (newlist != NULL) {
1705 1705                                          newlist[count].fd = -1;
1706 1706                                  }
1707 1707                          }
1708 1708                          if ((tmpfd = current[count].fd) >= 0) {
1709 1709                                  /*
1710 1710                                   * add to the cached fd tbl and bitmap.
1711 1711                                   */
1712 1712                                  if ((fp = getf(tmpfd)) == NULL) {
1713 1713                                          current[count].revents = POLLNVAL;
1714 1714                                          if (newlist != NULL) {
1715 1715                                                  newlist[count].fd = -1;
1716 1716                                          }
1717 1717                                          cached[count].fd = -1;
1718 1718                                          fdcnt++;
1719 1719                                  } else {
1720 1720                                          /*
1721 1721                                           * Here we don't care about the
1722 1722                                           * fdcnt. We will examine the bitmap
1723 1723                                           * later and pick up the correct
1724 1724                                           * fdcnt there. So we never bother
1725 1725                                           * to check value of 'cnt'.
1726 1726                                           */
1727 1727                                          error = pcache_insert(ps, fp,
1728 1728                                              ¤t[count], &cnt,
1729 1729                                              (ssize_t)count, which);
1730 1730                                          /*
1731 1731                                           * if no error, we want to do releasef
1732 1732                                           * after we updated cache poll list
1733 1733                                           * entry so that close() won't race
1734 1734                                           * us.
1735 1735                                           */
1736 1736                                          if (error) {
1737 1737                                                  /*
1738 1738                                                   * If we encountered an error,
1739 1739                                                   * we have invalidated an
1740 1740                                                   * entry in cached poll list
1741 1741                                                   * (in pcache_delete_fd() above)
1742 1742                                                   * but failed to add one here.
1743 1743                                                   * This is OK b/c what's in the
1744 1744                                                   * cached list is consistent
1745 1745                                                   * with content of cache.
1746 1746                                                   * It will not have any ill
1747 1747                                                   * effect on next poll().
1748 1748                                                   */
1749 1749                                                  releasef(tmpfd);
1750 1750                                                  if (newlist != NULL) {
1751 1751                                                          kmem_free(newlist,
1752 1752                                                              nfds *
1753 1753                                                              sizeof (pollfd_t));
1754 1754                                                  }
1755 1755                                                  return (error);
1756 1756                                          }
1757 1757                                          /*
1758 1758                                           * If we have allocated a new(temp)
1759 1759                                           * cache list, we need to keep both
1760 1760                                           * in sync b/c the new one can be freed
1761 1761                                           * if we have an error later.
1762 1762                                           */
1763 1763                                          if (newlist != NULL) {
1764 1764                                                  newlist[count].fd =
1765 1765                                                      current[count].fd;
1766 1766                                                  newlist[count].events =
1767 1767                                                      current[count].events;
1768 1768                                          }
1769 1769                                          cached[count].fd = current[count].fd;
1770 1770                                          cached[count].events =
1771 1771                                              current[count].events;
1772 1772                                          releasef(tmpfd);
1773 1773                                  }
1774 1774                          } else {
1775 1775                                  current[count].revents = 0;
1776 1776                          }
1777 1777                          count++;
1778 1778                          remain = common - count;
1779 1779                  }
1780 1780          }
1781 1781          if (mismatch != 0) {
1782 1782                  if (mismatch == common) {
1783 1783                          pollstats.pollcachemiss.value.ui64++;
1784 1784                  } else {
1785 1785                          pollstats.pollcachephit.value.ui64++;
1786 1786                  }
1787 1787          }
1788 1788          /*
1789 1789           * take care of the non overlapping part of a list
1790 1790           */
1791 1791          if (nfds > old_nfds) {
1792 1792                  ASSERT(newlist != NULL);
1793 1793                  for (i = old_nfds; i < nfds; i++) {
1794 1794                          /* filter out invalid events */
1795 1795                          if (current[i].events & ~VALID_POLL_EVENTS) {
1796 1796                                  newlist[i].events = current[i].events =
1797 1797                                      current[i].events & VALID_POLL_EVENTS;
1798 1798                          }
1799 1799                          if ((fd = current[i].fd) < 0) {
1800 1800                                  current[i].revents = 0;
1801 1801                                  continue;
1802 1802                          }
1803 1803                          /*
1804 1804                           * add to the cached fd tbl and bitmap.
1805 1805                           */
1806 1806                          if ((fp = getf(fd)) == NULL) {
1807 1807                                  current[i].revents = POLLNVAL;
1808 1808                                  newlist[i].fd = -1;
1809 1809                                  fdcnt++;
1810 1810                                  continue;
1811 1811                          }
1812 1812                          /*
1813 1813                           * Here we don't care about the
1814 1814                           * fdcnt. We will examine the bitmap
1815 1815                           * later and pick up the correct
1816 1816                           * fdcnt there. So we never bother to
1817 1817                           * check 'cnt'.
1818 1818                           */
1819 1819                          error = pcache_insert(ps, fp, ¤t[i], &cnt,
1820 1820                              (ssize_t)i, which);
1821 1821                          releasef(fd);
1822 1822                          if (error) {
1823 1823                                  /*
1824 1824                                   * Here we are half way through adding newly
1825 1825                                   * polled fd. Undo enough to keep the cache
1826 1826                                   * list consistent with the cache content.
1827 1827                                   */
1828 1828                                  pcacheset_remove_list(ps, current, old_nfds,
1829 1829                                      i, which, 0);
1830 1830                                  kmem_free(newlist, nfds * sizeof (pollfd_t));
1831 1831                                  return (error);
1832 1832                          }
1833 1833                  }
1834 1834          }
1835 1835          if (old_nfds > nfds) {
1836 1836                  /*
1837 1837                   * remove the fd's which are no longer polled.
1838 1838                   */
1839 1839                  pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds,
1840 1840                      which, 1);
1841 1841          }
1842 1842          /*
1843 1843           * set difference resolved. update nfds and cachedlist
1844 1844           * in pollstate struct.
1845 1845           */
1846 1846          if (newlist != NULL) {
1847 1847                  kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t));
1848 1848                  /*
1849 1849                   * By now, the pollfd.revents field should
1850 1850                   * all be zeroed.
1851 1851                   */
1852 1852                  pcsp->pcs_pollfd = newlist;
1853 1853                  pcsp->pcs_nfds = nfds;
1854 1854          }
1855 1855          ASSERT(*fdcntp == 0);
1856 1856          *fdcntp = fdcnt;
1857 1857          /*
1858 1858           * By now for every fd in pollfdp, one of the following should be
1859 1859           * true. Otherwise we will miss a polled event.
1860 1860           *
1861 1861           * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL
1862 1862           *    will be called on this fd in next poll.
1863 1863           * 2. the fd is cached in the pcache (i.e. pd_php is set). So
1864 1864           *    pollnotify will happen.
1865 1865           */
1866 1866          ASSERT(pollchecksanity(ps, nfds));
1867 1867          /*
1868 1868           * make sure cross reference between cached poll lists and cached
1869 1869           * poll fds are correct.
1870 1870           */
1871 1871          ASSERT(pollcheckxref(ps, which));
1872 1872          /*
1873 1873           * ensure each polldat in pollcache reference a polled fd in
1874 1874           * pollcacheset.
1875 1875           */
1876 1876  #ifdef DEBUG
1877 1877          checkpolldat(ps);
1878 1878  #endif
1879 1879          return (0);
1880 1880  }
1881 1881  
1882 1882  #ifdef DEBUG
1883 1883  static int
1884 1884  pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds)
1885 1885  {
1886 1886          int i;
1887 1887          int reventcnt = 0;
1888 1888  
1889 1889          for (i = 0; i < nfds; i++) {
1890 1890                  if (pollfdp[i].fd < 0) {
1891 1891                          ASSERT(pollfdp[i].revents == 0);
1892 1892                          continue;
1893 1893                  }
1894 1894                  if (pollfdp[i].revents) {
1895 1895                          reventcnt++;
1896 1896                  }
1897 1897                  if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) {
1898 1898                          ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd));
1899 1899                  }
1900 1900          }
1901 1901          return (reventcnt);
1902 1902  }
1903 1903  #endif  /* DEBUG */
1904 1904  
1905 1905  /*
1906 1906   * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock
1907 1907   * is held upon entry.
1908 1908   */
1909 1909  int
1910 1910  pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp,
1911 1911      int which)
1912 1912  {
1913 1913          int             i;
1914 1914          pollcache_t     *pcp;
1915 1915          int             fd;
1916 1916          int             begin, end, done;
1917 1917          pollhead_t      *php;
1918 1918          int             fdcnt;
1919 1919          int             error = 0;
1920 1920          file_t          *fp;
1921 1921          polldat_t       *pdp;
1922 1922          xref_t          *refp;
1923 1923          int             entry;
1924 1924  
1925 1925          pcp = ps->ps_pcache;
1926 1926          ASSERT(MUTEX_HELD(&ps->ps_lock));
1927 1927          ASSERT(MUTEX_HELD(&pcp->pc_lock));
1928 1928  retry:
1929 1929          done = 0;
1930 1930          begin = 0;
1931 1931          fdcnt = 0;
1932 1932          end = pcp->pc_mapend;
1933 1933          while ((fdcnt < nfds) && !done) {
1934 1934                  php = NULL;
1935 1935                  /*
1936 1936                   * only poll fds which may have events
1937 1937                   */
1938 1938                  fd = bt_getlowbit(pcp->pc_bitmap, begin, end);
1939 1939                  ASSERT(fd <= end);
1940 1940                  if (fd >= 0) {
1941 1941                          ASSERT(pollcheckrevents(ps, begin, fd, which));
1942 1942                          /*
1943 1943                           * adjust map pointers for next round
1944 1944                           */
1945 1945                          if (fd == end) {
1946 1946                                  done = 1;
1947 1947                          } else {
1948 1948                                  begin = fd + 1;
1949 1949                          }
1950 1950                          /*
1951 1951                           * A bitmap caches poll state information of
1952 1952                           * multiple poll lists. Call VOP_POLL only if
1953 1953                           * the bit corresponds to an fd in this poll
1954 1954                           * list.
1955 1955                           */
1956 1956                          pdp = pcache_lookup_fd(pcp, fd);
1957 1957                          ASSERT(pdp != NULL);
1958 1958                          ASSERT(pdp->pd_ref != NULL);
1959 1959                          refp = &pdp->pd_ref[which];
1960 1960                          if (refp->xf_refcnt == 0)
1961 1961                                  continue;
1962 1962                          entry = refp->xf_position;
1963 1963                          ASSERT((entry >= 0) && (entry < nfds));
1964 1964                          ASSERT(pollfdp[entry].fd == fd);
1965 1965                          /*
1966 1966                           * we are in this routine implies that we have
1967 1967                           * successfully polled this fd in the past.
1968 1968                           * Check to see this fd is closed while we are
1969 1969                           * blocked in poll. This ensures that we don't
1970 1970                           * miss a close on the fd in the case this fd is
1971 1971                           * reused.
1972 1972                           */
1973 1973                          if (pdp->pd_fp == NULL) {
1974 1974                                  ASSERT(pdp->pd_count > 0);
1975 1975                                  pollfdp[entry].revents = POLLNVAL;
1976 1976                                  fdcnt++;
1977 1977                                  if (refp->xf_refcnt > 1) {
1978 1978                                          /*
1979 1979                                           * this fd appeared multiple time
1980 1980                                           * in the poll list. Find all of them.
1981 1981                                           */
1982 1982                                          for (i = entry + 1; i < nfds; i++) {
1983 1983                                                  if (pollfdp[i].fd == fd) {
1984 1984                                                          pollfdp[i].revents =
1985 1985                                                              POLLNVAL;
1986 1986                                                          fdcnt++;
1987 1987                                                  }
1988 1988                                          }
1989 1989                                  }
1990 1990                                  pcacheset_invalidate(ps, pdp);
1991 1991                                  continue;
1992 1992                          }
1993 1993                          /*
1994 1994                           * We can be here polling a device that is being
1995 1995                           * closed (i.e. the file pointer is set to NULL,
1996 1996                           * but pollcacheclean has not happened yet).
1997 1997                           */
1998 1998                          if ((fp = getf(fd)) == NULL) {
1999 1999                                  pollfdp[entry].revents = POLLNVAL;
2000 2000                                  fdcnt++;
2001 2001                                  if (refp->xf_refcnt > 1) {
2002 2002                                          /*
2003 2003                                           * this fd appeared multiple time
2004 2004                                           * in the poll list. Find all of them.
2005 2005                                           */
2006 2006                                          for (i = entry + 1; i < nfds; i++) {
2007 2007                                                  if (pollfdp[i].fd == fd) {
2008 2008                                                          pollfdp[i].revents =
2009 2009                                                              POLLNVAL;
2010 2010                                                          fdcnt++;
2011 2011                                                  }
2012 2012                                          }
2013 2013                                  }
2014 2014                                  continue;
2015 2015                          }
2016 2016                          ASSERT(pdp->pd_fp == fp);
2017 2017                          ASSERT(infpollinfo(fd));
2018 2018                          /*
2019 2019                           * Since we no longer hold poll head lock across
2020 2020                           * VOP_POLL, pollunlock logic can be simplifed.
2021 2021                           */
2022 2022                          ASSERT(pdp->pd_php == NULL ||
2023 2023                              MUTEX_NOT_HELD(PHLOCK(pdp->pd_php)));
2024 2024                          /*
2025 2025                           * underlying file systems may set a "pollpending"
2026 2026                           * flag when it sees the poll may block. Pollwakeup()
2027 2027                           * is called by wakeup thread if pollpending is set.
2028 2028                           * Pass a 0 fdcnt so that the underlying file system
2029 2029                           * will set the "pollpending" flag set when there is
2030 2030                           * no polled events.
2031 2031                           *
2032 2032                           * Use pollfdp[].events for actual polling because
2033 2033                           * the pd_events is union of all cached poll events
2034 2034                           * on this fd. The events parameter also affects
2035 2035                           * how the polled device sets the "poll pending"
2036 2036                           * flag.
2037 2037                           */
2038 2038                          ASSERT(curthread->t_pollcache == NULL);
2039 2039                          error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0,
2040 2040                              &pollfdp[entry].revents, &php, NULL);
2041 2041                          /*
2042 2042                           * releasef after completely done with this cached
2043 2043                           * poll entry. To prevent close() coming in to clear
2044 2044                           * this entry.
2045 2045                           */
2046 2046                          if (error) {
2047 2047                                  releasef(fd);
2048 2048                                  break;
2049 2049                          }
2050 2050                          /*
2051 2051                           * layered devices (e.g. console driver)
2052 2052                           * may change the vnode and thus the pollhead
2053 2053                           * pointer out from underneath us.
2054 2054                           */
2055 2055                          if (php != NULL && pdp->pd_php != NULL &&
2056 2056                              php != pdp->pd_php) {
2057 2057                                  releasef(fd);
2058 2058                                  pollhead_delete(pdp->pd_php, pdp);
2059 2059                                  pdp->pd_php = php;
2060 2060                                  pollhead_insert(php, pdp);
2061 2061                                  /*
2062 2062                                   * We could have missed a wakeup on the new
2063 2063                                   * target device. Make sure the new target
2064 2064                                   * gets polled once.
2065 2065                                   */
2066 2066                                  BT_SET(pcp->pc_bitmap, fd);
2067 2067                                  goto retry;
2068 2068                          }
2069 2069  
2070 2070                          if (pollfdp[entry].revents) {
2071 2071                                  ASSERT(refp->xf_refcnt >= 1);
2072 2072                                  fdcnt++;
2073 2073                                  if (refp->xf_refcnt > 1) {
2074 2074                                          /*
2075 2075                                           * this fd appeared multiple time
2076 2076                                           * in the poll list. This is rare but
2077 2077                                           * we have to look at all of them for
2078 2078                                           * correctness.
2079 2079                                           */
2080 2080                                          error = plist_chkdupfd(fp, pdp, ps,
2081 2081                                              pollfdp, entry, &fdcnt);
2082 2082                                          if (error > 0) {
2083 2083                                                  releasef(fd);
2084 2084                                                  break;
2085 2085                                          }
2086 2086                                          if (error < 0) {
2087 2087                                                  goto retry;
2088 2088                                          }
2089 2089                                  }
2090 2090                                  releasef(fd);
2091 2091                          } else {
2092 2092                                  /*
2093 2093                                   * VOP_POLL didn't return any revents. We can
2094 2094                                   * clear the bit in bitmap only if we have the
2095 2095                                   * pollhead ptr cached and no other cached
2096 2096                                   * entry is polling different events on this fd.
2097 2097                                   * VOP_POLL may have dropped the ps_lock. Make
2098 2098                                   * sure pollwakeup has not happened before clear
2099 2099                                   * the bit.
2100 2100                                   */
2101 2101                                  if ((pdp->pd_php != NULL) &&
2102 2102                                      (pollfdp[entry].events == pdp->pd_events) &&
2103 2103                                      ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
2104 2104                                          BT_CLEAR(pcp->pc_bitmap, fd);
2105 2105                                  }
2106 2106                                  /*
2107 2107                                   * if the fd can be cached now but not before,
2108 2108                                   * do it now.
2109 2109                                   */
2110 2110                                  if ((pdp->pd_php == NULL) && (php != NULL)) {
2111 2111                                          pdp->pd_php = php;
2112 2112                                          pollhead_insert(php, pdp);
2113 2113                                          /*
2114 2114                                           * We are inserting a polldat struct for
2115 2115                                           * the first time. We may have missed a
2116 2116                                           * wakeup on this device. Re-poll once.
2117 2117                                           * This should be a rare event.
2118 2118                                           */
2119 2119                                          releasef(fd);
2120 2120                                          goto retry;
2121 2121                                  }
2122 2122                                  if (refp->xf_refcnt > 1) {
2123 2123                                          /*
2124 2124                                           * this fd appeared multiple time
2125 2125                                           * in the poll list. This is rare but
2126 2126                                           * we have to look at all of them for
2127 2127                                           * correctness.
2128 2128                                           */
2129 2129                                          error = plist_chkdupfd(fp, pdp, ps,
2130 2130                                              pollfdp, entry, &fdcnt);
2131 2131                                          if (error > 0) {
2132 2132                                                  releasef(fd);
2133 2133                                                  break;
2134 2134                                          }
2135 2135                                          if (error < 0) {
2136 2136                                                  goto retry;
2137 2137                                          }
2138 2138                                  }
2139 2139                                  releasef(fd);
2140 2140                          }
2141 2141                  } else {
2142 2142                          done = 1;
2143 2143                          ASSERT(pollcheckrevents(ps, begin, end + 1, which));
2144 2144                  }
2145 2145          }
2146 2146          if (!error) {
2147 2147                  ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds));
2148 2148                  *fdcntp += fdcnt;
2149 2149          }
2150 2150          return (error);
2151 2151  }
2152 2152  
2153 2153  /*
2154 2154   * Going through the poll list without much locking. Poll all fds and
2155 2155   * cache all valid fds in the pollcache.
2156 2156   */
2157 2157  int
2158 2158  pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which)
2159 2159  {
2160 2160          pollfd_t        *pollfdp = ps->ps_pollfd;
2161 2161          pollcacheset_t  *pcacheset = ps->ps_pcacheset;
2162 2162          pollfd_t        *newfdlist;
2163 2163          int             i;
2164 2164          int             fd;
2165 2165          file_t          *fp;
2166 2166          int             error = 0;
2167 2167  
2168 2168          ASSERT(MUTEX_HELD(&ps->ps_lock));
2169 2169          ASSERT(which < ps->ps_nsets);
2170 2170          ASSERT(pcacheset != NULL);
2171 2171          ASSERT(pcacheset[which].pcs_pollfd == NULL);
2172 2172          newfdlist  = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP);
2173 2173          /*
2174 2174           * cache the new poll list in pollcachset.
2175 2175           */
2176 2176          bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds);
2177 2177  
2178 2178          pcacheset[which].pcs_pollfd = newfdlist;
2179 2179          pcacheset[which].pcs_nfds = ps->ps_nfds;
2180 2180          pcacheset[which].pcs_usradr = (uintptr_t)fds;
2181 2181  
2182 2182          /*
2183 2183           * We have saved a copy of current poll fd list in one pollcacheset.
2184 2184           * The 'revents' field of the new list is not yet set to 0. Loop
2185 2185           * through the new list just to do that is expensive. We do that
2186 2186           * while polling the list.
2187 2187           */
2188 2188          for (i = 0; i < ps->ps_nfds; i++) {
2189 2189                  fd = pollfdp[i].fd;
2190 2190                  /*
2191 2191                   * We also filter out the illegal poll events in the event
2192 2192                   * field for the cached poll list/set.
2193 2193                   */
2194 2194                  if (pollfdp[i].events & ~VALID_POLL_EVENTS) {
2195 2195                          newfdlist[i].events = pollfdp[i].events =
2196 2196                              pollfdp[i].events & VALID_POLL_EVENTS;
2197 2197                  }
2198 2198                  if (fd < 0) {
2199 2199                          pollfdp[i].revents = 0;
2200 2200                          continue;
2201 2201                  }
2202 2202                  if ((fp = getf(fd)) == NULL) {
2203 2203                          pollfdp[i].revents = POLLNVAL;
2204 2204                          /*
2205 2205                           * invalidate this cache entry in the cached poll list
2206 2206                           */
2207 2207                          newfdlist[i].fd = -1;
2208 2208                          (*fdcntp)++;
2209 2209                          continue;
2210 2210                  }
2211 2211                  /*
2212 2212                   * cache this fd.
2213 2213                   */
2214 2214                  error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i,
2215 2215                      which);
2216 2216                  releasef(fd);
2217 2217                  if (error) {
2218 2218                          /*
2219 2219                           * Here we are half way through caching a new
2220 2220                           * poll list. Undo every thing.
2221 2221                           */
2222 2222                          pcacheset_remove_list(ps, pollfdp, 0, i, which, 0);
2223 2223                          kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t));
2224 2224                          pcacheset[which].pcs_pollfd = NULL;
2225 2225                          pcacheset[which].pcs_usradr = NULL;
2226 2226                          break;
2227 2227                  }
2228 2228          }
2229 2229          return (error);
2230 2230  }
2231 2231  
2232 2232  /*
2233 2233   * called by pollcacheclean() to set the fp NULL. It also sets polled events
2234 2234   * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to
2235 2235   * wake any sleeping poller, then remove the polldat from the driver.
2236 2236   * The routine is called with ps_pcachelock held.
2237 2237   */
2238 2238  void
2239 2239  pcache_clean_entry(pollstate_t *ps, int fd)
2240 2240  {
2241 2241          pollcache_t     *pcp;
2242 2242          polldat_t       *pdp;
2243 2243          int             i;
2244 2244  
2245 2245          ASSERT(ps != NULL);
2246 2246          ASSERT(MUTEX_HELD(&ps->ps_lock));
2247 2247          pcp = ps->ps_pcache;
2248 2248          ASSERT(pcp);
2249 2249          pdp = pcache_lookup_fd(pcp, fd);
2250 2250          ASSERT(pdp != NULL);
2251 2251          /*
2252 2252           * the corresponding fpollinfo in fi_list has been removed by
2253 2253           * a close on this fd. Reset the cached fp ptr here.
2254 2254           */
2255 2255          pdp->pd_fp = NULL;
2256 2256          /*
2257 2257           * XXX - This routine also touches data in pcacheset struct.
2258 2258           *
2259 2259           * set the event in cached poll lists to POLLCLOSED. This invalidate
2260 2260           * the cached poll fd entry in that poll list, which will force a
2261 2261           * removal of this cached entry in next poll(). The cleanup is done
2262 2262           * at the removal time.
2263 2263           */
2264 2264          ASSERT(pdp->pd_ref != NULL);
2265 2265          for (i = 0; i < ps->ps_nsets; i++) {
2266 2266                  xref_t          *refp;
2267 2267                  pollcacheset_t  *pcsp;
2268 2268  
2269 2269                  refp = &pdp->pd_ref[i];
2270 2270                  if (refp->xf_refcnt) {
2271 2271                          ASSERT(refp->xf_position >= 0);
2272 2272                          pcsp = &ps->ps_pcacheset[i];
2273 2273                          if (refp->xf_refcnt == 1) {
2274 2274                                  pcsp->pcs_pollfd[refp->xf_position].events =
2275 2275                                      (short)POLLCLOSED;
2276 2276                          }
2277 2277                          if (refp->xf_refcnt > 1) {
2278 2278                                  int     j;
2279 2279                                  /*
2280 2280                                   * mark every matching entry in pcs_pollfd
2281 2281                                   */
2282 2282                                  for (j = refp->xf_position;
2283 2283                                      j < pcsp->pcs_nfds; j++) {
2284 2284                                          if (pcsp->pcs_pollfd[j].fd == fd) {
2285 2285                                                  pcsp->pcs_pollfd[j].events =
2286 2286                                                      (short)POLLCLOSED;
2287 2287                                          }
2288 2288                                  }
2289 2289                          }
2290 2290                  }
2291 2291          }
2292 2292          if (pdp->pd_php) {
2293 2293                  pollwakeup(pdp->pd_php, POLLHUP);
2294 2294                  pollhead_delete(pdp->pd_php, pdp);
2295 2295                  pdp->pd_php = NULL;
2296 2296          }
2297 2297  }
2298 2298  
2299 2299  void
2300 2300  pcache_wake_parents(pollcache_t *pcp)
2301 2301  {
2302 2302          pcachelink_t *pl, *pln;
2303 2303  
2304 2304          ASSERT(MUTEX_HELD(&pcp->pc_lock));
2305 2305  
2306 2306          for (pl = pcp->pc_parents; pl != NULL; pl = pln) {
2307 2307                  mutex_enter(&pl->pcl_lock);
2308 2308                  if (pl->pcl_state == PCL_VALID) {
2309 2309                          ASSERT(pl->pcl_parent_pc != NULL);
2310 2310                          cv_broadcast(&pl->pcl_parent_pc->pc_cv);
2311 2311                  }
2312 2312                  pln = pl->pcl_parent_next;
2313 2313                  mutex_exit(&pl->pcl_lock);
2314 2314          }
2315 2315  }
2316 2316  
2317 2317  /*
2318 2318   * Initialize thread pollstate structure.
2319 2319   * It will persist for the life of the thread, until it calls pollcleanup().
2320 2320   */
2321 2321  pollstate_t *
2322 2322  pollstate_create()
2323 2323  {
2324 2324          pollstate_t *ps = curthread->t_pollstate;
2325 2325  
2326 2326          if (ps == NULL) {
2327 2327                  /*
2328 2328                   * This is the first time this thread has ever polled, so we
2329 2329                   * have to create its pollstate structure.
2330 2330                   */
2331 2331                  ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP);
2332 2332                  ps->ps_nsets = POLLFDSETS;
2333 2333                  ps->ps_pcacheset = pcacheset_create(ps->ps_nsets);
2334 2334                  curthread->t_pollstate = ps;
2335 2335          } else {
2336 2336                  ASSERT(ps->ps_depth == 0);
2337 2337                  ASSERT(ps->ps_flags == 0);
2338 2338                  ASSERT(ps->ps_pc_stack[0] == 0);
2339 2339          }
2340 2340          return (ps);
2341 2341  }
2342 2342  
2343 2343  void
2344 2344  pollstate_destroy(pollstate_t *ps)
2345 2345  {
2346 2346          if (ps->ps_pollfd != NULL) {
2347 2347                  kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t));
2348 2348                  ps->ps_pollfd = NULL;
2349 2349          }
2350 2350          if (ps->ps_pcache != NULL) {
2351 2351                  pcache_destroy(ps->ps_pcache);
2352 2352                  ps->ps_pcache = NULL;
2353 2353          }
2354 2354          pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets);
2355 2355          ps->ps_pcacheset = NULL;
2356 2356          if (ps->ps_dpbuf != NULL) {
2357 2357                  kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
2358 2358                  ps->ps_dpbuf = NULL;
2359 2359          }
2360 2360          mutex_destroy(&ps->ps_lock);
2361 2361          kmem_free(ps, sizeof (pollstate_t));
2362 2362  }
2363 2363  
2364 2364  static int
2365 2365  pollstate_contend(pollstate_t *ps, pollcache_t *pcp)
2366 2366  {
2367 2367          pollstate_t *rem, *next;
2368 2368          pollcache_t *desired_pc;
2369 2369          int result = 0, depth_total;
2370 2370  
2371 2371          mutex_enter(&pollstate_contenders_lock);
2372 2372          /*
2373 2373           * There is a small chance that the pollcache of interest became
2374 2374           * available while we were waiting on the contenders lock.
2375 2375           */
2376 2376          if (mutex_tryenter(&pcp->pc_lock) != 0) {
2377 2377                  goto out;
2378 2378          }
2379 2379  
2380 2380          /*
2381 2381           * Walk the list of contended pollstates, searching for evidence of a
2382 2382           * deadlock condition.
2383 2383           */
2384 2384          depth_total = ps->ps_depth;
2385 2385          desired_pc = pcp;
2386 2386          for (rem = pollstate_contenders; rem != NULL; rem = next) {
2387 2387                  int i, j;
2388 2388                  next = rem->ps_contend_nextp;
2389 2389  
2390 2390                  /* Is this pollstate holding the pollcache of interest? */
2391 2391                  for (i = 0; i < rem->ps_depth; i++) {
2392 2392                          if (rem->ps_pc_stack[i] != desired_pc) {
2393 2393                                  continue;
2394 2394                          }
2395 2395  
2396 2396                          /*
2397 2397                           * The remote pollstate holds the pollcache lock we
2398 2398                           * desire.  If it is waiting on a pollcache we hold,
2399 2399                           * then we can report the obvious deadlock.
2400 2400                           */
2401 2401                          ASSERT(rem->ps_contend_pc != NULL);
2402 2402                          for (j = 0; j < ps->ps_depth; j++) {
2403 2403                                  if (rem->ps_contend_pc == ps->ps_pc_stack[j]) {
2404 2404                                          rem->ps_flags |= POLLSTATE_STALEMATE;
2405 2405                                          result = -1;
2406 2406                                          goto out;
2407 2407                                  }
2408 2408                          }
2409 2409  
2410 2410                          /*
2411 2411                           * The remote pollstate is not blocking on a pollcache
2412 2412                           * which would deadlock against us.  That pollcache
2413 2413                           * may, however, be held by a pollstate which would
2414 2414                           * result in a deadlock.
2415 2415                           *
2416 2416                           * To detect such a condition, we continue walking
2417 2417                           * through the list using the pollcache blocking the
2418 2418                           * remote thread as our new search target.
2419 2419                           *
2420 2420                           * Return to the front of pollstate_contenders since it
2421 2421                           * is not ordered to guarantee complete dependency
2422 2422                           * traversal.  The below depth tracking places an upper
2423 2423                           * bound on iterations.
2424 2424                           */
2425 2425                          desired_pc = rem->ps_contend_pc;
2426 2426                          next = pollstate_contenders;
2427 2427  
2428 2428                          /*
2429 2429                           * The recursion depth of the remote pollstate is used
2430 2430                           * to calculate a final depth for the local /dev/poll
2431 2431                           * recursion, since those locks will be acquired
2432 2432                           * eventually.  If that value exceeds the defined
2433 2433                           * limit, we can report the failure now instead of
2434 2434                           * recursing to that failure depth.
2435 2435                           */
2436 2436                          depth_total += (rem->ps_depth - i);
2437 2437                          if (depth_total >= POLLMAXDEPTH) {
2438 2438                                  result = -1;
2439 2439                                  goto out;
2440 2440                          }
2441 2441                  }
2442 2442          }
2443 2443  
2444 2444          /*
2445 2445           * No deadlock partner was found.  The only course of action is to
2446 2446           * record ourself as a contended pollstate and wait for the pollcache
2447 2447           * mutex to become available.
2448 2448           */
2449 2449          ps->ps_contend_pc = pcp;
2450 2450          ps->ps_contend_nextp = pollstate_contenders;
2451 2451          ps->ps_contend_pnextp = &pollstate_contenders;
2452 2452          if (pollstate_contenders != NULL) {
2453 2453                  pollstate_contenders->ps_contend_pnextp =
2454 2454                      &ps->ps_contend_nextp;
2455 2455          }
2456 2456          pollstate_contenders = ps;
2457 2457  
2458 2458          mutex_exit(&pollstate_contenders_lock);
2459 2459          mutex_enter(&pcp->pc_lock);
2460 2460          mutex_enter(&pollstate_contenders_lock);
2461 2461  
2462 2462          /*
2463 2463           * Our acquisition of the pollcache mutex may be due to another thread
2464 2464           * giving up in the face of deadlock with us.  If that is the case,
2465 2465           * we too should report the failure.
2466 2466           */
2467 2467          if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) {
2468 2468                  result = -1;
2469 2469                  ps->ps_flags &= ~POLLSTATE_STALEMATE;
2470 2470                  mutex_exit(&pcp->pc_lock);
2471 2471          }
2472 2472  
2473 2473          /* Remove ourself from the contenders list. */
2474 2474          if (ps->ps_contend_nextp != NULL) {
2475 2475                  ps->ps_contend_nextp->ps_contend_pnextp =
2476 2476                      ps->ps_contend_pnextp;
2477 2477          }
2478 2478          *ps->ps_contend_pnextp = ps->ps_contend_nextp;
2479 2479          ps->ps_contend_pc = NULL;
2480 2480          ps->ps_contend_nextp = NULL;
2481 2481          ps->ps_contend_pnextp = NULL;
2482 2482  
2483 2483  out:
2484 2484          mutex_exit(&pollstate_contenders_lock);
2485 2485          return (result);
2486 2486  }
2487 2487  
2488 2488  int
2489 2489  pollstate_enter(pollcache_t *pcp)
2490 2490  {
2491 2491          pollstate_t *ps = curthread->t_pollstate;
2492 2492          int i;
2493 2493  
2494 2494          if (ps == NULL) {
2495 2495                  /*
2496 2496                   * The thread pollstate may not be initialized if VOP_POLL is
2497 2497                   * called on a recursion-enabled /dev/poll handle from outside
2498 2498                   * the poll() or /dev/poll codepaths.
2499 2499                   */
2500 2500                  return (PSE_FAIL_POLLSTATE);
2501 2501          }
2502 2502          if (ps->ps_depth >= POLLMAXDEPTH) {
2503 2503                  return (PSE_FAIL_DEPTH);
2504 2504          }
2505 2505          /*
2506 2506           * Check the desired pollcache against pollcaches we already have
2507 2507           * locked.  Such a loop is the most simple deadlock scenario.
2508 2508           */
2509 2509          for (i = 0; i < ps->ps_depth; i++) {
2510 2510                  if (ps->ps_pc_stack[i] == pcp) {
2511 2511                          return (PSE_FAIL_LOOP);
2512 2512                  }
2513 2513          }
2514 2514          ASSERT(ps->ps_pc_stack[i] == NULL);
2515 2515  
2516 2516          if (ps->ps_depth == 0) {
2517 2517                  /* Locking initial the pollcache requires no caution */
2518 2518                  mutex_enter(&pcp->pc_lock);
2519 2519          } else if (mutex_tryenter(&pcp->pc_lock) == 0) {
2520 2520                  if (pollstate_contend(ps, pcp) != 0) {
2521 2521                          /* This pollcache cannot safely be locked. */
2522 2522                          return (PSE_FAIL_DEADLOCK);
2523 2523                  }
2524 2524          }
2525 2525  
2526 2526          ps->ps_pc_stack[ps->ps_depth++] = pcp;
2527 2527          return (PSE_SUCCESS);
2528 2528  }
2529 2529  
2530 2530  void
2531 2531  pollstate_exit(pollcache_t *pcp)
2532 2532  {
2533 2533          pollstate_t *ps = curthread->t_pollstate;
2534 2534  
2535 2535          VERIFY(ps != NULL);
2536 2536          VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp);
2537 2537  
2538 2538          mutex_exit(&pcp->pc_lock);
2539 2539          ps->ps_pc_stack[--ps->ps_depth] = NULL;
2540 2540          VERIFY(ps->ps_depth >= 0);
2541 2541  }
2542 2542  
2543 2543  
2544 2544  /*
2545 2545   * We are holding the appropriate uf_lock entering this routine.
2546 2546   * Bump up the ps_busy count to prevent the thread from exiting.
2547 2547   */
2548 2548  void
2549 2549  pollblockexit(fpollinfo_t *fpip)
2550 2550  {
2551 2551          for (; fpip; fpip = fpip->fp_next) {
2552 2552                  pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache;
2553 2553  
2554 2554                  mutex_enter(&pcp->pc_no_exit);
2555 2555                  pcp->pc_busy++;  /* prevents exit()'s */
2556 2556                  mutex_exit(&pcp->pc_no_exit);
2557 2557          }
2558 2558  }
2559 2559  
2560 2560  /*
2561 2561   * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark
2562 2562   * the pcacheset events field POLLCLOSED to force the next poll() to remove
2563 2563   * this cache entry. We can't clean the polldat entry clean up here because
2564 2564   * lwp block in poll() needs the info to return. Wakeup anyone blocked in
2565 2565   * poll and let exiting lwp go. No lock is help upon entry. So it's OK for
2566 2566   * pcache_clean_entry to call pollwakeup().
2567 2567   */
2568 2568  void
2569 2569  pollcacheclean(fpollinfo_t *fip, int fd)
2570 2570  {
2571 2571          struct fpollinfo        *fpip, *fpip2;
2572 2572  
2573 2573          fpip = fip;
2574 2574          while (fpip) {
2575 2575                  pollstate_t *ps = fpip->fp_thread->t_pollstate;
2576 2576                  pollcache_t *pcp = ps->ps_pcache;
2577 2577  
2578 2578                  mutex_enter(&ps->ps_lock);
2579 2579                  pcache_clean_entry(ps, fd);
2580 2580                  mutex_exit(&ps->ps_lock);
2581 2581                  mutex_enter(&pcp->pc_no_exit);
2582 2582                  pcp->pc_busy--;
2583 2583                  if (pcp->pc_busy == 0) {
2584 2584                          /*
2585 2585                           * Wakeup the thread waiting in
2586 2586                           * thread_exit().
2587 2587                           */
2588 2588                          cv_signal(&pcp->pc_busy_cv);
2589 2589                  }
2590 2590                  mutex_exit(&pcp->pc_no_exit);
2591 2591  
2592 2592                  fpip2 = fpip;
2593 2593                  fpip = fpip->fp_next;
2594 2594                  kmem_free(fpip2, sizeof (fpollinfo_t));
2595 2595          }
2596 2596  }
2597 2597  
2598 2598  /*
2599 2599   * one of the cache line's counter is wrapping around. Reset all cache line
2600 2600   * counters to zero except one. This is simplistic, but probably works
2601 2601   * effectively.
2602 2602   */
2603 2603  void
2604 2604  pcacheset_reset_count(pollstate_t *ps, int index)
2605 2605  {
2606 2606          int     i;
2607 2607  
2608 2608          ASSERT(MUTEX_HELD(&ps->ps_lock));
2609 2609          for (i = 0; i < ps->ps_nsets; i++) {
2610 2610                  if (ps->ps_pcacheset[i].pcs_pollfd != NULL) {
2611 2611                          ps->ps_pcacheset[i].pcs_count = 0;
2612 2612                  }
2613 2613          }
2614 2614          ps->ps_pcacheset[index].pcs_count = 1;
2615 2615  }
2616 2616  
2617 2617  /*
2618 2618   * this routine implements poll cache list replacement policy.
2619 2619   * It is currently choose the "least used".
2620 2620   */
2621 2621  int
2622 2622  pcacheset_replace(pollstate_t *ps)
2623 2623  {
2624 2624          int i;
2625 2625          int index = 0;
2626 2626  
2627 2627          ASSERT(MUTEX_HELD(&ps->ps_lock));
2628 2628          for (i = 1; i < ps->ps_nsets; i++) {
2629 2629                  if (ps->ps_pcacheset[index].pcs_count >
2630 2630                      ps->ps_pcacheset[i].pcs_count) {
2631 2631                          index = i;
2632 2632                  }
2633 2633          }
2634 2634          ps->ps_pcacheset[index].pcs_count = 0;
2635 2635          return (index);
2636 2636  }
2637 2637  
2638 2638  /*
2639 2639   * this routine is called by strclose to remove remaining polldat struct on
2640 2640   * the pollhead list of the device being closed. There are two reasons as why
2641 2641   * the polldat structures still remain on the pollhead list:
2642 2642   *
2643 2643   * (1) The layered device(e.g.the console driver).
2644 2644   * In this case, the existence of a polldat implies that the thread putting
2645 2645   * the polldat on this list has not exited yet. Before the thread exits, it
2646 2646   * will have to hold this pollhead lock to remove the polldat. So holding the
2647 2647   * pollhead lock here effectively prevents the thread which put the polldat
2648 2648   * on this list from exiting.
2649 2649   *
2650 2650   * (2) /dev/poll.
2651 2651   * When a polled fd is cached in /dev/poll, its polldat will remain on the
2652 2652   * pollhead list if the process has not done a POLLREMOVE before closing the
2653 2653   * polled fd. We just unlink it here.
2654 2654   */
2655 2655  void
2656 2656  pollhead_clean(pollhead_t *php)
2657 2657  {
2658 2658          polldat_t       *pdp;
2659 2659  
2660 2660          /*
2661 2661           * In case(1), while we must prevent the thread in question from
2662 2662           * exiting, we must also obey the proper locking order, i.e.
2663 2663           * (ps_lock -> phlock).
2664 2664           */
2665 2665          PH_ENTER(php);
2666 2666          while (php->ph_list != NULL) {
2667 2667                  pollstate_t     *ps;
2668 2668                  pollcache_t     *pcp;
2669 2669  
2670 2670                  pdp = php->ph_list;
2671 2671                  ASSERT(pdp->pd_php == php);
2672 2672                  if (pdp->pd_thread == NULL) {
2673 2673                          /*
2674 2674                           * This is case(2). Since the ph_lock is sufficient
2675 2675                           * to synchronize this lwp with any other /dev/poll
2676 2676                           * lwp, just unlink the polldat.
2677 2677                           */
2678 2678                          php->ph_list = pdp->pd_next;
2679 2679                          pdp->pd_php = NULL;
2680 2680                          pdp->pd_next = NULL;
2681 2681                          continue;
2682 2682                  }
2683 2683                  ps = pdp->pd_thread->t_pollstate;
2684 2684                  ASSERT(ps != NULL);
2685 2685                  pcp = pdp->pd_pcache;
2686 2686                  ASSERT(pcp != NULL);
2687 2687                  mutex_enter(&pcp->pc_no_exit);
2688 2688                  pcp->pc_busy++;  /* prevents exit()'s */
2689 2689                  mutex_exit(&pcp->pc_no_exit);
2690 2690                  /*
2691 2691                   * Now get the locks in proper order to avoid deadlock.
2692 2692                   */
2693 2693                  PH_EXIT(php);
2694 2694                  mutex_enter(&ps->ps_lock);
2695 2695                  /*
2696 2696                   * while we dropped the pollhead lock, the element could be
2697 2697                   * taken off the list already.
2698 2698                   */
2699 2699                  PH_ENTER(php);
2700 2700                  if (pdp->pd_php == php) {
2701 2701                          ASSERT(pdp == php->ph_list);
2702 2702                          php->ph_list = pdp->pd_next;
2703 2703                          pdp->pd_php = NULL;
2704 2704                          pdp->pd_next = NULL;
2705 2705                  }
2706 2706                  PH_EXIT(php);
2707 2707                  mutex_exit(&ps->ps_lock);
2708 2708                  mutex_enter(&pcp->pc_no_exit);
2709 2709                  pcp->pc_busy--;
2710 2710                  if (pcp->pc_busy == 0) {
2711 2711                          /*
2712 2712                           * Wakeup the thread waiting in
2713 2713                           * thread_exit().
2714 2714                           */
2715 2715                          cv_signal(&pcp->pc_busy_cv);
2716 2716                  }
2717 2717                  mutex_exit(&pcp->pc_no_exit);
2718 2718                  PH_ENTER(php);
2719 2719          }
2720 2720          PH_EXIT(php);
2721 2721  }
2722 2722  
2723 2723  /*
2724 2724   * The remove_list is called to cleanup a partially cached 'current' list or
2725 2725   * to remove a partial list which is no longer cached. The flag value of 1
2726 2726   * indicates the second case.
2727 2727   */
2728 2728  void
2729 2729  pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end,
2730 2730      int cacheindex, int flag)
2731 2731  {
2732 2732          int i;
2733 2733  
2734 2734          ASSERT(MUTEX_HELD(&ps->ps_lock));
2735 2735          for (i = start; i < end; i++) {
2736 2736                  if ((pollfdp[i].fd >= 0) &&
2737 2737                      (flag || !(pollfdp[i].revents & POLLNVAL))) {
2738 2738                          if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex,
2739 2739                              (uint_t)pollfdp[i].events)) {
2740 2740                                  int j;
2741 2741                                  int fd = pollfdp[i].fd;
2742 2742  
2743 2743                                  for (j = i + 1; j < end; j++) {
2744 2744                                          if (pollfdp[j].fd == fd) {
2745 2745                                                  pcache_update_xref(
2746 2746                                                      ps->ps_pcache, fd,
2747 2747                                                      (ssize_t)j, cacheindex);
2748 2748                                                  break;
2749 2749                                          }
2750 2750                                  }
2751 2751                                  ASSERT(j <= end);
2752 2752                          }
2753 2753                  }
2754 2754          }
2755 2755  }
2756 2756  
2757 2757  #ifdef DEBUG
2758 2758  
2759 2759  #include<sys/strsubr.h>
2760 2760  /*
2761 2761   * make sure curthread is not on anyone's pollhead list any more.
2762 2762   */
2763 2763  static void
2764 2764  pollcheckphlist()
2765 2765  {
2766 2766          int i;
2767 2767          file_t *fp;
2768 2768          uf_entry_t *ufp;
2769 2769          uf_info_t *fip = P_FINFO(curproc);
2770 2770          struct stdata *stp;
2771 2771          polldat_t *pdp;
2772 2772  
2773 2773          mutex_enter(&fip->fi_lock);
2774 2774          for (i = 0; i < fip->fi_nfiles; i++) {
2775 2775                  UF_ENTER(ufp, fip, i);
2776 2776                  if ((fp = ufp->uf_file) != NULL) {
2777 2777                          if ((stp = fp->f_vnode->v_stream) != NULL) {
2778 2778                                  PH_ENTER(&stp->sd_pollist);
2779 2779                                  pdp = stp->sd_pollist.ph_list;
2780 2780                                  while (pdp) {
2781 2781                                          ASSERT(pdp->pd_thread != curthread);
2782 2782                                          pdp = pdp->pd_next;
2783 2783                                  }
2784 2784                                  PH_EXIT(&stp->sd_pollist);
2785 2785                          }
2786 2786                  }
2787 2787                  UF_EXIT(ufp);
2788 2788          }
2789 2789          mutex_exit(&fip->fi_lock);
2790 2790  }
2791 2791  
2792 2792  /*
2793 2793   * for resolved set poll list, the xref info in the pcache should be
2794 2794   * consistent with this poll list.
2795 2795   */
2796 2796  static int
2797 2797  pollcheckxref(pollstate_t *ps, int cacheindex)
2798 2798  {
2799 2799          pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd;
2800 2800          pollcache_t *pcp = ps->ps_pcache;
2801 2801          polldat_t *pdp;
2802 2802          int     i;
2803 2803          xref_t  *refp;
2804 2804  
2805 2805          for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) {
2806 2806                  if (pollfdp[i].fd < 0) {
2807 2807                          continue;
2808 2808                  }
2809 2809                  pdp = pcache_lookup_fd(pcp, pollfdp[i].fd);
2810 2810                  ASSERT(pdp != NULL);
2811 2811                  ASSERT(pdp->pd_ref != NULL);
2812 2812                  refp = &pdp->pd_ref[cacheindex];
2813 2813                  if (refp->xf_position >= 0) {
2814 2814                          ASSERT(refp->xf_refcnt >= 1);
2815 2815                          ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd);
2816 2816                          if (refp->xf_refcnt > 1) {
2817 2817                                  int     j;
2818 2818                                  int     count = 0;
2819 2819  
2820 2820                                  for (j = refp->xf_position;
2821 2821                                      j < ps->ps_pcacheset[cacheindex].pcs_nfds;
2822 2822                                      j++) {
2823 2823                                          if (pollfdp[j].fd == pdp->pd_fd) {
2824 2824                                                  count++;
2825 2825                                          }
2826 2826                                  }
2827 2827                                  ASSERT(count == refp->xf_refcnt);
2828 2828                          }
2829 2829                  }
2830 2830          }
2831 2831          return (1);
2832 2832  }
2833 2833  
2834 2834  /*
2835 2835   * For every cached pollfd, its polldat struct should be consistent with
2836 2836   * what is in the pcacheset lists.
2837 2837   */
2838 2838  static void
2839 2839  checkpolldat(pollstate_t *ps)
2840 2840  {
2841 2841          pollcache_t     *pcp = ps->ps_pcache;
2842 2842          polldat_t       **hashtbl;
2843 2843          int             i;
2844 2844  
2845 2845          hashtbl = pcp->pc_hash;
2846 2846          for (i = 0; i < pcp->pc_hashsize; i++) {
2847 2847                  polldat_t       *pdp;
2848 2848  
2849 2849                  for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
2850 2850                          ASSERT(pdp->pd_ref != NULL);
2851 2851                          if (pdp->pd_count > 0) {
2852 2852                                  xref_t          *refp;
2853 2853                                  int             j;
2854 2854                                  pollcacheset_t  *pcsp;
2855 2855                                  pollfd_t        *pollfd;
2856 2856  
2857 2857                                  for (j = 0; j < ps->ps_nsets; j++) {
2858 2858                                          refp = &pdp->pd_ref[j];
2859 2859                                          if (refp->xf_refcnt > 0) {
2860 2860                                                  pcsp = &ps->ps_pcacheset[j];
2861 2861                                  ASSERT(refp->xf_position < pcsp->pcs_nfds);
2862 2862                                                  pollfd = pcsp->pcs_pollfd;
2863 2863                          ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd);
2864 2864                                          }
2865 2865                                  }
2866 2866                          }
2867 2867                  }
2868 2868          }
2869 2869  }
2870 2870  
2871 2871  /*
2872 2872   * every wfd element on ph_list must have a corresponding fpollinfo on the
2873 2873   * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks.
2874 2874   */
2875 2875  void
2876 2876  checkwfdlist(vnode_t *vp, fpollinfo_t *fpip)
2877 2877  {
2878 2878          stdata_t *stp;
2879 2879          polldat_t *pdp;
2880 2880          fpollinfo_t *fpip2;
2881 2881  
2882 2882          if ((stp = vp->v_stream) == NULL) {
2883 2883                  return;
2884 2884          }
2885 2885          PH_ENTER(&stp->sd_pollist);
2886 2886          for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) {
2887 2887                  if (pdp->pd_thread != NULL &&
2888 2888                      pdp->pd_thread->t_procp == curthread->t_procp) {
2889 2889                          for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) {
2890 2890                                  if (pdp->pd_thread == fpip2->fp_thread) {
2891 2891                                          break;
2892 2892                                  }
2893 2893                          }
2894 2894                          ASSERT(fpip2 != NULL);
2895 2895                  }
2896 2896          }
2897 2897          PH_EXIT(&stp->sd_pollist);
2898 2898  }
2899 2899  
2900 2900  /*
2901 2901   * For each cached fd whose bit is not set in bitmap, its revents field in
2902 2902   * current poll list should be 0.
2903 2903   */
2904 2904  static int
2905 2905  pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex)
2906 2906  {
2907 2907          pollcache_t     *pcp = ps->ps_pcache;
2908 2908          pollfd_t        *pollfdp = ps->ps_pollfd;
2909 2909          int             i;
2910 2910  
2911 2911          for (i = begin; i < end; i++) {
2912 2912                  polldat_t       *pdp;
2913 2913  
2914 2914                  ASSERT(!BT_TEST(pcp->pc_bitmap, i));
2915 2915                  pdp = pcache_lookup_fd(pcp, i);
2916 2916                  if (pdp && pdp->pd_fp != NULL) {
2917 2917                          xref_t *refp;
2918 2918                          int entry;
2919 2919  
2920 2920                          ASSERT(pdp->pd_ref != NULL);
2921 2921                          refp = &pdp->pd_ref[cacheindex];
2922 2922                          if (refp->xf_refcnt == 0) {
2923 2923                                  continue;
2924 2924                          }
2925 2925                          entry = refp->xf_position;
2926 2926                          ASSERT(entry >= 0);
2927 2927                          ASSERT(pollfdp[entry].revents == 0);
2928 2928                          if (refp->xf_refcnt > 1) {
2929 2929                                  int j;
2930 2930  
2931 2931                                  for (j = entry + 1; j < ps->ps_nfds; j++) {
2932 2932                                          if (pollfdp[j].fd == i) {
2933 2933                                                  ASSERT(pollfdp[j].revents == 0);
2934 2934                                          }
2935 2935                                  }
2936 2936                          }
2937 2937                  }
2938 2938          }
2939 2939          return (1);
2940 2940  }
2941 2941  
2942 2942  #endif  /* DEBUG */
2943 2943  
2944 2944  pollcache_t *
2945 2945  pcache_alloc()
2946 2946  {
2947 2947          return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP));
2948 2948  }
2949 2949  
2950 2950  void
2951 2951  pcache_create(pollcache_t *pcp, nfds_t nfds)
2952 2952  {
2953 2953          size_t  mapsize;
2954 2954  
2955 2955          /*
2956 2956           * allocate enough bits for the poll fd list
2957 2957           */
2958 2958          if ((mapsize = POLLMAPCHUNK) <= nfds) {
2959 2959                  mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1);
2960 2960          }
2961 2961          pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t),
2962 2962              KM_SLEEP);
2963 2963          pcp->pc_mapsize = mapsize;
2964 2964          /*
2965 2965           * The hash size is at least POLLHASHCHUNKSZ. If user polls a large
2966 2966           * number of fd to start with, allocate a bigger hash table (to the
2967 2967           * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a
2968 2968           * hash table is expensive.
2969 2969           */
2970 2970          if (nfds < POLLHASHCHUNKSZ) {
2971 2971                  pcp->pc_hashsize = POLLHASHCHUNKSZ;
2972 2972          } else {
2973 2973                  pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
2974 2974                      ~(POLLHASHCHUNKSZ - 1);
2975 2975          }
2976 2976          pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
2977 2977              KM_SLEEP);
2978 2978  }
2979 2979  
2980 2980  void
2981 2981  pcache_destroy(pollcache_t *pcp)
2982 2982  {
2983 2983          polldat_t       **hashtbl;
2984 2984          int i;
2985 2985  
2986 2986          hashtbl = pcp->pc_hash;
2987 2987          for (i = 0; i < pcp->pc_hashsize; i++) {
2988 2988                  if (hashtbl[i] != NULL) {
2989 2989                          polldat_t *pdp, *pdp2;
2990 2990  
2991 2991                          pdp = hashtbl[i];
2992 2992                          while (pdp != NULL) {
2993 2993                                  pdp2 = pdp->pd_hashnext;
2994 2994                                  if (pdp->pd_ref != NULL) {
2995 2995                                          kmem_free(pdp->pd_ref, sizeof (xref_t) *
2996 2996                                              pdp->pd_nsets);
2997 2997                                  }
2998 2998                                  kmem_free(pdp, sizeof (polldat_t));
2999 2999                                  pdp = pdp2;
3000 3000                                  pcp->pc_fdcount--;
3001 3001                          }
3002 3002                  }
3003 3003          }
3004 3004          ASSERT(pcp->pc_fdcount == 0);
3005 3005          kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize);
3006 3006          kmem_free(pcp->pc_bitmap,
3007 3007              sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL));
3008 3008          mutex_destroy(&pcp->pc_no_exit);
3009 3009          mutex_destroy(&pcp->pc_lock);
3010 3010          cv_destroy(&pcp->pc_cv);
3011 3011          cv_destroy(&pcp->pc_busy_cv);
3012 3012          kmem_free(pcp, sizeof (pollcache_t));
3013 3013  }
3014 3014  
3015 3015  pollcacheset_t *
3016 3016  pcacheset_create(int nsets)
3017 3017  {
3018 3018          return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP));
3019 3019  }
3020 3020  
3021 3021  void
3022 3022  pcacheset_destroy(pollcacheset_t *pcsp, int nsets)
3023 3023  {
3024 3024          int i;
3025 3025  
3026 3026          for (i = 0; i < nsets; i++) {
3027 3027                  if (pcsp[i].pcs_pollfd != NULL) {
3028 3028                          kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds *
3029 3029                              sizeof (pollfd_t));
3030 3030                  }
3031 3031          }
3032 3032          kmem_free(pcsp, sizeof (pollcacheset_t) * nsets);
3033 3033  }
3034 3034  
3035 3035  /*
3036 3036   * Check each duplicated poll fd in the poll list. It may be necessary to
3037 3037   * VOP_POLL the same fd again using different poll events. getf() has been
3038 3038   * done by caller. This routine returns 0 if it can sucessfully process the
3039 3039   * entire poll fd list. It returns -1 if underlying vnode has changed during
3040 3040   * a VOP_POLL, in which case the caller has to repoll. It returns a positive
3041 3041   * value if VOP_POLL failed.
3042 3042   */
3043 3043  static int
3044 3044  plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp,
3045 3045      int entry, int *fdcntp)
3046 3046  {
3047 3047          int     i;
3048 3048          int     fd;
3049 3049          nfds_t  nfds = psp->ps_nfds;
3050 3050  
3051 3051          fd = pollfdp[entry].fd;
3052 3052          for (i = entry + 1; i < nfds; i++) {
3053 3053                  if (pollfdp[i].fd == fd) {
3054 3054                          if (pollfdp[i].events == pollfdp[entry].events) {
3055 3055                                  if ((pollfdp[i].revents =
3056 3056                                      pollfdp[entry].revents) != 0) {
3057 3057                                          (*fdcntp)++;
3058 3058                                  }
3059 3059                          } else {
3060 3060  
3061 3061                                  int     error;
3062 3062                                  pollhead_t *php;
3063 3063                                  pollcache_t *pcp = psp->ps_pcache;
3064 3064  
3065 3065                                  /*
3066 3066                                   * the events are different. VOP_POLL on this
3067 3067                                   * fd so that we don't miss any revents.
3068 3068                                   */
3069 3069                                  php = NULL;
3070 3070                                  ASSERT(curthread->t_pollcache == NULL);
3071 3071                                  error = VOP_POLL(fp->f_vnode,
3072 3072                                      pollfdp[i].events, 0,
3073 3073                                      &pollfdp[i].revents, &php, NULL);
3074 3074                                  if (error) {
3075 3075                                          return (error);
3076 3076                                  }
3077 3077                                  /*
3078 3078                                   * layered devices(e.g. console driver)
3079 3079                                   * may change the vnode and thus the pollhead
3080 3080                                   * pointer out from underneath us.
3081 3081                                   */
3082 3082                                  if (php != NULL && pdp->pd_php != NULL &&
3083 3083                                      php != pdp->pd_php) {
3084 3084                                          pollhead_delete(pdp->pd_php, pdp);
3085 3085                                          pdp->pd_php = php;
3086 3086                                          pollhead_insert(php, pdp);
3087 3087                                          /*
3088 3088                                           * We could have missed a wakeup on the
3089 3089                                           * new target device. Make sure the new
3090 3090                                           * target gets polled once.
3091 3091                                           */
3092 3092                                          BT_SET(pcp->pc_bitmap, fd);
3093 3093                                          return (-1);
3094 3094                                  }
3095 3095                                  if (pollfdp[i].revents) {
3096 3096                                          (*fdcntp)++;
3097 3097                                  }
3098 3098                          }
3099 3099                  }
3100 3100          }
3101 3101          return (0);
3102 3102  }
  
    | 
      ↓ open down ↓ | 
    3102 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX