1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * Copyright (c) 2012 by Delphix. All rights reserved.
  32  * Copyright 2015, Joyent, Inc.
  33  */
  34 
  35 /*
  36  * Portions of this source code were derived from Berkeley 4.3 BSD
  37  * under license from the Regents of the University of California.
  38  */
  39 
  40 #include <sys/param.h>
  41 #include <sys/isa_defs.h>
  42 #include <sys/types.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/user.h>
  45 #include <sys/systm.h>
  46 #include <sys/errno.h>
  47 #include <sys/time.h>
  48 #include <sys/vnode.h>
  49 #include <sys/file.h>
  50 #include <sys/mode.h>
  51 #include <sys/proc.h>
  52 #include <sys/uio.h>
  53 #include <sys/poll_impl.h>
  54 #include <sys/kmem.h>
  55 #include <sys/cmn_err.h>
  56 #include <sys/debug.h>
  57 #include <sys/bitmap.h>
  58 #include <sys/kstat.h>
  59 #include <sys/rctl.h>
  60 #include <sys/port_impl.h>
  61 #include <sys/schedctl.h>
  62 #include <sys/cpu.h>
  63 
  64 #define NPHLOCKS        64      /* Number of locks; must be power of 2 */
  65 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)]
  66 #define PHLOCK(php)     PHLOCKADDR(php).pp_lock
  67 #define PH_ENTER(php)   mutex_enter(PHLOCK(php))
  68 #define PH_EXIT(php)    mutex_exit(PHLOCK(php))
  69 #define VALID_POLL_EVENTS       (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \
  70         | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL)
  71 
  72 /*
  73  * global counters to collect some stats
  74  */
  75 static struct {
  76         kstat_named_t   polllistmiss;   /* failed to find a cached poll list */
  77         kstat_named_t   pollcachehit;   /* list matched 100% w/ cached one */
  78         kstat_named_t   pollcachephit;  /* list matched < 100% w/ cached one */
  79         kstat_named_t   pollcachemiss;  /* every list entry is dif from cache */
  80         kstat_named_t   pollunlockfail; /* failed to perform pollunlock */
  81 } pollstats = {
  82         { "polllistmiss",       KSTAT_DATA_UINT64 },
  83         { "pollcachehit",       KSTAT_DATA_UINT64 },
  84         { "pollcachephit",      KSTAT_DATA_UINT64 },
  85         { "pollcachemiss",      KSTAT_DATA_UINT64 },
  86         { "pollunlockfail",     KSTAT_DATA_UINT64 }
  87 };
  88 
  89 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats;
  90 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t);
  91 
  92 struct pplock   {
  93         kmutex_t        pp_lock;
  94         short           pp_flag;
  95         kcondvar_t      pp_wait_cv;
  96         int32_t         pp_pad;         /* to a nice round 16 bytes */
  97 };
  98 
  99 static struct pplock plocks[NPHLOCKS];  /* Hash array of pollhead locks */
 100 
 101 /* Contention lock & list for preventing deadlocks in recursive /dev/poll. */
 102 static  kmutex_t        pollstate_contenders_lock;
 103 static  pollstate_t     *pollstate_contenders = NULL;
 104 
 105 #ifdef DEBUG
 106 static int pollchecksanity(pollstate_t *, nfds_t);
 107 static int pollcheckxref(pollstate_t *, int);
 108 static void pollcheckphlist(void);
 109 static int pollcheckrevents(pollstate_t *, int, int, int);
 110 static void checkpolldat(pollstate_t *);
 111 #endif  /* DEBUG */
 112 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int,
 113     int *);
 114 
 115 /*
 116  * Data structure overview:
 117  * The per-thread poll state consists of
 118  *      one pollstate_t
 119  *      one pollcache_t
 120  *      one bitmap with one event bit per fd
 121  *      a (two-dimensional) hashed array of polldat_t structures - one entry
 122  *      per fd
 123  *
 124  * This conglomerate of data structures interact with
 125  *      the pollhead which is used by VOP_POLL and pollwakeup
 126  *      (protected by the PHLOCK, cached array of plocks), and
 127  *      the fpollinfo list hanging off the fi_list which is used to notify
 128  *      poll when a cached fd is closed. This is protected by uf_lock.
 129  *
 130  * Invariants:
 131  *      pd_php (pollhead pointer) is set iff (if and only if) the polldat
 132  *      is on that pollhead. This is modified atomically under pc_lock.
 133  *
 134  *      pd_fp (file_t pointer) is set iff the thread is on the fpollinfo
 135  *      list for that open file.
 136  *      This is modified atomically under pc_lock.
 137  *
 138  *      pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt.
 139  *      Iff pd_ref[i].xf_refcnt >= 1 then
 140  *              ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd
 141  *      Iff pd_ref[i].xf_refcnt > 1 then
 142  *              In ps_pcacheset[i].pcs_pollfd between index
 143  *              pd_ref[i].xf_position] and the end of the list
 144  *              there are xf_refcnt entries with .fd == pd_fd
 145  *
 146  * Locking design:
 147  * Whenever possible the design relies on the fact that the poll cache state
 148  * is per thread thus for both poll and exit it is self-synchronizing.
 149  * Thus the key interactions where other threads access the state are:
 150  *      pollwakeup (and polltime), and
 151  *      close cleaning up the cached references to an open file
 152  *
 153  * The two key locks in poll proper is ps_lock and pc_lock.
 154  *
 155  * The ps_lock is used for synchronization between poll, (lwp_)exit and close
 156  * to ensure that modifications to pollcacheset structure are serialized.
 157  * This lock is held through most of poll() except where poll sleeps
 158  * since there is little need to handle closes concurrently with the execution
 159  * of poll.
 160  * The pc_lock protects most of the fields in pollcache structure and polldat
 161  * structures (which are accessed by poll, pollwakeup, and polltime)
 162  * with the exception of fields that are only modified when only one thread
 163  * can access this per-thread state.
 164  * Those exceptions occur in poll when first allocating the per-thread state,
 165  * when poll grows the number of polldat (never shrinks), and when
 166  * exit/pollcleanup has ensured that there are no references from either
 167  * pollheads or fpollinfo to the threads poll state.
 168  *
 169  * Poll(2) system call is the only path which ps_lock and pc_lock are both
 170  * held, in that order. It needs ps_lock to synchronize with close and
 171  * lwp_exit; and pc_lock with pollwakeup.
 172  *
 173  * The locking interaction between pc_lock and PHLOCK take into account
 174  * that poll acquires these locks in the order of pc_lock and then PHLOCK
 175  * while pollwakeup does it in the reverse order. Thus pollwakeup implements
 176  * deadlock avoidance by dropping the locks and reacquiring them in the
 177  * reverse order. For this to work pollwakeup needs to prevent the thread
 178  * from exiting and freeing all of the poll related state. Thus is done
 179  * using
 180  *      the pc_no_exit lock
 181  *      the pc_busy counter
 182  *      the pc_busy_cv condition variable
 183  *
 184  * The locking interaction between pc_lock and uf_lock has similar
 185  * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef
 186  * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock
 187  * to prevent poll or exit from doing a delfpollinfo after which the thread
 188  * might exit. But the cleanup needs to acquire pc_lock when modifying
 189  * the poll cache state. The solution is to use pc_busy and do the close
 190  * cleanup in two phases:
 191  *      First close calls pollblockexit which increments pc_busy.
 192  *      This prevents the per-thread poll related state from being freed.
 193  *      Then close drops uf_lock and calls pollcacheclean.
 194  *      This routine can then acquire pc_lock and remove any references
 195  *      to the closing fd (as well as recording that it has been closed
 196  *      so that a POLLNVAL can be generated even if the fd is reused before
 197  *      poll has been woken up and checked getf() again).
 198  *
 199  * When removing a polled fd from poll cache, the fd is always removed
 200  * from pollhead list first and then from fpollinfo list, i.e.,
 201  * pollhead_delete() is called before delfpollinfo().
 202  *
 203  *
 204  * Locking hierarchy:
 205  *      pc_no_exit is a leaf level lock.
 206  *      ps_lock is held when acquiring pc_lock (except when pollwakeup
 207  *      acquires pc_lock).
 208  *      pc_lock might be held when acquiring PHLOCK (pollhead_insert/
 209  *      pollhead_delete)
 210  *      pc_lock is always held (but this is not required)
 211  *      when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called
 212  *      from pcache_clean_entry).
 213  *      pc_lock is held across addfpollinfo/delfpollinfo which acquire
 214  *      uf_lock.
 215  *      pc_lock is held across getf/releasef which acquire uf_lock.
 216  *      ps_lock might be held across getf/releasef which acquire uf_lock.
 217  *      pollwakeup tries to acquire pc_lock while holding PHLOCK
 218  *      but drops the locks and reacquire them in reverse order to avoid
 219  *      deadlock.
 220  *
 221  * Note also that there is deadlock avoidance support for VOP_POLL routines
 222  * and pollwakeup involving a file system or driver lock.
 223  * See below.
 224  */
 225 
 226 /*
 227  * Deadlock avoidance support for VOP_POLL() routines.  This is
 228  * sometimes necessary to prevent deadlock between polling threads
 229  * (which hold poll locks on entry to xx_poll(), then acquire foo)
 230  * and pollwakeup() threads (which hold foo, then acquire poll locks).
 231  *
 232  * pollunlock(*cookie) releases whatever poll locks the current thread holds,
 233  *      setting a cookie for use by pollrelock();
 234  *
 235  * pollrelock(cookie) reacquires previously dropped poll locks;
 236  *
 237  * polllock(php, mutex) does the common case: pollunlock(),
 238  *      acquire the problematic mutex, pollrelock().
 239  *
 240  * If polllock() or pollunlock() return non-zero, it indicates that a recursive
 241  * /dev/poll is in progress and pollcache locks cannot be dropped.  Callers
 242  * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL.
 243  */
 244 int
 245 pollunlock(int *lockstate)
 246 {
 247         pollstate_t *ps = curthread->t_pollstate;
 248         pollcache_t *pcp;
 249 
 250         ASSERT(lockstate != NULL);
 251 
 252         /*
 253          * There is no way to safely perform a pollunlock() while in the depths
 254          * of a recursive /dev/poll operation.
 255          */
 256         if (ps != NULL && ps->ps_depth > 1) {
 257                 ps->ps_flags |= POLLSTATE_ULFAIL;
 258                 pollstats.pollunlockfail.value.ui64++;
 259                 return (-1);
 260         }
 261 
 262         /*
 263          * t_pollcache is set by /dev/poll and event ports (port_fd.c).
 264          * If the pollrelock/pollunlock is called as a result of poll(2),
 265          * the t_pollcache should be NULL.
 266          */
 267         if (curthread->t_pollcache == NULL)
 268                 pcp = ps->ps_pcache;
 269         else
 270                 pcp = curthread->t_pollcache;
 271 
 272         if (!mutex_owned(&pcp->pc_lock)) {
 273                 *lockstate = 0;
 274         } else {
 275                 *lockstate = 1;
 276                 mutex_exit(&pcp->pc_lock);
 277         }
 278         return (0);
 279 }
 280 
 281 void
 282 pollrelock(int lockstate)
 283 {
 284         pollstate_t *ps = curthread->t_pollstate;
 285         pollcache_t *pcp;
 286 
 287         /* Skip this whole ordeal if the pollcache was not locked to begin */
 288         if (lockstate == 0)
 289                 return;
 290 
 291         /*
 292          * t_pollcache is set by /dev/poll and event ports (port_fd.c).
 293          * If the pollrelock/pollunlock is called as a result of poll(2),
 294          * the t_pollcache should be NULL.
 295          */
 296         if (curthread->t_pollcache == NULL)
 297                 pcp = ps->ps_pcache;
 298         else
 299                 pcp = curthread->t_pollcache;
 300 
 301         mutex_enter(&pcp->pc_lock);
 302 }
 303 
 304 /* ARGSUSED */
 305 int
 306 polllock(pollhead_t *php, kmutex_t *lp)
 307 {
 308         if (mutex_tryenter(lp) == 0) {
 309                 int state;
 310 
 311                 if (pollunlock(&state) != 0) {
 312                         return (-1);
 313                 }
 314                 mutex_enter(lp);
 315                 pollrelock(state);
 316         }
 317         return (0);
 318 }
 319 
 320 static int
 321 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 322 {
 323         kthread_t *t = curthread;
 324         klwp_t *lwp = ttolwp(t);
 325         proc_t *p = ttoproc(t);
 326         int fdcnt = 0;
 327         int i;
 328         hrtime_t deadline; /* hrtime value when we want to return */
 329         pollfd_t *pollfdp;
 330         pollstate_t *ps;
 331         pollcache_t *pcp;
 332         int error = 0;
 333         nfds_t old_nfds;
 334         int cacheindex = 0;     /* which cache set is used */
 335 
 336         /*
 337          * Determine the precise future time of the requested timeout, if any.
 338          */
 339         if (tsp == NULL) {
 340                 deadline = -1;
 341         } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
 342                 deadline = 0;
 343         } else {
 344                 /* They must wait at least a tick. */
 345                 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec;
 346                 deadline = MAX(deadline, nsec_per_tick);
 347                 deadline += gethrtime();
 348         }
 349 
 350         /*
 351          * Reset our signal mask, if requested.
 352          */
 353         if (ksetp != NULL) {
 354                 mutex_enter(&p->p_lock);
 355                 schedctl_finish_sigblock(t);
 356                 lwp->lwp_sigoldmask = t->t_hold;
 357                 t->t_hold = *ksetp;
 358                 t->t_flag |= T_TOMASK;
 359                 /*
 360                  * Call cv_reltimedwait_sig() just to check for signals.
 361                  * We will return immediately with either 0 or -1.
 362                  */
 363                 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
 364                     TR_CLOCK_TICK)) {
 365                         mutex_exit(&p->p_lock);
 366                         error = EINTR;
 367                         goto pollout;
 368                 }
 369                 mutex_exit(&p->p_lock);
 370         }
 371 
 372         /*
 373          * Check to see if this guy just wants to use poll() as a timeout.
 374          * If yes then bypass all the other stuff and make him sleep.
 375          */
 376         if (nfds == 0) {
 377                 /*
 378                  * Sleep until we have passed the requested future
 379                  * time or until interrupted by a signal.
 380                  * Do not check for signals if we do not want to wait.
 381                  */
 382                 if (deadline != 0) {
 383                         mutex_enter(&t->t_delay_lock);
 384                         while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
 385                             &t->t_delay_lock, deadline)) > 0)
 386                                 continue;
 387                         mutex_exit(&t->t_delay_lock);
 388                         error = (error == 0) ? EINTR : 0;
 389                 }
 390                 goto pollout;
 391         }
 392 
 393         if (nfds > p->p_fno_ctl) {
 394                 mutex_enter(&p->p_lock);
 395                 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
 396                     p->p_rctls, p, RCA_SAFE);
 397                 mutex_exit(&p->p_lock);
 398                 error = EINVAL;
 399                 goto pollout;
 400         }
 401 
 402         /*
 403          * Need to allocate memory for pollstate before anything because
 404          * the mutex and cv are created in this space
 405          */
 406         ps = pollstate_create();
 407 
 408         if (ps->ps_pcache == NULL)
 409                 ps->ps_pcache = pcache_alloc();
 410         pcp = ps->ps_pcache;
 411 
 412         /*
 413          * NOTE: for performance, buffers are saved across poll() calls.
 414          * The theory is that if a process polls heavily, it tends to poll
 415          * on the same set of descriptors.  Therefore, we only reallocate
 416          * buffers when nfds changes.  There is no hysteresis control,
 417          * because there is no data to suggest that this is necessary;
 418          * the penalty of reallocating is not *that* great in any event.
 419          */
 420         old_nfds = ps->ps_nfds;
 421         if (nfds != old_nfds) {
 422 
 423                 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
 424                 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
 425                 ps->ps_pollfd = pollfdp;
 426                 ps->ps_nfds = nfds;
 427         }
 428 
 429         pollfdp = ps->ps_pollfd;
 430         if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
 431                 error = EFAULT;
 432                 goto pollout;
 433         }
 434 
 435         if (fds == NULL) {
 436                 /*
 437                  * If the process has page 0 mapped, then the copyin() above
 438                  * will succeed even if fds is NULL.  However, our cached
 439                  * poll lists are keyed by the address of the passed-in fds
 440                  * structure, and we use the value NULL to indicate an unused
 441                  * poll cache list entry.  As such, we elect not to support
 442                  * NULL as a valid (user) memory address and fail the poll()
 443                  * call.
 444                  */
 445                 error = EINVAL;
 446                 goto pollout;
 447         }
 448 
 449         /*
 450          * If this thread polls for the first time, allocate ALL poll
 451          * cache data structures and cache the poll fd list. This
 452          * allocation is delayed till now because lwp's polling 0 fd
 453          * (i.e. using poll as timeout()) don't need this memory.
 454          */
 455         mutex_enter(&ps->ps_lock);
 456         pcp = ps->ps_pcache;
 457         ASSERT(pcp != NULL);
 458         if (pcp->pc_bitmap == NULL) {
 459                 pcache_create(pcp, nfds);
 460                 /*
 461                  * poll and cache this poll fd list in ps_pcacheset[0].
 462                  */
 463                 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
 464                 if (fdcnt || error) {
 465                         mutex_exit(&ps->ps_lock);
 466                         goto pollout;
 467                 }
 468         } else {
 469                 pollcacheset_t  *pcset = ps->ps_pcacheset;
 470 
 471                 /*
 472                  * Not first time polling. Select a cached poll list by
 473                  * matching user pollfd list buffer address.
 474                  */
 475                 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) {
 476                         if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) {
 477                                 if ((++pcset[cacheindex].pcs_count) == 0) {
 478                                         /*
 479                                          * counter is wrapping around.
 480                                          */
 481                                         pcacheset_reset_count(ps, cacheindex);
 482                                 }
 483                                 /*
 484                                  * examine and resolve possible
 485                                  * difference of the current poll
 486                                  * list and previously cached one.
 487                                  * If there is an error during resolve(),
 488                                  * the callee will guarantee the consistency
 489                                  * of cached poll list and cache content.
 490                                  */
 491                                 error = pcacheset_resolve(ps, nfds, &fdcnt,
 492                                     cacheindex);
 493                                 if (error) {
 494                                         mutex_exit(&ps->ps_lock);
 495                                         goto pollout;
 496                                 }
 497                                 break;
 498                         }
 499 
 500                         /*
 501                          * Note that pcs_usradr field of an used entry won't be
 502                          * NULL because it stores the address of passed-in fds,
 503                          * and NULL fds will not be cached (Then it is either
 504                          * the special timeout case when nfds is 0 or it returns
 505                          * failure directly).
 506                          */
 507                         if (pcset[cacheindex].pcs_usradr == NULL) {
 508                                 /*
 509                                  * found an unused entry. Use it to cache
 510                                  * this poll list.
 511                                  */
 512                                 error = pcacheset_cache_list(ps, fds, &fdcnt,
 513                                     cacheindex);
 514                                 if (fdcnt || error) {
 515                                         mutex_exit(&ps->ps_lock);
 516                                         goto pollout;
 517                                 }
 518                                 break;
 519                         }
 520                 }
 521                 if (cacheindex == ps->ps_nsets) {
 522                         /*
 523                          * We failed to find a matching cached poll fd list.
 524                          * replace an old list.
 525                          */
 526                         pollstats.polllistmiss.value.ui64++;
 527                         cacheindex = pcacheset_replace(ps);
 528                         ASSERT(cacheindex < ps->ps_nsets);
 529                         pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
 530                         error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
 531                         if (error) {
 532                                 mutex_exit(&ps->ps_lock);
 533                                 goto pollout;
 534                         }
 535                 }
 536         }
 537 
 538         /*
 539          * Always scan the bitmap with the lock on the pollcache held.
 540          * This is to make sure that a wakeup does not come undetected.
 541          * If the lock is not held, a pollwakeup could have come for an
 542          * fd we already checked but before this thread sleeps, in which
 543          * case the wakeup is missed. Now we hold the pcache lock and
 544          * check the bitmap again. This will prevent wakeup from happening
 545          * while we hold pcache lock since pollwakeup() will also lock
 546          * the pcache before updating poll bitmap.
 547          */
 548         mutex_enter(&pcp->pc_lock);
 549         for (;;) {
 550                 pcp->pc_flag = 0;
 551                 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
 552                 if (fdcnt || error) {
 553                         mutex_exit(&pcp->pc_lock);
 554                         mutex_exit(&ps->ps_lock);
 555                         break;
 556                 }
 557 
 558                 /*
 559                  * If PC_POLLWAKE is set, a pollwakeup() was performed on
 560                  * one of the file descriptors.  This can happen only if
 561                  * one of the VOP_POLL() functions dropped pcp->pc_lock.
 562                  * The only current cases of this is in procfs (prpoll())
 563                  * and STREAMS (strpoll()).
 564                  */
 565                 if (pcp->pc_flag & PC_POLLWAKE)
 566                         continue;
 567 
 568                 /*
 569                  * If you get here, the poll of fds was unsuccessful.
 570                  * Wait until some fd becomes readable, writable, or gets
 571                  * an exception, or until a signal or a timeout occurs.
 572                  * Do not check for signals if we have a zero timeout.
 573                  */
 574                 mutex_exit(&ps->ps_lock);
 575                 if (deadline == 0) {
 576                         error = -1;
 577                 } else {
 578                         error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
 579                             &pcp->pc_lock, deadline);
 580                 }
 581                 mutex_exit(&pcp->pc_lock);
 582                 /*
 583                  * If we have received a signal or timed out
 584                  * then break out and return.
 585                  */
 586                 if (error <= 0) {
 587                         error = (error == 0) ? EINTR : 0;
 588                         break;
 589                 }
 590                 /*
 591                  * We have not received a signal or timed out.
 592                  * Continue around and poll fds again.
 593                  */
 594                 mutex_enter(&ps->ps_lock);
 595                 mutex_enter(&pcp->pc_lock);
 596         }
 597 
 598 pollout:
 599         /*
 600          * If we changed the signal mask but we received
 601          * no signal then restore the signal mask.
 602          * Otherwise psig() will deal with the signal mask.
 603          */
 604         if (ksetp != NULL) {
 605                 mutex_enter(&p->p_lock);
 606                 if (lwp->lwp_cursig == 0) {
 607                         t->t_hold = lwp->lwp_sigoldmask;
 608                         t->t_flag &= ~T_TOMASK;
 609                 }
 610                 mutex_exit(&p->p_lock);
 611         }
 612 
 613         if (error)
 614                 return (set_errno(error));
 615 
 616         /*
 617          * Copy out the events and return the fdcnt to the user.
 618          */
 619         if (nfds != 0 &&
 620             copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
 621                 return (set_errno(EFAULT));
 622 
 623 #ifdef DEBUG
 624         /*
 625          * Another sanity check:
 626          */
 627         if (fdcnt) {
 628                 int     reventcnt = 0;
 629 
 630                 for (i = 0; i < nfds; i++) {
 631                         if (pollfdp[i].fd < 0) {
 632                                 ASSERT(pollfdp[i].revents == 0);
 633                                 continue;
 634                         }
 635                         if (pollfdp[i].revents) {
 636                                 reventcnt++;
 637                         }
 638                 }
 639                 ASSERT(fdcnt == reventcnt);
 640         } else {
 641                 for (i = 0; i < nfds; i++) {
 642                         ASSERT(pollfdp[i].revents == 0);
 643                 }
 644         }
 645 #endif  /* DEBUG */
 646 
 647         return (fdcnt);
 648 }
 649 
 650 /*
 651  * This is the system call trap that poll(),
 652  * select() and pselect() are built upon.
 653  * It is a private interface between libc and the kernel.
 654  */
 655 int
 656 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
 657 {
 658         timespec_t ts;
 659         timespec_t *tsp;
 660         sigset_t set;
 661         k_sigset_t kset;
 662         k_sigset_t *ksetp;
 663         model_t datamodel = get_udatamodel();
 664 
 665         if (timeoutp == NULL)
 666                 tsp = NULL;
 667         else {
 668                 if (datamodel == DATAMODEL_NATIVE) {
 669                         if (copyin(timeoutp, &ts, sizeof (ts)))
 670                                 return (set_errno(EFAULT));
 671                 } else {
 672                         timespec32_t ts32;
 673 
 674                         if (copyin(timeoutp, &ts32, sizeof (ts32)))
 675                                 return (set_errno(EFAULT));
 676                         TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
 677                 }
 678 
 679                 if (itimerspecfix(&ts))
 680                         return (set_errno(EINVAL));
 681                 tsp = &ts;
 682         }
 683 
 684         if (setp == NULL)
 685                 ksetp = NULL;
 686         else {
 687                 if (copyin(setp, &set, sizeof (set)))
 688                         return (set_errno(EFAULT));
 689                 sigutok(&set, &kset);
 690                 ksetp = &kset;
 691         }
 692 
 693         return (poll_common(fds, nfds, tsp, ksetp));
 694 }
 695 
 696 /*
 697  * Clean up any state left around by poll(2). Called when a thread exits.
 698  */
 699 void
 700 pollcleanup()
 701 {
 702         pollstate_t *ps = curthread->t_pollstate;
 703         pollcache_t *pcp;
 704 
 705         if (ps == NULL)
 706                 return;
 707         pcp = ps->ps_pcache;
 708         /*
 709          * free up all cached poll fds
 710          */
 711         if (pcp == NULL) {
 712                 /* this pollstate is used by /dev/poll */
 713                 goto pollcleanout;
 714         }
 715 
 716         if (pcp->pc_bitmap != NULL) {
 717                 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock));
 718                 /*
 719                  * a close lwp can race with us when cleaning up a polldat
 720                  * entry. We hold the ps_lock when cleaning hash table.
 721                  * Since this pollcache is going away anyway, there is no
 722                  * need to hold the pc_lock.
 723                  */
 724                 mutex_enter(&ps->ps_lock);
 725                 pcache_clean(pcp);
 726                 mutex_exit(&ps->ps_lock);
 727 #ifdef DEBUG
 728                 /*
 729                  * At this point, all fds cached by this lwp should be
 730                  * cleaned up. There should be no fd in fi_list still
 731                  * reference this thread.
 732                  */
 733                 checkfpollinfo();       /* sanity check */
 734                 pollcheckphlist();      /* sanity check */
 735 #endif  /* DEBUG */
 736         }
 737         /*
 738          * Be sure no one is referencing thread before exiting
 739          */
 740         mutex_enter(&pcp->pc_no_exit);
 741         ASSERT(pcp->pc_busy >= 0);
 742         while (pcp->pc_busy > 0)
 743                 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
 744         mutex_exit(&pcp->pc_no_exit);
 745 pollcleanout:
 746         pollstate_destroy(ps);
 747         curthread->t_pollstate = NULL;
 748 }
 749 
 750 /*
 751  * pollwakeup() - poke threads waiting in poll() for some event
 752  * on a particular object.
 753  *
 754  * The threads hanging off of the specified pollhead structure are scanned.
 755  * If their event mask matches the specified event(s), then pollnotify() is
 756  * called to poke the thread.
 757  *
 758  * Multiple events may be specified.  When POLLHUP or POLLERR are specified,
 759  * all waiting threads are poked.
 760  *
 761  * It is important that pollnotify() not drop the lock protecting the list
 762  * of threads.
 763  */
 764 void
 765 pollwakeup(pollhead_t *php, short events_arg)
 766 {
 767         polldat_t       *pdp;
 768         int             events = (ushort_t)events_arg;
 769         struct plist {
 770                 port_t *pp;
 771                 int     pevents;
 772                 struct plist *next;
 773                 };
 774         struct plist *plhead = NULL, *pltail = NULL;
 775 
 776 retry:
 777         PH_ENTER(php);
 778 
 779         for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) {
 780                 if ((pdp->pd_events & events) ||
 781                     (events & (POLLHUP | POLLERR))) {
 782 
 783                         pollcache_t     *pcp;
 784 
 785                         if (pdp->pd_portev != NULL) {
 786                                 port_kevent_t   *pkevp = pdp->pd_portev;
 787                                 /*
 788                                  * Object (fd) is associated with an event port,
 789                                  * => send event notification to the port.
 790                                  */
 791                                 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD);
 792                                 mutex_enter(&pkevp->portkev_lock);
 793                                 if (pkevp->portkev_flags & PORT_KEV_VALID) {
 794                                         int pevents;
 795 
 796                                         pkevp->portkev_flags &= ~PORT_KEV_VALID;
 797                                         pkevp->portkev_events |= events &
 798                                             (pdp->pd_events | POLLHUP |
 799                                             POLLERR);
 800                                         /*
 801                                          * portkev_lock mutex will be released
 802                                          * by port_send_event().
 803                                          */
 804                                         port_send_event(pkevp);
 805 
 806                                         /*
 807                                          * If we have some thread polling the
 808                                          * port's fd, add it to the list. They
 809                                          * will be notified later.
 810                                          * The port_pollwkup() will flag the
 811                                          * port_t so that it will not disappear
 812                                          * till port_pollwkdone() is called.
 813                                          */
 814                                         pevents =
 815                                             port_pollwkup(pkevp->portkev_port);
 816                                         if (pevents) {
 817                                                 struct plist *t;
 818                                                 t = kmem_zalloc(
 819                                                     sizeof (struct plist),
 820                                                     KM_SLEEP);
 821                                                 t->pp = pkevp->portkev_port;
 822                                                 t->pevents = pevents;
 823                                                 if (plhead == NULL) {
 824                                                         plhead = t;
 825                                                 } else {
 826                                                         pltail->next = t;
 827                                                 }
 828                                                 pltail = t;
 829                                         }
 830                                 } else {
 831                                         mutex_exit(&pkevp->portkev_lock);
 832                                 }
 833                                 continue;
 834                         }
 835 
 836                         pcp = pdp->pd_pcache;
 837 
 838                         /*
 839                          * Try to grab the lock for this thread. If
 840                          * we don't get it then we may deadlock so
 841                          * back out and restart all over again. Note
 842                          * that the failure rate is very very low.
 843                          */
 844                         if (mutex_tryenter(&pcp->pc_lock)) {
 845                                 pollnotify(pcp, pdp->pd_fd);
 846                                 mutex_exit(&pcp->pc_lock);
 847                         } else {
 848                                 /*
 849                                  * We are here because:
 850                                  *      1) This thread has been woke up
 851                                  *         and is trying to get out of poll().
 852                                  *      2) Some other thread is also here
 853                                  *         but with a different pollhead lock.
 854                                  *
 855                                  * So, we need to drop the lock on pollhead
 856                                  * because of (1) but we want to prevent
 857                                  * that thread from doing lwp_exit() or
 858                                  * devpoll close. We want to ensure that
 859                                  * the pollcache pointer is still invalid.
 860                                  *
 861                                  * Solution: Grab the pcp->pc_no_exit lock,
 862                                  * increment the pc_busy counter, drop every
 863                                  * lock in sight. Get out of the way and wait
 864                                  * for type (2) threads to finish.
 865                                  */
 866 
 867                                 mutex_enter(&pcp->pc_no_exit);
 868                                 pcp->pc_busy++;      /* prevents exit()'s */
 869                                 mutex_exit(&pcp->pc_no_exit);
 870 
 871                                 PH_EXIT(php);
 872                                 mutex_enter(&pcp->pc_lock);
 873                                 mutex_exit(&pcp->pc_lock);
 874                                 mutex_enter(&pcp->pc_no_exit);
 875                                 pcp->pc_busy--;
 876                                 if (pcp->pc_busy == 0) {
 877                                         /*
 878                                          * Wakeup the thread waiting in
 879                                          * thread_exit().
 880                                          */
 881                                         cv_signal(&pcp->pc_busy_cv);
 882                                 }
 883                                 mutex_exit(&pcp->pc_no_exit);
 884                                 goto retry;
 885                         }
 886                 }
 887         }
 888 
 889 
 890         /*
 891          * Event ports - If this php is of the port on the list,
 892          * call port_pollwkdone() to release it. The port_pollwkdone()
 893          * needs to be called before dropping the PH lock so that any new
 894          * thread attempting to poll this port are blocked. There can be
 895          * only one thread here in pollwakeup notifying this port's fd.
 896          */
 897         if (plhead != NULL && &plhead->pp->port_pollhd == php) {
 898                 struct plist *t;
 899                 port_pollwkdone(plhead->pp);
 900                 t = plhead;
 901                 plhead = plhead->next;
 902                 kmem_free(t, sizeof (struct plist));
 903         }
 904         PH_EXIT(php);
 905 
 906         /*
 907          * Event ports - Notify threads polling the event port's fd.
 908          * This is normally done in port_send_event() where it calls
 909          * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone,
 910          * we do it here in pollwakeup() to avoid a recursive call.
 911          */
 912         if (plhead != NULL) {
 913                 php = &plhead->pp->port_pollhd;
 914                 events = plhead->pevents;
 915                 goto retry;
 916         }
 917 }
 918 
 919 /*
 920  * This function is called to inform a thread (or threads) that an event being
 921  * polled on has occurred.  The pollstate lock on the thread should be held
 922  * on entry.
 923  */
 924 void
 925 pollnotify(pollcache_t *pcp, int fd)
 926 {
 927         ASSERT(fd < pcp->pc_mapsize);
 928         ASSERT(MUTEX_HELD(&pcp->pc_lock));
 929         BT_SET(pcp->pc_bitmap, fd);
 930         pcp->pc_flag |= PC_POLLWAKE;
 931         cv_broadcast(&pcp->pc_cv);
 932         pcache_wake_parents(pcp);
 933 }
 934 
 935 /*
 936  * add a polldat entry to pollhead ph_list. The polldat struct is used
 937  * by pollwakeup to wake sleeping pollers when polled events has happened.
 938  */
 939 void
 940 pollhead_insert(pollhead_t *php, polldat_t *pdp)
 941 {
 942         PH_ENTER(php);
 943         ASSERT(pdp->pd_next == NULL);
 944 #ifdef DEBUG
 945         {
 946                 /*
 947                  * the polldat should not be already on the list
 948                  */
 949                 polldat_t *wp;
 950                 for (wp = php->ph_list; wp; wp = wp->pd_next) {
 951                         ASSERT(wp != pdp);
 952                 }
 953         }
 954 #endif  /* DEBUG */
 955         pdp->pd_next = php->ph_list;
 956         php->ph_list = pdp;
 957         PH_EXIT(php);
 958 }
 959 
 960 /*
 961  * Delete the polldat entry from ph_list.
 962  */
 963 void
 964 pollhead_delete(pollhead_t *php, polldat_t *pdp)
 965 {
 966         polldat_t *wp;
 967         polldat_t **wpp;
 968 
 969         PH_ENTER(php);
 970         for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) {
 971                 if (wp == pdp) {
 972                         *wpp = pdp->pd_next;
 973                         pdp->pd_next = NULL;
 974                         break;
 975                 }
 976         }
 977 #ifdef DEBUG
 978         /* assert that pdp is no longer in the list */
 979         for (wp = *wpp; wp; wp = wp->pd_next) {
 980                 ASSERT(wp != pdp);
 981         }
 982 #endif  /* DEBUG */
 983         PH_EXIT(php);
 984 }
 985 
 986 /*
 987  * walk through the poll fd lists to see if they are identical. This is an
 988  * expensive operation and should not be done more than once for each poll()
 989  * call.
 990  *
 991  * As an optimization (i.e., not having to go through the lists more than
 992  * once), this routine also clear the revents field of pollfd in 'current'.
 993  * Zeroing out the revents field of each entry in current poll list is
 994  * required by poll man page.
 995  *
 996  * Since the events field of cached list has illegal poll events filtered
 997  * out, the current list applies the same filtering before comparison.
 998  *
 999  * The routine stops when it detects a meaningful difference, or when it
1000  * exhausts the lists.
1001  */
1002 int
1003 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n)
1004 {
1005         int    ix;
1006 
1007         for (ix = 0; ix < n; ix++) {
1008                 /* Prefetch 64 bytes worth of 8-byte elements */
1009                 if ((ix & 0x7) == 0) {
1010                         prefetch_write_many((caddr_t)&current[ix + 8]);
1011                         prefetch_write_many((caddr_t)&cached[ix + 8]);
1012                 }
1013                 if (current[ix].fd == cached[ix].fd) {
1014                         /*
1015                          * Filter out invalid poll events while we are in
1016                          * inside the loop.
1017                          */
1018                         if (current[ix].events & ~VALID_POLL_EVENTS) {
1019                                 current[ix].events &= VALID_POLL_EVENTS;
1020                                 if (newlist != NULL)
1021                                         newlist[ix].events = current[ix].events;
1022                         }
1023                         if (current[ix].events == cached[ix].events) {
1024                                 current[ix].revents = 0;
1025                                 continue;
1026                         }
1027                 }
1028                 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) {
1029                         current[ix].revents = 0;
1030                         continue;
1031                 }
1032                 return (ix);
1033         }
1034         return (ix);
1035 }
1036 
1037 /*
1038  * This routine returns a pointer to a cached poll fd entry, or NULL if it
1039  * does not find it in the hash table.
1040  */
1041 polldat_t *
1042 pcache_lookup_fd(pollcache_t *pcp, int fd)
1043 {
1044         int hashindex;
1045         polldat_t *pdp;
1046 
1047         hashindex = POLLHASH(pcp->pc_hashsize, fd);
1048         pdp = pcp->pc_hash[hashindex];
1049         while (pdp != NULL) {
1050                 if (pdp->pd_fd == fd)
1051                         break;
1052                 pdp = pdp->pd_hashnext;
1053         }
1054         return (pdp);
1055 }
1056 
1057 polldat_t *
1058 pcache_alloc_fd(int nsets)
1059 {
1060         polldat_t *pdp;
1061 
1062         pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP);
1063         if (nsets > 0) {
1064                 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP);
1065                 pdp->pd_nsets = nsets;
1066         }
1067         return (pdp);
1068 }
1069 
1070 /*
1071  * This routine  inserts a polldat into the pollcache's hash table. It
1072  * may be necessary to grow the size of the hash table.
1073  */
1074 void
1075 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds)
1076 {
1077         int hashindex;
1078         int fd;
1079 
1080         if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) ||
1081             (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) {
1082                 pcache_grow_hashtbl(pcp, nfds);
1083         }
1084         fd = pdp->pd_fd;
1085         hashindex = POLLHASH(pcp->pc_hashsize, fd);
1086         pdp->pd_hashnext = pcp->pc_hash[hashindex];
1087         pcp->pc_hash[hashindex] = pdp;
1088         pcp->pc_fdcount++;
1089 
1090 #ifdef DEBUG
1091         {
1092                 /*
1093                  * same fd should not appear on a hash list twice
1094                  */
1095                 polldat_t *pdp1;
1096                 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) {
1097                         ASSERT(pdp->pd_fd != pdp1->pd_fd);
1098                 }
1099         }
1100 #endif  /* DEBUG */
1101 }
1102 
1103 /*
1104  * Grow the hash table -- either double the table size or round it to the
1105  * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the
1106  * elements on the hash table.
1107  */
1108 void
1109 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds)
1110 {
1111         int     oldsize;
1112         polldat_t **oldtbl;
1113         polldat_t *pdp, *pdp1;
1114         int     i;
1115 #ifdef DEBUG
1116         int     count = 0;
1117 #endif
1118 
1119         ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0);
1120         oldsize = pcp->pc_hashsize;
1121         oldtbl = pcp->pc_hash;
1122         if (nfds > pcp->pc_hashsize * POLLHASHINC) {
1123                 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
1124                     ~(POLLHASHCHUNKSZ - 1);
1125         } else {
1126                 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC;
1127         }
1128         pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
1129             KM_SLEEP);
1130         /*
1131          * rehash existing elements
1132          */
1133         pcp->pc_fdcount = 0;
1134         for (i = 0; i < oldsize; i++) {
1135                 pdp = oldtbl[i];
1136                 while (pdp != NULL) {
1137                         pdp1 = pdp->pd_hashnext;
1138                         pcache_insert_fd(pcp, pdp, nfds);
1139                         pdp = pdp1;
1140 #ifdef DEBUG
1141                         count++;
1142 #endif
1143                 }
1144         }
1145         kmem_free(oldtbl, oldsize * sizeof (polldat_t *));
1146         ASSERT(pcp->pc_fdcount == count);
1147 }
1148 
1149 void
1150 pcache_grow_map(pollcache_t *pcp, int fd)
1151 {
1152         int     newsize;
1153         ulong_t *newmap;
1154 
1155         /*
1156          * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is
1157          * power of 2.
1158          */
1159         newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1);
1160         newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t),
1161             KM_SLEEP);
1162         /*
1163          * don't want pollwakeup to set a bit while growing the bitmap.
1164          */
1165         ASSERT(mutex_owned(&pcp->pc_lock) == 0);
1166         mutex_enter(&pcp->pc_lock);
1167         bcopy(pcp->pc_bitmap, newmap,
1168             (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t));
1169         kmem_free(pcp->pc_bitmap,
1170             (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t));
1171         pcp->pc_bitmap = newmap;
1172         pcp->pc_mapsize = newsize;
1173         mutex_exit(&pcp->pc_lock);
1174 }
1175 
1176 /*
1177  * remove all the reference from pollhead list and fpollinfo lists.
1178  */
1179 void
1180 pcache_clean(pollcache_t *pcp)
1181 {
1182         int i;
1183         polldat_t **hashtbl;
1184         polldat_t *pdp;
1185 
1186         ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock));
1187         hashtbl = pcp->pc_hash;
1188         for (i = 0; i < pcp->pc_hashsize; i++) {
1189                 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1190                         if (pdp->pd_php != NULL) {
1191                                 pollhead_delete(pdp->pd_php, pdp);
1192                                 pdp->pd_php = NULL;
1193                         }
1194                         if (pdp->pd_fp != NULL) {
1195                                 delfpollinfo(pdp->pd_fd);
1196                                 pdp->pd_fp = NULL;
1197                         }
1198                 }
1199         }
1200 }
1201 
1202 void
1203 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp)
1204 {
1205         int     i;
1206         int     fd = pdp->pd_fd;
1207 
1208         /*
1209          * we come here because an earlier close() on this cached poll fd.
1210          */
1211         ASSERT(pdp->pd_fp == NULL);
1212         ASSERT(MUTEX_HELD(&ps->ps_lock));
1213         pdp->pd_events = 0;
1214         for (i = 0; i < ps->ps_nsets; i++) {
1215                 xref_t          *refp;
1216                 pollcacheset_t  *pcsp;
1217 
1218                 ASSERT(pdp->pd_ref != NULL);
1219                 refp = &pdp->pd_ref[i];
1220                 if (refp->xf_refcnt) {
1221                         ASSERT(refp->xf_position >= 0);
1222                         pcsp = &ps->ps_pcacheset[i];
1223                         if (refp->xf_refcnt == 1) {
1224                                 pcsp->pcs_pollfd[refp->xf_position].fd = -1;
1225                                 refp->xf_refcnt = 0;
1226                                 pdp->pd_count--;
1227                         } else if (refp->xf_refcnt > 1) {
1228                                 int     j;
1229 
1230                                 /*
1231                                  * turn off every appearance in pcs_pollfd list
1232                                  */
1233                                 for (j = refp->xf_position;
1234                                     j < pcsp->pcs_nfds; j++) {
1235                                         if (pcsp->pcs_pollfd[j].fd == fd) {
1236                                                 pcsp->pcs_pollfd[j].fd = -1;
1237                                                 refp->xf_refcnt--;
1238                                                 pdp->pd_count--;
1239                                         }
1240                                 }
1241                         }
1242                         ASSERT(refp->xf_refcnt == 0);
1243                         refp->xf_position = POLLPOSINVAL;
1244                 }
1245         }
1246         ASSERT(pdp->pd_count == 0);
1247 }
1248 
1249 /*
1250  * Insert poll fd into the pollcache, and add poll registration.
1251  * This routine is called after getf() and before releasef(). So the vnode
1252  * can not disappear even if we block here.
1253  * If there is an error, the polled fd is not cached.
1254  */
1255 int
1256 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp,
1257     ssize_t pos, int which)
1258 {
1259         pollcache_t     *pcp = ps->ps_pcache;
1260         polldat_t       *pdp;
1261         int             error;
1262         int             fd;
1263         pollhead_t      *memphp = NULL;
1264         xref_t          *refp;
1265         int             newpollfd = 0;
1266 
1267         ASSERT(MUTEX_HELD(&ps->ps_lock));
1268         /*
1269          * The poll caching uses the existing VOP_POLL interface. If there
1270          * is no polled events, we want the polled device to set its "some
1271          * one is sleeping in poll" flag. When the polled events happen
1272          * later, the driver will call pollwakeup(). We achieve this by
1273          * always passing 0 in the third parameter ("anyyet") when calling
1274          * VOP_POLL. This parameter is not looked at by drivers when the
1275          * polled events exist. If a driver chooses to ignore this parameter
1276          * and call pollwakeup whenever the polled events happen, that will
1277          * be OK too.
1278          */
1279         ASSERT(curthread->t_pollcache == NULL);
1280         error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents,
1281             &memphp, NULL);
1282         if (error) {
1283                 return (error);
1284         }
1285         if (pollfdp->revents) {
1286                 (*fdcntp)++;
1287         }
1288         /*
1289          * polling the underlying device succeeded. Now we can cache it.
1290          * A close can't come in here because we have not done a releasef()
1291          * yet.
1292          */
1293         fd = pollfdp->fd;
1294         pdp = pcache_lookup_fd(pcp, fd);
1295         if (pdp == NULL) {
1296                 ASSERT(ps->ps_nsets > 0);
1297                 pdp = pcache_alloc_fd(ps->ps_nsets);
1298                 newpollfd = 1;
1299         }
1300         /*
1301          * If this entry was used to cache a poll fd which was closed, and
1302          * this entry has not been cleaned, do it now.
1303          */
1304         if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) {
1305                 pcacheset_invalidate(ps, pdp);
1306                 ASSERT(pdp->pd_next == NULL);
1307         }
1308         if (pdp->pd_count == 0) {
1309                 pdp->pd_fd = fd;
1310                 pdp->pd_fp = fp;
1311                 addfpollinfo(fd);
1312                 pdp->pd_thread = curthread;
1313                 pdp->pd_pcache = pcp;
1314                 /*
1315                  * the entry is never used or cleared by removing a cached
1316                  * pollfd (pcache_delete_fd). So all the fields should be clear.
1317                  */
1318                 ASSERT(pdp->pd_next == NULL);
1319         }
1320 
1321         /*
1322          * A polled fd is considered cached. So there should be a fpollinfo
1323          * entry on uf_fpollinfo list.
1324          */
1325         ASSERT(infpollinfo(fd));
1326         /*
1327          * If there is an inconsistency, we want to know it here.
1328          */
1329         ASSERT(pdp->pd_fp == fp);
1330 
1331         /*
1332          * XXX pd_events is a union of all polled events on this fd, possibly
1333          * by different threads. Unless this is a new first poll(), pd_events
1334          * never shrinks. If an event is no longer polled by a process, there
1335          * is no way to cancel that event. In that case, poll degrade to its
1336          * old form -- polling on this fd every time poll() is called. The
1337          * assumption is an app always polls the same type of events.
1338          */
1339         pdp->pd_events |= pollfdp->events;
1340 
1341         pdp->pd_count++;
1342         /*
1343          * There is not much special handling for multiple appearances of
1344          * same fd other than xf_position always recording the first
1345          * appearance in poll list. If this is called from pcacheset_cache_list,
1346          * a VOP_POLL is called on every pollfd entry; therefore each
1347          * revents and fdcnt should be set correctly. If this is called from
1348          * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will
1349          * pick up the right count and handle revents field of each pollfd
1350          * entry.
1351          */
1352         ASSERT(pdp->pd_ref != NULL);
1353         refp = &pdp->pd_ref[which];
1354         if (refp->xf_refcnt == 0) {
1355                 refp->xf_position = pos;
1356         } else {
1357                 /*
1358                  * xf_position records the fd's first appearance in poll list
1359                  */
1360                 if (pos < refp->xf_position) {
1361                         refp->xf_position = pos;
1362                 }
1363         }
1364         ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd);
1365         refp->xf_refcnt++;
1366         if (fd >= pcp->pc_mapsize) {
1367                 pcache_grow_map(pcp, fd);
1368         }
1369         if (fd > pcp->pc_mapend) {
1370                 pcp->pc_mapend = fd;
1371         }
1372         if (newpollfd != 0) {
1373                 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds);
1374         }
1375         if (memphp) {
1376                 if (pdp->pd_php == NULL) {
1377                         pollhead_insert(memphp, pdp);
1378                         pdp->pd_php = memphp;
1379                 } else {
1380                         if (memphp != pdp->pd_php) {
1381                                 /*
1382                                  * layered devices (e.g. console driver)
1383                                  * may change the vnode and thus the pollhead
1384                                  * pointer out from underneath us.
1385                                  */
1386                                 pollhead_delete(pdp->pd_php, pdp);
1387                                 pollhead_insert(memphp, pdp);
1388                                 pdp->pd_php = memphp;
1389                         }
1390                 }
1391         }
1392         /*
1393          * Since there is a considerable window between VOP_POLL and when
1394          * we actually put the polldat struct on the pollhead list, we could
1395          * miss a pollwakeup. In the case of polling additional events, we
1396          * don't update the events until after VOP_POLL. So we could miss
1397          * pollwakeup there too. So we always set the bit here just to be
1398          * safe. The real performance gain is in subsequent pcache_poll.
1399          */
1400         mutex_enter(&pcp->pc_lock);
1401         BT_SET(pcp->pc_bitmap, fd);
1402         mutex_exit(&pcp->pc_lock);
1403         return (0);
1404 }
1405 
1406 /*
1407  * The entry is not really deleted. The fields are cleared so that the
1408  * entry is no longer useful, but it will remain in the hash table for reuse
1409  * later. It will be freed when the polling lwp exits.
1410  */
1411 int
1412 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent)
1413 {
1414         pollcache_t     *pcp = ps->ps_pcache;
1415         polldat_t       *pdp;
1416         xref_t          *refp;
1417 
1418         ASSERT(fd < pcp->pc_mapsize);
1419         ASSERT(MUTEX_HELD(&ps->ps_lock));
1420 
1421         pdp = pcache_lookup_fd(pcp, fd);
1422         ASSERT(pdp != NULL);
1423         ASSERT(pdp->pd_count > 0);
1424         ASSERT(pdp->pd_ref != NULL);
1425         refp = &pdp->pd_ref[which];
1426         if (pdp->pd_count == 1) {
1427                 pdp->pd_events = 0;
1428                 refp->xf_position = POLLPOSINVAL;
1429                 ASSERT(refp->xf_refcnt == 1);
1430                 refp->xf_refcnt = 0;
1431                 if (pdp->pd_php) {
1432                         /*
1433                          * It is possible for a wakeup thread to get ahead
1434                          * of the following pollhead_delete and set the bit in
1435                          * bitmap.  It is OK because the bit will be cleared
1436                          * here anyway.
1437                          */
1438                         pollhead_delete(pdp->pd_php, pdp);
1439                         pdp->pd_php = NULL;
1440                 }
1441                 pdp->pd_count = 0;
1442                 if (pdp->pd_fp != NULL) {
1443                         pdp->pd_fp = NULL;
1444                         delfpollinfo(fd);
1445                 }
1446                 mutex_enter(&pcp->pc_lock);
1447                 BT_CLEAR(pcp->pc_bitmap, fd);
1448                 mutex_exit(&pcp->pc_lock);
1449                 return (0);
1450         }
1451         if ((cevent & POLLCLOSED) == POLLCLOSED) {
1452                 /*
1453                  * fd cached here has been closed. This is the first
1454                  * pcache_delete_fd called after the close. Clean up the
1455                  * entire entry.
1456                  */
1457                 pcacheset_invalidate(ps, pdp);
1458                 ASSERT(pdp->pd_php == NULL);
1459                 mutex_enter(&pcp->pc_lock);
1460                 BT_CLEAR(pcp->pc_bitmap, fd);
1461                 mutex_exit(&pcp->pc_lock);
1462                 return (0);
1463         }
1464 #ifdef DEBUG
1465         if (getf(fd) != NULL) {
1466                 ASSERT(infpollinfo(fd));
1467                 releasef(fd);
1468         }
1469 #endif  /* DEBUG */
1470         pdp->pd_count--;
1471         ASSERT(refp->xf_refcnt > 0);
1472         if (--refp->xf_refcnt == 0) {
1473                 refp->xf_position = POLLPOSINVAL;
1474         } else {
1475                 ASSERT(pos >= refp->xf_position);
1476                 if (pos == refp->xf_position) {
1477                         /*
1478                          * The xref position is no longer valid.
1479                          * Reset it to a special value and let
1480                          * caller know it needs to updatexref()
1481                          * with a new xf_position value.
1482                          */
1483                         refp->xf_position = POLLPOSTRANS;
1484                         return (1);
1485                 }
1486         }
1487         return (0);
1488 }
1489 
1490 void
1491 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which)
1492 {
1493         polldat_t       *pdp;
1494 
1495         pdp = pcache_lookup_fd(pcp, fd);
1496         ASSERT(pdp != NULL);
1497         ASSERT(pdp->pd_ref != NULL);
1498         pdp->pd_ref[which].xf_position = pos;
1499 }
1500 
1501 #ifdef DEBUG
1502 /*
1503  * For each polled fd, it's either in the bitmap or cached in
1504  * pcache hash table. If this routine returns 0, something is wrong.
1505  */
1506 static int
1507 pollchecksanity(pollstate_t *ps, nfds_t nfds)
1508 {
1509         int             i;
1510         int             fd;
1511         pollcache_t     *pcp = ps->ps_pcache;
1512         polldat_t       *pdp;
1513         pollfd_t        *pollfdp = ps->ps_pollfd;
1514         file_t          *fp;
1515 
1516         ASSERT(MUTEX_HELD(&ps->ps_lock));
1517         for (i = 0; i < nfds; i++) {
1518                 fd = pollfdp[i].fd;
1519                 if (fd < 0) {
1520                         ASSERT(pollfdp[i].revents == 0);
1521                         continue;
1522                 }
1523                 if (pollfdp[i].revents == POLLNVAL)
1524                         continue;
1525                 if ((fp = getf(fd)) == NULL)
1526                         continue;
1527                 pdp = pcache_lookup_fd(pcp, fd);
1528                 ASSERT(pdp != NULL);
1529                 ASSERT(infpollinfo(fd));
1530                 ASSERT(pdp->pd_fp == fp);
1531                 releasef(fd);
1532                 if (BT_TEST(pcp->pc_bitmap, fd))
1533                         continue;
1534                 if (pdp->pd_php == NULL)
1535                         return (0);
1536         }
1537         return (1);
1538 }
1539 #endif  /* DEBUG */
1540 
1541 /*
1542  * resolve the difference between the current poll list and a cached one.
1543  */
1544 int
1545 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which)
1546 {
1547         int             i;
1548         pollcache_t     *pcp = ps->ps_pcache;
1549         pollfd_t        *newlist = NULL;
1550         pollfd_t        *current = ps->ps_pollfd;
1551         pollfd_t        *cached;
1552         pollcacheset_t  *pcsp;
1553         int             common;
1554         int             count = 0;
1555         int             offset;
1556         int             remain;
1557         int             fd;
1558         file_t          *fp;
1559         int             fdcnt = 0;
1560         int             cnt = 0;
1561         nfds_t          old_nfds;
1562         int             error = 0;
1563         int             mismatch = 0;
1564 
1565         ASSERT(MUTEX_HELD(&ps->ps_lock));
1566 #ifdef DEBUG
1567         checkpolldat(ps);
1568 #endif
1569         pcsp = &ps->ps_pcacheset[which];
1570         old_nfds = pcsp->pcs_nfds;
1571         common = (nfds > old_nfds) ? old_nfds : nfds;
1572         if (nfds != old_nfds) {
1573                 /*
1574                  * the length of poll list has changed. allocate a new
1575                  * pollfd list.
1576                  */
1577                 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
1578                 bcopy(current, newlist, sizeof (pollfd_t) * nfds);
1579         }
1580         /*
1581          * Compare the overlapping part of the current fd list with the
1582          * cached one. Whenever a difference is found, resolve it.
1583          * The comparison is done on the current poll list and the
1584          * cached list. But we may be setting up the newlist to be the
1585          * cached list for next poll.
1586          */
1587         cached = pcsp->pcs_pollfd;
1588         remain = common;
1589 
1590         while (count < common) {
1591                 int     tmpfd;
1592                 pollfd_t *np;
1593 
1594                 np = (newlist != NULL) ? &newlist[count] : NULL;
1595                 offset = pcacheset_cmp(&current[count], &cached[count], np,
1596                     remain);
1597                 /*
1598                  * Collect stats. If lists are completed the first time,
1599                  * it's a hit. Otherwise, it's a partial hit or miss.
1600                  */
1601                 if ((count == 0) && (offset == common)) {
1602                         pollstats.pollcachehit.value.ui64++;
1603                 } else {
1604                         mismatch++;
1605                 }
1606                 count += offset;
1607                 if (offset < remain) {
1608                         ASSERT(count < common);
1609                         ASSERT((current[count].fd != cached[count].fd) ||
1610                             (current[count].events != cached[count].events));
1611                         /*
1612                          * Filter out invalid events.
1613                          */
1614                         if (current[count].events & ~VALID_POLL_EVENTS) {
1615                                 if (newlist != NULL) {
1616                                         newlist[count].events =
1617                                             current[count].events &=
1618                                             VALID_POLL_EVENTS;
1619                                 } else {
1620                                         current[count].events &=
1621                                             VALID_POLL_EVENTS;
1622                                 }
1623                         }
1624                         /*
1625                          * when resolving a difference, we always remove the
1626                          * fd from cache before inserting one into cache.
1627                          */
1628                         if (cached[count].fd >= 0) {
1629                                 tmpfd = cached[count].fd;
1630                                 if (pcache_delete_fd(ps, tmpfd, count, which,
1631                                     (uint_t)cached[count].events)) {
1632                                         /*
1633                                          * This should be rare but needed for
1634                                          * correctness.
1635                                          *
1636                                          * The first appearance in cached list
1637                                          * is being "turned off". The same fd
1638                                          * appear more than once in the cached
1639                                          * poll list. Find the next one on the
1640                                          * list and update the cached
1641                                          * xf_position field.
1642                                          */
1643                                         for (i = count + 1; i < old_nfds; i++) {
1644                                                 if (cached[i].fd == tmpfd) {
1645                                                         pcache_update_xref(pcp,
1646                                                             tmpfd, (ssize_t)i,
1647                                                             which);
1648                                                         break;
1649                                                 }
1650                                         }
1651                                         ASSERT(i <= old_nfds);
1652                                 }
1653                                 /*
1654                                  * In case a new cache list is allocated,
1655                                  * need to keep both cache lists in sync
1656                                  * b/c the new one can be freed if we have
1657                                  * an error later.
1658                                  */
1659                                 cached[count].fd = -1;
1660                                 if (newlist != NULL) {
1661                                         newlist[count].fd = -1;
1662                                 }
1663                         }
1664                         if ((tmpfd = current[count].fd) >= 0) {
1665                                 /*
1666                                  * add to the cached fd tbl and bitmap.
1667                                  */
1668                                 if ((fp = getf(tmpfd)) == NULL) {
1669                                         current[count].revents = POLLNVAL;
1670                                         if (newlist != NULL) {
1671                                                 newlist[count].fd = -1;
1672                                         }
1673                                         cached[count].fd = -1;
1674                                         fdcnt++;
1675                                 } else {
1676                                         /*
1677                                          * Here we don't care about the
1678                                          * fdcnt. We will examine the bitmap
1679                                          * later and pick up the correct
1680                                          * fdcnt there. So we never bother
1681                                          * to check value of 'cnt'.
1682                                          */
1683                                         error = pcache_insert(ps, fp,
1684                                             &current[count], &cnt,
1685                                             (ssize_t)count, which);
1686                                         /*
1687                                          * if no error, we want to do releasef
1688                                          * after we updated cache poll list
1689                                          * entry so that close() won't race
1690                                          * us.
1691                                          */
1692                                         if (error) {
1693                                                 /*
1694                                                  * If we encountered an error,
1695                                                  * we have invalidated an
1696                                                  * entry in cached poll list
1697                                                  * (in pcache_delete_fd() above)
1698                                                  * but failed to add one here.
1699                                                  * This is OK b/c what's in the
1700                                                  * cached list is consistent
1701                                                  * with content of cache.
1702                                                  * It will not have any ill
1703                                                  * effect on next poll().
1704                                                  */
1705                                                 releasef(tmpfd);
1706                                                 if (newlist != NULL) {
1707                                                         kmem_free(newlist,
1708                                                             nfds *
1709                                                             sizeof (pollfd_t));
1710                                                 }
1711                                                 return (error);
1712                                         }
1713                                         /*
1714                                          * If we have allocated a new(temp)
1715                                          * cache list, we need to keep both
1716                                          * in sync b/c the new one can be freed
1717                                          * if we have an error later.
1718                                          */
1719                                         if (newlist != NULL) {
1720                                                 newlist[count].fd =
1721                                                     current[count].fd;
1722                                                 newlist[count].events =
1723                                                     current[count].events;
1724                                         }
1725                                         cached[count].fd = current[count].fd;
1726                                         cached[count].events =
1727                                             current[count].events;
1728                                         releasef(tmpfd);
1729                                 }
1730                         } else {
1731                                 current[count].revents = 0;
1732                         }
1733                         count++;
1734                         remain = common - count;
1735                 }
1736         }
1737         if (mismatch != 0) {
1738                 if (mismatch == common) {
1739                         pollstats.pollcachemiss.value.ui64++;
1740                 } else {
1741                         pollstats.pollcachephit.value.ui64++;
1742                 }
1743         }
1744         /*
1745          * take care of the non overlapping part of a list
1746          */
1747         if (nfds > old_nfds) {
1748                 ASSERT(newlist != NULL);
1749                 for (i = old_nfds; i < nfds; i++) {
1750                         /* filter out invalid events */
1751                         if (current[i].events & ~VALID_POLL_EVENTS) {
1752                                 newlist[i].events = current[i].events =
1753                                     current[i].events & VALID_POLL_EVENTS;
1754                         }
1755                         if ((fd = current[i].fd) < 0) {
1756                                 current[i].revents = 0;
1757                                 continue;
1758                         }
1759                         /*
1760                          * add to the cached fd tbl and bitmap.
1761                          */
1762                         if ((fp = getf(fd)) == NULL) {
1763                                 current[i].revents = POLLNVAL;
1764                                 newlist[i].fd = -1;
1765                                 fdcnt++;
1766                                 continue;
1767                         }
1768                         /*
1769                          * Here we don't care about the
1770                          * fdcnt. We will examine the bitmap
1771                          * later and pick up the correct
1772                          * fdcnt there. So we never bother to
1773                          * check 'cnt'.
1774                          */
1775                         error = pcache_insert(ps, fp, &current[i], &cnt,
1776                             (ssize_t)i, which);
1777                         releasef(fd);
1778                         if (error) {
1779                                 /*
1780                                  * Here we are half way through adding newly
1781                                  * polled fd. Undo enough to keep the cache
1782                                  * list consistent with the cache content.
1783                                  */
1784                                 pcacheset_remove_list(ps, current, old_nfds,
1785                                     i, which, 0);
1786                                 kmem_free(newlist, nfds * sizeof (pollfd_t));
1787                                 return (error);
1788                         }
1789                 }
1790         }
1791         if (old_nfds > nfds) {
1792                 /*
1793                  * remove the fd's which are no longer polled.
1794                  */
1795                 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds,
1796                     which, 1);
1797         }
1798         /*
1799          * set difference resolved. update nfds and cachedlist
1800          * in pollstate struct.
1801          */
1802         if (newlist != NULL) {
1803                 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t));
1804                 /*
1805                  * By now, the pollfd.revents field should
1806                  * all be zeroed.
1807                  */
1808                 pcsp->pcs_pollfd = newlist;
1809                 pcsp->pcs_nfds = nfds;
1810         }
1811         ASSERT(*fdcntp == 0);
1812         *fdcntp = fdcnt;
1813         /*
1814          * By now for every fd in pollfdp, one of the following should be
1815          * true. Otherwise we will miss a polled event.
1816          *
1817          * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL
1818          *    will be called on this fd in next poll.
1819          * 2. the fd is cached in the pcache (i.e. pd_php is set). So
1820          *    pollnotify will happen.
1821          */
1822         ASSERT(pollchecksanity(ps, nfds));
1823         /*
1824          * make sure cross reference between cached poll lists and cached
1825          * poll fds are correct.
1826          */
1827         ASSERT(pollcheckxref(ps, which));
1828         /*
1829          * ensure each polldat in pollcache reference a polled fd in
1830          * pollcacheset.
1831          */
1832 #ifdef DEBUG
1833         checkpolldat(ps);
1834 #endif
1835         return (0);
1836 }
1837 
1838 #ifdef DEBUG
1839 static int
1840 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds)
1841 {
1842         int i;
1843         int reventcnt = 0;
1844 
1845         for (i = 0; i < nfds; i++) {
1846                 if (pollfdp[i].fd < 0) {
1847                         ASSERT(pollfdp[i].revents == 0);
1848                         continue;
1849                 }
1850                 if (pollfdp[i].revents) {
1851                         reventcnt++;
1852                 }
1853                 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) {
1854                         ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd));
1855                 }
1856         }
1857         return (reventcnt);
1858 }
1859 #endif  /* DEBUG */
1860 
1861 /*
1862  * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock
1863  * is held upon entry.
1864  */
1865 int
1866 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp,
1867     int which)
1868 {
1869         int             i;
1870         pollcache_t     *pcp;
1871         int             fd;
1872         int             begin, end, done;
1873         pollhead_t      *php;
1874         int             fdcnt;
1875         int             error = 0;
1876         file_t          *fp;
1877         polldat_t       *pdp;
1878         xref_t          *refp;
1879         int             entry;
1880 
1881         pcp = ps->ps_pcache;
1882         ASSERT(MUTEX_HELD(&ps->ps_lock));
1883         ASSERT(MUTEX_HELD(&pcp->pc_lock));
1884 retry:
1885         done = 0;
1886         begin = 0;
1887         fdcnt = 0;
1888         end = pcp->pc_mapend;
1889         while ((fdcnt < nfds) && !done) {
1890                 php = NULL;
1891                 /*
1892                  * only poll fds which may have events
1893                  */
1894                 fd = bt_getlowbit(pcp->pc_bitmap, begin, end);
1895                 ASSERT(fd <= end);
1896                 if (fd >= 0) {
1897                         ASSERT(pollcheckrevents(ps, begin, fd, which));
1898                         /*
1899                          * adjust map pointers for next round
1900                          */
1901                         if (fd == end) {
1902                                 done = 1;
1903                         } else {
1904                                 begin = fd + 1;
1905                         }
1906                         /*
1907                          * A bitmap caches poll state information of
1908                          * multiple poll lists. Call VOP_POLL only if
1909                          * the bit corresponds to an fd in this poll
1910                          * list.
1911                          */
1912                         pdp = pcache_lookup_fd(pcp, fd);
1913                         ASSERT(pdp != NULL);
1914                         ASSERT(pdp->pd_ref != NULL);
1915                         refp = &pdp->pd_ref[which];
1916                         if (refp->xf_refcnt == 0)
1917                                 continue;
1918                         entry = refp->xf_position;
1919                         ASSERT((entry >= 0) && (entry < nfds));
1920                         ASSERT(pollfdp[entry].fd == fd);
1921                         /*
1922                          * we are in this routine implies that we have
1923                          * successfully polled this fd in the past.
1924                          * Check to see this fd is closed while we are
1925                          * blocked in poll. This ensures that we don't
1926                          * miss a close on the fd in the case this fd is
1927                          * reused.
1928                          */
1929                         if (pdp->pd_fp == NULL) {
1930                                 ASSERT(pdp->pd_count > 0);
1931                                 pollfdp[entry].revents = POLLNVAL;
1932                                 fdcnt++;
1933                                 if (refp->xf_refcnt > 1) {
1934                                         /*
1935                                          * this fd appeared multiple time
1936                                          * in the poll list. Find all of them.
1937                                          */
1938                                         for (i = entry + 1; i < nfds; i++) {
1939                                                 if (pollfdp[i].fd == fd) {
1940                                                         pollfdp[i].revents =
1941                                                             POLLNVAL;
1942                                                         fdcnt++;
1943                                                 }
1944                                         }
1945                                 }
1946                                 pcacheset_invalidate(ps, pdp);
1947                                 continue;
1948                         }
1949                         /*
1950                          * We can be here polling a device that is being
1951                          * closed (i.e. the file pointer is set to NULL,
1952                          * but pollcacheclean has not happened yet).
1953                          */
1954                         if ((fp = getf(fd)) == NULL) {
1955                                 pollfdp[entry].revents = POLLNVAL;
1956                                 fdcnt++;
1957                                 if (refp->xf_refcnt > 1) {
1958                                         /*
1959                                          * this fd appeared multiple time
1960                                          * in the poll list. Find all of them.
1961                                          */
1962                                         for (i = entry + 1; i < nfds; i++) {
1963                                                 if (pollfdp[i].fd == fd) {
1964                                                         pollfdp[i].revents =
1965                                                             POLLNVAL;
1966                                                         fdcnt++;
1967                                                 }
1968                                         }
1969                                 }
1970                                 continue;
1971                         }
1972                         ASSERT(pdp->pd_fp == fp);
1973                         ASSERT(infpollinfo(fd));
1974                         /*
1975                          * Since we no longer hold poll head lock across
1976                          * VOP_POLL, pollunlock logic can be simplifed.
1977                          */
1978                         ASSERT(pdp->pd_php == NULL ||
1979                             MUTEX_NOT_HELD(PHLOCK(pdp->pd_php)));
1980                         /*
1981                          * underlying file systems may set a "pollpending"
1982                          * flag when it sees the poll may block. Pollwakeup()
1983                          * is called by wakeup thread if pollpending is set.
1984                          * Pass a 0 fdcnt so that the underlying file system
1985                          * will set the "pollpending" flag set when there is
1986                          * no polled events.
1987                          *
1988                          * Use pollfdp[].events for actual polling because
1989                          * the pd_events is union of all cached poll events
1990                          * on this fd. The events parameter also affects
1991                          * how the polled device sets the "poll pending"
1992                          * flag.
1993                          */
1994                         ASSERT(curthread->t_pollcache == NULL);
1995                         error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0,
1996                             &pollfdp[entry].revents, &php, NULL);
1997                         /*
1998                          * releasef after completely done with this cached
1999                          * poll entry. To prevent close() coming in to clear
2000                          * this entry.
2001                          */
2002                         if (error) {
2003                                 releasef(fd);
2004                                 break;
2005                         }
2006                         /*
2007                          * layered devices (e.g. console driver)
2008                          * may change the vnode and thus the pollhead
2009                          * pointer out from underneath us.
2010                          */
2011                         if (php != NULL && pdp->pd_php != NULL &&
2012                             php != pdp->pd_php) {
2013                                 releasef(fd);
2014                                 pollhead_delete(pdp->pd_php, pdp);
2015                                 pdp->pd_php = php;
2016                                 pollhead_insert(php, pdp);
2017                                 /*
2018                                  * We could have missed a wakeup on the new
2019                                  * target device. Make sure the new target
2020                                  * gets polled once.
2021                                  */
2022                                 BT_SET(pcp->pc_bitmap, fd);
2023                                 goto retry;
2024                         }
2025 
2026                         if (pollfdp[entry].revents) {
2027                                 ASSERT(refp->xf_refcnt >= 1);
2028                                 fdcnt++;
2029                                 if (refp->xf_refcnt > 1) {
2030                                         /*
2031                                          * this fd appeared multiple time
2032                                          * in the poll list. This is rare but
2033                                          * we have to look at all of them for
2034                                          * correctness.
2035                                          */
2036                                         error = plist_chkdupfd(fp, pdp, ps,
2037                                             pollfdp, entry, &fdcnt);
2038                                         if (error > 0) {
2039                                                 releasef(fd);
2040                                                 break;
2041                                         }
2042                                         if (error < 0) {
2043                                                 goto retry;
2044                                         }
2045                                 }
2046                                 releasef(fd);
2047                         } else {
2048                                 /*
2049                                  * VOP_POLL didn't return any revents. We can
2050                                  * clear the bit in bitmap only if we have the
2051                                  * pollhead ptr cached and no other cached
2052                                  * entry is polling different events on this fd.
2053                                  * VOP_POLL may have dropped the ps_lock. Make
2054                                  * sure pollwakeup has not happened before clear
2055                                  * the bit.
2056                                  */
2057                                 if ((pdp->pd_php != NULL) &&
2058                                     (pollfdp[entry].events == pdp->pd_events) &&
2059                                     ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
2060                                         BT_CLEAR(pcp->pc_bitmap, fd);
2061                                 }
2062                                 /*
2063                                  * if the fd can be cached now but not before,
2064                                  * do it now.
2065                                  */
2066                                 if ((pdp->pd_php == NULL) && (php != NULL)) {
2067                                         pdp->pd_php = php;
2068                                         pollhead_insert(php, pdp);
2069                                         /*
2070                                          * We are inserting a polldat struct for
2071                                          * the first time. We may have missed a
2072                                          * wakeup on this device. Re-poll once.
2073                                          * This should be a rare event.
2074                                          */
2075                                         releasef(fd);
2076                                         goto retry;
2077                                 }
2078                                 if (refp->xf_refcnt > 1) {
2079                                         /*
2080                                          * this fd appeared multiple time
2081                                          * in the poll list. This is rare but
2082                                          * we have to look at all of them for
2083                                          * correctness.
2084                                          */
2085                                         error = plist_chkdupfd(fp, pdp, ps,
2086                                             pollfdp, entry, &fdcnt);
2087                                         if (error > 0) {
2088                                                 releasef(fd);
2089                                                 break;
2090                                         }
2091                                         if (error < 0) {
2092                                                 goto retry;
2093                                         }
2094                                 }
2095                                 releasef(fd);
2096                         }
2097                 } else {
2098                         done = 1;
2099                         ASSERT(pollcheckrevents(ps, begin, end + 1, which));
2100                 }
2101         }
2102         if (!error) {
2103                 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds));
2104                 *fdcntp += fdcnt;
2105         }
2106         return (error);
2107 }
2108 
2109 /*
2110  * Going through the poll list without much locking. Poll all fds and
2111  * cache all valid fds in the pollcache.
2112  */
2113 int
2114 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which)
2115 {
2116         pollfd_t        *pollfdp = ps->ps_pollfd;
2117         pollcacheset_t  *pcacheset = ps->ps_pcacheset;
2118         pollfd_t        *newfdlist;
2119         int             i;
2120         int             fd;
2121         file_t          *fp;
2122         int             error = 0;
2123 
2124         ASSERT(MUTEX_HELD(&ps->ps_lock));
2125         ASSERT(which < ps->ps_nsets);
2126         ASSERT(pcacheset != NULL);
2127         ASSERT(pcacheset[which].pcs_pollfd == NULL);
2128         newfdlist  = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP);
2129         /*
2130          * cache the new poll list in pollcachset.
2131          */
2132         bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds);
2133 
2134         pcacheset[which].pcs_pollfd = newfdlist;
2135         pcacheset[which].pcs_nfds = ps->ps_nfds;
2136         pcacheset[which].pcs_usradr = (uintptr_t)fds;
2137 
2138         /*
2139          * We have saved a copy of current poll fd list in one pollcacheset.
2140          * The 'revents' field of the new list is not yet set to 0. Loop
2141          * through the new list just to do that is expensive. We do that
2142          * while polling the list.
2143          */
2144         for (i = 0; i < ps->ps_nfds; i++) {
2145                 fd = pollfdp[i].fd;
2146                 /*
2147                  * We also filter out the illegal poll events in the event
2148                  * field for the cached poll list/set.
2149                  */
2150                 if (pollfdp[i].events & ~VALID_POLL_EVENTS) {
2151                         newfdlist[i].events = pollfdp[i].events =
2152                             pollfdp[i].events & VALID_POLL_EVENTS;
2153                 }
2154                 if (fd < 0) {
2155                         pollfdp[i].revents = 0;
2156                         continue;
2157                 }
2158                 if ((fp = getf(fd)) == NULL) {
2159                         pollfdp[i].revents = POLLNVAL;
2160                         /*
2161                          * invalidate this cache entry in the cached poll list
2162                          */
2163                         newfdlist[i].fd = -1;
2164                         (*fdcntp)++;
2165                         continue;
2166                 }
2167                 /*
2168                  * cache this fd.
2169                  */
2170                 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i,
2171                     which);
2172                 releasef(fd);
2173                 if (error) {
2174                         /*
2175                          * Here we are half way through caching a new
2176                          * poll list. Undo every thing.
2177                          */
2178                         pcacheset_remove_list(ps, pollfdp, 0, i, which, 0);
2179                         kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t));
2180                         pcacheset[which].pcs_pollfd = NULL;
2181                         pcacheset[which].pcs_usradr = NULL;
2182                         break;
2183                 }
2184         }
2185         return (error);
2186 }
2187 
2188 /*
2189  * called by pollcacheclean() to set the fp NULL. It also sets polled events
2190  * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to
2191  * wake any sleeping poller, then remove the polldat from the driver.
2192  * The routine is called with ps_pcachelock held.
2193  */
2194 void
2195 pcache_clean_entry(pollstate_t *ps, int fd)
2196 {
2197         pollcache_t     *pcp;
2198         polldat_t       *pdp;
2199         int             i;
2200 
2201         ASSERT(ps != NULL);
2202         ASSERT(MUTEX_HELD(&ps->ps_lock));
2203         pcp = ps->ps_pcache;
2204         ASSERT(pcp);
2205         pdp = pcache_lookup_fd(pcp, fd);
2206         ASSERT(pdp != NULL);
2207         /*
2208          * the corresponding fpollinfo in fi_list has been removed by
2209          * a close on this fd. Reset the cached fp ptr here.
2210          */
2211         pdp->pd_fp = NULL;
2212         /*
2213          * XXX - This routine also touches data in pcacheset struct.
2214          *
2215          * set the event in cached poll lists to POLLCLOSED. This invalidate
2216          * the cached poll fd entry in that poll list, which will force a
2217          * removal of this cached entry in next poll(). The cleanup is done
2218          * at the removal time.
2219          */
2220         ASSERT(pdp->pd_ref != NULL);
2221         for (i = 0; i < ps->ps_nsets; i++) {
2222                 xref_t          *refp;
2223                 pollcacheset_t  *pcsp;
2224 
2225                 refp = &pdp->pd_ref[i];
2226                 if (refp->xf_refcnt) {
2227                         ASSERT(refp->xf_position >= 0);
2228                         pcsp = &ps->ps_pcacheset[i];
2229                         if (refp->xf_refcnt == 1) {
2230                                 pcsp->pcs_pollfd[refp->xf_position].events =
2231                                     (short)POLLCLOSED;
2232                         }
2233                         if (refp->xf_refcnt > 1) {
2234                                 int     j;
2235                                 /*
2236                                  * mark every matching entry in pcs_pollfd
2237                                  */
2238                                 for (j = refp->xf_position;
2239                                     j < pcsp->pcs_nfds; j++) {
2240                                         if (pcsp->pcs_pollfd[j].fd == fd) {
2241                                                 pcsp->pcs_pollfd[j].events =
2242                                                     (short)POLLCLOSED;
2243                                         }
2244                                 }
2245                         }
2246                 }
2247         }
2248         if (pdp->pd_php) {
2249                 pollwakeup(pdp->pd_php, POLLHUP);
2250                 pollhead_delete(pdp->pd_php, pdp);
2251                 pdp->pd_php = NULL;
2252         }
2253 }
2254 
2255 void
2256 pcache_wake_parents(pollcache_t *pcp)
2257 {
2258         pcachelink_t *pl, *pln;
2259 
2260         ASSERT(MUTEX_HELD(&pcp->pc_lock));
2261 
2262         for (pl = pcp->pc_parents; pl != NULL; pl = pln) {
2263                 mutex_enter(&pl->pcl_lock);
2264                 if (pl->pcl_state == PCL_VALID) {
2265                         ASSERT(pl->pcl_parent_pc != NULL);
2266                         cv_broadcast(&pl->pcl_parent_pc->pc_cv);
2267                 }
2268                 pln = pl->pcl_parent_next;
2269                 mutex_exit(&pl->pcl_lock);
2270         }
2271 }
2272 
2273 /*
2274  * Initialize thread pollstate structure.
2275  * It will persist for the life of the thread, until it calls pollcleanup().
2276  */
2277 pollstate_t *
2278 pollstate_create()
2279 {
2280         pollstate_t *ps = curthread->t_pollstate;
2281 
2282         if (ps == NULL) {
2283                 /*
2284                  * This is the first time this thread has ever polled, so we
2285                  * have to create its pollstate structure.
2286                  */
2287                 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP);
2288                 ps->ps_nsets = POLLFDSETS;
2289                 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets);
2290                 curthread->t_pollstate = ps;
2291         } else {
2292                 ASSERT(ps->ps_depth == 0);
2293                 ASSERT(ps->ps_flags == 0);
2294                 ASSERT(ps->ps_pc_stack[0] == 0);
2295         }
2296         return (ps);
2297 }
2298 
2299 void
2300 pollstate_destroy(pollstate_t *ps)
2301 {
2302         if (ps->ps_pollfd != NULL) {
2303                 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t));
2304                 ps->ps_pollfd = NULL;
2305         }
2306         if (ps->ps_pcache != NULL) {
2307                 pcache_destroy(ps->ps_pcache);
2308                 ps->ps_pcache = NULL;
2309         }
2310         pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets);
2311         ps->ps_pcacheset = NULL;
2312         if (ps->ps_dpbuf != NULL) {
2313                 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
2314                 ps->ps_dpbuf = NULL;
2315         }
2316         mutex_destroy(&ps->ps_lock);
2317         kmem_free(ps, sizeof (pollstate_t));
2318 }
2319 
2320 static int
2321 pollstate_contend(pollstate_t *ps, pollcache_t *pcp)
2322 {
2323         pollstate_t *rem, *next;
2324         pollcache_t *desired_pc;
2325         int result = 0, depth_total;
2326 
2327         mutex_enter(&pollstate_contenders_lock);
2328         /*
2329          * There is a small chance that the pollcache of interest became
2330          * available while we were waiting on the contenders lock.
2331          */
2332         if (mutex_tryenter(&pcp->pc_lock) != 0) {
2333                 goto out;
2334         }
2335 
2336         /*
2337          * Walk the list of contended pollstates, searching for evidence of a
2338          * deadlock condition.
2339          */
2340         depth_total = ps->ps_depth;
2341         desired_pc = pcp;
2342         for (rem = pollstate_contenders; rem != NULL; rem = next) {
2343                 int i, j;
2344                 next = rem->ps_contend_nextp;
2345 
2346                 /* Is this pollstate holding the pollcache of interest? */
2347                 for (i = 0; i < rem->ps_depth; i++) {
2348                         if (rem->ps_pc_stack[i] != desired_pc) {
2349                                 continue;
2350                         }
2351 
2352                         /*
2353                          * The remote pollstate holds the pollcache lock we
2354                          * desire.  If it is waiting on a pollcache we hold,
2355                          * then we can report the obvious deadlock.
2356                          */
2357                         ASSERT(rem->ps_contend_pc != NULL);
2358                         for (j = 0; j < ps->ps_depth; j++) {
2359                                 if (rem->ps_contend_pc == ps->ps_pc_stack[j]) {
2360                                         rem->ps_flags |= POLLSTATE_STALEMATE;
2361                                         result = -1;
2362                                         goto out;
2363                                 }
2364                         }
2365 
2366                         /*
2367                          * The remote pollstate is not blocking on a pollcache
2368                          * which would deadlock against us.  That pollcache
2369                          * may, however, be held by a pollstate which would
2370                          * result in a deadlock.
2371                          *
2372                          * To detect such a condition, we continue walking
2373                          * through the list using the pollcache blocking the
2374                          * remote thread as our new search target.
2375                          *
2376                          * Return to the front of pollstate_contenders since it
2377                          * is not ordered to guarantee complete dependency
2378                          * traversal.  The below depth tracking places an upper
2379                          * bound on iterations.
2380                          */
2381                         desired_pc = rem->ps_contend_pc;
2382                         next = pollstate_contenders;
2383 
2384                         /*
2385                          * The recursion depth of the remote pollstate is used
2386                          * to calculate a final depth for the local /dev/poll
2387                          * recursion, since those locks will be acquired
2388                          * eventually.  If that value exceeds the defined
2389                          * limit, we can report the failure now instead of
2390                          * recursing to that failure depth.
2391                          */
2392                         depth_total += (rem->ps_depth - i);
2393                         if (depth_total >= POLLMAXDEPTH) {
2394                                 result = -1;
2395                                 goto out;
2396                         }
2397                 }
2398         }
2399 
2400         /*
2401          * No deadlock partner was found.  The only course of action is to
2402          * record ourself as a contended pollstate and wait for the pollcache
2403          * mutex to become available.
2404          */
2405         ps->ps_contend_pc = pcp;
2406         ps->ps_contend_nextp = pollstate_contenders;
2407         ps->ps_contend_pnextp = &pollstate_contenders;
2408         if (pollstate_contenders != NULL) {
2409                 pollstate_contenders->ps_contend_pnextp =
2410                     &ps->ps_contend_nextp;
2411         }
2412         pollstate_contenders = ps;
2413 
2414         mutex_exit(&pollstate_contenders_lock);
2415         mutex_enter(&pcp->pc_lock);
2416         mutex_enter(&pollstate_contenders_lock);
2417 
2418         /*
2419          * Our acquisition of the pollcache mutex may be due to another thread
2420          * giving up in the face of deadlock with us.  If that is the case,
2421          * we too should report the failure.
2422          */
2423         if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) {
2424                 result = -1;
2425                 ps->ps_flags &= ~POLLSTATE_STALEMATE;
2426                 mutex_exit(&pcp->pc_lock);
2427         }
2428 
2429         /* Remove ourself from the contenders list. */
2430         if (ps->ps_contend_nextp != NULL) {
2431                 ps->ps_contend_nextp->ps_contend_pnextp =
2432                     ps->ps_contend_pnextp;
2433         }
2434         *ps->ps_contend_pnextp = ps->ps_contend_nextp;
2435         ps->ps_contend_pc = NULL;
2436         ps->ps_contend_nextp = NULL;
2437         ps->ps_contend_pnextp = NULL;
2438 
2439 out:
2440         mutex_exit(&pollstate_contenders_lock);
2441         return (result);
2442 }
2443 
2444 int
2445 pollstate_enter(pollcache_t *pcp)
2446 {
2447         pollstate_t *ps = curthread->t_pollstate;
2448         int i;
2449 
2450         if (ps == NULL) {
2451                 /*
2452                  * The thread pollstate may not be initialized if VOP_POLL is
2453                  * called on a recursion-enabled /dev/poll handle from outside
2454                  * the poll() or /dev/poll codepaths.
2455                  */
2456                 return (PSE_FAIL_POLLSTATE);
2457         }
2458         if (ps->ps_depth >= POLLMAXDEPTH) {
2459                 return (PSE_FAIL_DEPTH);
2460         }
2461         /*
2462          * Check the desired pollcache against pollcaches we already have
2463          * locked.  Such a loop is the most simple deadlock scenario.
2464          */
2465         for (i = 0; i < ps->ps_depth; i++) {
2466                 if (ps->ps_pc_stack[i] == pcp) {
2467                         return (PSE_FAIL_LOOP);
2468                 }
2469         }
2470         ASSERT(ps->ps_pc_stack[i] == NULL);
2471 
2472         if (ps->ps_depth == 0) {
2473                 /* Locking initial the pollcache requires no caution */
2474                 mutex_enter(&pcp->pc_lock);
2475         } else if (mutex_tryenter(&pcp->pc_lock) == 0) {
2476                 if (pollstate_contend(ps, pcp) != 0) {
2477                         /* This pollcache cannot safely be locked. */
2478                         return (PSE_FAIL_DEADLOCK);
2479                 }
2480         }
2481 
2482         ps->ps_pc_stack[ps->ps_depth++] = pcp;
2483         return (PSE_SUCCESS);
2484 }
2485 
2486 void
2487 pollstate_exit(pollcache_t *pcp)
2488 {
2489         pollstate_t *ps = curthread->t_pollstate;
2490 
2491         VERIFY(ps != NULL);
2492         VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp);
2493 
2494         mutex_exit(&pcp->pc_lock);
2495         ps->ps_pc_stack[--ps->ps_depth] = NULL;
2496         VERIFY(ps->ps_depth >= 0);
2497 }
2498 
2499 
2500 /*
2501  * We are holding the appropriate uf_lock entering this routine.
2502  * Bump up the ps_busy count to prevent the thread from exiting.
2503  */
2504 void
2505 pollblockexit(fpollinfo_t *fpip)
2506 {
2507         for (; fpip; fpip = fpip->fp_next) {
2508                 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache;
2509 
2510                 mutex_enter(&pcp->pc_no_exit);
2511                 pcp->pc_busy++;  /* prevents exit()'s */
2512                 mutex_exit(&pcp->pc_no_exit);
2513         }
2514 }
2515 
2516 /*
2517  * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark
2518  * the pcacheset events field POLLCLOSED to force the next poll() to remove
2519  * this cache entry. We can't clean the polldat entry clean up here because
2520  * lwp block in poll() needs the info to return. Wakeup anyone blocked in
2521  * poll and let exiting lwp go. No lock is help upon entry. So it's OK for
2522  * pcache_clean_entry to call pollwakeup().
2523  */
2524 void
2525 pollcacheclean(fpollinfo_t *fip, int fd)
2526 {
2527         struct fpollinfo        *fpip, *fpip2;
2528 
2529         fpip = fip;
2530         while (fpip) {
2531                 pollstate_t *ps = fpip->fp_thread->t_pollstate;
2532                 pollcache_t *pcp = ps->ps_pcache;
2533 
2534                 mutex_enter(&ps->ps_lock);
2535                 pcache_clean_entry(ps, fd);
2536                 mutex_exit(&ps->ps_lock);
2537                 mutex_enter(&pcp->pc_no_exit);
2538                 pcp->pc_busy--;
2539                 if (pcp->pc_busy == 0) {
2540                         /*
2541                          * Wakeup the thread waiting in
2542                          * thread_exit().
2543                          */
2544                         cv_signal(&pcp->pc_busy_cv);
2545                 }
2546                 mutex_exit(&pcp->pc_no_exit);
2547 
2548                 fpip2 = fpip;
2549                 fpip = fpip->fp_next;
2550                 kmem_free(fpip2, sizeof (fpollinfo_t));
2551         }
2552 }
2553 
2554 /*
2555  * one of the cache line's counter is wrapping around. Reset all cache line
2556  * counters to zero except one. This is simplistic, but probably works
2557  * effectively.
2558  */
2559 void
2560 pcacheset_reset_count(pollstate_t *ps, int index)
2561 {
2562         int     i;
2563 
2564         ASSERT(MUTEX_HELD(&ps->ps_lock));
2565         for (i = 0; i < ps->ps_nsets; i++) {
2566                 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) {
2567                         ps->ps_pcacheset[i].pcs_count = 0;
2568                 }
2569         }
2570         ps->ps_pcacheset[index].pcs_count = 1;
2571 }
2572 
2573 /*
2574  * this routine implements poll cache list replacement policy.
2575  * It is currently choose the "least used".
2576  */
2577 int
2578 pcacheset_replace(pollstate_t *ps)
2579 {
2580         int i;
2581         int index = 0;
2582 
2583         ASSERT(MUTEX_HELD(&ps->ps_lock));
2584         for (i = 1; i < ps->ps_nsets; i++) {
2585                 if (ps->ps_pcacheset[index].pcs_count >
2586                     ps->ps_pcacheset[i].pcs_count) {
2587                         index = i;
2588                 }
2589         }
2590         ps->ps_pcacheset[index].pcs_count = 0;
2591         return (index);
2592 }
2593 
2594 /*
2595  * this routine is called by strclose to remove remaining polldat struct on
2596  * the pollhead list of the device being closed. There are two reasons as why
2597  * the polldat structures still remain on the pollhead list:
2598  *
2599  * (1) The layered device(e.g.the console driver).
2600  * In this case, the existence of a polldat implies that the thread putting
2601  * the polldat on this list has not exited yet. Before the thread exits, it
2602  * will have to hold this pollhead lock to remove the polldat. So holding the
2603  * pollhead lock here effectively prevents the thread which put the polldat
2604  * on this list from exiting.
2605  *
2606  * (2) /dev/poll.
2607  * When a polled fd is cached in /dev/poll, its polldat will remain on the
2608  * pollhead list if the process has not done a POLLREMOVE before closing the
2609  * polled fd. We just unlink it here.
2610  */
2611 void
2612 pollhead_clean(pollhead_t *php)
2613 {
2614         polldat_t       *pdp;
2615 
2616         /*
2617          * In case(1), while we must prevent the thread in question from
2618          * exiting, we must also obey the proper locking order, i.e.
2619          * (ps_lock -> phlock).
2620          */
2621         PH_ENTER(php);
2622         while (php->ph_list != NULL) {
2623                 pollstate_t     *ps;
2624                 pollcache_t     *pcp;
2625 
2626                 pdp = php->ph_list;
2627                 ASSERT(pdp->pd_php == php);
2628                 if (pdp->pd_thread == NULL) {
2629                         /*
2630                          * This is case(2). Since the ph_lock is sufficient
2631                          * to synchronize this lwp with any other /dev/poll
2632                          * lwp, just unlink the polldat.
2633                          */
2634                         php->ph_list = pdp->pd_next;
2635                         pdp->pd_php = NULL;
2636                         pdp->pd_next = NULL;
2637                         continue;
2638                 }
2639                 ps = pdp->pd_thread->t_pollstate;
2640                 ASSERT(ps != NULL);
2641                 pcp = pdp->pd_pcache;
2642                 ASSERT(pcp != NULL);
2643                 mutex_enter(&pcp->pc_no_exit);
2644                 pcp->pc_busy++;  /* prevents exit()'s */
2645                 mutex_exit(&pcp->pc_no_exit);
2646                 /*
2647                  * Now get the locks in proper order to avoid deadlock.
2648                  */
2649                 PH_EXIT(php);
2650                 mutex_enter(&ps->ps_lock);
2651                 /*
2652                  * while we dropped the pollhead lock, the element could be
2653                  * taken off the list already.
2654                  */
2655                 PH_ENTER(php);
2656                 if (pdp->pd_php == php) {
2657                         ASSERT(pdp == php->ph_list);
2658                         php->ph_list = pdp->pd_next;
2659                         pdp->pd_php = NULL;
2660                         pdp->pd_next = NULL;
2661                 }
2662                 PH_EXIT(php);
2663                 mutex_exit(&ps->ps_lock);
2664                 mutex_enter(&pcp->pc_no_exit);
2665                 pcp->pc_busy--;
2666                 if (pcp->pc_busy == 0) {
2667                         /*
2668                          * Wakeup the thread waiting in
2669                          * thread_exit().
2670                          */
2671                         cv_signal(&pcp->pc_busy_cv);
2672                 }
2673                 mutex_exit(&pcp->pc_no_exit);
2674                 PH_ENTER(php);
2675         }
2676         PH_EXIT(php);
2677 }
2678 
2679 /*
2680  * The remove_list is called to cleanup a partially cached 'current' list or
2681  * to remove a partial list which is no longer cached. The flag value of 1
2682  * indicates the second case.
2683  */
2684 void
2685 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end,
2686     int cacheindex, int flag)
2687 {
2688         int i;
2689 
2690         ASSERT(MUTEX_HELD(&ps->ps_lock));
2691         for (i = start; i < end; i++) {
2692                 if ((pollfdp[i].fd >= 0) &&
2693                     (flag || !(pollfdp[i].revents & POLLNVAL))) {
2694                         if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex,
2695                             (uint_t)pollfdp[i].events)) {
2696                                 int j;
2697                                 int fd = pollfdp[i].fd;
2698 
2699                                 for (j = i + 1; j < end; j++) {
2700                                         if (pollfdp[j].fd == fd) {
2701                                                 pcache_update_xref(
2702                                                     ps->ps_pcache, fd,
2703                                                     (ssize_t)j, cacheindex);
2704                                                 break;
2705                                         }
2706                                 }
2707                                 ASSERT(j <= end);
2708                         }
2709                 }
2710         }
2711 }
2712 
2713 #ifdef DEBUG
2714 
2715 #include<sys/strsubr.h>
2716 /*
2717  * make sure curthread is not on anyone's pollhead list any more.
2718  */
2719 static void
2720 pollcheckphlist()
2721 {
2722         int i;
2723         file_t *fp;
2724         uf_entry_t *ufp;
2725         uf_info_t *fip = P_FINFO(curproc);
2726         struct stdata *stp;
2727         polldat_t *pdp;
2728 
2729         mutex_enter(&fip->fi_lock);
2730         for (i = 0; i < fip->fi_nfiles; i++) {
2731                 UF_ENTER(ufp, fip, i);
2732                 if ((fp = ufp->uf_file) != NULL) {
2733                         if ((stp = fp->f_vnode->v_stream) != NULL) {
2734                                 PH_ENTER(&stp->sd_pollist);
2735                                 pdp = stp->sd_pollist.ph_list;
2736                                 while (pdp) {
2737                                         ASSERT(pdp->pd_thread != curthread);
2738                                         pdp = pdp->pd_next;
2739                                 }
2740                                 PH_EXIT(&stp->sd_pollist);
2741                         }
2742                 }
2743                 UF_EXIT(ufp);
2744         }
2745         mutex_exit(&fip->fi_lock);
2746 }
2747 
2748 /*
2749  * for resolved set poll list, the xref info in the pcache should be
2750  * consistent with this poll list.
2751  */
2752 static int
2753 pollcheckxref(pollstate_t *ps, int cacheindex)
2754 {
2755         pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd;
2756         pollcache_t *pcp = ps->ps_pcache;
2757         polldat_t *pdp;
2758         int     i;
2759         xref_t  *refp;
2760 
2761         for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) {
2762                 if (pollfdp[i].fd < 0) {
2763                         continue;
2764                 }
2765                 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd);
2766                 ASSERT(pdp != NULL);
2767                 ASSERT(pdp->pd_ref != NULL);
2768                 refp = &pdp->pd_ref[cacheindex];
2769                 if (refp->xf_position >= 0) {
2770                         ASSERT(refp->xf_refcnt >= 1);
2771                         ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd);
2772                         if (refp->xf_refcnt > 1) {
2773                                 int     j;
2774                                 int     count = 0;
2775 
2776                                 for (j = refp->xf_position;
2777                                     j < ps->ps_pcacheset[cacheindex].pcs_nfds;
2778                                     j++) {
2779                                         if (pollfdp[j].fd == pdp->pd_fd) {
2780                                                 count++;
2781                                         }
2782                                 }
2783                                 ASSERT(count == refp->xf_refcnt);
2784                         }
2785                 }
2786         }
2787         return (1);
2788 }
2789 
2790 /*
2791  * For every cached pollfd, its polldat struct should be consistent with
2792  * what is in the pcacheset lists.
2793  */
2794 static void
2795 checkpolldat(pollstate_t *ps)
2796 {
2797         pollcache_t     *pcp = ps->ps_pcache;
2798         polldat_t       **hashtbl;
2799         int             i;
2800 
2801         hashtbl = pcp->pc_hash;
2802         for (i = 0; i < pcp->pc_hashsize; i++) {
2803                 polldat_t       *pdp;
2804 
2805                 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
2806                         ASSERT(pdp->pd_ref != NULL);
2807                         if (pdp->pd_count > 0) {
2808                                 xref_t          *refp;
2809                                 int             j;
2810                                 pollcacheset_t  *pcsp;
2811                                 pollfd_t        *pollfd;
2812 
2813                                 for (j = 0; j < ps->ps_nsets; j++) {
2814                                         refp = &pdp->pd_ref[j];
2815                                         if (refp->xf_refcnt > 0) {
2816                                                 pcsp = &ps->ps_pcacheset[j];
2817                                 ASSERT(refp->xf_position < pcsp->pcs_nfds);
2818                                                 pollfd = pcsp->pcs_pollfd;
2819                         ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd);
2820                                         }
2821                                 }
2822                         }
2823                 }
2824         }
2825 }
2826 
2827 /*
2828  * every wfd element on ph_list must have a corresponding fpollinfo on the
2829  * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks.
2830  */
2831 void
2832 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip)
2833 {
2834         stdata_t *stp;
2835         polldat_t *pdp;
2836         fpollinfo_t *fpip2;
2837 
2838         if ((stp = vp->v_stream) == NULL) {
2839                 return;
2840         }
2841         PH_ENTER(&stp->sd_pollist);
2842         for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) {
2843                 if (pdp->pd_thread != NULL &&
2844                     pdp->pd_thread->t_procp == curthread->t_procp) {
2845                         for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) {
2846                                 if (pdp->pd_thread == fpip2->fp_thread) {
2847                                         break;
2848                                 }
2849                         }
2850                         ASSERT(fpip2 != NULL);
2851                 }
2852         }
2853         PH_EXIT(&stp->sd_pollist);
2854 }
2855 
2856 /*
2857  * For each cached fd whose bit is not set in bitmap, its revents field in
2858  * current poll list should be 0.
2859  */
2860 static int
2861 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex)
2862 {
2863         pollcache_t     *pcp = ps->ps_pcache;
2864         pollfd_t        *pollfdp = ps->ps_pollfd;
2865         int             i;
2866 
2867         for (i = begin; i < end; i++) {
2868                 polldat_t       *pdp;
2869 
2870                 ASSERT(!BT_TEST(pcp->pc_bitmap, i));
2871                 pdp = pcache_lookup_fd(pcp, i);
2872                 if (pdp && pdp->pd_fp != NULL) {
2873                         xref_t *refp;
2874                         int entry;
2875 
2876                         ASSERT(pdp->pd_ref != NULL);
2877                         refp = &pdp->pd_ref[cacheindex];
2878                         if (refp->xf_refcnt == 0) {
2879                                 continue;
2880                         }
2881                         entry = refp->xf_position;
2882                         ASSERT(entry >= 0);
2883                         ASSERT(pollfdp[entry].revents == 0);
2884                         if (refp->xf_refcnt > 1) {
2885                                 int j;
2886 
2887                                 for (j = entry + 1; j < ps->ps_nfds; j++) {
2888                                         if (pollfdp[j].fd == i) {
2889                                                 ASSERT(pollfdp[j].revents == 0);
2890                                         }
2891                                 }
2892                         }
2893                 }
2894         }
2895         return (1);
2896 }
2897 
2898 #endif  /* DEBUG */
2899 
2900 pollcache_t *
2901 pcache_alloc()
2902 {
2903         return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP));
2904 }
2905 
2906 void
2907 pcache_create(pollcache_t *pcp, nfds_t nfds)
2908 {
2909         size_t  mapsize;
2910 
2911         /*
2912          * allocate enough bits for the poll fd list
2913          */
2914         if ((mapsize = POLLMAPCHUNK) <= nfds) {
2915                 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1);
2916         }
2917         pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t),
2918             KM_SLEEP);
2919         pcp->pc_mapsize = mapsize;
2920         /*
2921          * The hash size is at least POLLHASHCHUNKSZ. If user polls a large
2922          * number of fd to start with, allocate a bigger hash table (to the
2923          * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a
2924          * hash table is expensive.
2925          */
2926         if (nfds < POLLHASHCHUNKSZ) {
2927                 pcp->pc_hashsize = POLLHASHCHUNKSZ;
2928         } else {
2929                 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
2930                     ~(POLLHASHCHUNKSZ - 1);
2931         }
2932         pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
2933             KM_SLEEP);
2934 }
2935 
2936 void
2937 pcache_destroy(pollcache_t *pcp)
2938 {
2939         polldat_t       **hashtbl;
2940         int i;
2941 
2942         hashtbl = pcp->pc_hash;
2943         for (i = 0; i < pcp->pc_hashsize; i++) {
2944                 if (hashtbl[i] != NULL) {
2945                         polldat_t *pdp, *pdp2;
2946 
2947                         pdp = hashtbl[i];
2948                         while (pdp != NULL) {
2949                                 pdp2 = pdp->pd_hashnext;
2950                                 if (pdp->pd_ref != NULL) {
2951                                         kmem_free(pdp->pd_ref, sizeof (xref_t) *
2952                                             pdp->pd_nsets);
2953                                 }
2954                                 kmem_free(pdp, sizeof (polldat_t));
2955                                 pdp = pdp2;
2956                                 pcp->pc_fdcount--;
2957                         }
2958                 }
2959         }
2960         ASSERT(pcp->pc_fdcount == 0);
2961         kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize);
2962         kmem_free(pcp->pc_bitmap,
2963             sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL));
2964         mutex_destroy(&pcp->pc_no_exit);
2965         mutex_destroy(&pcp->pc_lock);
2966         cv_destroy(&pcp->pc_cv);
2967         cv_destroy(&pcp->pc_busy_cv);
2968         kmem_free(pcp, sizeof (pollcache_t));
2969 }
2970 
2971 pollcacheset_t *
2972 pcacheset_create(int nsets)
2973 {
2974         return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP));
2975 }
2976 
2977 void
2978 pcacheset_destroy(pollcacheset_t *pcsp, int nsets)
2979 {
2980         int i;
2981 
2982         for (i = 0; i < nsets; i++) {
2983                 if (pcsp[i].pcs_pollfd != NULL) {
2984                         kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds *
2985                             sizeof (pollfd_t));
2986                 }
2987         }
2988         kmem_free(pcsp, sizeof (pollcacheset_t) * nsets);
2989 }
2990 
2991 /*
2992  * Check each duplicated poll fd in the poll list. It may be necessary to
2993  * VOP_POLL the same fd again using different poll events. getf() has been
2994  * done by caller. This routine returns 0 if it can sucessfully process the
2995  * entire poll fd list. It returns -1 if underlying vnode has changed during
2996  * a VOP_POLL, in which case the caller has to repoll. It returns a positive
2997  * value if VOP_POLL failed.
2998  */
2999 static int
3000 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp,
3001     int entry, int *fdcntp)
3002 {
3003         int     i;
3004         int     fd;
3005         nfds_t  nfds = psp->ps_nfds;
3006 
3007         fd = pollfdp[entry].fd;
3008         for (i = entry + 1; i < nfds; i++) {
3009                 if (pollfdp[i].fd == fd) {
3010                         if (pollfdp[i].events == pollfdp[entry].events) {
3011                                 if ((pollfdp[i].revents =
3012                                     pollfdp[entry].revents) != 0) {
3013                                         (*fdcntp)++;
3014                                 }
3015                         } else {
3016 
3017                                 int     error;
3018                                 pollhead_t *php;
3019                                 pollcache_t *pcp = psp->ps_pcache;
3020 
3021                                 /*
3022                                  * the events are different. VOP_POLL on this
3023                                  * fd so that we don't miss any revents.
3024                                  */
3025                                 php = NULL;
3026                                 ASSERT(curthread->t_pollcache == NULL);
3027                                 error = VOP_POLL(fp->f_vnode,
3028                                     pollfdp[i].events, 0,
3029                                     &pollfdp[i].revents, &php, NULL);
3030                                 if (error) {
3031                                         return (error);
3032                                 }
3033                                 /*
3034                                  * layered devices(e.g. console driver)
3035                                  * may change the vnode and thus the pollhead
3036                                  * pointer out from underneath us.
3037                                  */
3038                                 if (php != NULL && pdp->pd_php != NULL &&
3039                                     php != pdp->pd_php) {
3040                                         pollhead_delete(pdp->pd_php, pdp);
3041                                         pdp->pd_php = php;
3042                                         pollhead_insert(php, pdp);
3043                                         /*
3044                                          * We could have missed a wakeup on the
3045                                          * new target device. Make sure the new
3046                                          * target gets polled once.
3047                                          */
3048                                         BT_SET(pcp->pc_bitmap, fd);
3049                                         return (-1);
3050                                 }
3051                                 if (pollfdp[i].revents) {
3052                                         (*fdcntp)++;
3053                                 }
3054                         }
3055                 }
3056         }
3057         return (0);
3058 }