Print this page
OS-5566 ppoll timeout calculation can overflow
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Alex Wilson <alex.wilson@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
OS-4656 nested epoll does not mimic Linux behavior
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-5162 poll/select yield improper EINTR when nfds and timeout are 0
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4830 lxbrand convert select/poll to IKE
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * Copyright (c) 2012 by Delphix. All rights reserved.
  32  * Copyright 2015, Joyent, Inc.
  33  */
  34 
  35 /*
  36  * Portions of this source code were derived from Berkeley 4.3 BSD
  37  * under license from the Regents of the University of California.
  38  */
  39 
  40 #include <sys/param.h>
  41 #include <sys/isa_defs.h>
  42 #include <sys/types.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/user.h>
  45 #include <sys/systm.h>
  46 #include <sys/errno.h>
  47 #include <sys/time.h>
  48 #include <sys/vnode.h>
  49 #include <sys/file.h>
  50 #include <sys/mode.h>
  51 #include <sys/proc.h>
  52 #include <sys/uio.h>


 300 
 301         mutex_enter(&pcp->pc_lock);
 302 }
 303 
 304 /* ARGSUSED */
 305 int
 306 polllock(pollhead_t *php, kmutex_t *lp)
 307 {
 308         if (mutex_tryenter(lp) == 0) {
 309                 int state;
 310 
 311                 if (pollunlock(&state) != 0) {
 312                         return (-1);
 313                 }
 314                 mutex_enter(lp);
 315                 pollrelock(state);
 316         }
 317         return (0);
 318 }
 319 
 320 static int
 321 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
 322 {











































 323         kthread_t *t = curthread;
 324         klwp_t *lwp = ttolwp(t);
 325         proc_t *p = ttoproc(t);
 326         int fdcnt = 0;
 327         int i;
 328         hrtime_t deadline; /* hrtime value when we want to return */
 329         pollfd_t *pollfdp;
 330         pollstate_t *ps;
 331         pollcache_t *pcp;
 332         int error = 0;
 333         nfds_t old_nfds;
 334         int cacheindex = 0;     /* which cache set is used */
 335 
 336         /*
 337          * Determine the precise future time of the requested timeout, if any.
 338          */
 339         if (tsp == NULL) {
 340                 deadline = -1;
 341         } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
 342                 deadline = 0;



 343         } else {










 344                 /* They must wait at least a tick. */
 345                 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec;
 346                 deadline = MAX(deadline, nsec_per_tick);
 347                 deadline += gethrtime();
 348         }
 349 
 350         /*
 351          * Reset our signal mask, if requested.


 352          */
 353         if (ksetp != NULL) {
 354                 mutex_enter(&p->p_lock);
 355                 schedctl_finish_sigblock(t);
 356                 lwp->lwp_sigoldmask = t->t_hold;
 357                 t->t_hold = *ksetp;
 358                 t->t_flag |= T_TOMASK;
 359                 /*
 360                  * Call cv_reltimedwait_sig() just to check for signals.
 361                  * We will return immediately with either 0 or -1.
 362                  */
 363                 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
 364                     TR_CLOCK_TICK)) {
 365                         mutex_exit(&p->p_lock);
 366                         error = EINTR;
 367                         goto pollout;
 368                 }
 369                 mutex_exit(&p->p_lock);
 370         }
 371 
 372         /*
 373          * Check to see if this guy just wants to use poll() as a timeout.
 374          * If yes then bypass all the other stuff and make him sleep.
 375          */
 376         if (nfds == 0) {

 377                 /*
 378                  * Sleep until we have passed the requested future
 379                  * time or until interrupted by a signal.
 380                  * Do not check for signals if we do not want to wait.
 381                  */
 382                 if (deadline != 0) {
 383                         mutex_enter(&t->t_delay_lock);
 384                         while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
 385                             &t->t_delay_lock, deadline)) > 0)
 386                                 continue;
 387                         mutex_exit(&t->t_delay_lock);
 388                         error = (error == 0) ? EINTR : 0;
 389                 }
 390                 goto pollout;
 391         }
 392 
 393         if (nfds > p->p_fno_ctl) {
 394                 mutex_enter(&p->p_lock);
 395                 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
 396                     p->p_rctls, p, RCA_SAFE);
 397                 mutex_exit(&p->p_lock);
 398                 error = EINVAL;
 399                 goto pollout;
 400         }
 401 
 402         /*
 403          * Need to allocate memory for pollstate before anything because
 404          * the mutex and cv are created in this space
 405          */
 406         ps = pollstate_create();
 407 
 408         if (ps->ps_pcache == NULL)
 409                 ps->ps_pcache = pcache_alloc();
 410         pcp = ps->ps_pcache;
 411 
 412         /*
 413          * NOTE: for performance, buffers are saved across poll() calls.
 414          * The theory is that if a process polls heavily, it tends to poll
 415          * on the same set of descriptors.  Therefore, we only reallocate
 416          * buffers when nfds changes.  There is no hysteresis control,
 417          * because there is no data to suggest that this is necessary;
 418          * the penalty of reallocating is not *that* great in any event.
 419          */
 420         old_nfds = ps->ps_nfds;
 421         if (nfds != old_nfds) {
 422 
 423                 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
 424                 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
 425                 ps->ps_pollfd = pollfdp;
 426                 ps->ps_nfds = nfds;
 427         }
 428 
 429         pollfdp = ps->ps_pollfd;
 430         if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
 431                 error = EFAULT;
 432                 goto pollout;
 433         }
 434 
 435         if (fds == NULL) {
 436                 /*
 437                  * If the process has page 0 mapped, then the copyin() above
 438                  * will succeed even if fds is NULL.  However, our cached
 439                  * poll lists are keyed by the address of the passed-in fds
 440                  * structure, and we use the value NULL to indicate an unused
 441                  * poll cache list entry.  As such, we elect not to support
 442                  * NULL as a valid (user) memory address and fail the poll()
 443                  * call.
 444                  */
 445                 error = EINVAL;
 446                 goto pollout;
 447         }
 448 
 449         /*
 450          * If this thread polls for the first time, allocate ALL poll
 451          * cache data structures and cache the poll fd list. This
 452          * allocation is delayed till now because lwp's polling 0 fd
 453          * (i.e. using poll as timeout()) don't need this memory.
 454          */
 455         mutex_enter(&ps->ps_lock);
 456         pcp = ps->ps_pcache;
 457         ASSERT(pcp != NULL);
 458         if (pcp->pc_bitmap == NULL) {
 459                 pcache_create(pcp, nfds);
 460                 /*
 461                  * poll and cache this poll fd list in ps_pcacheset[0].
 462                  */
 463                 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
 464                 if (fdcnt || error) {
 465                         mutex_exit(&ps->ps_lock);
 466                         goto pollout;
 467                 }
 468         } else {
 469                 pollcacheset_t  *pcset = ps->ps_pcacheset;
 470 
 471                 /*
 472                  * Not first time polling. Select a cached poll list by
 473                  * matching user pollfd list buffer address.
 474                  */
 475                 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) {
 476                         if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) {
 477                                 if ((++pcset[cacheindex].pcs_count) == 0) {
 478                                         /*
 479                                          * counter is wrapping around.
 480                                          */
 481                                         pcacheset_reset_count(ps, cacheindex);
 482                                 }
 483                                 /*
 484                                  * examine and resolve possible
 485                                  * difference of the current poll
 486                                  * list and previously cached one.
 487                                  * If there is an error during resolve(),
 488                                  * the callee will guarantee the consistency
 489                                  * of cached poll list and cache content.
 490                                  */
 491                                 error = pcacheset_resolve(ps, nfds, &fdcnt,
 492                                     cacheindex);
 493                                 if (error) {
 494                                         mutex_exit(&ps->ps_lock);
 495                                         goto pollout;
 496                                 }
 497                                 break;
 498                         }
 499 
 500                         /*
 501                          * Note that pcs_usradr field of an used entry won't be
 502                          * NULL because it stores the address of passed-in fds,
 503                          * and NULL fds will not be cached (Then it is either
 504                          * the special timeout case when nfds is 0 or it returns
 505                          * failure directly).
 506                          */
 507                         if (pcset[cacheindex].pcs_usradr == NULL) {
 508                                 /*
 509                                  * found an unused entry. Use it to cache
 510                                  * this poll list.
 511                                  */
 512                                 error = pcacheset_cache_list(ps, fds, &fdcnt,
 513                                     cacheindex);
 514                                 if (fdcnt || error) {
 515                                         mutex_exit(&ps->ps_lock);
 516                                         goto pollout;
 517                                 }
 518                                 break;
 519                         }
 520                 }
 521                 if (cacheindex == ps->ps_nsets) {
 522                         /*
 523                          * We failed to find a matching cached poll fd list.
 524                          * replace an old list.
 525                          */
 526                         pollstats.polllistmiss.value.ui64++;
 527                         cacheindex = pcacheset_replace(ps);
 528                         ASSERT(cacheindex < ps->ps_nsets);
 529                         pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
 530                         error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
 531                         if (error) {
 532                                 mutex_exit(&ps->ps_lock);
 533                                 goto pollout;
 534                         }
 535                 }
 536         }
 537 
 538         /*
 539          * Always scan the bitmap with the lock on the pollcache held.
 540          * This is to make sure that a wakeup does not come undetected.
 541          * If the lock is not held, a pollwakeup could have come for an
 542          * fd we already checked but before this thread sleeps, in which
 543          * case the wakeup is missed. Now we hold the pcache lock and
 544          * check the bitmap again. This will prevent wakeup from happening
 545          * while we hold pcache lock since pollwakeup() will also lock
 546          * the pcache before updating poll bitmap.
 547          */
 548         mutex_enter(&pcp->pc_lock);
 549         for (;;) {
 550                 pcp->pc_flag = 0;
 551                 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
 552                 if (fdcnt || error) {
 553                         mutex_exit(&pcp->pc_lock);
 554                         mutex_exit(&ps->ps_lock);
 555                         break;
 556                 }
 557 
 558                 /*
 559                  * If PC_POLLWAKE is set, a pollwakeup() was performed on
 560                  * one of the file descriptors.  This can happen only if
 561                  * one of the VOP_POLL() functions dropped pcp->pc_lock.
 562                  * The only current cases of this is in procfs (prpoll())
 563                  * and STREAMS (strpoll()).
 564                  */
 565                 if (pcp->pc_flag & PC_POLLWAKE)
 566                         continue;
 567 
 568                 /*
 569                  * If you get here, the poll of fds was unsuccessful.
 570                  * Wait until some fd becomes readable, writable, or gets
 571                  * an exception, or until a signal or a timeout occurs.
 572                  * Do not check for signals if we have a zero timeout.


 578                         error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
 579                             &pcp->pc_lock, deadline);
 580                 }
 581                 mutex_exit(&pcp->pc_lock);
 582                 /*
 583                  * If we have received a signal or timed out
 584                  * then break out and return.
 585                  */
 586                 if (error <= 0) {
 587                         error = (error == 0) ? EINTR : 0;
 588                         break;
 589                 }
 590                 /*
 591                  * We have not received a signal or timed out.
 592                  * Continue around and poll fds again.
 593                  */
 594                 mutex_enter(&ps->ps_lock);
 595                 mutex_enter(&pcp->pc_lock);
 596         }
 597 








































































































 598 pollout:
 599         /*
 600          * If we changed the signal mask but we received
 601          * no signal then restore the signal mask.
 602          * Otherwise psig() will deal with the signal mask.
 603          */
 604         if (ksetp != NULL) {
 605                 mutex_enter(&p->p_lock);
 606                 if (lwp->lwp_cursig == 0) {
 607                         t->t_hold = lwp->lwp_sigoldmask;
 608                         t->t_flag &= ~T_TOMASK;
 609                 }
 610                 mutex_exit(&p->p_lock);
 611         }
 612 
 613         if (error)
 614                 return (set_errno(error));
 615 
 616         /*
 617          * Copy out the events and return the fdcnt to the user.
 618          */
 619         if (nfds != 0 &&
 620             copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
 621                 return (set_errno(EFAULT));
 622 
 623 #ifdef DEBUG
 624         /*
 625          * Another sanity check:
 626          */
 627         if (fdcnt) {
 628                 int     reventcnt = 0;
 629 
 630                 for (i = 0; i < nfds; i++) {
 631                         if (pollfdp[i].fd < 0) {
 632                                 ASSERT(pollfdp[i].revents == 0);
 633                                 continue;
 634                         }
 635                         if (pollfdp[i].revents) {
 636                                 reventcnt++;
 637                         }
 638                 }
 639                 ASSERT(fdcnt == reventcnt);
 640         } else {


 641                 for (i = 0; i < nfds; i++) {
 642                         ASSERT(pollfdp[i].revents == 0);
 643                 }
 644         }
 645 #endif  /* DEBUG */
 646 
 647         return (fdcnt);
 648 }
 649 
 650 /*
 651  * This is the system call trap that poll(),
 652  * select() and pselect() are built upon.
 653  * It is a private interface between libc and the kernel.
 654  */
 655 int
 656 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
 657 {
 658         timespec_t ts;
 659         timespec_t *tsp;
 660         sigset_t set;
 661         k_sigset_t kset;
 662         k_sigset_t *ksetp;
 663         model_t datamodel = get_udatamodel();
 664 
 665         if (timeoutp == NULL)
 666                 tsp = NULL;
 667         else {
 668                 if (datamodel == DATAMODEL_NATIVE) {
 669                         if (copyin(timeoutp, &ts, sizeof (ts)))
 670                                 return (set_errno(EFAULT));
 671                 } else {
 672                         timespec32_t ts32;
 673 
 674                         if (copyin(timeoutp, &ts32, sizeof (ts32)))
 675                                 return (set_errno(EFAULT));
 676                         TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
 677                 }
 678 
 679                 if (itimerspecfix(&ts))
 680                         return (set_errno(EINVAL));
 681                 tsp = &ts;
 682         }
 683 
 684         if (setp == NULL)
 685                 ksetp = NULL;
 686         else {
 687                 if (copyin(setp, &set, sizeof (set)))
 688                         return (set_errno(EFAULT));
 689                 sigutok(&set, &kset);
 690                 ksetp = &kset;
 691         }
 692 
 693         return (poll_common(fds, nfds, tsp, ksetp));
 694 }
 695 
 696 /*
 697  * Clean up any state left around by poll(2). Called when a thread exits.
 698  */
 699 void
 700 pollcleanup()
 701 {
 702         pollstate_t *ps = curthread->t_pollstate;
 703         pollcache_t *pcp;
 704 
 705         if (ps == NULL)
 706                 return;
 707         pcp = ps->ps_pcache;
 708         /*
 709          * free up all cached poll fds
 710          */
 711         if (pcp == NULL) {
 712                 /* this pollstate is used by /dev/poll */
 713                 goto pollcleanout;
 714         }
 715 




  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * Copyright (c) 2012 by Delphix. All rights reserved.
  32  * Copyright 2016, Joyent, Inc.
  33  */
  34 
  35 /*
  36  * Portions of this source code were derived from Berkeley 4.3 BSD
  37  * under license from the Regents of the University of California.
  38  */
  39 
  40 #include <sys/param.h>
  41 #include <sys/isa_defs.h>
  42 #include <sys/types.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/user.h>
  45 #include <sys/systm.h>
  46 #include <sys/errno.h>
  47 #include <sys/time.h>
  48 #include <sys/vnode.h>
  49 #include <sys/file.h>
  50 #include <sys/mode.h>
  51 #include <sys/proc.h>
  52 #include <sys/uio.h>


 300 
 301         mutex_enter(&pcp->pc_lock);
 302 }
 303 
 304 /* ARGSUSED */
 305 int
 306 polllock(pollhead_t *php, kmutex_t *lp)
 307 {
 308         if (mutex_tryenter(lp) == 0) {
 309                 int state;
 310 
 311                 if (pollunlock(&state) != 0) {
 312                         return (-1);
 313                 }
 314                 mutex_enter(lp);
 315                 pollrelock(state);
 316         }
 317         return (0);
 318 }
 319 
 320 int
 321 poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds)
 322 {
 323         pollfd_t *pollfdp;
 324         nfds_t old_nfds;
 325 
 326         /*
 327          * NOTE: for performance, buffers are saved across poll() calls.
 328          * The theory is that if a process polls heavily, it tends to poll
 329          * on the same set of descriptors.  Therefore, we only reallocate
 330          * buffers when nfds changes.  There is no hysteresis control,
 331          * because there is no data to suggest that this is necessary;
 332          * the penalty of reallocating is not *that* great in any event.
 333          */
 334         old_nfds = ps->ps_nfds;
 335         if (nfds != old_nfds) {
 336                 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
 337                 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
 338                 ps->ps_pollfd = pollfdp;
 339                 ps->ps_nfds = nfds;
 340         }
 341 
 342         pollfdp = ps->ps_pollfd;
 343         if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
 344                 return (EFAULT);
 345         }
 346 
 347         if (fds == NULL) {
 348                 /*
 349                  * If the process has page 0 mapped, then the copyin() above
 350                  * will succeed even if fds is NULL.  However, our cached
 351                  * poll lists are keyed by the address of the passed-in fds
 352                  * structure, and we use the value NULL to indicate an unused
 353                  * poll cache list entry.  As such, we elect not to support
 354                  * NULL as a valid (user) memory address and fail the poll()
 355                  * call.
 356                  */
 357                 return (EFAULT);
 358         }
 359         return (0);
 360 }
 361 
 362 int
 363 poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp,
 364     int *fdcnt)
 365 {
 366         kthread_t *t = curthread;




 367         hrtime_t deadline; /* hrtime value when we want to return */
 368         pollfd_t *pollfdp;

 369         pollcache_t *pcp;
 370         int error = 0;

 371         int cacheindex = 0;     /* which cache set is used */
 372 
 373         /*
 374          * Determine the precise future time of the requested timeout, if any.
 375          */
 376         if (tsp == NULL) {
 377                 deadline = -1;
 378         } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
 379                 deadline = 0;
 380         } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) {
 381                 /* Use an indefinite timeout if tv_sec would cause overflow */
 382                 deadline = -1;
 383         } else {
 384                 /*
 385                  * The above check, when combined with the protections offered
 386                  * by itimerspecfix (ensuring that neither field is negative
 387                  * and that tv_nsec represents less than a whole second), will
 388                  * prevent overflow during the conversion from timespec_t to
 389                  * uhrtime_t.
 390                  */
 391                 uhrtime_t utime = tsp->tv_sec * NANOSEC;
 392                 utime += tsp->tv_nsec;
 393 
 394                 /* They must wait at least a tick. */
 395                 utime = MAX(utime, nsec_per_tick);



 396 
 397                 /*
 398                  * Since utime has an upper bound of HRTIME_MAX, adding the
 399                  * gethrtime() result cannot incur an overflow as the unsigned
 400                  * type has an adequate bound.
 401                  */
 402                 utime += (uhrtime_t)gethrtime();
 403                 if (utime > HRTIME_MAX) {
 404                         deadline = -1;
 405                 } else {
 406                         deadline = (hrtime_t)utime;










 407                 }

 408         }
 409 
 410         /*
 411          * Check to see if the caller just wants to use poll() as a timeout.
 412          * If yes then bypass all the other stuff and make him sleep.
 413          */
 414         if (nfds == 0) {
 415                 *fdcnt = 0;
 416                 /*
 417                  * Sleep until we have passed the requested future
 418                  * time or until interrupted by a signal.
 419                  * Do not check for signals if we do not want to wait.
 420                  */
 421                 if (deadline != 0) {
 422                         mutex_enter(&t->t_delay_lock);
 423                         while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
 424                             &t->t_delay_lock, deadline)) > 0)
 425                                 continue;
 426                         mutex_exit(&t->t_delay_lock);
 427                         return ((error == 0) ? EINTR : 0);
 428                 }
 429                 return (0);
 430         }
 431 
 432         VERIFY(ps != NULL);



































 433         pollfdp = ps->ps_pollfd;
 434         VERIFY(pollfdp != NULL);



 435 

 436         /*













 437          * If this thread polls for the first time, allocate ALL poll
 438          * cache data structures and cache the poll fd list. This
 439          * allocation is delayed till now because lwp's polling 0 fd
 440          * (i.e. using poll as timeout()) don't need this memory.
 441          */
 442         mutex_enter(&ps->ps_lock);
 443         pcp = ps->ps_pcache;
 444         ASSERT(pcp != NULL);
 445         if (pcp->pc_bitmap == NULL) {
 446                 pcache_create(pcp, nfds);
 447                 /*
 448                  * poll and cache this poll fd list in ps_pcacheset[0].
 449                  */
 450                 error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex);
 451                 if (error || *fdcnt) {
 452                         mutex_exit(&ps->ps_lock);
 453                         return (error);
 454                 }
 455         } else {
 456                 pollcacheset_t  *pcset = ps->ps_pcacheset;
 457 
 458                 /*
 459                  * Not first time polling. Select a cached poll list by
 460                  * matching user pollfd list buffer address.
 461                  */
 462                 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) {
 463                         if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) {
 464                                 if ((++pcset[cacheindex].pcs_count) == 0) {
 465                                         /*
 466                                          * counter is wrapping around.
 467                                          */
 468                                         pcacheset_reset_count(ps, cacheindex);
 469                                 }
 470                                 /*
 471                                  * examine and resolve possible
 472                                  * difference of the current poll
 473                                  * list and previously cached one.
 474                                  * If there is an error during resolve(),
 475                                  * the callee will guarantee the consistency
 476                                  * of cached poll list and cache content.
 477                                  */
 478                                 error = pcacheset_resolve(ps, nfds, fdcnt,
 479                                     cacheindex);
 480                                 if (error) {
 481                                         mutex_exit(&ps->ps_lock);
 482                                         return (error);
 483                                 }
 484                                 break;
 485                         }
 486 
 487                         /*
 488                          * Note that pcs_usradr field of an used entry won't be
 489                          * NULL because it stores the address of passed-in fds,
 490                          * and NULL fds will not be cached (Then it is either
 491                          * the special timeout case when nfds is 0 or it returns
 492                          * failure directly).
 493                          */
 494                         if (pcset[cacheindex].pcs_usradr == NULL) {
 495                                 /*
 496                                  * found an unused entry. Use it to cache
 497                                  * this poll list.
 498                                  */
 499                                 error = pcacheset_cache_list(ps, fds, fdcnt,
 500                                     cacheindex);
 501                                 if (error || *fdcnt) {
 502                                         mutex_exit(&ps->ps_lock);
 503                                         return (error);
 504                                 }
 505                                 break;
 506                         }
 507                 }
 508                 if (cacheindex == ps->ps_nsets) {
 509                         /*
 510                          * We failed to find a matching cached poll fd list.
 511                          * replace an old list.
 512                          */
 513                         pollstats.polllistmiss.value.ui64++;
 514                         cacheindex = pcacheset_replace(ps);
 515                         ASSERT(cacheindex < ps->ps_nsets);
 516                         pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
 517                         error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex);
 518                         if (error) {
 519                                 mutex_exit(&ps->ps_lock);
 520                                 return (error);
 521                         }
 522                 }
 523         }
 524 
 525         /*
 526          * Always scan the bitmap with the lock on the pollcache held.
 527          * This is to make sure that a wakeup does not come undetected.
 528          * If the lock is not held, a pollwakeup could have come for an
 529          * fd we already checked but before this thread sleeps, in which
 530          * case the wakeup is missed. Now we hold the pcache lock and
 531          * check the bitmap again. This will prevent wakeup from happening
 532          * while we hold pcache lock since pollwakeup() will also lock
 533          * the pcache before updating poll bitmap.
 534          */
 535         mutex_enter(&pcp->pc_lock);
 536         for (;;) {
 537                 pcp->pc_flag = 0;
 538                 error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex);
 539                 if (error || *fdcnt) {
 540                         mutex_exit(&pcp->pc_lock);
 541                         mutex_exit(&ps->ps_lock);
 542                         break;
 543                 }
 544 
 545                 /*
 546                  * If PC_POLLWAKE is set, a pollwakeup() was performed on
 547                  * one of the file descriptors.  This can happen only if
 548                  * one of the VOP_POLL() functions dropped pcp->pc_lock.
 549                  * The only current cases of this is in procfs (prpoll())
 550                  * and STREAMS (strpoll()).
 551                  */
 552                 if (pcp->pc_flag & PC_POLLWAKE)
 553                         continue;
 554 
 555                 /*
 556                  * If you get here, the poll of fds was unsuccessful.
 557                  * Wait until some fd becomes readable, writable, or gets
 558                  * an exception, or until a signal or a timeout occurs.
 559                  * Do not check for signals if we have a zero timeout.


 565                         error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
 566                             &pcp->pc_lock, deadline);
 567                 }
 568                 mutex_exit(&pcp->pc_lock);
 569                 /*
 570                  * If we have received a signal or timed out
 571                  * then break out and return.
 572                  */
 573                 if (error <= 0) {
 574                         error = (error == 0) ? EINTR : 0;
 575                         break;
 576                 }
 577                 /*
 578                  * We have not received a signal or timed out.
 579                  * Continue around and poll fds again.
 580                  */
 581                 mutex_enter(&ps->ps_lock);
 582                 mutex_enter(&pcp->pc_lock);
 583         }
 584 
 585         return (error);
 586 }
 587 
 588 /*
 589  * This is the system call trap that poll(),
 590  * select() and pselect() are built upon.
 591  * It is a private interface between libc and the kernel.
 592  */
 593 int
 594 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
 595 {
 596         kthread_t *t = curthread;
 597         klwp_t *lwp = ttolwp(t);
 598         proc_t *p = ttoproc(t);
 599         timespec_t ts;
 600         timespec_t *tsp;
 601         k_sigset_t kset;
 602         pollstate_t *ps = NULL;
 603         pollfd_t *pollfdp = NULL;
 604         int error = 0, fdcnt = 0;
 605 
 606         /*
 607          * Copy in timeout
 608          */
 609         if (timeoutp == NULL) {
 610                 tsp = NULL;
 611         } else {
 612                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 613                         if (copyin(timeoutp, &ts, sizeof (ts)))
 614                                 return (set_errno(EFAULT));
 615                 } else {
 616                         timespec32_t ts32;
 617 
 618                         if (copyin(timeoutp, &ts32, sizeof (ts32)))
 619                                 return (set_errno(EFAULT));
 620                         TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
 621                 }
 622 
 623                 if (itimerspecfix(&ts))
 624                         return (set_errno(EINVAL));
 625                 tsp = &ts;
 626         }
 627 
 628         /*
 629          * Copy in and reset signal mask, if requested.
 630          */
 631         if (setp != NULL) {
 632                 sigset_t set;
 633 
 634                 if (copyin(setp, &set, sizeof (set)))
 635                         return (set_errno(EFAULT));
 636                 sigutok(&set, &kset);
 637 
 638                 mutex_enter(&p->p_lock);
 639                 schedctl_finish_sigblock(t);
 640                 lwp->lwp_sigoldmask = t->t_hold;
 641                 t->t_hold = kset;
 642                 t->t_flag |= T_TOMASK;
 643                 /*
 644                  * Call cv_reltimedwait_sig() just to check for signals.
 645                  * We will return immediately with either 0 or -1.
 646                  */
 647                 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
 648                     TR_CLOCK_TICK)) {
 649                         mutex_exit(&p->p_lock);
 650                         error = EINTR;
 651                         goto pollout;
 652                 }
 653                 mutex_exit(&p->p_lock);
 654         }
 655 
 656         /*
 657          * Initialize pollstate and copy in pollfd data if present.
 658          * If nfds == 0, we will skip all of the copying and check steps and
 659          * proceed directly into poll_common to process the supplied timeout.
 660          */
 661         if (nfds != 0) {
 662                 if (nfds > p->p_fno_ctl) {
 663                         mutex_enter(&p->p_lock);
 664                         (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
 665                             p->p_rctls, p, RCA_SAFE);
 666                         mutex_exit(&p->p_lock);
 667                         error = EINVAL;
 668                         goto pollout;
 669                 }
 670 
 671                 /*
 672                  * Need to allocate memory for pollstate before anything
 673                  * because the mutex and cv are created in this space
 674                  */
 675                 ps = pollstate_create();
 676                 if (ps->ps_pcache == NULL)
 677                         ps->ps_pcache = pcache_alloc();
 678 
 679                 if ((error = poll_copyin(ps, fds, nfds)) != 0)
 680                         goto pollout;
 681                 pollfdp = ps->ps_pollfd;
 682         }
 683 
 684         /*
 685          * Perform the actual poll.
 686          */
 687         error = poll_common(ps, fds, nfds, tsp, &fdcnt);
 688 
 689 pollout:
 690         /*
 691          * If we changed the signal mask but we received no signal then restore
 692          * the signal mask.  Otherwise psig() will deal with the signal mask.

 693          */
 694         if (setp != NULL) {
 695                 mutex_enter(&p->p_lock);
 696                 if (lwp->lwp_cursig == 0) {
 697                         t->t_hold = lwp->lwp_sigoldmask;
 698                         t->t_flag &= ~T_TOMASK;
 699                 }
 700                 mutex_exit(&p->p_lock);
 701         }
 702 
 703         if (error)
 704                 return (set_errno(error));

 705         /*
 706          * Copy out the events and return the fdcnt to the user.
 707          */
 708         if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))

 709                 return (set_errno(EFAULT));
 710 
 711 #ifdef DEBUG
 712         /*
 713          * Another sanity check:
 714          */
 715         if (fdcnt) {
 716                 int i, reventcnt = 0;
 717 
 718                 for (i = 0; i < nfds; i++) {
 719                         if (pollfdp[i].fd < 0) {
 720                                 ASSERT(pollfdp[i].revents == 0);
 721                                 continue;
 722                         }
 723                         if (pollfdp[i].revents) {
 724                                 reventcnt++;
 725                         }
 726                 }
 727                 ASSERT(fdcnt == reventcnt);
 728         } else {
 729                 int i;
 730 
 731                 for (i = 0; i < nfds; i++) {
 732                         ASSERT(pollfdp[i].revents == 0);
 733                 }
 734         }
 735 #endif  /* DEBUG */
 736 
 737         return (fdcnt);
 738 }
 739 














































 740 /*
 741  * Clean up any state left around by poll(2). Called when a thread exits.
 742  */
 743 void
 744 pollcleanup()
 745 {
 746         pollstate_t *ps = curthread->t_pollstate;
 747         pollcache_t *pcp;
 748 
 749         if (ps == NULL)
 750                 return;
 751         pcp = ps->ps_pcache;
 752         /*
 753          * free up all cached poll fds
 754          */
 755         if (pcp == NULL) {
 756                 /* this pollstate is used by /dev/poll */
 757                 goto pollcleanout;
 758         }
 759