Print this page
OS-5566 ppoll timeout calculation can overflow
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Alex Wilson <alex.wilson@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
OS-4656 nested epoll does not mimic Linux behavior
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-5162 poll/select yield improper EINTR when nfds and timeout are 0
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4830 lxbrand convert select/poll to IKE
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

*** 27,37 **** /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * Copyright (c) 2012 by Delphix. All rights reserved. ! * Copyright 2015, Joyent, Inc. */ /* * Portions of this source code were derived from Berkeley 4.3 BSD * under license from the Regents of the University of California. --- 27,37 ---- /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * Copyright (c) 2012 by Delphix. All rights reserved. ! * Copyright 2016, Joyent, Inc. */ /* * Portions of this source code were derived from Berkeley 4.3 BSD * under license from the Regents of the University of California.
*** 315,381 **** pollrelock(state); } return (0); } ! static int ! poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) { kthread_t *t = curthread; - klwp_t *lwp = ttolwp(t); - proc_t *p = ttoproc(t); - int fdcnt = 0; - int i; hrtime_t deadline; /* hrtime value when we want to return */ pollfd_t *pollfdp; - pollstate_t *ps; pollcache_t *pcp; int error = 0; - nfds_t old_nfds; int cacheindex = 0; /* which cache set is used */ /* * Determine the precise future time of the requested timeout, if any. */ if (tsp == NULL) { deadline = -1; } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { deadline = 0; } else { /* They must wait at least a tick. */ ! deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; ! deadline = MAX(deadline, nsec_per_tick); ! deadline += gethrtime(); ! } /* ! * Reset our signal mask, if requested. */ ! if (ksetp != NULL) { ! mutex_enter(&p->p_lock); ! schedctl_finish_sigblock(t); ! lwp->lwp_sigoldmask = t->t_hold; ! t->t_hold = *ksetp; ! t->t_flag |= T_TOMASK; ! /* ! * Call cv_reltimedwait_sig() just to check for signals. ! * We will return immediately with either 0 or -1. ! */ ! if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, ! TR_CLOCK_TICK)) { ! mutex_exit(&p->p_lock); ! error = EINTR; ! goto pollout; } - mutex_exit(&p->p_lock); } /* ! * Check to see if this guy just wants to use poll() as a timeout. * If yes then bypass all the other stuff and make him sleep. */ if (nfds == 0) { /* * Sleep until we have passed the requested future * time or until interrupted by a signal. * Do not check for signals if we do not want to wait. */ --- 315,420 ---- pollrelock(state); } return (0); } ! int ! poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds) { + pollfd_t *pollfdp; + nfds_t old_nfds; + + /* + * NOTE: for performance, buffers are saved across poll() calls. + * The theory is that if a process polls heavily, it tends to poll + * on the same set of descriptors. Therefore, we only reallocate + * buffers when nfds changes. There is no hysteresis control, + * because there is no data to suggest that this is necessary; + * the penalty of reallocating is not *that* great in any event. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pollfdp; + ps->ps_nfds = nfds; + } + + pollfdp = ps->ps_pollfd; + if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { + return (EFAULT); + } + + if (fds == NULL) { + /* + * If the process has page 0 mapped, then the copyin() above + * will succeed even if fds is NULL. However, our cached + * poll lists are keyed by the address of the passed-in fds + * structure, and we use the value NULL to indicate an unused + * poll cache list entry. As such, we elect not to support + * NULL as a valid (user) memory address and fail the poll() + * call. + */ + return (EFAULT); + } + return (0); + } + + int + poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp, + int *fdcnt) + { kthread_t *t = curthread; hrtime_t deadline; /* hrtime value when we want to return */ pollfd_t *pollfdp; pollcache_t *pcp; int error = 0; int cacheindex = 0; /* which cache set is used */ /* * Determine the precise future time of the requested timeout, if any. */ if (tsp == NULL) { deadline = -1; } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { deadline = 0; + } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) { + /* Use an indefinite timeout if tv_sec would cause overflow */ + deadline = -1; } else { + /* + * The above check, when combined with the protections offered + * by itimerspecfix (ensuring that neither field is negative + * and that tv_nsec represents less than a whole second), will + * prevent overflow during the conversion from timespec_t to + * uhrtime_t. + */ + uhrtime_t utime = tsp->tv_sec * NANOSEC; + utime += tsp->tv_nsec; + /* They must wait at least a tick. */ ! utime = MAX(utime, nsec_per_tick); /* ! * Since utime has an upper bound of HRTIME_MAX, adding the ! * gethrtime() result cannot incur an overflow as the unsigned ! * type has an adequate bound. */ ! utime += (uhrtime_t)gethrtime(); ! if (utime > HRTIME_MAX) { ! deadline = -1; ! } else { ! deadline = (hrtime_t)utime; } } /* ! * Check to see if the caller just wants to use poll() as a timeout. * If yes then bypass all the other stuff and make him sleep. */ if (nfds == 0) { + *fdcnt = 0; /* * Sleep until we have passed the requested future * time or until interrupted by a signal. * Do not check for signals if we do not want to wait. */
*** 383,454 **** mutex_enter(&t->t_delay_lock); while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, &t->t_delay_lock, deadline)) > 0) continue; mutex_exit(&t->t_delay_lock); ! error = (error == 0) ? EINTR : 0; } ! goto pollout; } ! if (nfds > p->p_fno_ctl) { ! mutex_enter(&p->p_lock); ! (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], ! p->p_rctls, p, RCA_SAFE); ! mutex_exit(&p->p_lock); ! error = EINVAL; ! goto pollout; ! } ! ! /* ! * Need to allocate memory for pollstate before anything because ! * the mutex and cv are created in this space ! */ ! ps = pollstate_create(); ! ! if (ps->ps_pcache == NULL) ! ps->ps_pcache = pcache_alloc(); ! pcp = ps->ps_pcache; ! ! /* ! * NOTE: for performance, buffers are saved across poll() calls. ! * The theory is that if a process polls heavily, it tends to poll ! * on the same set of descriptors. Therefore, we only reallocate ! * buffers when nfds changes. There is no hysteresis control, ! * because there is no data to suggest that this is necessary; ! * the penalty of reallocating is not *that* great in any event. ! */ ! old_nfds = ps->ps_nfds; ! if (nfds != old_nfds) { ! ! kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); ! pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); ! ps->ps_pollfd = pollfdp; ! ps->ps_nfds = nfds; ! } ! pollfdp = ps->ps_pollfd; ! if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { ! error = EFAULT; ! goto pollout; ! } - if (fds == NULL) { /* - * If the process has page 0 mapped, then the copyin() above - * will succeed even if fds is NULL. However, our cached - * poll lists are keyed by the address of the passed-in fds - * structure, and we use the value NULL to indicate an unused - * poll cache list entry. As such, we elect not to support - * NULL as a valid (user) memory address and fail the poll() - * call. - */ - error = EINVAL; - goto pollout; - } - - /* * If this thread polls for the first time, allocate ALL poll * cache data structures and cache the poll fd list. This * allocation is delayed till now because lwp's polling 0 fd * (i.e. using poll as timeout()) don't need this memory. */ --- 422,441 ---- mutex_enter(&t->t_delay_lock); while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, &t->t_delay_lock, deadline)) > 0) continue; mutex_exit(&t->t_delay_lock); ! return ((error == 0) ? EINTR : 0); } ! return (0); } ! VERIFY(ps != NULL); pollfdp = ps->ps_pollfd; ! VERIFY(pollfdp != NULL); /* * If this thread polls for the first time, allocate ALL poll * cache data structures and cache the poll fd list. This * allocation is delayed till now because lwp's polling 0 fd * (i.e. using poll as timeout()) don't need this memory. */
*** 458,471 **** if (pcp->pc_bitmap == NULL) { pcache_create(pcp, nfds); /* * poll and cache this poll fd list in ps_pcacheset[0]. */ ! error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); ! if (fdcnt || error) { mutex_exit(&ps->ps_lock); ! goto pollout; } } else { pollcacheset_t *pcset = ps->ps_pcacheset; /* --- 445,458 ---- if (pcp->pc_bitmap == NULL) { pcache_create(pcp, nfds); /* * poll and cache this poll fd list in ps_pcacheset[0]. */ ! error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); ! if (error || *fdcnt) { mutex_exit(&ps->ps_lock); ! return (error); } } else { pollcacheset_t *pcset = ps->ps_pcacheset; /*
*** 486,500 **** * list and previously cached one. * If there is an error during resolve(), * the callee will guarantee the consistency * of cached poll list and cache content. */ ! error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); ! goto pollout; } break; } /* --- 473,487 ---- * list and previously cached one. * If there is an error during resolve(), * the callee will guarantee the consistency * of cached poll list and cache content. */ ! error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); ! return (error); } break; } /*
*** 507,521 **** if (pcset[cacheindex].pcs_usradr == NULL) { /* * found an unused entry. Use it to cache * this poll list. */ ! error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); ! if (fdcnt || error) { mutex_exit(&ps->ps_lock); ! goto pollout; } break; } } if (cacheindex == ps->ps_nsets) { --- 494,508 ---- if (pcset[cacheindex].pcs_usradr == NULL) { /* * found an unused entry. Use it to cache * this poll list. */ ! error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); ! if (error || *fdcnt) { mutex_exit(&ps->ps_lock); ! return (error); } break; } } if (cacheindex == ps->ps_nsets) {
*** 525,538 **** */ pollstats.polllistmiss.value.ui64++; cacheindex = pcacheset_replace(ps); ASSERT(cacheindex < ps->ps_nsets); pcset[cacheindex].pcs_usradr = (uintptr_t)fds; ! error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); ! goto pollout; } } } /* --- 512,525 ---- */ pollstats.polllistmiss.value.ui64++; cacheindex = pcacheset_replace(ps); ASSERT(cacheindex < ps->ps_nsets); pcset[cacheindex].pcs_usradr = (uintptr_t)fds; ! error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); ! return (error); } } } /*
*** 546,557 **** * the pcache before updating poll bitmap. */ mutex_enter(&pcp->pc_lock); for (;;) { pcp->pc_flag = 0; ! error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); ! if (fdcnt || error) { mutex_exit(&pcp->pc_lock); mutex_exit(&ps->ps_lock); break; } --- 533,544 ---- * the pcache before updating poll bitmap. */ mutex_enter(&pcp->pc_lock); for (;;) { pcp->pc_flag = 0; ! error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex); ! if (error || *fdcnt) { mutex_exit(&pcp->pc_lock); mutex_exit(&ps->ps_lock); break; }
*** 593,609 **** */ mutex_enter(&ps->ps_lock); mutex_enter(&pcp->pc_lock); } pollout: /* ! * If we changed the signal mask but we received ! * no signal then restore the signal mask. ! * Otherwise psig() will deal with the signal mask. */ ! if (ksetp != NULL) { mutex_enter(&p->p_lock); if (lwp->lwp_cursig == 0) { t->t_hold = lwp->lwp_sigoldmask; t->t_flag &= ~T_TOMASK; } --- 580,699 ---- */ mutex_enter(&ps->ps_lock); mutex_enter(&pcp->pc_lock); } + return (error); + } + + /* + * This is the system call trap that poll(), + * select() and pselect() are built upon. + * It is a private interface between libc and the kernel. + */ + int + pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) + { + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + timespec_t ts; + timespec_t *tsp; + k_sigset_t kset; + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + int error = 0, fdcnt = 0; + + /* + * Copy in timeout + */ + if (timeoutp == NULL) { + tsp = NULL; + } else { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + /* + * Copy in and reset signal mask, if requested. + */ + if (setp != NULL) { + sigset_t set; + + if (copyin(setp, &set, sizeof (set))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = kset; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + * If nfds == 0, we will skip all of the copying and check steps and + * proceed directly into poll_common to process the supplied timeout. + */ + if (nfds != 0) { + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + if ((error = poll_copyin(ps, fds, nfds)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + pollout: /* ! * If we changed the signal mask but we received no signal then restore ! * the signal mask. Otherwise psig() will deal with the signal mask. */ ! if (setp != NULL) { mutex_enter(&p->p_lock); if (lwp->lwp_cursig == 0) { t->t_hold = lwp->lwp_sigoldmask; t->t_flag &= ~T_TOMASK; }
*** 610,633 **** mutex_exit(&p->p_lock); } if (error) return (set_errno(error)); - /* * Copy out the events and return the fdcnt to the user. */ ! if (nfds != 0 && ! copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) return (set_errno(EFAULT)); #ifdef DEBUG /* * Another sanity check: */ if (fdcnt) { ! int reventcnt = 0; for (i = 0; i < nfds; i++) { if (pollfdp[i].fd < 0) { ASSERT(pollfdp[i].revents == 0); continue; --- 700,721 ---- mutex_exit(&p->p_lock); } if (error) return (set_errno(error)); /* * Copy out the events and return the fdcnt to the user. */ ! if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) return (set_errno(EFAULT)); #ifdef DEBUG /* * Another sanity check: */ if (fdcnt) { ! int i, reventcnt = 0; for (i = 0; i < nfds; i++) { if (pollfdp[i].fd < 0) { ASSERT(pollfdp[i].revents == 0); continue;
*** 636,700 **** reventcnt++; } } ASSERT(fdcnt == reventcnt); } else { for (i = 0; i < nfds; i++) { ASSERT(pollfdp[i].revents == 0); } } #endif /* DEBUG */ return (fdcnt); } - /* - * This is the system call trap that poll(), - * select() and pselect() are built upon. - * It is a private interface between libc and the kernel. - */ - int - pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) - { - timespec_t ts; - timespec_t *tsp; - sigset_t set; - k_sigset_t kset; - k_sigset_t *ksetp; - model_t datamodel = get_udatamodel(); - - if (timeoutp == NULL) - tsp = NULL; - else { - if (datamodel == DATAMODEL_NATIVE) { - if (copyin(timeoutp, &ts, sizeof (ts))) - return (set_errno(EFAULT)); - } else { - timespec32_t ts32; - - if (copyin(timeoutp, &ts32, sizeof (ts32))) - return (set_errno(EFAULT)); - TIMESPEC32_TO_TIMESPEC(&ts, &ts32) - } - - if (itimerspecfix(&ts)) - return (set_errno(EINVAL)); - tsp = &ts; - } - - if (setp == NULL) - ksetp = NULL; - else { - if (copyin(setp, &set, sizeof (set))) - return (set_errno(EFAULT)); - sigutok(&set, &kset); - ksetp = &kset; - } - - return (poll_common(fds, nfds, tsp, ksetp)); - } - /* * Clean up any state left around by poll(2). Called when a thread exits. */ void pollcleanup() --- 724,744 ---- reventcnt++; } } ASSERT(fdcnt == reventcnt); } else { + int i; + for (i = 0; i < nfds; i++) { ASSERT(pollfdp[i].revents == 0); } } #endif /* DEBUG */ return (fdcnt); } /* * Clean up any state left around by poll(2). Called when a thread exits. */ void pollcleanup()