Print this page
OS-5566 ppoll timeout calculation can overflow
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Alex Wilson <alex.wilson@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
OS-4656 nested epoll does not mimic Linux behavior
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-5162 poll/select yield improper EINTR when nfds and timeout are 0
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
OS-4830 lxbrand convert select/poll to IKE
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
        
@@ -27,11 +27,11 @@
 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
 /*        All Rights Reserved   */
 
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2016, Joyent, Inc.
  */
 
 /*
  * Portions of this source code were derived from Berkeley 4.3 BSD
  * under license from the Regents of the University of California.
@@ -315,67 +315,106 @@
                 pollrelock(state);
         }
         return (0);
 }
 
-static int
-poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
+int
+poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds)
 {
+        pollfd_t *pollfdp;
+        nfds_t old_nfds;
+
+        /*
+         * NOTE: for performance, buffers are saved across poll() calls.
+         * The theory is that if a process polls heavily, it tends to poll
+         * on the same set of descriptors.  Therefore, we only reallocate
+         * buffers when nfds changes.  There is no hysteresis control,
+         * because there is no data to suggest that this is necessary;
+         * the penalty of reallocating is not *that* great in any event.
+         */
+        old_nfds = ps->ps_nfds;
+        if (nfds != old_nfds) {
+                kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
+                pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+                ps->ps_pollfd = pollfdp;
+                ps->ps_nfds = nfds;
+        }
+
+        pollfdp = ps->ps_pollfd;
+        if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
+                return (EFAULT);
+        }
+
+        if (fds == NULL) {
+                /*
+                 * If the process has page 0 mapped, then the copyin() above
+                 * will succeed even if fds is NULL.  However, our cached
+                 * poll lists are keyed by the address of the passed-in fds
+                 * structure, and we use the value NULL to indicate an unused
+                 * poll cache list entry.  As such, we elect not to support
+                 * NULL as a valid (user) memory address and fail the poll()
+                 * call.
+                 */
+                return (EFAULT);
+        }
+        return (0);
+}
+
+int
+poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp,
+    int *fdcnt)
+{
         kthread_t *t = curthread;
-        klwp_t *lwp = ttolwp(t);
-        proc_t *p = ttoproc(t);
-        int fdcnt = 0;
-        int i;
         hrtime_t deadline; /* hrtime value when we want to return */
         pollfd_t *pollfdp;
-        pollstate_t *ps;
         pollcache_t *pcp;
         int error = 0;
-        nfds_t old_nfds;
         int cacheindex = 0;     /* which cache set is used */
 
         /*
          * Determine the precise future time of the requested timeout, if any.
          */
         if (tsp == NULL) {
                 deadline = -1;
         } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) {
                 deadline = 0;
+        } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) {
+                /* Use an indefinite timeout if tv_sec would cause overflow */
+                deadline = -1;
         } else {
+                /*
+                 * The above check, when combined with the protections offered
+                 * by itimerspecfix (ensuring that neither field is negative
+                 * and that tv_nsec represents less than a whole second), will
+                 * prevent overflow during the conversion from timespec_t to
+                 * uhrtime_t.
+                 */
+                uhrtime_t utime = tsp->tv_sec * NANOSEC;
+                utime += tsp->tv_nsec;
+
                 /* They must wait at least a tick. */
-                deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec;
-                deadline = MAX(deadline, nsec_per_tick);
-                deadline += gethrtime();
-        }
+                utime = MAX(utime, nsec_per_tick);
 
         /*
-         * Reset our signal mask, if requested.
+                 * Since utime has an upper bound of HRTIME_MAX, adding the
+                 * gethrtime() result cannot incur an overflow as the unsigned
+                 * type has an adequate bound.
          */
-        if (ksetp != NULL) {
-                mutex_enter(&p->p_lock);
-                schedctl_finish_sigblock(t);
-                lwp->lwp_sigoldmask = t->t_hold;
-                t->t_hold = *ksetp;
-                t->t_flag |= T_TOMASK;
-                /*
-                 * Call cv_reltimedwait_sig() just to check for signals.
-                 * We will return immediately with either 0 or -1.
-                 */
-                if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
-                    TR_CLOCK_TICK)) {
-                        mutex_exit(&p->p_lock);
-                        error = EINTR;
-                        goto pollout;
+                utime += (uhrtime_t)gethrtime();
+                if (utime > HRTIME_MAX) {
+                        deadline = -1;
+                } else {
+                        deadline = (hrtime_t)utime;
                 }
-                mutex_exit(&p->p_lock);
         }
 
         /*
-         * Check to see if this guy just wants to use poll() as a timeout.
+         * Check to see if the caller just wants to use poll() as a timeout.
          * If yes then bypass all the other stuff and make him sleep.
          */
         if (nfds == 0) {
+                *fdcnt = 0;
                 /*
                  * Sleep until we have passed the requested future
                  * time or until interrupted by a signal.
                  * Do not check for signals if we do not want to wait.
                  */
@@ -383,72 +422,20 @@
                         mutex_enter(&t->t_delay_lock);
                         while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv,
                             &t->t_delay_lock, deadline)) > 0)
                                 continue;
                         mutex_exit(&t->t_delay_lock);
-                        error = (error == 0) ? EINTR : 0;
+                        return ((error == 0) ? EINTR : 0);
                 }
-                goto pollout;
+                return (0);
         }
 
-        if (nfds > p->p_fno_ctl) {
-                mutex_enter(&p->p_lock);
-                (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
-                    p->p_rctls, p, RCA_SAFE);
-                mutex_exit(&p->p_lock);
-                error = EINVAL;
-                goto pollout;
-        }
-
-        /*
-         * Need to allocate memory for pollstate before anything because
-         * the mutex and cv are created in this space
-         */
-        ps = pollstate_create();
-
-        if (ps->ps_pcache == NULL)
-                ps->ps_pcache = pcache_alloc();
-        pcp = ps->ps_pcache;
-
-        /*
-         * NOTE: for performance, buffers are saved across poll() calls.
-         * The theory is that if a process polls heavily, it tends to poll
-         * on the same set of descriptors.  Therefore, we only reallocate
-         * buffers when nfds changes.  There is no hysteresis control,
-         * because there is no data to suggest that this is necessary;
-         * the penalty of reallocating is not *that* great in any event.
-         */
-        old_nfds = ps->ps_nfds;
-        if (nfds != old_nfds) {
-
-                kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
-                pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
-                ps->ps_pollfd = pollfdp;
-                ps->ps_nfds = nfds;
-        }
-
+        VERIFY(ps != NULL);
         pollfdp = ps->ps_pollfd;
-        if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
-                error = EFAULT;
-                goto pollout;
-        }
+        VERIFY(pollfdp != NULL);
 
-        if (fds == NULL) {
                 /*
-                 * If the process has page 0 mapped, then the copyin() above
-                 * will succeed even if fds is NULL.  However, our cached
-                 * poll lists are keyed by the address of the passed-in fds
-                 * structure, and we use the value NULL to indicate an unused
-                 * poll cache list entry.  As such, we elect not to support
-                 * NULL as a valid (user) memory address and fail the poll()
-                 * call.
-                 */
-                error = EINVAL;
-                goto pollout;
-        }
-
-        /*
          * If this thread polls for the first time, allocate ALL poll
          * cache data structures and cache the poll fd list. This
          * allocation is delayed till now because lwp's polling 0 fd
          * (i.e. using poll as timeout()) don't need this memory.
          */
@@ -458,14 +445,14 @@
         if (pcp->pc_bitmap == NULL) {
                 pcache_create(pcp, nfds);
                 /*
                  * poll and cache this poll fd list in ps_pcacheset[0].
                  */
-                error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
-                if (fdcnt || error) {
+                error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex);
+                if (error || *fdcnt) {
                         mutex_exit(&ps->ps_lock);
-                        goto pollout;
+                        return (error);
                 }
         } else {
                 pollcacheset_t  *pcset = ps->ps_pcacheset;
 
                 /*
@@ -486,15 +473,15 @@
                                  * list and previously cached one.
                                  * If there is an error during resolve(),
                                  * the callee will guarantee the consistency
                                  * of cached poll list and cache content.
                                  */
-                                error = pcacheset_resolve(ps, nfds, &fdcnt,
+                                error = pcacheset_resolve(ps, nfds, fdcnt,
                                     cacheindex);
                                 if (error) {
                                         mutex_exit(&ps->ps_lock);
-                                        goto pollout;
+                                        return (error);
                                 }
                                 break;
                         }
 
                         /*
@@ -507,15 +494,15 @@
                         if (pcset[cacheindex].pcs_usradr == NULL) {
                                 /*
                                  * found an unused entry. Use it to cache
                                  * this poll list.
                                  */
-                                error = pcacheset_cache_list(ps, fds, &fdcnt,
+                                error = pcacheset_cache_list(ps, fds, fdcnt,
                                     cacheindex);
-                                if (fdcnt || error) {
+                                if (error || *fdcnt) {
                                         mutex_exit(&ps->ps_lock);
-                                        goto pollout;
+                                        return (error);
                                 }
                                 break;
                         }
                 }
                 if (cacheindex == ps->ps_nsets) {
@@ -525,14 +512,14 @@
                          */
                         pollstats.polllistmiss.value.ui64++;
                         cacheindex = pcacheset_replace(ps);
                         ASSERT(cacheindex < ps->ps_nsets);
                         pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
-                        error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
+                        error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex);
                         if (error) {
                                 mutex_exit(&ps->ps_lock);
-                                goto pollout;
+                                return (error);
                         }
                 }
         }
 
         /*
@@ -546,12 +533,12 @@
          * the pcache before updating poll bitmap.
          */
         mutex_enter(&pcp->pc_lock);
         for (;;) {
                 pcp->pc_flag = 0;
-                error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
-                if (fdcnt || error) {
+                error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex);
+                if (error || *fdcnt) {
                         mutex_exit(&pcp->pc_lock);
                         mutex_exit(&ps->ps_lock);
                         break;
                 }
 
@@ -593,17 +580,120 @@
                  */
                 mutex_enter(&ps->ps_lock);
                 mutex_enter(&pcp->pc_lock);
         }
 
+        return (error);
+}
+
+/*
+ * This is the system call trap that poll(),
+ * select() and pselect() are built upon.
+ * It is a private interface between libc and the kernel.
+ */
+int
+pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
+{
+        kthread_t *t = curthread;
+        klwp_t *lwp = ttolwp(t);
+        proc_t *p = ttoproc(t);
+        timespec_t ts;
+        timespec_t *tsp;
+        k_sigset_t kset;
+        pollstate_t *ps = NULL;
+        pollfd_t *pollfdp = NULL;
+        int error = 0, fdcnt = 0;
+
+        /*
+         * Copy in timeout
+         */
+        if (timeoutp == NULL) {
+                tsp = NULL;
+        } else {
+                if (get_udatamodel() == DATAMODEL_NATIVE) {
+                        if (copyin(timeoutp, &ts, sizeof (ts)))
+                                return (set_errno(EFAULT));
+                } else {
+                        timespec32_t ts32;
+
+                        if (copyin(timeoutp, &ts32, sizeof (ts32)))
+                                return (set_errno(EFAULT));
+                        TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+                }
+
+                if (itimerspecfix(&ts))
+                        return (set_errno(EINVAL));
+                tsp = &ts;
+        }
+
+        /*
+         * Copy in and reset signal mask, if requested.
+         */
+        if (setp != NULL) {
+                sigset_t set;
+
+                if (copyin(setp, &set, sizeof (set)))
+                        return (set_errno(EFAULT));
+                sigutok(&set, &kset);
+
+                mutex_enter(&p->p_lock);
+                schedctl_finish_sigblock(t);
+                lwp->lwp_sigoldmask = t->t_hold;
+                t->t_hold = kset;
+                t->t_flag |= T_TOMASK;
+                /*
+                 * Call cv_reltimedwait_sig() just to check for signals.
+                 * We will return immediately with either 0 or -1.
+                 */
+                if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0,
+                    TR_CLOCK_TICK)) {
+                        mutex_exit(&p->p_lock);
+                        error = EINTR;
+                        goto pollout;
+                }
+                mutex_exit(&p->p_lock);
+        }
+
+        /*
+         * Initialize pollstate and copy in pollfd data if present.
+         * If nfds == 0, we will skip all of the copying and check steps and
+         * proceed directly into poll_common to process the supplied timeout.
+         */
+        if (nfds != 0) {
+                if (nfds > p->p_fno_ctl) {
+                        mutex_enter(&p->p_lock);
+                        (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+                            p->p_rctls, p, RCA_SAFE);
+                        mutex_exit(&p->p_lock);
+                        error = EINVAL;
+                        goto pollout;
+                }
+
+                /*
+                 * Need to allocate memory for pollstate before anything
+                 * because the mutex and cv are created in this space
+                 */
+                ps = pollstate_create();
+                if (ps->ps_pcache == NULL)
+                        ps->ps_pcache = pcache_alloc();
+
+                if ((error = poll_copyin(ps, fds, nfds)) != 0)
+                        goto pollout;
+                pollfdp = ps->ps_pollfd;
+        }
+
+        /*
+         * Perform the actual poll.
+         */
+        error = poll_common(ps, fds, nfds, tsp, &fdcnt);
+
 pollout:
         /*
-         * If we changed the signal mask but we received
-         * no signal then restore the signal mask.
-         * Otherwise psig() will deal with the signal mask.
+         * If we changed the signal mask but we received no signal then restore
+         * the signal mask.  Otherwise psig() will deal with the signal mask.
          */
-        if (ksetp != NULL) {
+        if (setp != NULL) {
                 mutex_enter(&p->p_lock);
                 if (lwp->lwp_cursig == 0) {
                         t->t_hold = lwp->lwp_sigoldmask;
                         t->t_flag &= ~T_TOMASK;
                 }
@@ -610,24 +700,22 @@
                 mutex_exit(&p->p_lock);
         }
 
         if (error)
                 return (set_errno(error));
-
         /*
          * Copy out the events and return the fdcnt to the user.
          */
-        if (nfds != 0 &&
-            copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
+        if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
                 return (set_errno(EFAULT));
 
 #ifdef DEBUG
         /*
          * Another sanity check:
          */
         if (fdcnt) {
-                int     reventcnt = 0;
+                int i, reventcnt = 0;
 
                 for (i = 0; i < nfds; i++) {
                         if (pollfdp[i].fd < 0) {
                                 ASSERT(pollfdp[i].revents == 0);
                                 continue;
@@ -636,65 +724,21 @@
                                 reventcnt++;
                         }
                 }
                 ASSERT(fdcnt == reventcnt);
         } else {
+                int i;
+
                 for (i = 0; i < nfds; i++) {
                         ASSERT(pollfdp[i].revents == 0);
                 }
         }
 #endif  /* DEBUG */
 
         return (fdcnt);
 }
 
-/*
- * This is the system call trap that poll(),
- * select() and pselect() are built upon.
- * It is a private interface between libc and the kernel.
- */
-int
-pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
-{
-        timespec_t ts;
-        timespec_t *tsp;
-        sigset_t set;
-        k_sigset_t kset;
-        k_sigset_t *ksetp;
-        model_t datamodel = get_udatamodel();
-
-        if (timeoutp == NULL)
-                tsp = NULL;
-        else {
-                if (datamodel == DATAMODEL_NATIVE) {
-                        if (copyin(timeoutp, &ts, sizeof (ts)))
-                                return (set_errno(EFAULT));
-                } else {
-                        timespec32_t ts32;
-
-                        if (copyin(timeoutp, &ts32, sizeof (ts32)))
-                                return (set_errno(EFAULT));
-                        TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
-                }
-
-                if (itimerspecfix(&ts))
-                        return (set_errno(EINVAL));
-                tsp = &ts;
-        }
-
-        if (setp == NULL)
-                ksetp = NULL;
-        else {
-                if (copyin(setp, &set, sizeof (set)))
-                        return (set_errno(EFAULT));
-                sigutok(&set, &kset);
-                ksetp = &kset;
-        }
-
-        return (poll_common(fds, nfds, tsp, ksetp));
-}
-
 /*
  * Clean up any state left around by poll(2). Called when a thread exits.
  */
 void
 pollcleanup()