1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright 2016 Joyent, Inc.
  14  */
  15 
  16 /*
  17  * The illumos kernel provides two clock backends: CLOCK_REALTIME, the
  18  * adjustable system wall clock; and CLOCK_HIGHRES, the monotonically
  19  * increasing time source that is not subject to drift or adjustment.  By
  20  * contrast, the Linux kernel is furnished with an overabundance of narrowly
  21  * differentiated clock types.
  22  *
  23  * Fortunately, most of the commonly used Linux clock types are either similar
  24  * enough to the native clock backends that they can be directly mapped, or
  25  * represent queries to the per-process and per-LWP microstate counters.
  26  *
  27  * CLOCK_BOOTTIME is identical to CLOCK_MONOTONIC, except that it takes into
  28  * account time that the system is suspended. Since that is uninteresting to
  29  * us, we treat it the same.
  30  */
  31 
  32 #include <sys/time.h>
  33 #include <sys/systm.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/brand.h>
  36 #include <sys/lx_brand.h>
  37 #include <sys/lx_impl.h>
  38 #include <lx_signum.h>
  39 
  40 /*
  41  * From "uts/common/os/timer.c":
  42  */
  43 extern int clock_settime(clockid_t, timespec_t *);
  44 extern int clock_gettime(clockid_t, timespec_t *);
  45 extern int clock_getres(clockid_t, timespec_t *);
  46 extern int nanosleep(timespec_t *, timespec_t *);
  47 
  48 
  49 static int lx_emul_clock_getres(clockid_t, timespec_t *);
  50 static int lx_emul_clock_gettime(clockid_t, timespec_t *);
  51 static int lx_emul_clock_settime(clockid_t, timespec_t *);
  52 
  53 typedef struct lx_clock_backend {
  54         clockid_t lclk_ntv_id;
  55         int (*lclk_clock_getres)(clockid_t, timespec_t *);
  56         int (*lclk_clock_gettime)(clockid_t, timespec_t *);
  57         int (*lclk_clock_settime)(clockid_t, timespec_t *);
  58 } lx_clock_backend_t;
  59 
  60 /*
  61  * NOTE: The Linux man pages state this structure is obsolete and is
  62  * unsupported, so it is declared here for sizing purposes only.
  63  */
  64 struct lx_timezone {
  65         int tz_minuteswest;     /* minutes W of Greenwich */
  66         int tz_dsttime;         /* type of dst correction */
  67 };
  68 
  69 /*
  70  * Use the native clock_* system call implementation, but with a translated
  71  * clock identifier:
  72  */
  73 #define NATIVE(ntv_id)                                                  \
  74         { ntv_id, clock_getres, clock_gettime, clock_settime }
  75 
  76 /*
  77  * This backend is not supported, so we provide an emulation handler:
  78  */
  79 #define EMUL(ntv_id)                                                    \
  80         { ntv_id, lx_emul_clock_getres, lx_emul_clock_gettime,          \
  81             lx_emul_clock_settime }
  82 
  83 static lx_clock_backend_t lx_clock_backends[] = {
  84         NATIVE(CLOCK_REALTIME),         /* LX_CLOCK_REALTIME */
  85         NATIVE(CLOCK_HIGHRES),          /* LX_CLOCK_MONOTONIC */
  86         EMUL(CLOCK_PROCESS_CPUTIME_ID), /* LX_CLOCK_PROCESS_CPUTIME_ID */
  87         EMUL(CLOCK_THREAD_CPUTIME_ID),  /* LX_CLOCK_THREAD_CPUTIME_ID */
  88         NATIVE(CLOCK_HIGHRES),          /* LX_CLOCK_MONOTONIC_RAW */
  89         NATIVE(CLOCK_REALTIME),         /* LX_CLOCK_REALTIME_COARSE */
  90         NATIVE(CLOCK_HIGHRES),          /* LX_CLOCK_MONOTONIC_COARSE */
  91         NATIVE(CLOCK_HIGHRES)           /* LX_CLOCK_BOOTTIME */
  92 };
  93 
  94 #define LX_CLOCK_MAX \
  95         (sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0]))
  96 #define LX_CLOCK_BACKEND(clk) (((clk) < LX_CLOCK_MAX && (clk) >= 0) ? \
  97         &lx_clock_backends[(clk)] : NULL)
  98 
  99 /*
 100  * Linux defines the size of the sigevent structure to be 64 bytes.  In order
 101  * to meet that definition, the trailing union includes a member which pads it
 102  * out to the desired length for the given architecture.
 103  */
 104 #define LX_SIGEV_PAD_SIZE       ((64 - \
 105         (sizeof (int) * 2 + sizeof (union sigval))) / sizeof (int))
 106 
 107 typedef struct {
 108         union sigval    lx_sigev_value;
 109         int             lx_sigev_signo;
 110         int             lx_sigev_notify;
 111         union {
 112                 int     lx_pad[LX_SIGEV_PAD_SIZE];
 113                 int     lx_tid;
 114                 struct {
 115                         void (*lx_notify_function)(union sigval);
 116                         void *lx_notify_attribute;
 117                 } lx_sigev_thread;
 118         } lx_sigev_un;
 119 } lx_sigevent_t;
 120 
 121 
 122 #ifdef _SYSCALL32_IMPL
 123 
 124 #define LX_SIGEV32_PAD_SIZE     ((64 - \
 125         (sizeof (int) * 2 + sizeof (union sigval32))) / sizeof (int))
 126 
 127 typedef struct {
 128         union sigval32  lx_sigev_value;
 129         int             lx_sigev_signo;
 130         int             lx_sigev_notify;
 131         union {
 132                 int     lx_pad[LX_SIGEV32_PAD_SIZE];
 133                 int     lx_tid;
 134                 struct {
 135                         caddr32_t lx_notify_function;
 136                         caddr32_t lx_notify_attribute;
 137                 } lx_sigev_thread;
 138         } lx_sigev_un;
 139 } lx_sigevent32_t;
 140 
 141 #endif /* _SYSCALL32_IMPL */
 142 
 143 #define LX_SIGEV_SIGNAL         0
 144 #define LX_SIGEV_NONE           1
 145 #define LX_SIGEV_THREAD         2
 146 #define LX_SIGEV_THREAD_ID      4
 147 
 148 /*
 149  * Access private SIGEV_THREAD_ID callback state in itimer_t
 150  */
 151 #define LX_SIGEV_THREAD_ID_LPID(it)     ((it)->it_cb_data[0])
 152 #define LX_SIGEV_THREAD_ID_TID(it)      ((it)->it_cb_data[1])
 153 
 154 
 155 /* ARGSUSED */
 156 static int
 157 lx_emul_clock_settime(clockid_t clock, timespec_t *tp)
 158 {
 159         return (set_errno(EINVAL));
 160 }
 161 
 162 static int
 163 lx_emul_clock_gettime(clockid_t clock, timespec_t *tp)
 164 {
 165         timespec_t t;
 166 
 167         switch (clock) {
 168         case CLOCK_PROCESS_CPUTIME_ID: {
 169                 proc_t *p = ttoproc(curthread);
 170                 hrtime_t snsecs, unsecs;
 171 
 172                 /*
 173                  * Based on getrusage() in "rusagesys.c":
 174                  */
 175                 mutex_enter(&p->p_lock);
 176                 unsecs = mstate_aggr_state(p, LMS_USER);
 177                 snsecs = mstate_aggr_state(p, LMS_SYSTEM);
 178                 mutex_exit(&p->p_lock);
 179 
 180                 hrt2ts(unsecs + snsecs, &t);
 181                 break;
 182         }
 183 
 184         case CLOCK_THREAD_CPUTIME_ID: {
 185                 klwp_t *lwp = ttolwp(curthread);
 186                 struct mstate *ms = &lwp->lwp_mstate;
 187                 hrtime_t snsecs, unsecs;
 188 
 189                 /*
 190                  * Based on getrusage_lwp() in "rusagesys.c":
 191                  */
 192                 unsecs = ms->ms_acct[LMS_USER];
 193                 snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
 194 
 195                 scalehrtime(&unsecs);
 196                 scalehrtime(&snsecs);
 197 
 198                 hrt2ts(unsecs + snsecs, &t);
 199                 break;
 200         }
 201 
 202         default:
 203                 return (set_errno(EINVAL));
 204         }
 205 
 206 #if defined(_SYSCALL32_IMPL)
 207         if (get_udatamodel() != DATAMODEL_NATIVE) {
 208                 timespec32_t t32;
 209 
 210                 if (TIMESPEC_OVERFLOW(&t)) {
 211                         return (set_errno(EOVERFLOW));
 212                 }
 213                 TIMESPEC_TO_TIMESPEC32(&t32, &t);
 214 
 215                 if (copyout(&t32, tp, sizeof (t32)) != 0) {
 216                         return (set_errno(EFAULT));
 217                 }
 218 
 219                 return (0);
 220         }
 221 #endif
 222 
 223         if (copyout(&t, tp, sizeof (t)) != 0) {
 224                 return (set_errno(EFAULT));
 225         }
 226 
 227         return (0);
 228 }
 229 
 230 static int
 231 lx_emul_clock_getres(clockid_t clock, timespec_t *tp)
 232 {
 233         timespec_t t;
 234 
 235         if (tp == NULL) {
 236                 return (0);
 237         }
 238 
 239         switch (clock) {
 240         case CLOCK_PROCESS_CPUTIME_ID:
 241         case CLOCK_THREAD_CPUTIME_ID:
 242                 /*
 243                  * These clock backends return microstate accounting values for
 244                  * the LWP or the entire process.  The Linux kernel claims they
 245                  * have nanosecond resolution; so will we.
 246                  */
 247                 t.tv_sec = 0;
 248                 t.tv_nsec = 1;
 249                 break;
 250 
 251         default:
 252                 return (set_errno(EINVAL));
 253         }
 254 
 255 #if defined(_SYSCALL32_IMPL)
 256         if (get_udatamodel() != DATAMODEL_NATIVE) {
 257                 timespec32_t t32;
 258 
 259                 if (TIMESPEC_OVERFLOW(&t)) {
 260                         return (set_errno(EOVERFLOW));
 261                 }
 262                 TIMESPEC_TO_TIMESPEC32(&t32, &t);
 263 
 264                 if (copyout(&t32, tp, sizeof (t32)) != 0) {
 265                         return (set_errno(EFAULT));
 266                 }
 267 
 268                 return (0);
 269         }
 270 #endif
 271 
 272         if (copyout(&t, tp, sizeof (t)) != 0) {
 273                 return (set_errno(EFAULT));
 274         }
 275 
 276         return (0);
 277 }
 278 
 279 static void
 280 lx_clock_unsupported(int clock)
 281 {
 282         char buf[100];
 283 
 284         (void) snprintf(buf, sizeof (buf), "unsupported clock: %d", clock);
 285         lx_unsupported(buf);
 286 }
 287 
 288 long
 289 lx_clock_settime(int clock, timespec_t *tp)
 290 {
 291         lx_clock_backend_t *backend;
 292 
 293         if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
 294                 lx_clock_unsupported(clock);
 295                 return (set_errno(EINVAL));
 296         }
 297 
 298         return (backend->lclk_clock_settime(backend->lclk_ntv_id, tp));
 299 }
 300 
 301 long
 302 lx_clock_gettime(int clock, timespec_t *tp)
 303 {
 304         lx_clock_backend_t *backend;
 305 
 306         if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
 307                 lx_clock_unsupported(clock);
 308                 return (set_errno(EINVAL));
 309         }
 310 
 311         return (backend->lclk_clock_gettime(backend->lclk_ntv_id, tp));
 312 }
 313 
 314 long
 315 lx_clock_getres(int clock, timespec_t *tp)
 316 {
 317         lx_clock_backend_t *backend;
 318 
 319         if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) {
 320                 lx_clock_unsupported(clock);
 321                 return (set_errno(EINVAL));
 322         }
 323 
 324         /*
 325          * It is important this check is performed after the clock
 326          * check. Both glibc and musl, in their clock_getcpuclockid(),
 327          * use clock_getres() with a NULL tp to validate a clock
 328          * value. Performing the tp check before the clock check could
 329          * indicate a valid clock to libc when it shouldn't.
 330          */
 331         if (tp == NULL) {
 332                 return (0);
 333         }
 334 
 335         return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp));
 336 }
 337 
 338 static int
 339 lx_ltos_sigev(lx_sigevent_t *lev, struct sigevent *sev)
 340 {
 341         bzero(sev, sizeof (*sev));
 342 
 343         switch (lev->lx_sigev_notify) {
 344         case LX_SIGEV_NONE:
 345                 sev->sigev_notify = SIGEV_NONE;
 346                 break;
 347 
 348         case LX_SIGEV_SIGNAL:
 349         case LX_SIGEV_THREAD_ID:
 350                 sev->sigev_notify = SIGEV_SIGNAL;
 351                 break;
 352 
 353         case LX_SIGEV_THREAD:
 354                 /*
 355                  * Just as in illumos, SIGEV_THREAD handling is performed in
 356                  * userspace with the help of SIGEV_SIGNAL/SIGEV_THREAD_ID.
 357                  *
 358                  * It's not expected to make an appearance in the syscall.
 359                  */
 360         default:
 361                 return (EINVAL);
 362         }
 363 
 364         sev->sigev_signo = lx_ltos_signo(lev->lx_sigev_signo, 0);
 365         sev->sigev_value = lev->lx_sigev_value;
 366 
 367         /* Ensure SIGEV_SIGNAL has a valid signo to work with. */
 368         if (sev->sigev_notify == SIGEV_SIGNAL && sev->sigev_signo == 0) {
 369                 return (EINVAL);
 370         }
 371         return (0);
 372 }
 373 
 374 static int
 375 lx_sigev_copyin(lx_sigevent_t *userp, lx_sigevent_t *levp)
 376 {
 377 #ifdef _SYSCALL32_IMPL
 378         if (get_udatamodel() != DATAMODEL_NATIVE) {
 379                 lx_sigevent32_t lev32;
 380 
 381                 if (copyin(userp, &lev32, sizeof (lev32)) != 0) {
 382                         return (EFAULT);
 383                 }
 384                 levp->lx_sigev_value.sival_int = lev32.lx_sigev_value.sival_int;
 385                 levp->lx_sigev_signo = lev32.lx_sigev_signo;
 386                 levp->lx_sigev_notify = lev32.lx_sigev_notify;
 387                 levp->lx_sigev_un.lx_tid = lev32.lx_sigev_un.lx_tid;
 388         } else
 389 #endif /* _SYSCALL32_IMPL */
 390         {
 391                 if (copyin(userp, levp, sizeof (lx_sigevent_t)) != 0) {
 392                         return (EFAULT);
 393                 }
 394         }
 395         return (0);
 396 }
 397 
 398 static void
 399 lx_sigev_thread_fire(itimer_t *it)
 400 {
 401         proc_t *p = it->it_proc;
 402         pid_t lpid = (pid_t)LX_SIGEV_THREAD_ID_LPID(it);
 403         id_t tid = (id_t)LX_SIGEV_THREAD_ID_TID(it);
 404         lwpdir_t *ld;
 405 
 406         ASSERT(MUTEX_HELD(&it->it_mutex));
 407         ASSERT(it->it_pending == 0);
 408         ASSERT(it->it_flags & IT_SIGNAL);
 409         ASSERT(MUTEX_HELD(&p->p_lock));
 410 
 411         ld = lwp_hash_lookup(p, tid);
 412         if (ld != NULL) {
 413                 lx_lwp_data_t *lwpd;
 414                 kthread_t *t;
 415 
 416                 t = ld->ld_entry->le_thread;
 417                 lwpd = ttolxlwp(t);
 418                 if (lwpd != NULL && lwpd->br_pid == lpid) {
 419                         /*
 420                          * A thread matching the LX pid is still present in the
 421                          * process.  Send a targeted signal as requested.
 422                          */
 423                         it->it_pending = 1;
 424                         mutex_exit(&it->it_mutex);
 425                         sigaddqa(p, t, it->it_sigq);
 426                         return;
 427                 }
 428         }
 429 
 430         mutex_exit(&it->it_mutex);
 431 }
 432 
 433 long
 434 lx_timer_create(int clock, lx_sigevent_t *sevp, timer_t *tidp)
 435 {
 436         int error;
 437         lx_sigevent_t lev;
 438         struct sigevent sev;
 439         clock_backend_t *backend = NULL;
 440         proc_t *p = curproc;
 441         itimer_t *itp;
 442         timer_t tid;
 443 
 444         if (clock == -2) {
 445                 /*
 446                  * A change was made to the old userspace timer emulation to
 447                  * handle this specific clock ID for MapR.  It was wrongly
 448                  * mapped to CLOCK_REALTIME rather than CLOCK_THREAD_CPUTIME_ID
 449                  * which it maps to.  Until the CLOCK_*_CPUTIME_ID timers can
 450                  * be emulated, the admittedly incorrect mapping will remain.
 451                  */
 452                 backend = clock_get_backend(CLOCK_REALTIME);
 453         } else {
 454                 lx_clock_backend_t *lback = LX_CLOCK_BACKEND(clock);
 455 
 456                 if (lback != NULL) {
 457                         backend = clock_get_backend(lback->lclk_ntv_id);
 458                 }
 459         }
 460         if (backend == NULL) {
 461                 return (set_errno(EINVAL));
 462         }
 463 
 464         /* We have to convert the Linux sigevent layout to the illumos layout */
 465         if (sevp != NULL) {
 466                 if ((error = lx_sigev_copyin(sevp, &lev)) != 0) {
 467                         return (set_errno(error));
 468                 }
 469                 if ((error = lx_ltos_sigev(&lev, &sev)) != 0) {
 470                         return (set_errno(error));
 471                 }
 472         } else {
 473                 bzero(&sev, sizeof (sev));
 474                 sev.sigev_notify = SIGEV_SIGNAL;
 475                 sev.sigev_signo = SIGALRM;
 476         }
 477 
 478         if ((error = timer_setup(backend, &sev, NULL, &itp, &tid)) != 0) {
 479                 return (set_errno(error));
 480         }
 481 
 482         /*
 483          * The SIGEV_THREAD_ID notification method in Linux allows the caller
 484          * to target a specific thread to receive the signal.  The IT_CALLBACK
 485          * timer functionality is used to fulfill this need.  After translating
 486          * the LX pid to a SunOS thread ID (ensuring it exists in the current
 487          * process), those IDs are attached to the timer along with the custom
 488          * lx_sigev_thread_fire callback.  This targets the signal notification
 489          * properly when the timer fires.
 490          */
 491         if (lev.lx_sigev_notify == LX_SIGEV_THREAD_ID) {
 492                 pid_t lpid, spid;
 493                 id_t stid;
 494 
 495                 lpid = (pid_t)lev.lx_sigev_un.lx_tid;
 496                 if (lx_lpid_to_spair(lpid, &spid, &stid) != 0 ||
 497                     spid != curproc->p_pid) {
 498                         error = EINVAL;
 499                         goto err;
 500                 }
 501 
 502                 itp->it_flags |= IT_CALLBACK;
 503                 itp->it_cb_func = lx_sigev_thread_fire;
 504                 LX_SIGEV_THREAD_ID_LPID(itp) = lpid;
 505                 LX_SIGEV_THREAD_ID_TID(itp) = stid;
 506         }
 507 
 508         /*
 509          * When the sigevent is not specified, its sigev_value field is
 510          * expected to be populated with the timer ID.
 511          */
 512         if (sevp == NULL) {
 513                 itp->it_sigq->sq_info.si_value.sival_int = tid;
 514         }
 515 
 516         if (copyout(&tid, tidp, sizeof (timer_t)) != 0) {
 517                 error = EFAULT;
 518                 goto err;
 519         }
 520 
 521         timer_release(p, itp);
 522         return (0);
 523 
 524 err:
 525         timer_delete_grabbed(p, tid, itp);
 526         return (set_errno(error));
 527 }
 528 
 529 long
 530 lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp)
 531 {
 532         struct lx_timezone tz;
 533 
 534         bzero(&tz, sizeof (tz));
 535 
 536         /*
 537          * We want to be similar to libc which just does a fasttrap to
 538          * gethrestime and simply converts that result. We follow how uniqtime
 539          * does the conversion but we can't use that code since it does some
 540          * extra work which can cause the result to bounce around based on which
 541          * CPU we run on.
 542          */
 543         if (tvp != NULL) {
 544                 struct timeval tv;
 545                 timestruc_t ts;
 546                 int usec, nsec;
 547 
 548                 gethrestime(&ts);
 549                 nsec = ts.tv_nsec;
 550                 usec = nsec + (nsec >> 2);
 551                 usec = nsec + (usec >> 1);
 552                 usec = nsec + (usec >> 2);
 553                 usec = nsec + (usec >> 4);
 554                 usec = nsec - (usec >> 3);
 555                 usec = nsec + (usec >> 2);
 556                 usec = nsec + (usec >> 3);
 557                 usec = nsec + (usec >> 4);
 558                 usec = nsec + (usec >> 1);
 559                 usec = nsec + (usec >> 6);
 560                 usec = usec >> 10;
 561 
 562                 tv.tv_sec = ts.tv_sec;
 563                 tv.tv_usec = usec;
 564 
 565                 if (get_udatamodel() == DATAMODEL_NATIVE) {
 566                         if (copyout(&tv, tvp, sizeof (tv)) != 0)
 567                                 return (set_errno(EFAULT));
 568                 }
 569 #ifdef _SYSCALL32_IMPL
 570                 else {
 571                         struct timeval32 tv32;
 572 
 573                         if (TIMEVAL_OVERFLOW(&tv))
 574                                 return (set_errno(EOVERFLOW));
 575                         TIMEVAL_TO_TIMEVAL32(&tv32, &tv);
 576 
 577                         if (copyout(&tv32, tvp, sizeof (tv32)))
 578                                 return (set_errno(EFAULT));
 579                 }
 580 #endif
 581         }
 582 
 583         /*
 584          * The Linux man page states use of the second parameter is obsolete,
 585          * but gettimeofday(2) should still return EFAULT if it is set
 586          * to a bad non-NULL pointer (sigh...)
 587          */
 588         if (tzp != NULL && copyout(&tz, tzp, sizeof (tz)) != 0)
 589                 return (set_errno(EFAULT));
 590 
 591         return (0);
 592 }
 593 
 594 /*
 595  * On Linux a bad buffer will set errno to EFAULT, and on Illumos the failure
 596  * mode is documented as "undefined."
 597  */
 598 long
 599 lx_time(time_t *tp)
 600 {
 601         timestruc_t ts;
 602         struct timeval tv;
 603 
 604         gethrestime(&ts);
 605         tv.tv_sec = ts.tv_sec;
 606         tv.tv_usec = 0;
 607 
 608         if (get_udatamodel() == DATAMODEL_NATIVE) {
 609                 if (tp != NULL &&
 610                     copyout(&tv.tv_sec, tp, sizeof (tv.tv_sec)) != 0)
 611                         return (set_errno(EFAULT));
 612 
 613                 return (tv.tv_sec);
 614         }
 615 #ifdef _SYSCALL32_IMPL
 616         else {
 617                 struct timeval32 tv32;
 618 
 619                 if (TIMEVAL_OVERFLOW(&tv))
 620                         return (set_errno(EOVERFLOW));
 621                 TIMEVAL_TO_TIMEVAL32(&tv32, &tv);
 622 
 623                 if (tp != NULL &&
 624                     copyout(&tv32.tv_sec, tp, sizeof (tv32.tv_sec)))
 625                         return (set_errno(EFAULT));
 626 
 627                 return (tv32.tv_sec);
 628         }
 629 #endif /* _SYSCALL32_IMPL */
 630         /* NOTREACHED */
 631 }
 632 
 633 long
 634 lx_nanosleep(timespec_t *rqtp, timespec_t *rmtp)
 635 {
 636         return (nanosleep(rqtp, rmtp));
 637 }