Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/os/exec.c
          +++ new/usr/src/uts/common/os/exec.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  
  26   26  /*      Copyright (c) 1988 AT&T */
  27   27  /*        All Rights Reserved   */
  28   28  /*
  29   29   * Copyright 2015, Joyent, Inc.  All rights reserved.
  30   30   */
  31   31  
  32   32  #include <sys/types.h>
  33   33  #include <sys/param.h>
  34   34  #include <sys/sysmacros.h>
  35   35  #include <sys/systm.h>
  36   36  #include <sys/signal.h>
  37   37  #include <sys/cred_impl.h>
  38   38  #include <sys/policy.h>
  39   39  #include <sys/user.h>
  40   40  #include <sys/errno.h>
  41   41  #include <sys/file.h>
  42   42  #include <sys/vfs.h>
  43   43  #include <sys/vnode.h>
  44   44  #include <sys/mman.h>
  45   45  #include <sys/acct.h>
  46   46  #include <sys/cpuvar.h>
  47   47  #include <sys/proc.h>
  48   48  #include <sys/cmn_err.h>
  49   49  #include <sys/debug.h>
  50   50  #include <sys/pathname.h>
  51   51  #include <sys/vm.h>
  52   52  #include <sys/lgrp.h>
  53   53  #include <sys/vtrace.h>
  54   54  #include <sys/exec.h>
  55   55  #include <sys/exechdr.h>
  56   56  #include <sys/kmem.h>
  57   57  #include <sys/prsystm.h>
  58   58  #include <sys/modctl.h>
  59   59  #include <sys/vmparam.h>
  60   60  #include <sys/door.h>
  61   61  #include <sys/schedctl.h>
  62   62  #include <sys/utrap.h>
  63   63  #include <sys/systeminfo.h>
  64   64  #include <sys/stack.h>
  65   65  #include <sys/rctl.h>
  66   66  #include <sys/dtrace.h>
  67   67  #include <sys/lwpchan_impl.h>
  68   68  #include <sys/pool.h>
  69   69  #include <sys/sdt.h>
  70   70  #include <sys/brand.h>
  71   71  #include <sys/klpd.h>
  72   72  #include <sys/random.h>
  73   73  
  74   74  #include <c2/audit.h>
  75   75  
  76   76  #include <vm/hat.h>
  77   77  #include <vm/anon.h>
  78   78  #include <vm/as.h>
  79   79  #include <vm/seg.h>
  80   80  #include <vm/seg_vn.h>
  81   81  
  82   82  #define PRIV_RESET              0x01    /* needs to reset privs */
  83   83  #define PRIV_SETID              0x02    /* needs to change uids */
  84   84  #define PRIV_SETUGID            0x04    /* is setuid/setgid/forced privs */
  85   85  #define PRIV_INCREASE           0x08    /* child runs with more privs */
  86   86  #define MAC_FLAGS               0x10    /* need to adjust MAC flags */
  87   87  #define PRIV_FORCED             0x20    /* has forced privileges */
  88   88  
  89   89  static int execsetid(struct vnode *, struct vattr *, uid_t *, uid_t *,
  90   90      priv_set_t *, cred_t *, const char *);
  91   91  static int hold_execsw(struct execsw *);
  92   92  
  93   93  uint_t auxv_hwcap = 0;  /* auxv AT_SUN_HWCAP value; determined on the fly */
  94   94  uint_t auxv_hwcap_2 = 0;        /* AT_SUN_HWCAP2 */
  95   95  #if defined(_SYSCALL32_IMPL)
  96   96  uint_t auxv_hwcap32 = 0;        /* 32-bit version of auxv_hwcap */
  97   97  uint_t auxv_hwcap32_2 = 0;      /* 32-bit version of auxv_hwcap2 */
  98   98  #endif
  99   99  
 100  100  #define PSUIDFLAGS              (SNOCD|SUGID)
 101  101  #define RANDOM_LEN      16      /* 16 bytes for AT_RANDOM aux entry */
 102  102  
 103  103  /*
 104  104   * exece() - system call wrapper around exec_common()
 105  105   */
 106  106  int
 107  107  exece(const char *fname, const char **argp, const char **envp)
 108  108  {
 109  109          int error;
 110  110  
 111  111          error = exec_common(fname, argp, envp, EBA_NONE);
 112  112          return (error ? (set_errno(error)) : 0);
 113  113  }
 114  114  
 115  115  int
 116  116  exec_common(const char *fname, const char **argp, const char **envp,
 117  117      int brand_action)
 118  118  {
 119  119          vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL;
 120  120          proc_t *p = ttoproc(curthread);
 121  121          klwp_t *lwp = ttolwp(curthread);
 122  122          struct user *up = PTOU(p);
 123  123          long execsz;            /* temporary count of exec size */
 124  124          int i;
 125  125          int error;
 126  126          char exec_file[MAXCOMLEN+1];
 127  127          struct pathname pn;
 128  128          struct pathname resolvepn;
 129  129          struct uarg args;
 130  130          struct execa ua;
 131  131          k_sigset_t savedmask;
 132  132          lwpdir_t *lwpdir = NULL;
 133  133          tidhash_t *tidhash;
 134  134          lwpdir_t *old_lwpdir = NULL;
 135  135          uint_t old_lwpdir_sz;
 136  136          tidhash_t *old_tidhash;
 137  137          uint_t old_tidhash_sz;
 138  138          ret_tidhash_t *ret_tidhash;
 139  139          lwpent_t *lep;
 140  140          boolean_t brandme = B_FALSE;
 141  141  
 142  142          /*
 143  143           * exec() is not supported for the /proc agent lwp.
 144  144           */
 145  145          if (curthread == p->p_agenttp)
 146  146                  return (ENOTSUP);
 147  147  
 148  148          if (brand_action != EBA_NONE) {
 149  149                  /*
 150  150                   * Brand actions are not supported for processes that are not
 151  151                   * running in a branded zone.
 152  152                   */
 153  153                  if (!ZONE_IS_BRANDED(p->p_zone))
 154  154                          return (ENOTSUP);
 155  155  
 156  156                  if (brand_action == EBA_NATIVE) {
 157  157                          /* Only branded processes can be unbranded */
 158  158                          if (!PROC_IS_BRANDED(p))
 159  159                                  return (ENOTSUP);
 160  160                  } else {
 161  161                          /* Only unbranded processes can be branded */
 162  162                          if (PROC_IS_BRANDED(p))
 163  163                                  return (ENOTSUP);
 164  164                          brandme = B_TRUE;
 165  165                  }
 166  166          } else {
 167  167                  /*
 168  168                   * If this is a native zone, or if the process is already
 169  169                   * branded, then we don't need to do anything.  If this is
 170  170                   * a native process in a branded zone, we need to brand the
 171  171                   * process as it exec()s the new binary.
 172  172                   */
 173  173                  if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p))
 174  174                          brandme = B_TRUE;
 175  175          }
 176  176  
 177  177          /*
 178  178           * Inform /proc that an exec() has started.
 179  179           * Hold signals that are ignored by default so that we will
 180  180           * not be interrupted by a signal that will be ignored after
 181  181           * successful completion of gexec().
 182  182           */
 183  183          mutex_enter(&p->p_lock);
 184  184          prexecstart();
 185  185          schedctl_finish_sigblock(curthread);
 186  186          savedmask = curthread->t_hold;
 187  187          sigorset(&curthread->t_hold, &ignoredefault);
 188  188          mutex_exit(&p->p_lock);
 189  189  
 190  190          /*
 191  191           * Look up path name and remember last component for later.
 192  192           * To help coreadm expand its %d token, we attempt to save
 193  193           * the directory containing the executable in p_execdir. The
 194  194           * first call to lookuppn() may fail and return EINVAL because
 195  195           * dirvpp is non-NULL. In that case, we make a second call to
 196  196           * lookuppn() with dirvpp set to NULL; p_execdir will be NULL,
 197  197           * but coreadm is allowed to expand %d to the empty string and
 198  198           * there are other cases in which that failure may occur.
 199  199           */
 200  200          if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
 201  201                  goto out;
 202  202          pn_alloc(&resolvepn);
 203  203          if ((error = lookuppn(&pn, &resolvepn, FOLLOW, &dir, &vp)) != 0) {
 204  204                  pn_free(&resolvepn);
 205  205                  pn_free(&pn);
 206  206                  if (error != EINVAL)
 207  207                          goto out;
 208  208  
 209  209                  dir = NULL;
 210  210                  if ((error = pn_get((char *)fname, UIO_USERSPACE, &pn)) != 0)
 211  211                          goto out;
 212  212                  pn_alloc(&resolvepn);
 213  213                  if ((error = lookuppn(&pn, &resolvepn, FOLLOW, NULLVPP,
 214  214                      &vp)) != 0) {
 215  215                          pn_free(&resolvepn);
 216  216                          pn_free(&pn);
 217  217                          goto out;
 218  218                  }
 219  219          }
 220  220          if (vp == NULL) {
 221  221                  if (dir != NULL)
 222  222                          VN_RELE(dir);
 223  223                  error = ENOENT;
 224  224                  pn_free(&resolvepn);
 225  225                  pn_free(&pn);
 226  226                  goto out;
 227  227          }
 228  228  
 229  229          if ((error = secpolicy_basic_exec(CRED(), vp)) != 0) {
 230  230                  if (dir != NULL)
 231  231                          VN_RELE(dir);
 232  232                  pn_free(&resolvepn);
 233  233                  pn_free(&pn);
 234  234                  VN_RELE(vp);
 235  235                  goto out;
 236  236          }
 237  237  
 238  238          /*
 239  239           * We do not allow executing files in attribute directories.
 240  240           * We test this by determining whether the resolved path
 241  241           * contains a "/" when we're in an attribute directory;
 242  242           * only if the pathname does not contain a "/" the resolved path
 243  243           * points to a file in the current working (attribute) directory.
 244  244           */
 245  245          if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 &&
 246  246              strchr(resolvepn.pn_path, '/') == NULL) {
 247  247                  if (dir != NULL)
 248  248                          VN_RELE(dir);
 249  249                  error = EACCES;
 250  250                  pn_free(&resolvepn);
 251  251                  pn_free(&pn);
 252  252                  VN_RELE(vp);
 253  253                  goto out;
 254  254          }
 255  255  
 256  256          bzero(exec_file, MAXCOMLEN+1);
 257  257          (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN);
 258  258          bzero(&args, sizeof (args));
 259  259          args.pathname = resolvepn.pn_path;
 260  260          /* don't free resolvepn until we are done with args */
 261  261          pn_free(&pn);
 262  262  
 263  263          /*
 264  264           * If we're running in a profile shell, then call pfexecd.
 265  265           */
 266  266          if ((CR_FLAGS(p->p_cred) & PRIV_PFEXEC) != 0) {
 267  267                  error = pfexec_call(p->p_cred, &resolvepn, &args.pfcred,
 268  268                      &args.scrubenv);
 269  269  
 270  270                  /* Returning errno in case we're not allowed to execute. */
 271  271                  if (error > 0) {
 272  272                          if (dir != NULL)
 273  273                                  VN_RELE(dir);
 274  274                          pn_free(&resolvepn);
 275  275                          VN_RELE(vp);
 276  276                          goto out;
 277  277                  }
 278  278  
 279  279                  /* Don't change the credentials when using old ptrace. */
 280  280                  if (args.pfcred != NULL &&
 281  281                      (p->p_proc_flag & P_PR_PTRACE) != 0) {
 282  282                          crfree(args.pfcred);
 283  283                          args.pfcred = NULL;
 284  284                          args.scrubenv = B_FALSE;
 285  285                  }
 286  286          }
 287  287  
 288  288          /*
 289  289           * Specific exec handlers, or policies determined via
 290  290           * /etc/system may override the historical default.
 291  291           */
 292  292          args.stk_prot = PROT_ZFOD;
 293  293          args.dat_prot = PROT_ZFOD;
 294  294  
 295  295          CPU_STATS_ADD_K(sys, sysexec, 1);
 296  296          DTRACE_PROC1(exec, char *, args.pathname);
 297  297  
 298  298          ua.fname = fname;
 299  299          ua.argp = argp;
 300  300          ua.envp = envp;
 301  301  
 302  302          /* If necessary, brand this process/lwp before we start the exec. */
 303  303          if (brandme) {
 304  304                  void *brand_data = NULL;
 305  305  
 306  306                  /*
 307  307                   * Process branding may fail if multiple LWPs are present and
 308  308                   * holdlwps() cannot complete successfully.
 309  309                   */
 310  310                  error = brand_setbrand(p, B_TRUE);
 311  311  
 312  312                  if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) {
 313  313                          brand_data = BROP(p)->b_lwpdata_alloc(p);
 314  314                          if (brand_data == NULL) {
 315  315                                  error = 1;
 316  316                          }
 317  317                  }
 318  318  
 319  319                  if (error == 0) {
 320  320                          mutex_enter(&p->p_lock);
 321  321                          BROP(p)->b_initlwp(lwp, brand_data);
 322  322                          mutex_exit(&p->p_lock);
 323  323                  } else {
 324  324                          VN_RELE(vp);
 325  325                          if (dir != NULL) {
 326  326                                  VN_RELE(dir);
 327  327                          }
 328  328                          pn_free(&resolvepn);
 329  329                          goto fail;
 330  330                  }
 331  331          }
 332  332  
 333  333          if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz,
 334  334              exec_file, p->p_cred, &brand_action)) != 0) {
 335  335                  if (brandme) {
 336  336                          BROP(p)->b_freelwp(lwp);
 337  337                          brand_clearbrand(p, B_TRUE);
 338  338                  }
 339  339                  VN_RELE(vp);
 340  340                  if (dir != NULL)
 341  341                          VN_RELE(dir);
 342  342                  pn_free(&resolvepn);
 343  343                  goto fail;
 344  344          }
 345  345  
 346  346          /*
 347  347           * Free floating point registers (sun4u only)
 348  348           */
 349  349          ASSERT(lwp != NULL);
 350  350          lwp_freeregs(lwp, 1);
 351  351  
 352  352          /*
 353  353           * Free thread and process context ops.
 354  354           */
 355  355          if (curthread->t_ctx)
 356  356                  freectx(curthread, 1);
 357  357          if (p->p_pctx)
 358  358                  freepctx(p, 1);
 359  359  
 360  360          /*
 361  361           * Remember file name for accounting; clear any cached DTrace predicate.
 362  362           */
 363  363          up->u_acflag &= ~AFORK;
 364  364          bcopy(exec_file, up->u_comm, MAXCOMLEN+1);
 365  365          curthread->t_predcache = NULL;
 366  366  
 367  367          /*
 368  368           * Clear contract template state
 369  369           */
 370  370          lwp_ctmpl_clear(lwp, B_TRUE);
 371  371  
 372  372          /*
 373  373           * Save the directory in which we found the executable for expanding
 374  374           * the %d token used in core file patterns.
 375  375           */
 376  376          mutex_enter(&p->p_lock);
 377  377          tmpvp = p->p_execdir;
 378  378          p->p_execdir = dir;
 379  379          if (p->p_execdir != NULL)
 380  380                  VN_HOLD(p->p_execdir);
 381  381          mutex_exit(&p->p_lock);
 382  382  
 383  383          if (tmpvp != NULL)
 384  384                  VN_RELE(tmpvp);
 385  385  
 386  386          /*
 387  387           * Reset stack state to the user stack, clear set of signals
 388  388           * caught on the signal stack, and reset list of signals that
 389  389           * restart system calls; the new program's environment should
 390  390           * not be affected by detritus from the old program.  Any
 391  391           * pending held signals remain held, so don't clear t_hold.
 392  392           */
 393  393          mutex_enter(&p->p_lock);
 394  394          DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp,
 395  395              uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0);
 396  396          lwp->lwp_oldcontext = 0;
 397  397          lwp->lwp_ustack = 0;
 398  398          lwp->lwp_old_stk_ctl = 0;
 399  399          sigemptyset(&up->u_signodefer);
 400  400          sigemptyset(&up->u_sigonstack);
 401  401          sigemptyset(&up->u_sigresethand);
 402  402          lwp->lwp_sigaltstack.ss_sp = 0;
 403  403          lwp->lwp_sigaltstack.ss_size = 0;
 404  404          lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
 405  405  
 406  406          /*
 407  407           * Make saved resource limit == current resource limit.
 408  408           */
 409  409          for (i = 0; i < RLIM_NLIMITS; i++) {
 410  410                  /*CONSTCOND*/
 411  411                  if (RLIM_SAVED(i)) {
 412  412                          (void) rctl_rlimit_get(rctlproc_legacy[i], p,
 413  413                              &up->u_saved_rlimit[i]);
 414  414                  }
 415  415          }
 416  416  
 417  417          /*
 418  418           * If the action was to catch the signal, then the action
 419  419           * must be reset to SIG_DFL.
 420  420           */
 421  421          sigdefault(p);
 422  422          p->p_flag &= ~(SNOWAIT|SJCTL);
 423  423          p->p_flag |= (SEXECED|SMSACCT|SMSFORK);
 424  424          up->u_signal[SIGCLD - 1] = SIG_DFL;
 425  425  
 426  426          /*
 427  427           * Delete the dot4 sigqueues/signotifies.
 428  428           */
 429  429          sigqfree(p);
 430  430  
 431  431          mutex_exit(&p->p_lock);
 432  432  
 433  433          mutex_enter(&p->p_pflock);
 434  434          p->p_prof.pr_base = NULL;
 435  435          p->p_prof.pr_size = 0;
 436  436          p->p_prof.pr_off = 0;
 437  437          p->p_prof.pr_scale = 0;
 438  438          p->p_prof.pr_samples = 0;
 439  439          mutex_exit(&p->p_pflock);
 440  440  
 441  441          ASSERT(curthread->t_schedctl == NULL);
 442  442  
 443  443  #if defined(__sparc)
 444  444          if (p->p_utraps != NULL)
 445  445                  utrap_free(p);
 446  446  #endif  /* __sparc */
 447  447  
 448  448          /*
 449  449           * Close all close-on-exec files.
 450  450           */
 451  451          close_exec(P_FINFO(p));
 452  452          TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up);
 453  453  
 454  454          /* Unbrand ourself if necessary. */
 455  455          if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) {
 456  456                  BROP(p)->b_freelwp(lwp);
 457  457                  brand_clearbrand(p, B_FALSE);
 458  458          }
 459  459  
 460  460          setregs(&args);
 461  461  
 462  462          /* Mark this as an executable vnode */
 463  463          mutex_enter(&vp->v_lock);
 464  464          vp->v_flag |= VVMEXEC;
 465  465          mutex_exit(&vp->v_lock);
 466  466  
 467  467          VN_RELE(vp);
 468  468          if (dir != NULL)
 469  469                  VN_RELE(dir);
 470  470          pn_free(&resolvepn);
 471  471  
 472  472          /*
 473  473           * Allocate a new lwp directory and lwpid hash table if necessary.
 474  474           */
 475  475          if (curthread->t_tid != 1 || p->p_lwpdir_sz != 2) {
 476  476                  lwpdir = kmem_zalloc(2 * sizeof (lwpdir_t), KM_SLEEP);
 477  477                  lwpdir->ld_next = lwpdir + 1;
 478  478                  tidhash = kmem_zalloc(2 * sizeof (tidhash_t), KM_SLEEP);
 479  479                  if (p->p_lwpdir != NULL)
 480  480                          lep = p->p_lwpdir[curthread->t_dslot].ld_entry;
 481  481                  else
 482  482                          lep = kmem_zalloc(sizeof (*lep), KM_SLEEP);
 483  483          }
 484  484  
 485  485          if (PROC_IS_BRANDED(p))
 486  486                  BROP(p)->b_exec();
 487  487  
 488  488          mutex_enter(&p->p_lock);
 489  489          prbarrier(p);
 490  490  
 491  491          /*
 492  492           * Reset lwp id to the default value of 1.
 493  493           * This is a single-threaded process now
 494  494           * and lwp #1 is lwp_wait()able by default.
 495  495           * The t_unpark flag should not be inherited.
 496  496           */
 497  497          ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
 498  498          curthread->t_tid = 1;
 499  499          kpreempt_disable();
 500  500          ASSERT(curthread->t_lpl != NULL);
 501  501          p->p_t1_lgrpid = curthread->t_lpl->lpl_lgrpid;
 502  502          kpreempt_enable();
 503  503          if (p->p_tr_lgrpid != LGRP_NONE && p->p_tr_lgrpid != p->p_t1_lgrpid) {
 504  504                  lgrp_update_trthr_migrations(1);
 505  505          }
 506  506          curthread->t_unpark = 0;
 507  507          curthread->t_proc_flag |= TP_TWAIT;
 508  508          curthread->t_proc_flag &= ~TP_DAEMON;   /* daemons shouldn't exec */
 509  509          p->p_lwpdaemon = 0;                     /* but oh well ... */
 510  510          p->p_lwpid = 1;
 511  511  
 512  512          /*
 513  513           * Install the newly-allocated lwp directory and lwpid hash table
 514  514           * and insert the current thread into the new hash table.
 515  515           */
 516  516          if (lwpdir != NULL) {
 517  517                  old_lwpdir = p->p_lwpdir;
 518  518                  old_lwpdir_sz = p->p_lwpdir_sz;
 519  519                  old_tidhash = p->p_tidhash;
 520  520                  old_tidhash_sz = p->p_tidhash_sz;
 521  521                  p->p_lwpdir = p->p_lwpfree = lwpdir;
 522  522                  p->p_lwpdir_sz = 2;
 523  523                  lep->le_thread = curthread;
 524  524                  lep->le_lwpid = curthread->t_tid;
 525  525                  lep->le_start = curthread->t_start;
 526  526                  lwp_hash_in(p, lep, tidhash, 2, 0);
 527  527                  p->p_tidhash = tidhash;
 528  528                  p->p_tidhash_sz = 2;
 529  529          }
 530  530          ret_tidhash = p->p_ret_tidhash;
 531  531          p->p_ret_tidhash = NULL;
 532  532  
 533  533          /*
 534  534           * Restore the saved signal mask and
 535  535           * inform /proc that the exec() has finished.
 536  536           */
 537  537          curthread->t_hold = savedmask;
 538  538          prexecend();
 539  539          mutex_exit(&p->p_lock);
 540  540          if (old_lwpdir) {
 541  541                  kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t));
 542  542                  kmem_free(old_tidhash, old_tidhash_sz * sizeof (tidhash_t));
 543  543          }
 544  544          while (ret_tidhash != NULL) {
 545  545                  ret_tidhash_t *next = ret_tidhash->rth_next;
 546  546                  kmem_free(ret_tidhash->rth_tidhash,
 547  547                      ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
 548  548                  kmem_free(ret_tidhash, sizeof (*ret_tidhash));
 549  549                  ret_tidhash = next;
 550  550          }
 551  551  
 552  552          ASSERT(error == 0);
 553  553          DTRACE_PROC(exec__success);
 554  554          return (0);
 555  555  
 556  556  fail:
 557  557          DTRACE_PROC1(exec__failure, int, error);
 558  558  out:            /* error return */
 559  559          mutex_enter(&p->p_lock);
 560  560          curthread->t_hold = savedmask;
 561  561          prexecend();
 562  562          mutex_exit(&p->p_lock);
 563  563          ASSERT(error != 0);
 564  564          return (error);
 565  565  }
 566  566  
 567  567  
 568  568  /*
 569  569   * Perform generic exec duties and switchout to object-file specific
 570  570   * handler.
 571  571   */
 572  572  int
 573  573  gexec(
 574  574          struct vnode **vpp,
 575  575          struct execa *uap,
 576  576          struct uarg *args,
 577  577          struct intpdata *idatap,
 578  578          int level,
 579  579          long *execsz,
 580  580          caddr_t exec_file,
 581  581          struct cred *cred,
 582  582          int *brand_action)
 583  583  {
 584  584          struct vnode *vp, *execvp = NULL;
 585  585          proc_t *pp = ttoproc(curthread);
 586  586          struct execsw *eswp;
 587  587          int error = 0;
 588  588          int suidflags = 0;
 589  589          ssize_t resid;
 590  590          uid_t uid, gid;
 591  591          struct vattr vattr;
 592  592          char magbuf[MAGIC_BYTES];
 593  593          int setid;
 594  594          cred_t *oldcred, *newcred = NULL;
 595  595          int privflags = 0;
 596  596          int setidfl;
 597  597          priv_set_t fset;
 598  598  
 599  599          /*
 600  600           * If the SNOCD or SUGID flag is set, turn it off and remember the
 601  601           * previous setting so we can restore it if we encounter an error.
 602  602           */
 603  603          if (level == 0 && (pp->p_flag & PSUIDFLAGS)) {
 604  604                  mutex_enter(&pp->p_lock);
 605  605                  suidflags = pp->p_flag & PSUIDFLAGS;
 606  606                  pp->p_flag &= ~PSUIDFLAGS;
 607  607                  mutex_exit(&pp->p_lock);
 608  608          }
 609  609  
 610  610          if ((error = execpermissions(*vpp, &vattr, args)) != 0)
 611  611                  goto bad_noclose;
 612  612  
 613  613          /* need to open vnode for stateful file systems */
 614  614          if ((error = VOP_OPEN(vpp, FREAD, CRED(), NULL)) != 0)
 615  615                  goto bad_noclose;
 616  616          vp = *vpp;
 617  617  
 618  618          /*
 619  619           * Note: to support binary compatibility with SunOS a.out
 620  620           * executables, we read in the first four bytes, as the
 621  621           * magic number is in bytes 2-3.
 622  622           */
 623  623          if (error = vn_rdwr(UIO_READ, vp, magbuf, sizeof (magbuf),
 624  624              (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid))
 625  625                  goto bad;
 626  626          if (resid != 0)
 627  627                  goto bad;
 628  628  
 629  629          if ((eswp = findexec_by_hdr(magbuf)) == NULL)
 630  630                  goto bad;
 631  631  
 632  632          if (level == 0 &&
 633  633              (privflags = execsetid(vp, &vattr, &uid, &gid, &fset,
 634  634              args->pfcred == NULL ? cred : args->pfcred, args->pathname)) != 0) {
 635  635  
 636  636                  /* Pfcred is a credential with a ref count of 1 */
 637  637  
 638  638                  if (args->pfcred != NULL) {
 639  639                          privflags |= PRIV_INCREASE|PRIV_RESET;
 640  640                          newcred = cred = args->pfcred;
 641  641                  } else {
 642  642                          newcred = cred = crdup(cred);
 643  643                  }
 644  644  
 645  645                  /* If we can, drop the PA bit */
 646  646                  if ((privflags & PRIV_RESET) != 0)
 647  647                          priv_adjust_PA(cred);
 648  648  
 649  649                  if (privflags & PRIV_SETID) {
 650  650                          cred->cr_uid = uid;
 651  651                          cred->cr_gid = gid;
 652  652                          cred->cr_suid = uid;
 653  653                          cred->cr_sgid = gid;
 654  654                  }
 655  655  
 656  656                  if (privflags & MAC_FLAGS) {
 657  657                          if (!(CR_FLAGS(cred) & NET_MAC_AWARE_INHERIT))
 658  658                                  CR_FLAGS(cred) &= ~NET_MAC_AWARE;
 659  659                          CR_FLAGS(cred) &= ~NET_MAC_AWARE_INHERIT;
 660  660                  }
 661  661  
 662  662                  /*
 663  663                   * Implement the privilege updates:
 664  664                   *
 665  665                   * Restrict with L:
 666  666                   *
 667  667                   *      I' = I & L
 668  668                   *
 669  669                   *      E' = P' = (I' + F) & A
 670  670                   *
 671  671                   * But if running under ptrace, we cap I and F with P.
 672  672                   */
 673  673                  if ((privflags & (PRIV_RESET|PRIV_FORCED)) != 0) {
 674  674                          if ((privflags & PRIV_INCREASE) != 0 &&
 675  675                              (pp->p_proc_flag & P_PR_PTRACE) != 0) {
 676  676                                  priv_intersect(&CR_OPPRIV(cred),
 677  677                                      &CR_IPRIV(cred));
 678  678                                  priv_intersect(&CR_OPPRIV(cred), &fset);
 679  679                          }
 680  680                          priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
 681  681                          CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
 682  682                          if (privflags & PRIV_FORCED) {
 683  683                                  priv_set_PA(cred);
 684  684                                  priv_union(&fset, &CR_EPRIV(cred));
 685  685                                  priv_union(&fset, &CR_PPRIV(cred));
 686  686                          }
 687  687                          priv_adjust_PA(cred);
 688  688                  }
 689  689          } else if (level == 0 && args->pfcred != NULL) {
 690  690                  newcred = cred = args->pfcred;
 691  691                  privflags |= PRIV_INCREASE;
 692  692                  /* pfcred is not forced to adhere to these settings */
 693  693                  priv_intersect(&CR_LPRIV(cred), &CR_IPRIV(cred));
 694  694                  CR_EPRIV(cred) = CR_PPRIV(cred) = CR_IPRIV(cred);
 695  695                  priv_adjust_PA(cred);
 696  696          }
 697  697  
 698  698          /* SunOS 4.x buy-back */
 699  699          if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) &&
 700  700              (vattr.va_mode & (VSUID|VSGID))) {
 701  701                  char path[MAXNAMELEN];
 702  702                  refstr_t *mntpt = NULL;
 703  703                  int ret = -1;
 704  704  
 705  705                  bzero(path, sizeof (path));
 706  706                  zone_hold(pp->p_zone);
 707  707  
 708  708                  ret = vnodetopath(pp->p_zone->zone_rootvp, vp, path,
 709  709                      sizeof (path), cred);
 710  710  
 711  711                  /* fallback to mountpoint if a path can't be found */
 712  712                  if ((ret != 0) || (ret == 0 && path[0] == '\0'))
 713  713                          mntpt = vfs_getmntpoint(vp->v_vfsp);
 714  714  
 715  715                  if (mntpt == NULL)
 716  716                          zcmn_err(pp->p_zone->zone_id, CE_NOTE,
 717  717                              "!uid %d: setuid execution not allowed, "
 718  718                              "file=%s", cred->cr_uid, path);
 719  719                  else
 720  720                          zcmn_err(pp->p_zone->zone_id, CE_NOTE,
 721  721                              "!uid %d: setuid execution not allowed, "
 722  722                              "fs=%s, file=%s", cred->cr_uid,
 723  723                              ZONE_PATH_TRANSLATE(refstr_value(mntpt),
 724  724                              pp->p_zone), exec_file);
 725  725  
 726  726                  if (!INGLOBALZONE(pp)) {
 727  727                          /* zone_rootpath always has trailing / */
 728  728                          if (mntpt == NULL)
 729  729                                  cmn_err(CE_NOTE, "!zone: %s, uid: %d "
 730  730                                      "setuid execution not allowed, file=%s%s",
 731  731                                      pp->p_zone->zone_name, cred->cr_uid,
 732  732                                      pp->p_zone->zone_rootpath, path + 1);
 733  733                          else
 734  734                                  cmn_err(CE_NOTE, "!zone: %s, uid: %d "
 735  735                                      "setuid execution not allowed, fs=%s, "
 736  736                                      "file=%s", pp->p_zone->zone_name,
 737  737                                      cred->cr_uid, refstr_value(mntpt),
 738  738                                      exec_file);
 739  739                  }
 740  740  
 741  741                  if (mntpt != NULL)
 742  742                          refstr_rele(mntpt);
 743  743  
 744  744                  zone_rele(pp->p_zone);
 745  745          }
 746  746  
 747  747          /*
 748  748           * execsetid() told us whether or not we had to change the
 749  749           * credentials of the process.  In privflags, it told us
 750  750           * whether we gained any privileges or executed a set-uid executable.
 751  751           */
 752  752          setid = (privflags & (PRIV_SETUGID|PRIV_INCREASE|PRIV_FORCED));
 753  753  
 754  754          /*
 755  755           * Use /etc/system variable to determine if the stack
 756  756           * should be marked as executable by default.
 757  757           */
 758  758          if (noexec_user_stack)
 759  759                  args->stk_prot &= ~PROT_EXEC;
 760  760  
 761  761          args->execswp = eswp; /* Save execsw pointer in uarg for exec_func */
 762  762          args->ex_vp = vp;
 763  763  
 764  764          /*
 765  765           * Traditionally, the setid flags told the sub processes whether
 766  766           * the file just executed was set-uid or set-gid; this caused
 767  767           * some confusion as the 'setid' flag did not match the SUGID
 768  768           * process flag which is only set when the uids/gids do not match.
 769  769           * A script set-gid/set-uid to the real uid/gid would start with
 770  770           * /dev/fd/X but an executable would happily trust LD_LIBRARY_PATH.
 771  771           * Now we flag those cases where the calling process cannot
 772  772           * be trusted to influence the newly exec'ed process, either
 773  773           * because it runs with more privileges or when the uids/gids
 774  774           * do in fact not match.
 775  775           * This also makes the runtime linker agree with the on exec
 776  776           * values of SNOCD and SUGID.
 777  777           */
 778  778          setidfl = 0;
 779  779          if (cred->cr_uid != cred->cr_ruid || (cred->cr_rgid != cred->cr_gid &&
 780  780              !supgroupmember(cred->cr_gid, cred))) {
 781  781                  setidfl |= EXECSETID_UGIDS;
 782  782          }
 783  783          if (setid & PRIV_SETUGID)
 784  784                  setidfl |= EXECSETID_SETID;
 785  785          if (setid & PRIV_FORCED)
 786  786                  setidfl |= EXECSETID_PRIVS;
 787  787  
 788  788          execvp = pp->p_exec;
 789  789          if (execvp)
 790  790                  VN_HOLD(execvp);
 791  791  
 792  792          error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz,
 793  793              setidfl, exec_file, cred, brand_action);
 794  794          rw_exit(eswp->exec_lock);
 795  795          if (error != 0) {
 796  796                  if (execvp)
 797  797                          VN_RELE(execvp);
 798  798                  /*
 799  799                   * If this process's p_exec has been set to the vp of
 800  800                   * the executable by exec_func, we will return without
 801  801                   * calling VOP_CLOSE because proc_exit will close it
 802  802                   * on exit.
 803  803                   */
 804  804                  if (pp->p_exec == vp)
 805  805                          goto bad_noclose;
 806  806                  else
 807  807                          goto bad;
 808  808          }
 809  809  
 810  810          if (level == 0) {
 811  811                  uid_t oruid;
 812  812  
 813  813                  if (execvp != NULL) {
 814  814                          /*
 815  815                           * Close the previous executable only if we are
 816  816                           * at level 0.
 817  817                           */
 818  818                          (void) VOP_CLOSE(execvp, FREAD, 1, (offset_t)0,
 819  819                              cred, NULL);
 820  820                  }
 821  821  
 822  822                  mutex_enter(&pp->p_crlock);
 823  823  
 824  824                  oruid = pp->p_cred->cr_ruid;
 825  825  
 826  826                  if (newcred != NULL) {
 827  827                          /*
 828  828                           * Free the old credentials, and set the new ones.
 829  829                           * Do this for both the process and the (single) thread.
 830  830                           */
 831  831                          crfree(pp->p_cred);
 832  832                          pp->p_cred = cred;      /* cred already held for proc */
 833  833                          crhold(cred);           /* hold new cred for thread */
 834  834                          /*
 835  835                           * DTrace accesses t_cred in probe context.  t_cred
 836  836                           * must always be either NULL, or point to a valid,
 837  837                           * allocated cred structure.
 838  838                           */
 839  839                          oldcred = curthread->t_cred;
 840  840                          curthread->t_cred = cred;
 841  841                          crfree(oldcred);
 842  842  
 843  843                          if (priv_basic_test >= 0 &&
 844  844                              !PRIV_ISASSERT(&CR_IPRIV(newcred),
 845  845                              priv_basic_test)) {
 846  846                                  pid_t pid = pp->p_pid;
 847  847                                  char *fn = PTOU(pp)->u_comm;
 848  848  
 849  849                                  cmn_err(CE_WARN, "%s[%d]: exec: basic_test "
 850  850                                      "privilege removed from E/I", fn, pid);
 851  851                          }
 852  852                  }
 853  853                  /*
 854  854                   * On emerging from a successful exec(), the saved
 855  855                   * uid and gid equal the effective uid and gid.
 856  856                   */
 857  857                  cred->cr_suid = cred->cr_uid;
 858  858                  cred->cr_sgid = cred->cr_gid;
 859  859  
 860  860                  /*
 861  861                   * If the real and effective ids do not match, this
 862  862                   * is a setuid process that should not dump core.
 863  863                   * The group comparison is tricky; we prevent the code
 864  864                   * from flagging SNOCD when executing with an effective gid
 865  865                   * which is a supplementary group.
 866  866                   */
 867  867                  if (cred->cr_ruid != cred->cr_uid ||
 868  868                      (cred->cr_rgid != cred->cr_gid &&
 869  869                      !supgroupmember(cred->cr_gid, cred)) ||
 870  870                      (privflags & PRIV_INCREASE) != 0)
 871  871                          suidflags = PSUIDFLAGS;
 872  872                  else
 873  873                          suidflags = 0;
 874  874  
 875  875                  mutex_exit(&pp->p_crlock);
 876  876                  if (newcred != NULL && oruid != newcred->cr_ruid) {
 877  877                          /* Note that the process remains in the same zone. */
 878  878                          mutex_enter(&pidlock);
 879  879                          upcount_dec(oruid, crgetzoneid(newcred));
 880  880                          upcount_inc(newcred->cr_ruid, crgetzoneid(newcred));
 881  881                          mutex_exit(&pidlock);
 882  882                  }
 883  883                  if (suidflags) {
 884  884                          mutex_enter(&pp->p_lock);
 885  885                          pp->p_flag |= suidflags;
 886  886                          mutex_exit(&pp->p_lock);
 887  887                  }
 888  888                  if (setid && (pp->p_proc_flag & P_PR_PTRACE) == 0) {
 889  889                          /*
 890  890                           * If process is traced via /proc, arrange to
 891  891                           * invalidate the associated /proc vnode.
 892  892                           */
 893  893                          if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE))
 894  894                                  args->traceinval = 1;
 895  895                  }
 896  896  
 897  897                  /*
 898  898                   * If legacy ptrace is enabled, generate the SIGTRAP.
 899  899                   */
 900  900                  if (pp->p_proc_flag & P_PR_PTRACE) {
 901  901                          psignal(pp, SIGTRAP);
 902  902                  }
 903  903  
 904  904                  if (args->traceinval)
 905  905                          prinvalidate(&pp->p_user);
 906  906          }
 907  907          if (execvp)
 908  908                  VN_RELE(execvp);
 909  909          return (0);
 910  910  
 911  911  bad:
 912  912          (void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, cred, NULL);
 913  913  
 914  914  bad_noclose:
 915  915          if (newcred != NULL)
 916  916                  crfree(newcred);
 917  917          if (error == 0)
 918  918                  error = ENOEXEC;
 919  919  
 920  920          if (suidflags) {
 921  921                  mutex_enter(&pp->p_lock);
 922  922                  pp->p_flag |= suidflags;
 923  923                  mutex_exit(&pp->p_lock);
 924  924          }
 925  925          return (error);
 926  926  }
 927  927  
 928  928  extern char *execswnames[];
 929  929  
 930  930  struct execsw *
 931  931  allocate_execsw(char *name, char *magic, size_t magic_size)
 932  932  {
 933  933          int i, j;
 934  934          char *ename;
 935  935          char *magicp;
 936  936  
 937  937          mutex_enter(&execsw_lock);
 938  938          for (i = 0; i < nexectype; i++) {
 939  939                  if (execswnames[i] == NULL) {
 940  940                          ename = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 941  941                          (void) strcpy(ename, name);
 942  942                          execswnames[i] = ename;
 943  943                          /*
 944  944                           * Set the magic number last so that we
 945  945                           * don't need to hold the execsw_lock in
 946  946                           * findexectype().
 947  947                           */
 948  948                          magicp = kmem_alloc(magic_size, KM_SLEEP);
 949  949                          for (j = 0; j < magic_size; j++)
 950  950                                  magicp[j] = magic[j];
 951  951                          execsw[i].exec_magic = magicp;
 952  952                          mutex_exit(&execsw_lock);
 953  953                          return (&execsw[i]);
 954  954                  }
 955  955          }
 956  956          mutex_exit(&execsw_lock);
 957  957          return (NULL);
 958  958  }
 959  959  
 960  960  /*
 961  961   * Find the exec switch table entry with the corresponding magic string.
 962  962   */
 963  963  struct execsw *
 964  964  findexecsw(char *magic)
 965  965  {
 966  966          struct execsw *eswp;
 967  967  
 968  968          for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 969  969                  ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 970  970                  if (magic && eswp->exec_maglen != 0 &&
 971  971                      bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0)
 972  972                          return (eswp);
 973  973          }
 974  974          return (NULL);
 975  975  }
 976  976  
 977  977  /*
 978  978   * Find the execsw[] index for the given exec header string by looking for the
 979  979   * magic string at a specified offset and length for each kind of executable
 980  980   * file format until one matches.  If no execsw[] entry is found, try to
 981  981   * autoload a module for this magic string.
 982  982   */
 983  983  struct execsw *
 984  984  findexec_by_hdr(char *header)
 985  985  {
 986  986          struct execsw *eswp;
 987  987  
 988  988          for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
 989  989                  ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
 990  990                  if (header && eswp->exec_maglen != 0 &&
 991  991                      bcmp(&header[eswp->exec_magoff], eswp->exec_magic,
 992  992                      eswp->exec_maglen) == 0) {
 993  993                          if (hold_execsw(eswp) != 0)
 994  994                                  return (NULL);
 995  995                          return (eswp);
 996  996                  }
 997  997          }
 998  998          return (NULL);  /* couldn't find the type */
 999  999  }
1000 1000  
1001 1001  /*
1002 1002   * Find the execsw[] index for the given magic string.  If no execsw[] entry
1003 1003   * is found, try to autoload a module for this magic string.
1004 1004   */
1005 1005  struct execsw *
1006 1006  findexec_by_magic(char *magic)
1007 1007  {
1008 1008          struct execsw *eswp;
1009 1009  
1010 1010          for (eswp = execsw; eswp < &execsw[nexectype]; eswp++) {
1011 1011                  ASSERT(eswp->exec_maglen <= MAGIC_BYTES);
1012 1012                  if (magic && eswp->exec_maglen != 0 &&
1013 1013                      bcmp(magic, eswp->exec_magic, eswp->exec_maglen) == 0) {
1014 1014                          if (hold_execsw(eswp) != 0)
1015 1015                                  return (NULL);
1016 1016                          return (eswp);
1017 1017                  }
1018 1018          }
1019 1019          return (NULL);  /* couldn't find the type */
1020 1020  }
1021 1021  
1022 1022  static int
1023 1023  hold_execsw(struct execsw *eswp)
1024 1024  {
1025 1025          char *name;
1026 1026  
1027 1027          rw_enter(eswp->exec_lock, RW_READER);
1028 1028          while (!LOADED_EXEC(eswp)) {
1029 1029                  rw_exit(eswp->exec_lock);
1030 1030                  name = execswnames[eswp-execsw];
1031 1031                  ASSERT(name);
1032 1032                  if (modload("exec", name) == -1)
1033 1033                          return (-1);
1034 1034                  rw_enter(eswp->exec_lock, RW_READER);
1035 1035          }
1036 1036          return (0);
1037 1037  }
1038 1038  
1039 1039  static int
1040 1040  execsetid(struct vnode *vp, struct vattr *vattrp, uid_t *uidp, uid_t *gidp,
1041 1041      priv_set_t *fset, cred_t *cr, const char *pathname)
1042 1042  {
1043 1043          proc_t *pp = ttoproc(curthread);
1044 1044          uid_t uid, gid;
1045 1045          int privflags = 0;
1046 1046  
1047 1047          /*
1048 1048           * Remember credentials.
1049 1049           */
1050 1050          uid = cr->cr_uid;
1051 1051          gid = cr->cr_gid;
1052 1052  
1053 1053          /* Will try to reset the PRIV_AWARE bit later. */
1054 1054          if ((CR_FLAGS(cr) & (PRIV_AWARE|PRIV_AWARE_INHERIT)) == PRIV_AWARE)
1055 1055                  privflags |= PRIV_RESET;
1056 1056  
1057 1057          if ((vp->v_vfsp->vfs_flag & VFS_NOSETUID) == 0) {
1058 1058                  /*
1059 1059                   * If it's a set-uid root program we perform the
1060 1060                   * forced privilege look-aside. This has three possible
1061 1061                   * outcomes:
1062 1062                   *      no look aside information -> treat as before
1063 1063                   *      look aside in Limit set -> apply forced privs
1064 1064                   *      look aside not in Limit set -> ignore set-uid root
1065 1065                   *
1066 1066                   * Ordinary set-uid root execution only allowed if the limit
1067 1067                   * set holds all unsafe privileges.
1068 1068                   */
1069 1069                  if (vattrp->va_mode & VSUID) {
1070 1070                          if (vattrp->va_uid == 0) {
1071 1071                                  int res = get_forced_privs(cr, pathname, fset);
1072 1072  
1073 1073                                  switch (res) {
1074 1074                                  case -1:
1075 1075                                          if (priv_issubset(&priv_unsafe,
1076 1076                                              &CR_LPRIV(cr))) {
1077 1077                                                  uid = vattrp->va_uid;
1078 1078                                                  privflags |= PRIV_SETUGID;
1079 1079                                          }
1080 1080                                          break;
1081 1081                                  case 0:
1082 1082                                          privflags |= PRIV_FORCED|PRIV_INCREASE;
1083 1083                                          break;
1084 1084                                  default:
1085 1085                                          break;
1086 1086                                  }
1087 1087                          } else {
1088 1088                                  uid = vattrp->va_uid;
1089 1089                                  privflags |= PRIV_SETUGID;
1090 1090                          }
1091 1091                  }
1092 1092                  if (vattrp->va_mode & VSGID) {
1093 1093                          gid = vattrp->va_gid;
1094 1094                          privflags |= PRIV_SETUGID;
1095 1095                  }
1096 1096          }
1097 1097  
1098 1098          /*
1099 1099           * Do we need to change our credential anyway?
1100 1100           * This is the case when E != I or P != I, as
1101 1101           * we need to do the assignments (with F empty and A full)
1102 1102           * Or when I is not a subset of L; in that case we need to
1103 1103           * enforce L.
1104 1104           *
1105 1105           *              I' = L & I
1106 1106           *
1107 1107           *              E' = P' = (I' + F) & A
1108 1108           * or
1109 1109           *              E' = P' = I'
1110 1110           */
1111 1111          if (!priv_isequalset(&CR_EPRIV(cr), &CR_IPRIV(cr)) ||
1112 1112              !priv_issubset(&CR_IPRIV(cr), &CR_LPRIV(cr)) ||
1113 1113              !priv_isequalset(&CR_PPRIV(cr), &CR_IPRIV(cr)))
1114 1114                  privflags |= PRIV_RESET;
1115 1115  
1116 1116          /* Child has more privileges than parent */
1117 1117          if (!priv_issubset(&CR_IPRIV(cr), &CR_PPRIV(cr)))
1118 1118                  privflags |= PRIV_INCREASE;
1119 1119  
1120 1120          /* If MAC-aware flag(s) are on, need to update cred to remove. */
1121 1121          if ((CR_FLAGS(cr) & NET_MAC_AWARE) ||
1122 1122              (CR_FLAGS(cr) & NET_MAC_AWARE_INHERIT))
1123 1123                  privflags |= MAC_FLAGS;
1124 1124          /*
1125 1125           * Set setuid/setgid protections if no ptrace() compatibility.
1126 1126           * For privileged processes, honor setuid/setgid even in
1127 1127           * the presence of ptrace() compatibility.
1128 1128           */
1129 1129          if (((pp->p_proc_flag & P_PR_PTRACE) == 0 ||
1130 1130              PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, (uid == 0))) &&
1131 1131              (cr->cr_uid != uid ||
1132 1132              cr->cr_gid != gid ||
1133 1133              cr->cr_suid != uid ||
1134 1134              cr->cr_sgid != gid)) {
1135 1135                  *uidp = uid;
1136 1136                  *gidp = gid;
1137 1137                  privflags |= PRIV_SETID;
1138 1138          }
1139 1139          return (privflags);
1140 1140  }
1141 1141  
1142 1142  int
1143 1143  execpermissions(struct vnode *vp, struct vattr *vattrp, struct uarg *args)
1144 1144  {
1145 1145          int error;
1146 1146          proc_t *p = ttoproc(curthread);
1147 1147  
1148 1148          vattrp->va_mask = AT_MODE | AT_UID | AT_GID | AT_SIZE;
1149 1149          if (error = VOP_GETATTR(vp, vattrp, ATTR_EXEC, p->p_cred, NULL))
1150 1150                  return (error);
1151 1151          /*
1152 1152           * Check the access mode.
1153 1153           * If VPROC, ask /proc if the file is an object file.
1154 1154           */
1155 1155          if ((error = VOP_ACCESS(vp, VEXEC, 0, p->p_cred, NULL)) != 0 ||
1156 1156              !(vp->v_type == VREG || (vp->v_type == VPROC && pr_isobject(vp))) ||
1157 1157              (vp->v_vfsp->vfs_flag & VFS_NOEXEC) != 0 ||
1158 1158              (vattrp->va_mode & (VEXEC|(VEXEC>>3)|(VEXEC>>6))) == 0) {
1159 1159                  if (error == 0)
1160 1160                          error = EACCES;
1161 1161                  return (error);
1162 1162          }
1163 1163  
1164 1164          if ((p->p_plist || (p->p_proc_flag & (P_PR_PTRACE|P_PR_TRACE))) &&
1165 1165              (error = VOP_ACCESS(vp, VREAD, 0, p->p_cred, NULL))) {
1166 1166                  /*
1167 1167                   * If process is under ptrace(2) compatibility,
1168 1168                   * fail the exec(2).
1169 1169                   */
1170 1170                  if (p->p_proc_flag & P_PR_PTRACE)
1171 1171                          goto bad;
1172 1172                  /*
1173 1173                   * Process is traced via /proc.
1174 1174                   * Arrange to invalidate the /proc vnode.
1175 1175                   */
1176 1176                  args->traceinval = 1;
1177 1177          }
1178 1178          return (0);
1179 1179  bad:
1180 1180          if (error == 0)
1181 1181                  error = ENOEXEC;
1182 1182          return (error);
1183 1183  }
1184 1184  
1185 1185  /*
1186 1186   * Map a section of an executable file into the user's
1187 1187   * address space.
1188 1188   */
1189 1189  int
1190 1190  execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen,
1191 1191      off_t offset, int prot, int page, uint_t szc)
1192 1192  {
1193 1193          int error = 0;
1194 1194          off_t oldoffset;
1195 1195          caddr_t zfodbase, oldaddr;
1196 1196          size_t end, oldlen;
1197 1197          size_t zfoddiff;
1198 1198          label_t ljb;
1199 1199          proc_t *p = ttoproc(curthread);
1200 1200  
1201 1201          oldaddr = addr;
1202 1202          addr = (caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK);
1203 1203          if (len) {
1204 1204                  oldlen = len;
1205 1205                  len += ((size_t)oldaddr - (size_t)addr);
1206 1206                  oldoffset = offset;
1207 1207                  offset = (off_t)((uintptr_t)offset & PAGEMASK);
1208 1208                  if (page) {
1209 1209                          spgcnt_t  prefltmem, availm, npages;
1210 1210                          int preread;
1211 1211                          uint_t mflag = MAP_PRIVATE | MAP_FIXED;
1212 1212  
1213 1213                          if ((prot & (PROT_WRITE | PROT_EXEC)) == PROT_EXEC) {
1214 1214                                  mflag |= MAP_TEXT;
1215 1215                          } else {
1216 1216                                  mflag |= MAP_INITDATA;
1217 1217                          }
1218 1218  
1219 1219                          if (valid_usr_range(addr, len, prot, p->p_as,
1220 1220                              p->p_as->a_userlimit) != RANGE_OKAY) {
1221 1221                                  error = ENOMEM;
1222 1222                                  goto bad;
1223 1223                          }
1224 1224                          if (error = VOP_MAP(vp, (offset_t)offset,
1225 1225                              p->p_as, &addr, len, prot, PROT_ALL,
1226 1226                              mflag, CRED(), NULL))
1227 1227                                  goto bad;
1228 1228  
1229 1229                          /*
1230 1230                           * If the segment can fit, then we prefault
1231 1231                           * the entire segment in.  This is based on the
1232 1232                           * model that says the best working set of a
1233 1233                           * small program is all of its pages.
1234 1234                           */
1235 1235                          npages = (spgcnt_t)btopr(len);
1236 1236                          prefltmem = freemem - desfree;
1237 1237                          preread =
1238 1238                              (npages < prefltmem && len < PGTHRESH) ? 1 : 0;
1239 1239  
1240 1240                          /*
1241 1241                           * If we aren't prefaulting the segment,
1242 1242                           * increment "deficit", if necessary to ensure
1243 1243                           * that pages will become available when this
1244 1244                           * process starts executing.
1245 1245                           */
1246 1246                          availm = freemem - lotsfree;
1247 1247                          if (preread == 0 && npages > availm &&
1248 1248                              deficit < lotsfree) {
1249 1249                                  deficit += MIN((pgcnt_t)(npages - availm),
1250 1250                                      lotsfree - deficit);
1251 1251                          }
1252 1252  
1253 1253                          if (preread) {
1254 1254                                  TRACE_2(TR_FAC_PROC, TR_EXECMAP_PREREAD,
1255 1255                                      "execmap preread:freemem %d size %lu",
1256 1256                                      freemem, len);
1257 1257                                  (void) as_fault(p->p_as->a_hat, p->p_as,
1258 1258                                      (caddr_t)addr, len, F_INVAL, S_READ);
1259 1259                          }
1260 1260                  } else {
1261 1261                          if (valid_usr_range(addr, len, prot, p->p_as,
1262 1262                              p->p_as->a_userlimit) != RANGE_OKAY) {
1263 1263                                  error = ENOMEM;
1264 1264                                  goto bad;
1265 1265                          }
1266 1266  
1267 1267                          if (error = as_map(p->p_as, addr, len,
1268 1268                              segvn_create, zfod_argsp))
1269 1269                                  goto bad;
1270 1270                          /*
1271 1271                           * Read in the segment in one big chunk.
1272 1272                           */
1273 1273                          if (error = vn_rdwr(UIO_READ, vp, (caddr_t)oldaddr,
1274 1274                              oldlen, (offset_t)oldoffset, UIO_USERSPACE, 0,
1275 1275                              (rlim64_t)0, CRED(), (ssize_t *)0))
1276 1276                                  goto bad;
1277 1277                          /*
1278 1278                           * Now set protections.
1279 1279                           */
1280 1280                          if (prot != PROT_ZFOD) {
1281 1281                                  (void) as_setprot(p->p_as, (caddr_t)addr,
1282 1282                                      len, prot);
1283 1283                          }
1284 1284                  }
1285 1285          }
1286 1286  
1287 1287          if (zfodlen) {
1288 1288                  struct as *as = curproc->p_as;
1289 1289                  struct seg *seg;
1290 1290                  uint_t zprot = 0;
1291 1291  
1292 1292                  end = (size_t)addr + len;
1293 1293                  zfodbase = (caddr_t)roundup(end, PAGESIZE);
1294 1294                  zfoddiff = (uintptr_t)zfodbase - end;
1295 1295                  if (zfoddiff) {
1296 1296                          /*
1297 1297                           * Before we go to zero the remaining space on the last
1298 1298                           * page, make sure we have write permission.
1299 1299                           *
1300 1300                           * Normal illumos binaries don't even hit the case
1301 1301                           * where we have to change permission on the last page
1302 1302                           * since their protection is typically either
1303 1303                           *    PROT_USER | PROT_WRITE | PROT_READ
1304 1304                           * or
1305 1305                           *    PROT_ZFOD (same as PROT_ALL).
1306 1306                           *
1307 1307                           * We need to be careful how we zero-fill the last page
1308 1308                           * if the segment protection does not include
1309 1309                           * PROT_WRITE. Using as_setprot() can cause the VM
1310 1310                           * segment code to call segvn_vpage(), which must
1311 1311                           * allocate a page struct for each page in the segment.
1312 1312                           * If we have a very large segment, this may fail, so
1313 1313                           * we have to check for that, even though we ignore
1314 1314                           * other return values from as_setprot.
1315 1315                           */
1316 1316  
1317 1317                          AS_LOCK_ENTER(as, RW_READER);
1318 1318                          seg = as_segat(curproc->p_as, (caddr_t)end);
1319 1319                          if (seg != NULL)
1320 1320                                  SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1,
1321 1321                                      &zprot);
1322 1322                          AS_LOCK_EXIT(as);
1323 1323  
1324 1324                          if (seg != NULL && (zprot & PROT_WRITE) == 0) {
1325 1325                                  if (as_setprot(as, (caddr_t)end, zfoddiff - 1,
1326 1326                                      zprot | PROT_WRITE) == ENOMEM) {
1327 1327                                          error = ENOMEM;
1328 1328                                          goto bad;
1329 1329                                  }
1330 1330                          }
1331 1331  
1332 1332                          if (on_fault(&ljb)) {
1333 1333                                  no_fault();
1334 1334                                  if (seg != NULL && (zprot & PROT_WRITE) == 0)
1335 1335                                          (void) as_setprot(as, (caddr_t)end,
1336 1336                                              zfoddiff - 1, zprot);
1337 1337                                  error = EFAULT;
1338 1338                                  goto bad;
1339 1339                          }
1340 1340                          uzero((void *)end, zfoddiff);
1341 1341                          no_fault();
1342 1342                          if (seg != NULL && (zprot & PROT_WRITE) == 0)
1343 1343                                  (void) as_setprot(as, (caddr_t)end,
1344 1344                                      zfoddiff - 1, zprot);
1345 1345                  }
1346 1346                  if (zfodlen > zfoddiff) {
1347 1347                          struct segvn_crargs crargs =
1348 1348                              SEGVN_ZFOD_ARGS(PROT_ZFOD, PROT_ALL);
1349 1349  
1350 1350                          zfodlen -= zfoddiff;
1351 1351                          if (valid_usr_range(zfodbase, zfodlen, prot, p->p_as,
1352 1352                              p->p_as->a_userlimit) != RANGE_OKAY) {
1353 1353                                  error = ENOMEM;
1354 1354                                  goto bad;
1355 1355                          }
1356 1356                          if (szc > 0) {
1357 1357                                  /*
1358 1358                                   * ASSERT alignment because the mapelfexec()
1359 1359                                   * caller for the szc > 0 case extended zfod
1360 1360                                   * so it's end is pgsz aligned.
1361 1361                                   */
1362 1362                                  size_t pgsz = page_get_pagesize(szc);
1363 1363                                  ASSERT(IS_P2ALIGNED(zfodbase + zfodlen, pgsz));
1364 1364  
1365 1365                                  if (IS_P2ALIGNED(zfodbase, pgsz)) {
1366 1366                                          crargs.szc = szc;
1367 1367                                  } else {
1368 1368                                          crargs.szc = AS_MAP_HEAP;
1369 1369                                  }
1370 1370                          } else {
1371 1371                                  crargs.szc = AS_MAP_NO_LPOOB;
1372 1372                          }
1373 1373                          if (error = as_map(p->p_as, (caddr_t)zfodbase,
1374 1374                              zfodlen, segvn_create, &crargs))
1375 1375                                  goto bad;
1376 1376                          if (prot != PROT_ZFOD) {
1377 1377                                  (void) as_setprot(p->p_as, (caddr_t)zfodbase,
1378 1378                                      zfodlen, prot);
1379 1379                          }
1380 1380                  }
1381 1381          }
1382 1382          return (0);
1383 1383  bad:
1384 1384          return (error);
1385 1385  }
1386 1386  
1387 1387  void
1388 1388  setexecenv(struct execenv *ep)
1389 1389  {
1390 1390          proc_t *p = ttoproc(curthread);
1391 1391          klwp_t *lwp = ttolwp(curthread);
1392 1392          struct vnode *vp;
1393 1393  
1394 1394          p->p_bssbase = ep->ex_bssbase;
1395 1395          p->p_brkbase = ep->ex_brkbase;
1396 1396          p->p_brksize = ep->ex_brksize;
1397 1397          if (p->p_exec)
1398 1398                  VN_RELE(p->p_exec);     /* out with the old */
1399 1399          vp = p->p_exec = ep->ex_vp;
1400 1400          if (vp != NULL)
1401 1401                  VN_HOLD(vp);            /* in with the new */
1402 1402  
1403 1403          lwp->lwp_sigaltstack.ss_sp = 0;
1404 1404          lwp->lwp_sigaltstack.ss_size = 0;
1405 1405          lwp->lwp_sigaltstack.ss_flags = SS_DISABLE;
1406 1406  }
1407 1407  
1408 1408  int
1409 1409  execopen(struct vnode **vpp, int *fdp)
1410 1410  {
1411 1411          struct vnode *vp = *vpp;
1412 1412          file_t *fp;
1413 1413          int error = 0;
1414 1414          int filemode = FREAD;
1415 1415  
1416 1416          VN_HOLD(vp);            /* open reference */
1417 1417          if (error = falloc(NULL, filemode, &fp, fdp)) {
1418 1418                  VN_RELE(vp);
1419 1419                  *fdp = -1;      /* just in case falloc changed value */
1420 1420                  return (error);
1421 1421          }
1422 1422          if (error = VOP_OPEN(&vp, filemode, CRED(), NULL)) {
1423 1423                  VN_RELE(vp);
1424 1424                  setf(*fdp, NULL);
1425 1425                  unfalloc(fp);
1426 1426                  *fdp = -1;
1427 1427                  return (error);
1428 1428          }
1429 1429          *vpp = vp;              /* vnode should not have changed */
1430 1430          fp->f_vnode = vp;
1431 1431          mutex_exit(&fp->f_tlock);
1432 1432          setf(*fdp, fp);
1433 1433          return (0);
1434 1434  }
1435 1435  
1436 1436  int
1437 1437  execclose(int fd)
1438 1438  {
1439 1439          return (closeandsetf(fd, NULL));
1440 1440  }
1441 1441  
1442 1442  
1443 1443  /*
1444 1444   * noexec stub function.
1445 1445   */
1446 1446  /*ARGSUSED*/
1447 1447  int
1448 1448  noexec(
1449 1449      struct vnode *vp,
1450 1450      struct execa *uap,
1451 1451      struct uarg *args,
1452 1452      struct intpdata *idatap,
1453 1453      int level,
1454 1454      long *execsz,
1455 1455      int setid,
1456 1456      caddr_t exec_file,
1457 1457      struct cred *cred)
1458 1458  {
1459 1459          cmn_err(CE_WARN, "missing exec capability for %s", uap->fname);
1460 1460          return (ENOEXEC);
1461 1461  }
1462 1462  
1463 1463  /*
1464 1464   * Support routines for building a user stack.
1465 1465   *
1466 1466   * execve(path, argv, envp) must construct a new stack with the specified
1467 1467   * arguments and environment variables (see exec_args() for a description
1468 1468   * of the user stack layout).  To do this, we copy the arguments and
1469 1469   * environment variables from the old user address space into the kernel,
1470 1470   * free the old as, create the new as, and copy our buffered information
1471 1471   * to the new stack.  Our kernel buffer has the following structure:
1472 1472   *
1473 1473   *      +-----------------------+ <--- stk_base + stk_size
1474 1474   *      | string offsets        |
1475 1475   *      +-----------------------+ <--- stk_offp
1476 1476   *      |                       |
1477 1477   *      | STK_AVAIL() space     |
1478 1478   *      |                       |
1479 1479   *      +-----------------------+ <--- stk_strp
1480 1480   *      | strings               |
1481 1481   *      +-----------------------+ <--- stk_base
1482 1482   *
1483 1483   * When we add a string, we store the string's contents (including the null
1484 1484   * terminator) at stk_strp, and we store the offset of the string relative to
1485 1485   * stk_base at --stk_offp.  At strings are added, stk_strp increases and
1486 1486   * stk_offp decreases.  The amount of space remaining, STK_AVAIL(), is just
1487 1487   * the difference between these pointers.  If we run out of space, we return
1488 1488   * an error and exec_args() starts all over again with a buffer twice as large.
1489 1489   * When we're all done, the kernel buffer looks like this:
1490 1490   *
1491 1491   *      +-----------------------+ <--- stk_base + stk_size
1492 1492   *      | argv[0] offset        |
1493 1493   *      +-----------------------+
1494 1494   *      | ...                   |
1495 1495   *      +-----------------------+
1496 1496   *      | argv[argc-1] offset   |
1497 1497   *      +-----------------------+
1498 1498   *      | envp[0] offset        |
1499 1499   *      +-----------------------+
1500 1500   *      | ...                   |
1501 1501   *      +-----------------------+
1502 1502   *      | envp[envc-1] offset   |
1503 1503   *      +-----------------------+
1504 1504   *      | AT_SUN_PLATFORM offset|
1505 1505   *      +-----------------------+
1506 1506   *      | AT_SUN_EXECNAME offset|
1507 1507   *      +-----------------------+ <--- stk_offp
1508 1508   *      |                       |
1509 1509   *      | STK_AVAIL() space     |
1510 1510   *      |                       |
1511 1511   *      +-----------------------+ <--- stk_strp
1512 1512   *      | AT_SUN_EXECNAME offset|
1513 1513   *      +-----------------------+
1514 1514   *      | AT_SUN_PLATFORM offset|
1515 1515   *      +-----------------------+
1516 1516   *      | envp[envc-1] string   |
1517 1517   *      +-----------------------+
1518 1518   *      | ...                   |
1519 1519   *      +-----------------------+
1520 1520   *      | envp[0] string        |
1521 1521   *      +-----------------------+
1522 1522   *      | argv[argc-1] string   |
1523 1523   *      +-----------------------+
1524 1524   *      | ...                   |
1525 1525   *      +-----------------------+
1526 1526   *      | argv[0] string        |
1527 1527   *      +-----------------------+ <--- stk_base
1528 1528   */
1529 1529  
1530 1530  #define STK_AVAIL(args)         ((char *)(args)->stk_offp - (args)->stk_strp)
1531 1531  
1532 1532  /*
1533 1533   * Add a string to the stack.
1534 1534   */
1535 1535  static int
1536 1536  stk_add(uarg_t *args, const char *sp, enum uio_seg segflg)
1537 1537  {
1538 1538          int error;
1539 1539          size_t len;
1540 1540  
1541 1541          if (STK_AVAIL(args) < sizeof (int))
1542 1542                  return (E2BIG);
1543 1543          *--args->stk_offp = args->stk_strp - args->stk_base;
1544 1544  
1545 1545          if (segflg == UIO_USERSPACE) {
1546 1546                  error = copyinstr(sp, args->stk_strp, STK_AVAIL(args), &len);
1547 1547                  if (error != 0)
1548 1548                          return (error);
1549 1549          } else {
1550 1550                  len = strlen(sp) + 1;
1551 1551                  if (len > STK_AVAIL(args))
1552 1552                          return (E2BIG);
1553 1553                  bcopy(sp, args->stk_strp, len);
1554 1554          }
1555 1555  
1556 1556          args->stk_strp += len;
  
    | 
      ↓ open down ↓ | 
    1556 lines elided | 
    
      ↑ open up ↑ | 
  
1557 1557  
1558 1558          return (0);
1559 1559  }
1560 1560  
1561 1561  /*
1562 1562   * Add a fixed size byte array to the stack (only from kernel space).
1563 1563   */
1564 1564  static int
1565 1565  stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len)
1566 1566  {
1567      -        int error;
1568      -
1569 1567          if (STK_AVAIL(args) < sizeof (int))
1570 1568                  return (E2BIG);
1571 1569          *--args->stk_offp = args->stk_strp - args->stk_base;
1572 1570  
1573 1571          if (len > STK_AVAIL(args))
1574 1572                  return (E2BIG);
1575 1573          bcopy(sp, args->stk_strp, len);
1576 1574  
1577 1575          args->stk_strp += len;
1578 1576  
1579 1577          return (0);
1580 1578  }
1581 1579  
1582 1580  static int
1583 1581  stk_getptr(uarg_t *args, char *src, char **dst)
1584 1582  {
1585 1583          int error;
1586 1584  
1587 1585          if (args->from_model == DATAMODEL_NATIVE) {
1588 1586                  ulong_t ptr;
1589 1587                  error = fulword(src, &ptr);
1590 1588                  *dst = (caddr_t)ptr;
1591 1589          } else {
1592 1590                  uint32_t ptr;
1593 1591                  error = fuword32(src, &ptr);
1594 1592                  *dst = (caddr_t)(uintptr_t)ptr;
1595 1593          }
1596 1594          return (error);
1597 1595  }
1598 1596  
1599 1597  static int
1600 1598  stk_putptr(uarg_t *args, char *addr, char *value)
1601 1599  {
1602 1600          if (args->to_model == DATAMODEL_NATIVE)
1603 1601                  return (sulword(addr, (ulong_t)value));
1604 1602          else
1605 1603                  return (suword32(addr, (uint32_t)(uintptr_t)value));
1606 1604  }
1607 1605  
1608 1606  static int
1609 1607  stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1610 1608  {
1611 1609          char *sp;
1612 1610          int argc, error;
1613 1611          int argv_empty = 0;
1614 1612          size_t ptrsize = args->from_ptrsize;
1615 1613          size_t size, pad;
1616 1614          char *argv = (char *)uap->argp;
1617 1615          char *envp = (char *)uap->envp;
1618 1616          uint8_t rdata[RANDOM_LEN];
1619 1617  
1620 1618          /*
1621 1619           * Copy interpreter's name and argument to argv[0] and argv[1].
1622 1620           * In the rare case that we have nested interpreters then those names
1623 1621           * and arguments are also copied to the subsequent slots in argv.
1624 1622           */
1625 1623          if (intp != NULL && intp->intp_name[0] != NULL) {
1626 1624                  int i;
1627 1625  
1628 1626                  for (i = 0; i < INTP_MAXDEPTH; i++) {
1629 1627                          if (intp->intp_name[i] == NULL)
1630 1628                                  break;
1631 1629                          error = stk_add(args, intp->intp_name[i], UIO_SYSSPACE);
1632 1630                          if (error != 0)
1633 1631                                  return (error);
1634 1632                          if (intp->intp_arg[i] != NULL) {
1635 1633                                  error = stk_add(args, intp->intp_arg[i],
1636 1634                                      UIO_SYSSPACE);
1637 1635                                  if (error != 0)
1638 1636                                          return (error);
1639 1637                          }
1640 1638                  }
1641 1639  
1642 1640                  if (args->fname != NULL)
1643 1641                          error = stk_add(args, args->fname, UIO_SYSSPACE);
1644 1642                  else
1645 1643                          error = stk_add(args, uap->fname, UIO_USERSPACE);
1646 1644                  if (error)
1647 1645                          return (error);
1648 1646  
1649 1647                  /*
1650 1648                   * Check for an empty argv[].
1651 1649                   */
1652 1650                  if (stk_getptr(args, argv, &sp))
1653 1651                          return (EFAULT);
1654 1652                  if (sp == NULL)
1655 1653                          argv_empty = 1;
1656 1654  
1657 1655                  argv += ptrsize;                /* ignore original argv[0] */
1658 1656          }
1659 1657  
1660 1658          if (argv_empty == 0) {
1661 1659                  /*
1662 1660                   * Add argv[] strings to the stack.
1663 1661                   */
1664 1662                  for (;;) {
1665 1663                          if (stk_getptr(args, argv, &sp))
1666 1664                                  return (EFAULT);
1667 1665                          if (sp == NULL)
1668 1666                                  break;
1669 1667                          if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1670 1668                                  return (error);
1671 1669                          argv += ptrsize;
1672 1670                  }
1673 1671          }
1674 1672          argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1675 1673          args->arglen = args->stk_strp - args->stk_base;
1676 1674  
1677 1675          /*
1678 1676           * Add environ[] strings to the stack.
1679 1677           */
1680 1678          if (envp != NULL) {
1681 1679                  for (;;) {
1682 1680                          char *tmp = args->stk_strp;
1683 1681                          if (stk_getptr(args, envp, &sp))
1684 1682                                  return (EFAULT);
1685 1683                          if (sp == NULL)
1686 1684                                  break;
1687 1685                          if ((error = stk_add(args, sp, UIO_USERSPACE)) != 0)
1688 1686                                  return (error);
1689 1687                          if (args->scrubenv && strncmp(tmp, "LD_", 3) == 0) {
1690 1688                                  /* Undo the copied string */
1691 1689                                  args->stk_strp = tmp;
1692 1690                                  *(args->stk_offp++) = NULL;
1693 1691                          }
1694 1692                          envp += ptrsize;
1695 1693                  }
1696 1694          }
1697 1695          args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp;
1698 1696          args->ne = args->na - argc;
1699 1697  
1700 1698          /*
1701 1699           * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME,
1702 1700           * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM
1703 1701           * array, to the stack.
1704 1702           */
1705 1703          if (auxvpp != NULL && *auxvpp != NULL) {
1706 1704                  if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0)
1707 1705                          return (error);
1708 1706                  if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0)
1709 1707                          return (error);
1710 1708                  if (args->brandname != NULL &&
1711 1709                      (error = stk_add(args, args->brandname, UIO_SYSSPACE)) != 0)
1712 1710                          return (error);
1713 1711                  if (args->emulator != NULL &&
1714 1712                      (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0)
1715 1713                          return (error);
1716 1714  
1717 1715                  /*
1718 1716                   * For the AT_RANDOM aux vector we provide 16 bytes of random
1719 1717                   * data.
1720 1718                   */
1721 1719                  (void) random_get_pseudo_bytes(rdata, sizeof (rdata));
1722 1720  
1723 1721                  if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0)
1724 1722                          return (error);
1725 1723  
1726 1724                  if (args->brand_nroot != NULL &&
1727 1725                      (error = stk_add(args, args->brand_nroot,
1728 1726                      UIO_SYSSPACE)) != 0)
1729 1727                          return (error);
1730 1728          }
1731 1729  
1732 1730          /*
1733 1731           * Compute the size of the stack.  This includes all the pointers,
1734 1732           * the space reserved for the aux vector, and all the strings.
1735 1733           * The total number of pointers is args->na (which is argc + envc)
1736 1734           * plus 4 more: (1) a pointer's worth of space for argc; (2) the NULL
1737 1735           * after the last argument (i.e. argv[argc]); (3) the NULL after the
1738 1736           * last environment variable (i.e. envp[envc]); and (4) the NULL after
1739 1737           * all the strings, at the very top of the stack.
1740 1738           */
1741 1739          size = (args->na + 4) * args->to_ptrsize + args->auxsize +
1742 1740              (args->stk_strp - args->stk_base);
1743 1741  
1744 1742          /*
1745 1743           * Pad the string section with zeroes to align the stack size.
1746 1744           */
1747 1745          pad = P2NPHASE(size, args->stk_align);
1748 1746  
1749 1747          if (STK_AVAIL(args) < pad)
1750 1748                  return (E2BIG);
1751 1749  
1752 1750          args->usrstack_size = size + pad;
1753 1751  
1754 1752          while (pad-- != 0)
1755 1753                  *args->stk_strp++ = 0;
1756 1754  
1757 1755          args->nc = args->stk_strp - args->stk_base;
1758 1756  
1759 1757          return (0);
1760 1758  }
1761 1759  
1762 1760  static int
1763 1761  stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up)
1764 1762  {
1765 1763          size_t ptrsize = args->to_ptrsize;
1766 1764          ssize_t pslen;
1767 1765          char *kstrp = args->stk_base;
1768 1766          char *ustrp = usrstack - args->nc - ptrsize;
1769 1767          char *usp = usrstack - args->usrstack_size;
1770 1768          int *offp = (int *)(args->stk_base + args->stk_size);
1771 1769          int envc = args->ne;
1772 1770          int argc = args->na - envc;
1773 1771          int i;
1774 1772  
1775 1773          /*
1776 1774           * Record argc for /proc.
1777 1775           */
1778 1776          up->u_argc = argc;
1779 1777  
1780 1778          /*
1781 1779           * Put argc on the stack.  Note that even though it's an int,
1782 1780           * it always consumes ptrsize bytes (for alignment).
1783 1781           */
1784 1782          if (stk_putptr(args, usp, (char *)(uintptr_t)argc))
1785 1783                  return (-1);
1786 1784  
1787 1785          /*
1788 1786           * Add argc space (ptrsize) to usp and record argv for /proc.
1789 1787           */
1790 1788          up->u_argv = (uintptr_t)(usp += ptrsize);
1791 1789  
1792 1790          /*
1793 1791           * Put the argv[] pointers on the stack.
1794 1792           */
1795 1793          for (i = 0; i < argc; i++, usp += ptrsize)
1796 1794                  if (stk_putptr(args, usp, &ustrp[*--offp]))
1797 1795                          return (-1);
1798 1796  
1799 1797          /*
1800 1798           * Copy arguments to u_psargs.
1801 1799           */
1802 1800          pslen = MIN(args->arglen, PSARGSZ) - 1;
1803 1801          for (i = 0; i < pslen; i++)
1804 1802                  up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]);
1805 1803          while (i < PSARGSZ)
1806 1804                  up->u_psargs[i++] = '\0';
1807 1805  
1808 1806          /*
1809 1807           * Add space for argv[]'s NULL terminator (ptrsize) to usp and
1810 1808           * record envp for /proc.
1811 1809           */
1812 1810          up->u_envp = (uintptr_t)(usp += ptrsize);
1813 1811  
1814 1812          /*
1815 1813           * Put the envp[] pointers on the stack.
1816 1814           */
1817 1815          for (i = 0; i < envc; i++, usp += ptrsize)
1818 1816                  if (stk_putptr(args, usp, &ustrp[*--offp]))
1819 1817                          return (-1);
1820 1818  
1821 1819          /*
1822 1820           * Add space for envp[]'s NULL terminator (ptrsize) to usp and
1823 1821           * remember where the stack ends, which is also where auxv begins.
1824 1822           */
1825 1823          args->stackend = usp += ptrsize;
1826 1824  
1827 1825          /*
1828 1826           * Put all the argv[], envp[], and auxv strings on the stack.
1829 1827           */
1830 1828          if (copyout(args->stk_base, ustrp, args->nc))
1831 1829                  return (-1);
1832 1830  
1833 1831          /*
1834 1832           * Fill in the aux vector now that we know the user stack addresses
1835 1833           * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and
1836 1834           * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array.
1837 1835           */
1838 1836          if (auxvpp != NULL && *auxvpp != NULL) {
1839 1837                  if (args->to_model == DATAMODEL_NATIVE) {
1840 1838                          auxv_t **a = (auxv_t **)auxvpp;
1841 1839                          ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp])
1842 1840                          ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp])
1843 1841                          if (args->brandname != NULL)
1844 1842                                  ADDAUX(*a,
1845 1843                                      AT_SUN_BRANDNAME, (long)&ustrp[*--offp])
1846 1844                          if (args->emulator != NULL)
1847 1845                                  ADDAUX(*a,
1848 1846                                      AT_SUN_EMULATOR, (long)&ustrp[*--offp])
1849 1847                          ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp])
1850 1848                          if (args->brand_nroot != NULL) {
1851 1849                                  ADDAUX(*a,
1852 1850                                      AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp])
1853 1851                          }
1854 1852                  } else {
1855 1853                          auxv32_t **a = (auxv32_t **)auxvpp;
1856 1854                          ADDAUX(*a,
1857 1855                              AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp])
1858 1856                          ADDAUX(*a,
1859 1857                              AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp])
1860 1858                          if (args->brandname != NULL)
1861 1859                                  ADDAUX(*a, AT_SUN_BRANDNAME,
1862 1860                                      (int)(uintptr_t)&ustrp[*--offp])
1863 1861                          if (args->emulator != NULL)
1864 1862                                  ADDAUX(*a, AT_SUN_EMULATOR,
1865 1863                                      (int)(uintptr_t)&ustrp[*--offp])
1866 1864                          ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp])
1867 1865                          if (args->brand_nroot != NULL) {
1868 1866                                  ADDAUX(*a, AT_SUN_BRAND_NROOT,
1869 1867                                      (int)(uintptr_t)&ustrp[*--offp])
1870 1868                          }
1871 1869                  }
1872 1870          }
1873 1871  
1874 1872          return (0);
1875 1873  }
1876 1874  
1877 1875  /*
1878 1876   * Initialize a new user stack with the specified arguments and environment.
1879 1877   * The initial user stack layout is as follows:
1880 1878   *
1881 1879   *      User Stack
1882 1880   *      +---------------+ <--- curproc->p_usrstack
1883 1881   *      |               |
1884 1882   *      | slew          |
1885 1883   *      |               |
1886 1884   *      +---------------+
1887 1885   *      | NULL          |
1888 1886   *      +---------------+
1889 1887   *      |               |
1890 1888   *      | auxv strings  |
1891 1889   *      |               |
1892 1890   *      +---------------+
1893 1891   *      |               |
1894 1892   *      | envp strings  |
1895 1893   *      |               |
1896 1894   *      +---------------+
1897 1895   *      |               |
1898 1896   *      | argv strings  |
1899 1897   *      |               |
1900 1898   *      +---------------+ <--- ustrp
1901 1899   *      |               |
1902 1900   *      | aux vector    |
1903 1901   *      |               |
1904 1902   *      +---------------+ <--- auxv
1905 1903   *      | NULL          |
1906 1904   *      +---------------+
1907 1905   *      | envp[envc-1]  |
1908 1906   *      +---------------+
1909 1907   *      | ...           |
1910 1908   *      +---------------+
1911 1909   *      | envp[0]       |
1912 1910   *      +---------------+ <--- envp[]
1913 1911   *      | NULL          |
1914 1912   *      +---------------+
1915 1913   *      | argv[argc-1]  |
1916 1914   *      +---------------+
1917 1915   *      | ...           |
1918 1916   *      +---------------+
1919 1917   *      | argv[0]       |
1920 1918   *      +---------------+ <--- argv[]
1921 1919   *      | argc          |
1922 1920   *      +---------------+ <--- stack base
1923 1921   */
1924 1922  int
1925 1923  exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp)
1926 1924  {
1927 1925          size_t size;
1928 1926          int error;
1929 1927          proc_t *p = ttoproc(curthread);
1930 1928          user_t *up = PTOU(p);
1931 1929          char *usrstack;
1932 1930          rctl_entity_p_t e;
1933 1931          struct as *as;
1934 1932          extern int use_stk_lpg;
1935 1933          size_t sp_slew;
1936 1934  
1937 1935          args->from_model = p->p_model;
1938 1936          if (p->p_model == DATAMODEL_NATIVE) {
1939 1937                  args->from_ptrsize = sizeof (long);
1940 1938          } else {
1941 1939                  args->from_ptrsize = sizeof (int32_t);
1942 1940          }
1943 1941  
1944 1942          if (args->to_model == DATAMODEL_NATIVE) {
1945 1943                  args->to_ptrsize = sizeof (long);
1946 1944                  args->ncargs = NCARGS;
1947 1945                  args->stk_align = STACK_ALIGN;
1948 1946                  if (args->addr32)
1949 1947                          usrstack = (char *)USRSTACK64_32;
1950 1948                  else
1951 1949                          usrstack = (char *)USRSTACK;
1952 1950          } else {
1953 1951                  args->to_ptrsize = sizeof (int32_t);
1954 1952                  args->ncargs = NCARGS32;
1955 1953                  args->stk_align = STACK_ALIGN32;
1956 1954                  usrstack = (char *)USRSTACK32;
1957 1955          }
1958 1956  
1959 1957          if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack)
1960 1958                  usrstack = (char *)args->maxstack;
1961 1959  
1962 1960          ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0);
1963 1961  
1964 1962  #if defined(__sparc)
1965 1963          /*
1966 1964           * Make sure user register windows are empty before
1967 1965           * attempting to make a new stack.
1968 1966           */
1969 1967          (void) flush_user_windows_to_stack(NULL);
1970 1968  #endif
1971 1969  
1972 1970          for (size = PAGESIZE; ; size *= 2) {
1973 1971                  args->stk_size = size;
1974 1972                  args->stk_base = kmem_alloc(size, KM_SLEEP);
1975 1973                  args->stk_strp = args->stk_base;
1976 1974                  args->stk_offp = (int *)(args->stk_base + size);
1977 1975                  error = stk_copyin(uap, args, intp, auxvpp);
1978 1976                  if (error == 0)
1979 1977                          break;
1980 1978                  kmem_free(args->stk_base, size);
1981 1979                  if (error != E2BIG && error != ENAMETOOLONG)
1982 1980                          return (error);
1983 1981                  if (size >= args->ncargs)
1984 1982                          return (E2BIG);
1985 1983          }
1986 1984  
1987 1985          size = args->usrstack_size;
1988 1986  
1989 1987          ASSERT(error == 0);
1990 1988          ASSERT(P2PHASE(size, args->stk_align) == 0);
1991 1989          ASSERT((ssize_t)STK_AVAIL(args) >= 0);
1992 1990  
1993 1991          if (size > args->ncargs) {
1994 1992                  kmem_free(args->stk_base, args->stk_size);
1995 1993                  return (E2BIG);
1996 1994          }
1997 1995  
1998 1996          /*
1999 1997           * Leave only the current lwp and force the other lwps to exit.
2000 1998           * If another lwp beat us to the punch by calling exit(), bail out.
2001 1999           */
2002 2000          if ((error = exitlwps(0)) != 0) {
2003 2001                  kmem_free(args->stk_base, args->stk_size);
2004 2002                  return (error);
2005 2003          }
2006 2004  
2007 2005          /*
2008 2006           * Revoke any doors created by the process.
2009 2007           */
2010 2008          if (p->p_door_list)
2011 2009                  door_exit();
2012 2010  
2013 2011          /*
2014 2012           * Release schedctl data structures.
2015 2013           */
2016 2014          if (p->p_pagep)
2017 2015                  schedctl_proc_cleanup();
2018 2016  
2019 2017          /*
2020 2018           * Clean up any DTrace helpers for the process.
2021 2019           */
2022 2020          if (p->p_dtrace_helpers != NULL) {
2023 2021                  ASSERT(dtrace_helpers_cleanup != NULL);
2024 2022                  (*dtrace_helpers_cleanup)();
2025 2023          }
2026 2024  
2027 2025          mutex_enter(&p->p_lock);
2028 2026          /*
2029 2027           * Cleanup the DTrace provider associated with this process.
2030 2028           */
2031 2029          if (p->p_dtrace_probes) {
2032 2030                  ASSERT(dtrace_fasttrap_exec_ptr != NULL);
2033 2031                  dtrace_fasttrap_exec_ptr(p);
2034 2032          }
2035 2033          mutex_exit(&p->p_lock);
2036 2034  
2037 2035          /*
2038 2036           * discard the lwpchan cache.
2039 2037           */
2040 2038          if (p->p_lcp != NULL)
2041 2039                  lwpchan_destroy_cache(1);
2042 2040  
2043 2041          /*
2044 2042           * Delete the POSIX timers.
2045 2043           */
2046 2044          if (p->p_itimer != NULL)
2047 2045                  timer_exit();
2048 2046  
2049 2047          /*
2050 2048           * Delete the ITIMER_REALPROF interval timer.
2051 2049           * The other ITIMER_* interval timers are specified
2052 2050           * to be inherited across exec().
2053 2051           */
2054 2052          delete_itimer_realprof();
2055 2053  
2056 2054          if (AU_AUDITING())
2057 2055                  audit_exec(args->stk_base, args->stk_base + args->arglen,
2058 2056                      args->na - args->ne, args->ne, args->pfcred);
2059 2057  
2060 2058          /*
2061 2059           * Ensure that we don't change resource associations while we
2062 2060           * change address spaces.
2063 2061           */
2064 2062          mutex_enter(&p->p_lock);
2065 2063          pool_barrier_enter();
2066 2064          mutex_exit(&p->p_lock);
2067 2065  
2068 2066          /*
2069 2067           * Destroy the old address space and create a new one.
2070 2068           * From here on, any errors are fatal to the exec()ing process.
2071 2069           * On error we return -1, which means the caller must SIGKILL
2072 2070           * the process.
2073 2071           */
2074 2072          relvm();
2075 2073  
2076 2074          mutex_enter(&p->p_lock);
2077 2075          pool_barrier_exit();
2078 2076          mutex_exit(&p->p_lock);
2079 2077  
2080 2078          up->u_execsw = args->execswp;
2081 2079  
2082 2080          p->p_brkbase = NULL;
2083 2081          p->p_brksize = 0;
2084 2082          p->p_brkpageszc = 0;
2085 2083          p->p_stksize = 0;
2086 2084          p->p_stkpageszc = 0;
2087 2085          p->p_model = args->to_model;
2088 2086          p->p_usrstack = usrstack;
2089 2087          p->p_stkprot = args->stk_prot;
2090 2088          p->p_datprot = args->dat_prot;
2091 2089  
2092 2090          /*
2093 2091           * Reset resource controls such that all controls are again active as
2094 2092           * well as appropriate to the potentially new address model for the
2095 2093           * process.
2096 2094           */
2097 2095          e.rcep_p.proc = p;
2098 2096          e.rcep_t = RCENTITY_PROCESS;
2099 2097          rctl_set_reset(p->p_rctls, p, &e);
2100 2098  
2101 2099          /* Too early to call map_pgsz for the heap */
2102 2100          if (use_stk_lpg) {
2103 2101                  p->p_stkpageszc = page_szc(map_pgsz(MAPPGSZ_STK, p, 0, 0, 0));
2104 2102          }
2105 2103  
2106 2104          mutex_enter(&p->p_lock);
2107 2105          p->p_flag |= SAUTOLPG;  /* kernel controls page sizes */
2108 2106          mutex_exit(&p->p_lock);
2109 2107  
2110 2108          /*
2111 2109           * Some platforms may choose to randomize real stack start by adding a
2112 2110           * small slew (not more than a few hundred bytes) to the top of the
2113 2111           * stack. This helps avoid cache thrashing when identical processes
2114 2112           * simultaneously share caches that don't provide enough associativity
2115 2113           * (e.g. sun4v systems). In this case stack slewing makes the same hot
2116 2114           * stack variables in different processes to live in different cache
2117 2115           * sets increasing effective associativity.
2118 2116           */
2119 2117          sp_slew = exec_get_spslew();
2120 2118          ASSERT(P2PHASE(sp_slew, args->stk_align) == 0);
2121 2119          exec_set_sp(size + sp_slew);
2122 2120  
2123 2121          as = as_alloc();
2124 2122          p->p_as = as;
2125 2123          as->a_proc = p;
2126 2124          if (p->p_model == DATAMODEL_ILP32 || args->addr32)
2127 2125                  as->a_userlimit = (caddr_t)USERLIMIT32;
2128 2126          (void) hat_setup(as->a_hat, HAT_ALLOC);
2129 2127          hat_join_srd(as->a_hat, args->ex_vp);
2130 2128  
2131 2129          /*
2132 2130           * Finally, write out the contents of the new stack.
2133 2131           */
2134 2132          error = stk_copyout(args, usrstack - sp_slew, auxvpp, up);
2135 2133          kmem_free(args->stk_base, args->stk_size);
2136 2134          return (error);
2137 2135  }
  
    | 
      ↓ open down ↓ | 
    559 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX