Print this page
OS-5015 PT_INTERP headers should be permitted after PT_LOAD headers
OS-5451 comm page should not break i86xpv
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
OS-5293 lx brand: prelink(8)'d binaries core dump before main()
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5072 lxbrand support PT_GNU_STACK
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5202 Support AT_SECURE & AT_*ID in LX
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4824 Unlike Linux, nested interpreters don't work
(LX changes only, the rest were upstreamed...)
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
OS-3735 modstubs MAXNARG is too low.
OS-3733 Verify b_native_exec exists before calling it
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4119 lxbrand panic when running native perl inside lx zone
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4128 programs that lack PT_PHDR are not properly loaded
OS-4141 freeing phdrs induces bad kmem_free() in elfexec()
backout OS-4141: needs more work
backout OS-4128: needs more work
OS-4141 freeing phdrs induces bad kmem_free() in elfexec()
OS-4128 programs that lack PT_PHDR are not properly loaded
OS-3696 lx brand: G-Portugol programs core dump
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-3517 lx brand: branded zones don't interpret .interp section
OS-3405 lx brand: socket() fails for PF_INET6
OS-3382 lxbrand 64bit gettimeofday depends on vsyscall or vdso
OS-3280 need a way to specify the root of a native system in the lx brand
OS-3279 lx brand should allow delegated datasets
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-2949 add support for AT_RANDOM aux vector entry
OS-2877 lx_librtld_db falls to load due to NULL DT_DEBUG
        
*** 24,34 ****
   */
  
  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  /*        All Rights Reserved   */
  /*
!  * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
   */
  
  #include <sys/types.h>
  #include <sys/param.h>
  #include <sys/thread.h>
--- 24,34 ----
   */
  
  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  /*        All Rights Reserved   */
  /*
!  * Copyright 2016 Joyent, Inc.
   */
  
  #include <sys/types.h>
  #include <sys/param.h>
  #include <sys/thread.h>
*** 64,73 ****
--- 64,78 ----
  #include <sys/brand.h>
  #include "elf_impl.h"
  #include <sys/sdt.h>
  #include <sys/siginfo.h>
  
+ #if defined(__x86)
+ #include <sys/comm_page_util.h>
+ #endif /* defined(__x86) */
+ 
+ 
  extern int at_flags;
  
  #define ORIGIN_STR      "ORIGIN"
  #define ORIGIN_STR_SIZE 6
  
*** 161,176 ****
  
          return (0);
  }
  
  /*
!  * Map in the executable pointed to by vp. Returns 0 on success.
   */
  int
  mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
!     intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
!     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
  {
          size_t          len;
          struct vattr    vat;
          caddr_t         phdrbase = NULL;
          ssize_t         phdrsize;
--- 166,185 ----
  
          return (0);
  }
  
  /*
!  * Map in the executable pointed to by vp. Returns 0 on success.  Note that
!  * this function currently has the maximum number of arguments allowed by
!  * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
!  * adding to MAXNARG.  (Better yet, do not add to this monster of a function
!  * signature!)
   */
  int
  mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
!     intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
!     caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
  {
          size_t          len;
          struct vattr    vat;
          caddr_t         phdrbase = NULL;
          ssize_t         phdrsize;
*** 178,194 ****
--- 187,207 ----
          int             error = 0;
          Phdr            *uphdr = NULL;
          Phdr            *junk = NULL;
          Phdr            *dynphdr = NULL;
          Phdr            *dtrphdr = NULL;
+         char            *interp = NULL;
          uintptr_t       lddata;
          long            execsz;
          intptr_t        minaddr;
  
          if (lddatap != NULL)
                  *lddatap = NULL;
  
+         if (minaddrp != NULL)
+                 *minaddrp = NULL;
+ 
          if (error = execpermissions(vp, &vat, args)) {
                  uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
                  return (error);
          }
  
*** 210,238 ****
  
          if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
              &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
              len, &execsz, brksize)) {
                  uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
                  kmem_free(phdrbase, phdrsize);
                  return (error);
          }
  
          /*
!          * Inform our caller if the executable needs an interpreter.
           */
!         *interp = (dynphdr == NULL) ? 0 : 1;
  
          /*
           * If this is a statically linked executable, voffset should indicate
           * the address of the executable itself (it normally holds the address
           * of the interpreter).
           */
!         if (ehdr->e_type == ET_EXEC && *interp == 0)
                  *voffset = minaddr;
  
          if (uphdr != NULL) {
                  *uphdr_vaddr = uphdr->p_vaddr;
          } else {
                  *uphdr_vaddr = (Addr)-1;
          }
  
          kmem_free(phdrbase, phdrsize);
--- 223,315 ----
  
          if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
              &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
              len, &execsz, brksize)) {
                  uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
+                 if (uphdr != NULL && uphdr->p_flags == 0)
+                         kmem_free(uphdr, sizeof (Phdr));
                  kmem_free(phdrbase, phdrsize);
                  return (error);
          }
  
+         if (minaddrp != NULL)
+                 *minaddrp = minaddr;
+ 
          /*
!          * If the executable requires an interpreter, determine its name.
           */
!         if (dynphdr != NULL) {
!                 ssize_t resid;
  
+                 if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
+                         uprintf("%s: Invalid interpreter\n", exec_file);
+                         kmem_free(phdrbase, phdrsize);
+                         return (ENOEXEC);
+                 }
+ 
+                 interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ 
+                 if ((error = vn_rdwr(UIO_READ, vp, interp, dynphdr->p_filesz,
+                     (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
+                     (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
+                     interp[dynphdr->p_filesz - 1] != '\0') {
+                         uprintf("%s: Cannot obtain interpreter pathname\n",
+                             exec_file);
+                         kmem_free(interp, MAXPATHLEN);
+                         kmem_free(phdrbase, phdrsize);
+                         return (error != 0 ? error : ENOEXEC);
+                 }
+         }
+ 
          /*
           * If this is a statically linked executable, voffset should indicate
           * the address of the executable itself (it normally holds the address
           * of the interpreter).
           */
!         if (ehdr->e_type == ET_EXEC && interp == NULL)
                  *voffset = minaddr;
  
+         /*
+          * If the caller has asked for the interpreter name, return it (it's
+          * up to the caller to free it); if the caller hasn't asked for it,
+          * free it ourselves.
+          */
+         if (interpp != NULL) {
+                 *interpp = interp;
+         } else if (interp != NULL) {
+                 kmem_free(interp, MAXPATHLEN);
+         }
+ 
          if (uphdr != NULL) {
                  *uphdr_vaddr = uphdr->p_vaddr;
+ 
+                 if (uphdr->p_flags == 0)
+                         kmem_free(uphdr, sizeof (Phdr));
+         } else if (ehdr->e_type == ET_DYN) {
+                 /*
+                  * If we don't have a uphdr, we'll apply the logic found
+                  * in mapelfexec() and use the p_vaddr of the first PT_LOAD
+                  * section as the base address of the object.
+                  */
+                 Phdr *phdr = (Phdr *)phdrbase;
+                 int i, hsize = ehdr->e_phentsize;
+ 
+                 for (i = nphdrs; i > 0; i--) {
+                         if (phdr->p_type == PT_LOAD) {
+                                 *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
+                                     ehdr->e_phoff;
+                                 break;
+                         }
+ 
+                         phdr = (Phdr *)((caddr_t)phdr + hsize);
+                 }
+ 
+                 /*
+                  * If we don't have a PT_LOAD segment, we should have returned
+                  * ENOEXEC when elfsize() returned 0, above.
+                  */
+                 VERIFY(i > 0);
          } else {
                  *uphdr_vaddr = (Addr)-1;
          }
  
          kmem_free(phdrbase, phdrsize);
*** 241,257 ****
  
  /*ARGSUSED*/
  int
  elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
      int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
!     int brand_action)
  {
          caddr_t         phdrbase = NULL;
          caddr_t         bssbase = 0;
          caddr_t         brkbase = 0;
          size_t          brksize = 0;
!         ssize_t         dlnsize;
          aux_entry_t     *aux;
          int             error;
          ssize_t         resid;
          int             fd = -1;
          intptr_t        voffset;
--- 318,334 ----
  
  /*ARGSUSED*/
  int
  elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
      int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
!     int *brand_action)
  {
          caddr_t         phdrbase = NULL;
          caddr_t         bssbase = 0;
          caddr_t         brkbase = 0;
          size_t          brksize = 0;
!         ssize_t         dlnsize, nsize = 0;
          aux_entry_t     *aux;
          int             error;
          ssize_t         resid;
          int             fd = -1;
          intptr_t        voffset;
*** 271,280 ****
--- 348,358 ----
          ssize_t         capsize;
          int             hasu = 0;
          int             hasauxv = 0;
          int             hasdy = 0;
          int             branded = 0;
+         int             dynuphdr = 0;
  
          struct proc *p = ttoproc(curthread);
          struct user *up = PTOU(p);
          struct bigwad {
                  Ehdr    ehdr;
*** 325,335 ****
--- 403,415 ----
          if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
                  args->to_model = DATAMODEL_ILP32;
                  *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
          } else {
                  args->to_model = DATAMODEL_LP64;
+                 if (!args->stk_prot_override) {
                          args->stk_prot &= ~PROT_EXEC;
+                 }
  #if defined(__i386) || defined(__amd64)
                  args->dat_prot &= ~PROT_EXEC;
  #endif
                  *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
          }
*** 337,360 ****
          args->to_model = DATAMODEL_ILP32;
          *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
  #endif  /* _LP64 */
  
          /*
!          * We delay invoking the brand callback until we've figured out
!          * what kind of elf binary we're trying to run, 32-bit or 64-bit.
!          * We do this because now the brand library can just check
!          * args->to_model to see if the target is 32-bit or 64-bit without
!          * having do duplicate all the code above.
           *
           * The level checks associated with brand handling below are used to
           * prevent a loop since the brand elfexec function typically comes back
           * through this function. We must check <= here since the nested
           * handling in the #! interpreter code will increment the level before
           * calling gexec to run the final elfexec interpreter.
           */
          if ((level <= INTP_MAXDEPTH) &&
!             (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
                  error = BROP(p)->b_elfexec(vp, uap, args,
                      idatap, level + 1, execsz, setid, exec_file, cred,
                      brand_action);
                  goto out;
          }
--- 417,466 ----
          args->to_model = DATAMODEL_ILP32;
          *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
  #endif  /* _LP64 */
  
          /*
!          * We delay invoking the brand callback until we've figured out what
!          * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
!          * because now the brand library can just check args->to_model to see if
!          * the target is 32-bit or 64-bit without having do duplicate all the
!          * code above.
           *
+          * We also give the brand a chance to indicate that based on the ELF
+          * OSABI of the target binary it should become unbranded and optionally
+          * indicate that it should be treated as existing in a specific prefix.
+          *
+          * Note that if a brand opts to go down this route it does not actually
+          * end up being debranded. In other words, future programs that exec
+          * will still be considered for branding unless this escape hatch is
+          * used. Consider the case of lx brand for example. If a user runs
+          * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
+          * of DTrace that's in /native will take this escape hatch and be run
+          * and interpreted using the normal system call table; however, the
+          * execution of a non-illumos binary in the form of /bin/ls will still
+          * be branded and be subject to all of the normal actions of the brand.
+          *
           * The level checks associated with brand handling below are used to
           * prevent a loop since the brand elfexec function typically comes back
           * through this function. We must check <= here since the nested
           * handling in the #! interpreter code will increment the level before
           * calling gexec to run the final elfexec interpreter.
           */
+         if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
+             (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
+                 if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
+                     &args->brand_nroot) == B_TRUE) {
+                         ASSERT(ehdrp->e_ident[EI_OSABI]);
+                         *brand_action = EBA_NATIVE;
+                         /* Add one for the trailing '/' in the path */
+                         if (args->brand_nroot != NULL)
+                                 nsize = strlen(args->brand_nroot) + 1;
+                 }
+         }
+ 
          if ((level <= INTP_MAXDEPTH) &&
!             (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
                  error = BROP(p)->b_elfexec(vp, uap, args,
                      idatap, level + 1, execsz, setid, exec_file, cred,
                      brand_action);
                  goto out;
          }
*** 421,438 ****
                   * entries are:
                   *
                   *      AT_BASE
                   *      AT_FLAGS
                   *      AT_PAGESZ
                   *      AT_SUN_AUXFLAGS
                   *      AT_SUN_HWCAP
                   *      AT_SUN_HWCAP2
                   *      AT_SUN_PLATFORM (added in stk_copyout)
                   *      AT_SUN_EXECNAME (added in stk_copyout)
                   *      AT_NULL
                   *
!                  * total == 9
                   */
                  if (hasdy && hasu) {
                          /*
                           * Has PT_INTERP & PT_PHDR - the auxvectors that
                           * will be built are:
--- 527,545 ----
                   * entries are:
                   *
                   *      AT_BASE
                   *      AT_FLAGS
                   *      AT_PAGESZ
+                  *      AT_RANDOM
                   *      AT_SUN_AUXFLAGS
                   *      AT_SUN_HWCAP
                   *      AT_SUN_HWCAP2
                   *      AT_SUN_PLATFORM (added in stk_copyout)
                   *      AT_SUN_EXECNAME (added in stk_copyout)
                   *      AT_NULL
                   *
!                  * total == 10
                   */
                  if (hasdy && hasu) {
                          /*
                           * Has PT_INTERP & PT_PHDR - the auxvectors that
                           * will be built are:
*** 443,465 ****
                           *      AT_ENTRY
                           *      AT_LDDATA
                           *
                           * total = 5
                           */
!                         args->auxsize = (9 + 5) * sizeof (aux_entry_t);
                  } else if (hasdy) {
                          /*
                           * Has PT_INTERP but no PT_PHDR
                           *
                           *      AT_EXECFD
                           *      AT_LDDATA
                           *
                           * total = 2
                           */
!                         args->auxsize = (9 + 2) * sizeof (aux_entry_t);
                  } else {
!                         args->auxsize = 9 * sizeof (aux_entry_t);
                  }
          } else {
                  args->auxsize = 0;
          }
  
--- 550,572 ----
                           *      AT_ENTRY
                           *      AT_LDDATA
                           *
                           * total = 5
                           */
!                         args->auxsize = (10 + 5) * sizeof (aux_entry_t);
                  } else if (hasdy) {
                          /*
                           * Has PT_INTERP but no PT_PHDR
                           *
                           *      AT_EXECFD
                           *      AT_LDDATA
                           *
                           * total = 2
                           */
!                         args->auxsize = (10 + 2) * sizeof (aux_entry_t);
                  } else {
!                         args->auxsize = 10 * sizeof (aux_entry_t);
                  }
          } else {
                  args->auxsize = 0;
          }
  
*** 468,486 ****
           * AT_SUN_EMULATOR aux entry.
           */
          if (args->emulator != NULL)
                  args->auxsize += sizeof (aux_entry_t);
  
-         if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
-                 branded = 1;
                  /*
!                  * We will be adding 4 entries to the aux vectors.  One for
!                  * the the brandname and 3 for the brand specific aux vectors.
                   */
                  args->auxsize += 4 * sizeof (aux_entry_t);
          }
  
          /* Hardware/Software capabilities */
          if (capphdr != NULL &&
              (capsize = capphdr->p_filesz) > 0 &&
              capsize <= 16 * sizeof (*cap)) {
                  int ncaps = capsize / sizeof (*cap);
--- 575,622 ----
           * AT_SUN_EMULATOR aux entry.
           */
          if (args->emulator != NULL)
                  args->auxsize += sizeof (aux_entry_t);
  
          /*
!          * If this is a native binary that's been given a modified interpreter
!          * root, inform it that the native system exists at that root.
           */
+         if (args->brand_nroot != NULL) {
+                 args->auxsize += sizeof (aux_entry_t);
+         }
+ 
+ 
+         /*
+          * On supported kernels (x86_64) make room in the auxv for the
+          * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
+          * which do not provide such functionality.
+          */
+ #if defined(__amd64)
+         args->auxsize += sizeof (aux_entry_t);
+ #endif /* defined(__amd64) */
+ 
+         /*
+          * If we have user credentials, we'll supply the following entries:
+          *      AT_SUN_UID
+          *      AT_SUN_RUID
+          *      AT_SUN_GID
+          *      AT_SUN_RGID
+          */
+         if (cred != NULL) {
                  args->auxsize += 4 * sizeof (aux_entry_t);
          }
  
+         if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+                 branded = 1;
+                 /*
+                  * We will be adding 5 entries to the aux vectors.  One for
+                  * the the brandname and 4 for the brand specific aux vectors.
+                  */
+                 args->auxsize += 5 * sizeof (aux_entry_t);
+         }
+ 
          /* Hardware/Software capabilities */
          if (capphdr != NULL &&
              (capsize = capphdr->p_filesz) > 0 &&
              capsize <= 16 * sizeof (*cap)) {
                  int ncaps = capsize / sizeof (*cap);
*** 532,541 ****
--- 668,685 ----
          if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &dyphdr,
              &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
              len, execsz, &brksize)) != 0)
                  goto bad;
  
+         if (uphdr != NULL) {
+                 /*
+                  * Our uphdr has been dynamically allocated if (and only if)
+                  * its program header flags are clear.
+                  */
+                 dynuphdr = (uphdr->p_flags == 0);
+         }
+ 
          if (uphdr != NULL && dyphdr == NULL)
                  goto bad;
  
          if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
                  uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
*** 546,566 ****
                  size_t          len;
                  uintptr_t       lddata;
                  char            *p;
                  struct vnode    *nvp;
  
!                 dlnsize = dyphdr->p_filesz;
  
                  if (dlnsize > MAXPATHLEN || dlnsize <= 0)
                          goto bad;
  
                  /*
                   * Read in "interpreter" pathname.
                   */
!                 if ((error = vn_rdwr(UIO_READ, vp, dlnp, dyphdr->p_filesz,
!                     (offset_t)dyphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
!                     CRED(), &resid)) != 0) {
                          uprintf("%s: Cannot obtain interpreter pathname\n",
                              exec_file);
                          goto bad;
                  }
  
--- 690,715 ----
                  size_t          len;
                  uintptr_t       lddata;
                  char            *p;
                  struct vnode    *nvp;
  
!                 dlnsize = dyphdr->p_filesz + nsize;
  
                  if (dlnsize > MAXPATHLEN || dlnsize <= 0)
                          goto bad;
  
+                 if (nsize != 0) {
+                         bcopy(args->brand_nroot, dlnp, nsize - 1);
+                         dlnp[nsize - 1] = '/';
+                 }
+ 
                  /*
                   * Read in "interpreter" pathname.
                   */
!                 if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
!                     dyphdr->p_filesz, (offset_t)dyphdr->p_offset, UIO_SYSSPACE,
!                     0, (rlim64_t)0, CRED(), &resid)) != 0) {
                          uprintf("%s: Cannot obtain interpreter pathname\n",
                              exec_file);
                          goto bad;
                  }
  
*** 701,713 ****
                          goto bad;
                  }
  
                  dtrphdr = NULL;
  
!                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
                      &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
                      execsz, NULL);
                  if (error || junk != NULL) {
                          VN_RELE(nvp);
                          uprintf("%s: Cannot map %s\n", exec_file, dlnp);
                          goto bad;
                  }
--- 850,863 ----
                          goto bad;
                  }
  
                  dtrphdr = NULL;
  
!                 error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
                      &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
                      execsz, NULL);
+ 
                  if (error || junk != NULL) {
                          VN_RELE(nvp);
                          uprintf("%s: Cannot map %s\n", exec_file, dlnp);
                          goto bad;
                  }
*** 730,741 ****
                  ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
          }
  
          if (hasauxv) {
                  int auxf = AF_SUN_HWCAPVERIFY;
                  /*
!                  * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
                   * exec_args()
                   */
                  ADDAUX(aux, AT_BASE, voffset)
                  ADDAUX(aux, AT_FLAGS, at_flags)
                  ADDAUX(aux, AT_PAGESZ, PAGESIZE)
--- 880,892 ----
                  ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
          }
  
          if (hasauxv) {
                  int auxf = AF_SUN_HWCAPVERIFY;
+ 
                  /*
!                  * Note: AT_SUN_PLATFORM and AT_RANDOM were filled in via
                   * exec_args()
                   */
                  ADDAUX(aux, AT_BASE, voffset)
                  ADDAUX(aux, AT_FLAGS, at_flags)
                  ADDAUX(aux, AT_PAGESZ, PAGESIZE)
*** 760,770 ****
                   * which we trust because we see they are already running
                   * under pfexec (where uid != euid).  This prevents a
                   * malicious user within the zone from crafting a wrapper to
                   * run native suid commands with unsecure libraries interposed.
                   */
!                 if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
                      (setid &= ~EXECSETID_SETID) != 0))
                          auxf &= ~AF_SUN_SETUGID;
  
                  /*
                   * Record the user addr of the auxflags aux vector entry
--- 911,921 ----
                   * which we trust because we see they are already running
                   * under pfexec (where uid != euid).  This prevents a
                   * malicious user within the zone from crafting a wrapper to
                   * run native suid commands with unsecure libraries interposed.
                   */
!                 if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
                      (setid &= ~EXECSETID_SETID) != 0))
                          auxf &= ~AF_SUN_SETUGID;
  
                  /*
                   * Record the user addr of the auxflags aux vector entry
*** 773,783 ****
--- 924,946 ----
                  args->auxp_auxflags =
                      (char *)((char *)args->stackend +
                      ((char *)&aux->a_type -
                      (char *)bigwad->elfargs));
                  ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
+ 
                  /*
+                  * Record information about the real and effective user and
+                  * group IDs.
+                  */
+                 if (cred != NULL) {
+                         ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
+                         ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
+                         ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
+                         ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
+                 }
+ 
+                 /*
                   * Hardware capability flag word (performance hints)
                   * Used for choosing faster library routines.
                   * (Potentially different between 32-bit and 64-bit ABIs)
                   */
  #if defined(_LP64)
*** 802,814 ****
--- 965,994 ----
                              ((char *)&aux->a_type -
                              (char *)bigwad->elfargs));
                          ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
                          ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
                          ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
+                         ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
                  }
  
+                 /*
+                  * Add the comm page auxv entry, mapping it in if needed.
+                  */
+ #if defined(__amd64)
+                 if (args->commpage != NULL ||
+                     (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) {
+                         ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
+                 } else {
+                         /*
+                          * If the comm page cannot be mapped, pad out the auxv
+                          * to satisfy later size checks.
+                          */
                          ADDAUX(aux, AT_NULL, 0)
+                 }
+ #endif /* defined(__amd64) */
+ 
+                 ADDAUX(aux, AT_NULL, 0)
                  postfixsize = (char *)aux - (char *)bigwad->elfargs;
  
                  /*
                   * We make assumptions above when we determine how many aux
                   * vector entries we will be adding. However, if we have an
*** 843,852 ****
--- 1023,1033 ----
                  error = ENOMEM;
                  goto bad;
          }
  
          bzero(up->u_auxv, sizeof (up->u_auxv));
+         up->u_commpagep = args->commpage;
          if (postfixsize) {
                  int num_auxv;
  
                  /*
                   * Copy the aux vector to the user stack.
*** 909,918 ****
--- 1090,1101 ----
          psignal(p, SIGKILL);
  
          if (error == 0)
                  error = ENOEXEC;
  out:
+         if (dynuphdr)
+                 kmem_free(uphdr, sizeof (Phdr));
          if (phdrbase != NULL)
                  kmem_free(phdrbase, phdrsize);
          if (cap != NULL)
                  kmem_free(cap, capsize);
          kmem_free(bigwad, sizeof (struct bigwad));
*** 1175,1184 ****
--- 1358,1390 ----
          (*shstrbasep)[*shstrsizep - 1] = '\0';
  
          return (0);
  }
  
+ 
+ #ifdef _ELF32_COMPAT
+ int
+ elf32readhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
+     caddr_t *phbasep, ssize_t *phsizep)
+ #else
+ int
+ elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
+     caddr_t *phbasep, ssize_t *phsizep)
+ #endif
+ {
+         int error, nshdrs, shstrndx;
+ 
+         if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
+             nphdrs)) != 0 ||
+             (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
+             phsizep)) != 0) {
+                 return (error);
+         }
+         return (0);
+ }
+ 
+ 
  static int
  mapelfexec(
          vnode_t *vp,
          Ehdr *ehdr,
          int nphdrs,
*** 1195,1249 ****
          size_t len,
          long *execsz,
          size_t *brksize)
  {
          Phdr *phdr;
!         int i, prot, error;
          caddr_t addr = NULL;
          size_t zfodsz;
          int ptload = 0;
          int page;
          off_t offset;
          int hsize = ehdr->e_phentsize;
          caddr_t mintmp = (caddr_t)-1;
          extern int use_brk_lpg;
  
          if (ehdr->e_type == ET_DYN) {
!                 /*
!                  * Obtain the virtual address of a hole in the
!                  * address space to map the "interpreter".
!                  */
!                 map_addr(&addr, len, (offset_t)0, 1, 0);
!                 if (addr == NULL)
!                         return (ENOMEM);
!                 *voffset = (intptr_t)addr;
  
                  /*
!                  * Calculate the minimum vaddr so it can be subtracted out.
!                  * According to the ELF specification, since PT_LOAD sections
!                  * must be sorted by increasing p_vaddr values, this is
!                  * guaranteed to be the first PT_LOAD section.
                   */
                  phdr = (Phdr *)phdrbase;
                  for (i = nphdrs; i > 0; i--) {
                          if (phdr->p_type == PT_LOAD) {
!                                 *voffset -= (uintptr_t)phdr->p_vaddr;
                                  break;
                          }
                          phdr = (Phdr *)((caddr_t)phdr + hsize);
                  }
  
          } else {
                  *voffset = 0;
          }
          phdr = (Phdr *)phdrbase;
          for (i = nphdrs; i > 0; i--) {
                  switch (phdr->p_type) {
                  case PT_LOAD:
-                         if ((*dyphdr != NULL) && (*uphdr == NULL))
-                                 return (0);
- 
                          ptload = 1;
                          prot = PROT_USER;
                          if (phdr->p_flags & PF_R)
                                  prot |= PROT_READ;
                          if (phdr->p_flags & PF_W)
--- 1401,1490 ----
          size_t len,
          long *execsz,
          size_t *brksize)
  {
          Phdr *phdr;
!         int i, prot, error, lastprot = 0;
          caddr_t addr = NULL;
          size_t zfodsz;
          int ptload = 0;
          int page;
          off_t offset;
          int hsize = ehdr->e_phentsize;
          caddr_t mintmp = (caddr_t)-1;
+         uintptr_t lastaddr = NULL;
          extern int use_brk_lpg;
  
          if (ehdr->e_type == ET_DYN) {
!                 caddr_t vaddr;
  
                  /*
!                  * Despite the fact that mmapobj(2) refuses to load them, we
!                  * need to support executing ET_DYN objects that have a
!                  * non-NULL p_vaddr.  When found in the wild, these objects
!                  * are likely to be due to an old (and largely obviated) Linux
!                  * facility, prelink(8), that rewrites shared objects to
!                  * prefer specific (disjoint) virtual address ranges.  (Yes,
!                  * this is putatively for performance -- and yes, it has
!                  * limited applicability, many edge conditions and grisly
!                  * failure modes; even for Linux, it's insane.)  As ELF
!                  * mandates that the PT_LOAD segments be in p_vaddr order, we
!                  * find the lowest p_vaddr by finding the first PT_LOAD
!                  * segment.
                   */
                  phdr = (Phdr *)phdrbase;
                  for (i = nphdrs; i > 0; i--) {
                          if (phdr->p_type == PT_LOAD) {
!                                 addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
                                  break;
                          }
                          phdr = (Phdr *)((caddr_t)phdr + hsize);
                  }
  
+                 /*
+                  * We have a non-zero p_vaddr in the first PT_LOAD segment --
+                  * presumably because we're directly executing a prelink(8)'d
+                  * ld-linux.so.  While we could correctly execute such an
+                  * object without locating it at its desired p_vaddr (it is,
+                  * after all, still relocatable), our inner antiquarian
+                  * derives a perverse pleasure in accommodating the steampunk
+                  * prelink(8) contraption -- goggles on!
+                  */
+                 if ((vaddr = addr) != NULL) {
+                         if (as_gap(curproc->p_as, len,
+                             &addr, &len, AH_LO, NULL) == -1 || addr != vaddr) {
+                                 addr = NULL;
+                         }
+                 }
+ 
+                 if (addr == NULL) {
+                         /*
+                          * We either have a NULL p_vaddr (the common case, by
+                          * many orders of magnitude) or we have a non-NULL
+                          * p_vaddr and we were unable to obtain the specified
+                          * VA range (presumably because it's an illegal
+                          * address).  Either way, obtain an address in which
+                          * to map the interpreter.
+                          */
+                         map_addr(&addr, len, (offset_t)0, 1, 0);
+                         if (addr == NULL)
+                                 return (ENOMEM);
+                 }
+ 
+                 /*
+                  * Our voffset is the difference between where we landed and
+                  * where we wanted to be.
+                  */
+                 *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
          } else {
                  *voffset = 0;
          }
+ 
          phdr = (Phdr *)phdrbase;
          for (i = nphdrs; i > 0; i--) {
                  switch (phdr->p_type) {
                  case PT_LOAD:
                          ptload = 1;
                          prot = PROT_USER;
                          if (phdr->p_flags & PF_R)
                                  prot |= PROT_READ;
                          if (phdr->p_flags & PF_W)
*** 1251,1267 ****
--- 1492,1571 ----
                          if (phdr->p_flags & PF_X)
                                  prot |= PROT_EXEC;
  
                          addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
  
+                         if ((*dyphdr != NULL) && uphdr != NULL &&
+                             (*uphdr == NULL)) {
                                  /*
+                                  * The PT_PHDR program header is, strictly
+                                  * speaking, optional.  If we find that this
+                                  * is missing, we will determine the location
+                                  * of the program headers based on the address
+                                  * of the lowest PT_LOAD segment (namely, this
+                                  * one):  we subtract the p_offset to get to
+                                  * the ELF header and then add back the program
+                                  * header offset to get to the program headers.
+                                  * We then cons up a Phdr that corresponds to
+                                  * the (missing) PT_PHDR, setting the flags
+                                  * to 0 to denote that this is artificial and
+                                  * should (must) be freed by the caller.
+                                  */
+                                 Phdr *cons;
+ 
+                                 cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
+ 
+                                 cons->p_flags = 0;
+                                 cons->p_type = PT_PHDR;
+                                 cons->p_vaddr = ((uintptr_t)addr -
+                                     phdr->p_offset) + ehdr->e_phoff;
+ 
+                                 *uphdr = cons;
+                         }
+ 
+                         /*
                           * Keep track of the segment with the lowest starting
                           * address.
                           */
                          if (addr < mintmp)
                                  mintmp = addr;
  
+                         /*
+                          * Segments need not correspond to page boundaries:
+                          * they are permitted to share a page.  If two PT_LOAD
+                          * segments share the same page, and the permissions
+                          * of the segments differ, the behavior is historically
+                          * that the permissions of the latter segment are used
+                          * for the page that the two segments share.  This is
+                          * also historically a non-issue:  binaries generated
+                          * by most anything will make sure that two PT_LOAD
+                          * segments with differing permissions don't actually
+                          * share any pages.  However, there exist some crazy
+                          * things out there (including at least an obscure
+                          * Portuguese teaching language called G-Portugol) that
+                          * actually do the wrong thing and expect it to work:
+                          * they have a segment with execute permission share
+                          * a page with a subsequent segment that does not
+                          * have execute permissions and expect the resulting
+                          * shared page to in fact be executable.  To accommodate
+                          * such broken link editors, we take advantage of a
+                          * latitude explicitly granted to the loader:  it is
+                          * permitted to make _any_ PT_LOAD segment executable
+                          * (provided that it is readable or writable).  If we
+                          * see that we're sharing a page and that the previous
+                          * page was executable, we will add execute permissions
+                          * to our segment.
+                          */
+                         if (btop(lastaddr) == btop((uintptr_t)addr) &&
+                             (phdr->p_flags & (PF_R | PF_W)) &&
+                             (lastprot & PROT_EXEC)) {
+                                 prot |= PROT_EXEC;
+                         }
+ 
+                         lastaddr = (uintptr_t)addr + phdr->p_filesz;
+                         lastprot = prot;
+ 
                          zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
  
                          offset = phdr->p_offset;
                          if (((uintptr_t)offset & PAGEOFFSET) ==
                              ((uintptr_t)addr & PAGEOFFSET) &&
*** 1322,1344 ****
  
                          *execsz += btopr(phdr->p_memsz);
                          break;
  
                  case PT_INTERP:
!                         if (ptload)
!                                 goto bad;
                          *dyphdr = phdr;
                          break;
  
                  case PT_SHLIB:
                          *stphdr = phdr;
                          break;
  
                  case PT_PHDR:
!                         if (ptload)
                                  goto bad;
                          *uphdr = phdr;
                          break;
  
                  case PT_NULL:
                  case PT_DYNAMIC:
                  case PT_NOTE:
--- 1626,1665 ----
  
                          *execsz += btopr(phdr->p_memsz);
                          break;
  
                  case PT_INTERP:
!                         /*
!                          * The ELF specification is unequivocal about the
!                          * PT_INTERP program header with respect to any PT_LOAD
!                          * program header:  "If it is present, it must precede
!                          * any loadable segment entry." Linux, however, makes
!                          * no attempt to enforce this -- which has allowed some
!                          * binary editing tools to get away with generating
!                          * invalid ELF binaries in the respect that PT_INTERP
!                          * occurs after the first PT_LOAD program header.  This
!                          * is unfortunate (and of course, disappointing) but
!                          * it's no worse than that: there is no reason that we
!                          * can't process the PT_INTERP entry (if present) after
!                          * one or more PT_LOAD entries.  We therefore
!                          * deliberately do not check ptload here and always
!                          * store dyphdr to be the PT_INTERP program header.
!                          */
                          *dyphdr = phdr;
                          break;
  
                  case PT_SHLIB:
                          *stphdr = phdr;
                          break;
  
                  case PT_PHDR:
!                         if (ptload || phdr->p_flags == 0)
                                  goto bad;
+ 
+                         if (uphdr != NULL)
                                  *uphdr = phdr;
+ 
                          break;
  
                  case PT_NULL:
                  case PT_DYNAMIC:
                  case PT_NOTE:
*** 2183,2193 ****
  
  #ifdef  _LP64
  extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
                          intpdata_t *idatap, int level, long *execsz,
                          int setid, caddr_t exec_file, cred_t *cred,
!                         int brand_action);
  extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
                          rlim64_t rlimit, int sig, core_content_t content);
  
  static struct execsw esw32 = {
          elf32magicstr,
--- 2504,2514 ----
  
  #ifdef  _LP64
  extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
                          intpdata_t *idatap, int level, long *execsz,
                          int setid, caddr_t exec_file, cred_t *cred,
!                         int *brand_action);
  extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
                          rlim64_t rlimit, int sig, core_content_t content);
  
  static struct execsw esw32 = {
          elf32magicstr,