Print this page
OS-5015 PT_INTERP headers should be permitted after PT_LOAD headers
OS-5451 comm page should not break i86xpv
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5192 need faster clock_gettime
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Ryan Zezeski <ryan@zinascii.com>
OS-5293 lx brand: prelink(8)'d binaries core dump before main()
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5072 lxbrand support PT_GNU_STACK
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5202 Support AT_SECURE & AT_*ID in LX
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4824 Unlike Linux, nested interpreters don't work
(LX changes only, the rest were upstreamed...)
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Cody Mello <cody.mello@joyent.com>
OS-3735 modstubs MAXNARG is too low.
OS-3733 Verify b_native_exec exists before calling it
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4119 lxbrand panic when running native perl inside lx zone
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-4128 programs that lack PT_PHDR are not properly loaded
OS-4141 freeing phdrs induces bad kmem_free() in elfexec()
backout OS-4141: needs more work
backout OS-4128: needs more work
OS-4141 freeing phdrs induces bad kmem_free() in elfexec()
OS-4128 programs that lack PT_PHDR are not properly loaded
OS-3696 lx brand: G-Portugol programs core dump
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-3517 lx brand: branded zones don't interpret .interp section
OS-3405 lx brand: socket() fails for PF_INET6
OS-3382 lxbrand 64bit gettimeofday depends on vsyscall or vdso
OS-3280 need a way to specify the root of a native system in the lx brand
OS-3279 lx brand should allow delegated datasets
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-2949 add support for AT_RANDOM aux vector entry
OS-2877 lx_librtld_db falls to load due to NULL DT_DEBUG
        
@@ -24,11 +24,11 @@
  */
 
 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
 /*        All Rights Reserved   */
 /*
- * Copyright (c) 2013, Joyent, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/thread.h>
@@ -64,10 +64,15 @@
 #include <sys/brand.h>
 #include "elf_impl.h"
 #include <sys/sdt.h>
 #include <sys/siginfo.h>
 
+#if defined(__x86)
+#include <sys/comm_page_util.h>
+#endif /* defined(__x86) */
+
+
 extern int at_flags;
 
 #define ORIGIN_STR      "ORIGIN"
 #define ORIGIN_STR_SIZE 6
 
@@ -161,16 +166,20 @@
 
         return (0);
 }
 
 /*
- * Map in the executable pointed to by vp. Returns 0 on success.
+ * Map in the executable pointed to by vp. Returns 0 on success.  Note that
+ * this function currently has the maximum number of arguments allowed by
+ * modstubs on x86 (MAXNARG)!  Do _not_ add to this function signature without
+ * adding to MAXNARG.  (Better yet, do not add to this monster of a function
+ * signature!)
  */
 int
 mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr,
-    intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase,
-    caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap)
+    intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase,
+    caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp)
 {
         size_t          len;
         struct vattr    vat;
         caddr_t         phdrbase = NULL;
         ssize_t         phdrsize;
@@ -178,17 +187,21 @@
         int             error = 0;
         Phdr            *uphdr = NULL;
         Phdr            *junk = NULL;
         Phdr            *dynphdr = NULL;
         Phdr            *dtrphdr = NULL;
+        char            *interp = NULL;
         uintptr_t       lddata;
         long            execsz;
         intptr_t        minaddr;
 
         if (lddatap != NULL)
                 *lddatap = NULL;
 
+        if (minaddrp != NULL)
+                *minaddrp = NULL;
+
         if (error = execpermissions(vp, &vat, args)) {
                 uprintf("%s: Cannot execute %s\n", exec_file, args->pathname);
                 return (error);
         }
 
@@ -210,29 +223,93 @@
 
         if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr,
             &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr,
             len, &execsz, brksize)) {
                 uprintf("%s: Cannot map %s\n", exec_file, args->pathname);
+                if (uphdr != NULL && uphdr->p_flags == 0)
+                        kmem_free(uphdr, sizeof (Phdr));
                 kmem_free(phdrbase, phdrsize);
                 return (error);
         }
 
+        if (minaddrp != NULL)
+                *minaddrp = minaddr;
+
         /*
-         * Inform our caller if the executable needs an interpreter.
+         * If the executable requires an interpreter, determine its name.
          */
-        *interp = (dynphdr == NULL) ? 0 : 1;
+        if (dynphdr != NULL) {
+                ssize_t resid;
 
+                if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) {
+                        uprintf("%s: Invalid interpreter\n", exec_file);
+                        kmem_free(phdrbase, phdrsize);
+                        return (ENOEXEC);
+                }
+
+                interp = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+                if ((error = vn_rdwr(UIO_READ, vp, interp, dynphdr->p_filesz,
+                    (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0,
+                    (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 ||
+                    interp[dynphdr->p_filesz - 1] != '\0') {
+                        uprintf("%s: Cannot obtain interpreter pathname\n",
+                            exec_file);
+                        kmem_free(interp, MAXPATHLEN);
+                        kmem_free(phdrbase, phdrsize);
+                        return (error != 0 ? error : ENOEXEC);
+                }
+        }
+
         /*
          * If this is a statically linked executable, voffset should indicate
          * the address of the executable itself (it normally holds the address
          * of the interpreter).
          */
-        if (ehdr->e_type == ET_EXEC && *interp == 0)
+        if (ehdr->e_type == ET_EXEC && interp == NULL)
                 *voffset = minaddr;
 
+        /*
+         * If the caller has asked for the interpreter name, return it (it's
+         * up to the caller to free it); if the caller hasn't asked for it,
+         * free it ourselves.
+         */
+        if (interpp != NULL) {
+                *interpp = interp;
+        } else if (interp != NULL) {
+                kmem_free(interp, MAXPATHLEN);
+        }
+
         if (uphdr != NULL) {
                 *uphdr_vaddr = uphdr->p_vaddr;
+
+                if (uphdr->p_flags == 0)
+                        kmem_free(uphdr, sizeof (Phdr));
+        } else if (ehdr->e_type == ET_DYN) {
+                /*
+                 * If we don't have a uphdr, we'll apply the logic found
+                 * in mapelfexec() and use the p_vaddr of the first PT_LOAD
+                 * section as the base address of the object.
+                 */
+                Phdr *phdr = (Phdr *)phdrbase;
+                int i, hsize = ehdr->e_phentsize;
+
+                for (i = nphdrs; i > 0; i--) {
+                        if (phdr->p_type == PT_LOAD) {
+                                *uphdr_vaddr = (uintptr_t)phdr->p_vaddr +
+                                    ehdr->e_phoff;
+                                break;
+                        }
+
+                        phdr = (Phdr *)((caddr_t)phdr + hsize);
+                }
+
+                /*
+                 * If we don't have a PT_LOAD segment, we should have returned
+                 * ENOEXEC when elfsize() returned 0, above.
+                 */
+                VERIFY(i > 0);
         } else {
                 *uphdr_vaddr = (Addr)-1;
         }
 
         kmem_free(phdrbase, phdrsize);
@@ -241,17 +318,17 @@
 
 /*ARGSUSED*/
 int
 elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap,
     int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred,
-    int brand_action)
+    int *brand_action)
 {
         caddr_t         phdrbase = NULL;
         caddr_t         bssbase = 0;
         caddr_t         brkbase = 0;
         size_t          brksize = 0;
-        ssize_t         dlnsize;
+        ssize_t         dlnsize, nsize = 0;
         aux_entry_t     *aux;
         int             error;
         ssize_t         resid;
         int             fd = -1;
         intptr_t        voffset;
@@ -271,10 +348,11 @@
         ssize_t         capsize;
         int             hasu = 0;
         int             hasauxv = 0;
         int             hasdy = 0;
         int             branded = 0;
+        int             dynuphdr = 0;
 
         struct proc *p = ttoproc(curthread);
         struct user *up = PTOU(p);
         struct bigwad {
                 Ehdr    ehdr;
@@ -325,11 +403,13 @@
         if (ehdrp->e_ident[EI_CLASS] == ELFCLASS32) {
                 args->to_model = DATAMODEL_ILP32;
                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1);
         } else {
                 args->to_model = DATAMODEL_LP64;
+                if (!args->stk_prot_override) {
                 args->stk_prot &= ~PROT_EXEC;
+                }
 #if defined(__i386) || defined(__amd64)
                 args->dat_prot &= ~PROT_EXEC;
 #endif
                 *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS64-1);
         }
@@ -337,24 +417,50 @@
         args->to_model = DATAMODEL_ILP32;
         *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS-1);
 #endif  /* _LP64 */
 
         /*
-         * We delay invoking the brand callback until we've figured out
-         * what kind of elf binary we're trying to run, 32-bit or 64-bit.
-         * We do this because now the brand library can just check
-         * args->to_model to see if the target is 32-bit or 64-bit without
-         * having do duplicate all the code above.
+         * We delay invoking the brand callback until we've figured out what
+         * kind of elf binary we're trying to run, 32-bit or 64-bit.  We do this
+         * because now the brand library can just check args->to_model to see if
+         * the target is 32-bit or 64-bit without having do duplicate all the
+         * code above.
          *
+         * We also give the brand a chance to indicate that based on the ELF
+         * OSABI of the target binary it should become unbranded and optionally
+         * indicate that it should be treated as existing in a specific prefix.
+         *
+         * Note that if a brand opts to go down this route it does not actually
+         * end up being debranded. In other words, future programs that exec
+         * will still be considered for branding unless this escape hatch is
+         * used. Consider the case of lx brand for example. If a user runs
+         * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable
+         * of DTrace that's in /native will take this escape hatch and be run
+         * and interpreted using the normal system call table; however, the
+         * execution of a non-illumos binary in the form of /bin/ls will still
+         * be branded and be subject to all of the normal actions of the brand.
+         *
          * The level checks associated with brand handling below are used to
          * prevent a loop since the brand elfexec function typically comes back
          * through this function. We must check <= here since the nested
          * handling in the #! interpreter code will increment the level before
          * calling gexec to run the final elfexec interpreter.
          */
+        if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) &&
+            (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) {
+                if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI],
+                    &args->brand_nroot) == B_TRUE) {
+                        ASSERT(ehdrp->e_ident[EI_OSABI]);
+                        *brand_action = EBA_NATIVE;
+                        /* Add one for the trailing '/' in the path */
+                        if (args->brand_nroot != NULL)
+                                nsize = strlen(args->brand_nroot) + 1;
+                }
+        }
+
         if ((level <= INTP_MAXDEPTH) &&
-            (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+            (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
                 error = BROP(p)->b_elfexec(vp, uap, args,
                     idatap, level + 1, execsz, setid, exec_file, cred,
                     brand_action);
                 goto out;
         }
@@ -421,18 +527,19 @@
                  * entries are:
                  *
                  *      AT_BASE
                  *      AT_FLAGS
                  *      AT_PAGESZ
+                 *      AT_RANDOM
                  *      AT_SUN_AUXFLAGS
                  *      AT_SUN_HWCAP
                  *      AT_SUN_HWCAP2
                  *      AT_SUN_PLATFORM (added in stk_copyout)
                  *      AT_SUN_EXECNAME (added in stk_copyout)
                  *      AT_NULL
                  *
-                 * total == 9
+                 * total == 10
                  */
                 if (hasdy && hasu) {
                         /*
                          * Has PT_INTERP & PT_PHDR - the auxvectors that
                          * will be built are:
@@ -443,23 +550,23 @@
                          *      AT_ENTRY
                          *      AT_LDDATA
                          *
                          * total = 5
                          */
-                        args->auxsize = (9 + 5) * sizeof (aux_entry_t);
+                        args->auxsize = (10 + 5) * sizeof (aux_entry_t);
                 } else if (hasdy) {
                         /*
                          * Has PT_INTERP but no PT_PHDR
                          *
                          *      AT_EXECFD
                          *      AT_LDDATA
                          *
                          * total = 2
                          */
-                        args->auxsize = (9 + 2) * sizeof (aux_entry_t);
+                        args->auxsize = (10 + 2) * sizeof (aux_entry_t);
                 } else {
-                        args->auxsize = 9 * sizeof (aux_entry_t);
+                        args->auxsize = 10 * sizeof (aux_entry_t);
                 }
         } else {
                 args->auxsize = 0;
         }
 
@@ -468,19 +575,48 @@
          * AT_SUN_EMULATOR aux entry.
          */
         if (args->emulator != NULL)
                 args->auxsize += sizeof (aux_entry_t);
 
-        if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
-                branded = 1;
                 /*
-                 * We will be adding 4 entries to the aux vectors.  One for
-                 * the the brandname and 3 for the brand specific aux vectors.
+         * If this is a native binary that's been given a modified interpreter
+         * root, inform it that the native system exists at that root.
                  */
+        if (args->brand_nroot != NULL) {
+                args->auxsize += sizeof (aux_entry_t);
+        }
+
+
+        /*
+         * On supported kernels (x86_64) make room in the auxv for the
+         * AT_SUN_COMMPAGE entry.  This will go unpopulated on i86xpv systems
+         * which do not provide such functionality.
+         */
+#if defined(__amd64)
+        args->auxsize += sizeof (aux_entry_t);
+#endif /* defined(__amd64) */
+
+        /*
+         * If we have user credentials, we'll supply the following entries:
+         *      AT_SUN_UID
+         *      AT_SUN_RUID
+         *      AT_SUN_GID
+         *      AT_SUN_RGID
+         */
+        if (cred != NULL) {
                 args->auxsize += 4 * sizeof (aux_entry_t);
         }
 
+        if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) {
+                branded = 1;
+                /*
+                 * We will be adding 5 entries to the aux vectors.  One for
+                 * the the brandname and 4 for the brand specific aux vectors.
+                 */
+                args->auxsize += 5 * sizeof (aux_entry_t);
+        }
+
         /* Hardware/Software capabilities */
         if (capphdr != NULL &&
             (capsize = capphdr->p_filesz) > 0 &&
             capsize <= 16 * sizeof (*cap)) {
                 int ncaps = capsize / sizeof (*cap);
@@ -532,10 +668,18 @@
         if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &dyphdr,
             &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL,
             len, execsz, &brksize)) != 0)
                 goto bad;
 
+        if (uphdr != NULL) {
+                /*
+                 * Our uphdr has been dynamically allocated if (and only if)
+                 * its program header flags are clear.
+                 */
+                dynuphdr = (uphdr->p_flags == 0);
+        }
+
         if (uphdr != NULL && dyphdr == NULL)
                 goto bad;
 
         if (dtrphdr != NULL && dtrace_safe_phdr(dtrphdr, args, voffset) != 0) {
                 uprintf("%s: Bad DTrace phdr in %s\n", exec_file, exec_file);
@@ -546,21 +690,26 @@
                 size_t          len;
                 uintptr_t       lddata;
                 char            *p;
                 struct vnode    *nvp;
 
-                dlnsize = dyphdr->p_filesz;
+                dlnsize = dyphdr->p_filesz + nsize;
 
                 if (dlnsize > MAXPATHLEN || dlnsize <= 0)
                         goto bad;
 
+                if (nsize != 0) {
+                        bcopy(args->brand_nroot, dlnp, nsize - 1);
+                        dlnp[nsize - 1] = '/';
+                }
+
                 /*
                  * Read in "interpreter" pathname.
                  */
-                if ((error = vn_rdwr(UIO_READ, vp, dlnp, dyphdr->p_filesz,
-                    (offset_t)dyphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0,
-                    CRED(), &resid)) != 0) {
+                if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize,
+                    dyphdr->p_filesz, (offset_t)dyphdr->p_offset, UIO_SYSSPACE,
+                    0, (rlim64_t)0, CRED(), &resid)) != 0) {
                         uprintf("%s: Cannot obtain interpreter pathname\n",
                             exec_file);
                         goto bad;
                 }
 
@@ -701,13 +850,14 @@
                         goto bad;
                 }
 
                 dtrphdr = NULL;
 
-                error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk,
+                error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk,
                     &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len,
                     execsz, NULL);
+
                 if (error || junk != NULL) {
                         VN_RELE(nvp);
                         uprintf("%s: Cannot map %s\n", exec_file, dlnp);
                         goto bad;
                 }
@@ -730,12 +880,13 @@
                 ADDAUX(aux, AT_SUN_LDDATA, voffset + lddata)
         }
 
         if (hasauxv) {
                 int auxf = AF_SUN_HWCAPVERIFY;
+
                 /*
-                 * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via
+                 * Note: AT_SUN_PLATFORM and AT_RANDOM were filled in via
                  * exec_args()
                  */
                 ADDAUX(aux, AT_BASE, voffset)
                 ADDAUX(aux, AT_FLAGS, at_flags)
                 ADDAUX(aux, AT_PAGESZ, PAGESIZE)
@@ -760,11 +911,11 @@
                  * which we trust because we see they are already running
                  * under pfexec (where uid != euid).  This prevents a
                  * malicious user within the zone from crafting a wrapper to
                  * run native suid commands with unsecure libraries interposed.
                  */
-                if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
+                if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) &&
                     (setid &= ~EXECSETID_SETID) != 0))
                         auxf &= ~AF_SUN_SETUGID;
 
                 /*
                  * Record the user addr of the auxflags aux vector entry
@@ -773,11 +924,23 @@
                 args->auxp_auxflags =
                     (char *)((char *)args->stackend +
                     ((char *)&aux->a_type -
                     (char *)bigwad->elfargs));
                 ADDAUX(aux, AT_SUN_AUXFLAGS, auxf);
+
                 /*
+                 * Record information about the real and effective user and
+                 * group IDs.
+                 */
+                if (cred != NULL) {
+                        ADDAUX(aux, AT_SUN_UID, crgetuid(cred));
+                        ADDAUX(aux, AT_SUN_RUID, crgetruid(cred));
+                        ADDAUX(aux, AT_SUN_GID, crgetgid(cred));
+                        ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred));
+                }
+
+                /*
                  * Hardware capability flag word (performance hints)
                  * Used for choosing faster library routines.
                  * (Potentially different between 32-bit and 64-bit ABIs)
                  */
 #if defined(_LP64)
@@ -802,13 +965,30 @@
                             ((char *)&aux->a_type -
                             (char *)bigwad->elfargs));
                         ADDAUX(aux, AT_SUN_BRAND_AUX1, 0)
                         ADDAUX(aux, AT_SUN_BRAND_AUX2, 0)
                         ADDAUX(aux, AT_SUN_BRAND_AUX3, 0)
+                        ADDAUX(aux, AT_SUN_BRAND_AUX4, 0)
                 }
 
+                /*
+                 * Add the comm page auxv entry, mapping it in if needed.
+                 */
+#if defined(__amd64)
+                if (args->commpage != NULL ||
+                    (args->commpage = (uintptr_t)comm_page_mapin()) != NULL) {
+                        ADDAUX(aux, AT_SUN_COMMPAGE, args->commpage)
+                } else {
+                        /*
+                         * If the comm page cannot be mapped, pad out the auxv
+                         * to satisfy later size checks.
+                         */
                 ADDAUX(aux, AT_NULL, 0)
+                }
+#endif /* defined(__amd64) */
+
+                ADDAUX(aux, AT_NULL, 0)
                 postfixsize = (char *)aux - (char *)bigwad->elfargs;
 
                 /*
                  * We make assumptions above when we determine how many aux
                  * vector entries we will be adding. However, if we have an
@@ -843,10 +1023,11 @@
                 error = ENOMEM;
                 goto bad;
         }
 
         bzero(up->u_auxv, sizeof (up->u_auxv));
+        up->u_commpagep = args->commpage;
         if (postfixsize) {
                 int num_auxv;
 
                 /*
                  * Copy the aux vector to the user stack.
@@ -909,10 +1090,12 @@
         psignal(p, SIGKILL);
 
         if (error == 0)
                 error = ENOEXEC;
 out:
+        if (dynuphdr)
+                kmem_free(uphdr, sizeof (Phdr));
         if (phdrbase != NULL)
                 kmem_free(phdrbase, phdrsize);
         if (cap != NULL)
                 kmem_free(cap, capsize);
         kmem_free(bigwad, sizeof (struct bigwad));
@@ -1175,10 +1358,33 @@
         (*shstrbasep)[*shstrsizep - 1] = '\0';
 
         return (0);
 }
 
+
+#ifdef _ELF32_COMPAT
+int
+elf32readhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
+    caddr_t *phbasep, ssize_t *phsizep)
+#else
+int
+elfreadhdr(vnode_t *vp, cred_t *credp, Ehdr *ehdrp, int *nphdrs,
+    caddr_t *phbasep, ssize_t *phsizep)
+#endif
+{
+        int error, nshdrs, shstrndx;
+
+        if ((error = getelfhead(vp, credp, ehdrp, &nshdrs, &shstrndx,
+            nphdrs)) != 0 ||
+            (error = getelfphdr(vp, credp, ehdrp, *nphdrs, phbasep,
+            phsizep)) != 0) {
+                return (error);
+        }
+        return (0);
+}
+
+
 static int
 mapelfexec(
         vnode_t *vp,
         Ehdr *ehdr,
         int nphdrs,
@@ -1195,55 +1401,90 @@
         size_t len,
         long *execsz,
         size_t *brksize)
 {
         Phdr *phdr;
-        int i, prot, error;
+        int i, prot, error, lastprot = 0;
         caddr_t addr = NULL;
         size_t zfodsz;
         int ptload = 0;
         int page;
         off_t offset;
         int hsize = ehdr->e_phentsize;
         caddr_t mintmp = (caddr_t)-1;
+        uintptr_t lastaddr = NULL;
         extern int use_brk_lpg;
 
         if (ehdr->e_type == ET_DYN) {
-                /*
-                 * Obtain the virtual address of a hole in the
-                 * address space to map the "interpreter".
-                 */
-                map_addr(&addr, len, (offset_t)0, 1, 0);
-                if (addr == NULL)
-                        return (ENOMEM);
-                *voffset = (intptr_t)addr;
+                caddr_t vaddr;
 
                 /*
-                 * Calculate the minimum vaddr so it can be subtracted out.
-                 * According to the ELF specification, since PT_LOAD sections
-                 * must be sorted by increasing p_vaddr values, this is
-                 * guaranteed to be the first PT_LOAD section.
+                 * Despite the fact that mmapobj(2) refuses to load them, we
+                 * need to support executing ET_DYN objects that have a
+                 * non-NULL p_vaddr.  When found in the wild, these objects
+                 * are likely to be due to an old (and largely obviated) Linux
+                 * facility, prelink(8), that rewrites shared objects to
+                 * prefer specific (disjoint) virtual address ranges.  (Yes,
+                 * this is putatively for performance -- and yes, it has
+                 * limited applicability, many edge conditions and grisly
+                 * failure modes; even for Linux, it's insane.)  As ELF
+                 * mandates that the PT_LOAD segments be in p_vaddr order, we
+                 * find the lowest p_vaddr by finding the first PT_LOAD
+                 * segment.
                  */
                 phdr = (Phdr *)phdrbase;
                 for (i = nphdrs; i > 0; i--) {
                         if (phdr->p_type == PT_LOAD) {
-                                *voffset -= (uintptr_t)phdr->p_vaddr;
+                                addr = (caddr_t)(uintptr_t)phdr->p_vaddr;
                                 break;
                         }
                         phdr = (Phdr *)((caddr_t)phdr + hsize);
                 }
 
+                /*
+                 * We have a non-zero p_vaddr in the first PT_LOAD segment --
+                 * presumably because we're directly executing a prelink(8)'d
+                 * ld-linux.so.  While we could correctly execute such an
+                 * object without locating it at its desired p_vaddr (it is,
+                 * after all, still relocatable), our inner antiquarian
+                 * derives a perverse pleasure in accommodating the steampunk
+                 * prelink(8) contraption -- goggles on!
+                 */
+                if ((vaddr = addr) != NULL) {
+                        if (as_gap(curproc->p_as, len,
+                            &addr, &len, AH_LO, NULL) == -1 || addr != vaddr) {
+                                addr = NULL;
+                        }
+                }
+
+                if (addr == NULL) {
+                        /*
+                         * We either have a NULL p_vaddr (the common case, by
+                         * many orders of magnitude) or we have a non-NULL
+                         * p_vaddr and we were unable to obtain the specified
+                         * VA range (presumably because it's an illegal
+                         * address).  Either way, obtain an address in which
+                         * to map the interpreter.
+                         */
+                        map_addr(&addr, len, (offset_t)0, 1, 0);
+                        if (addr == NULL)
+                                return (ENOMEM);
+                }
+
+                /*
+                 * Our voffset is the difference between where we landed and
+                 * where we wanted to be.
+                 */
+                *voffset = (uintptr_t)addr - (uintptr_t)vaddr;
         } else {
                 *voffset = 0;
         }
+
         phdr = (Phdr *)phdrbase;
         for (i = nphdrs; i > 0; i--) {
                 switch (phdr->p_type) {
                 case PT_LOAD:
-                        if ((*dyphdr != NULL) && (*uphdr == NULL))
-                                return (0);
-
                         ptload = 1;
                         prot = PROT_USER;
                         if (phdr->p_flags & PF_R)
                                 prot |= PROT_READ;
                         if (phdr->p_flags & PF_W)
@@ -1251,17 +1492,80 @@
                         if (phdr->p_flags & PF_X)
                                 prot |= PROT_EXEC;
 
                         addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset);
 
+                        if ((*dyphdr != NULL) && uphdr != NULL &&
+                            (*uphdr == NULL)) {
                         /*
+                                 * The PT_PHDR program header is, strictly
+                                 * speaking, optional.  If we find that this
+                                 * is missing, we will determine the location
+                                 * of the program headers based on the address
+                                 * of the lowest PT_LOAD segment (namely, this
+                                 * one):  we subtract the p_offset to get to
+                                 * the ELF header and then add back the program
+                                 * header offset to get to the program headers.
+                                 * We then cons up a Phdr that corresponds to
+                                 * the (missing) PT_PHDR, setting the flags
+                                 * to 0 to denote that this is artificial and
+                                 * should (must) be freed by the caller.
+                                 */
+                                Phdr *cons;
+
+                                cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP);
+
+                                cons->p_flags = 0;
+                                cons->p_type = PT_PHDR;
+                                cons->p_vaddr = ((uintptr_t)addr -
+                                    phdr->p_offset) + ehdr->e_phoff;
+
+                                *uphdr = cons;
+                        }
+
+                        /*
                          * Keep track of the segment with the lowest starting
                          * address.
                          */
                         if (addr < mintmp)
                                 mintmp = addr;
 
+                        /*
+                         * Segments need not correspond to page boundaries:
+                         * they are permitted to share a page.  If two PT_LOAD
+                         * segments share the same page, and the permissions
+                         * of the segments differ, the behavior is historically
+                         * that the permissions of the latter segment are used
+                         * for the page that the two segments share.  This is
+                         * also historically a non-issue:  binaries generated
+                         * by most anything will make sure that two PT_LOAD
+                         * segments with differing permissions don't actually
+                         * share any pages.  However, there exist some crazy
+                         * things out there (including at least an obscure
+                         * Portuguese teaching language called G-Portugol) that
+                         * actually do the wrong thing and expect it to work:
+                         * they have a segment with execute permission share
+                         * a page with a subsequent segment that does not
+                         * have execute permissions and expect the resulting
+                         * shared page to in fact be executable.  To accommodate
+                         * such broken link editors, we take advantage of a
+                         * latitude explicitly granted to the loader:  it is
+                         * permitted to make _any_ PT_LOAD segment executable
+                         * (provided that it is readable or writable).  If we
+                         * see that we're sharing a page and that the previous
+                         * page was executable, we will add execute permissions
+                         * to our segment.
+                         */
+                        if (btop(lastaddr) == btop((uintptr_t)addr) &&
+                            (phdr->p_flags & (PF_R | PF_W)) &&
+                            (lastprot & PROT_EXEC)) {
+                                prot |= PROT_EXEC;
+                        }
+
+                        lastaddr = (uintptr_t)addr + phdr->p_filesz;
+                        lastprot = prot;
+
                         zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz;
 
                         offset = phdr->p_offset;
                         if (((uintptr_t)offset & PAGEOFFSET) ==
                             ((uintptr_t)addr & PAGEOFFSET) &&
@@ -1322,23 +1626,40 @@
 
                         *execsz += btopr(phdr->p_memsz);
                         break;
 
                 case PT_INTERP:
-                        if (ptload)
-                                goto bad;
+                        /*
+                         * The ELF specification is unequivocal about the
+                         * PT_INTERP program header with respect to any PT_LOAD
+                         * program header:  "If it is present, it must precede
+                         * any loadable segment entry." Linux, however, makes
+                         * no attempt to enforce this -- which has allowed some
+                         * binary editing tools to get away with generating
+                         * invalid ELF binaries in the respect that PT_INTERP
+                         * occurs after the first PT_LOAD program header.  This
+                         * is unfortunate (and of course, disappointing) but
+                         * it's no worse than that: there is no reason that we
+                         * can't process the PT_INTERP entry (if present) after
+                         * one or more PT_LOAD entries.  We therefore
+                         * deliberately do not check ptload here and always
+                         * store dyphdr to be the PT_INTERP program header.
+                         */
                         *dyphdr = phdr;
                         break;
 
                 case PT_SHLIB:
                         *stphdr = phdr;
                         break;
 
                 case PT_PHDR:
-                        if (ptload)
+                        if (ptload || phdr->p_flags == 0)
                                 goto bad;
+
+                        if (uphdr != NULL)
                         *uphdr = phdr;
+
                         break;
 
                 case PT_NULL:
                 case PT_DYNAMIC:
                 case PT_NOTE:
@@ -2183,11 +2504,11 @@
 
 #ifdef  _LP64
 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args,
                         intpdata_t *idatap, int level, long *execsz,
                         int setid, caddr_t exec_file, cred_t *cred,
-                        int brand_action);
+                        int *brand_action);
 extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp,
                         rlim64_t rlimit, int sig, core_content_t content);
 
 static struct execsw esw32 = {
         elf32magicstr,