Print this page
OS-5510 remove lwp_brand_syscall_fast handler
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Lint-clean syscall_asm_amd64.s
OS-4961 lxbrand want fasttrap-like brand hook
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-3937 lxbrand incorrect stack alignment for lx_syscall_enter()
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-3561 lxbrand emulation library should execute on alternate stack
OS-3558 lxbrand add support for full in-kernel syscall handling
OS-3545 lx_syscall_regs should not walk stack
OS-3868 many LTP testcases now hang
OS-3901 lxbrand lx_recvmsg fails to translate control messages when 64-bit
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Bryan Cantrill <bryan@joyent.com>
OS-3215 32bit syscalls with more than 6 parameters re-use arg0/arg1 as arg6/arg7
OS-3223 Passing arg6 and arg7 can't clobber the stack for ap-style calls
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Keith M Wesolowski <wesolows@foobazco.org>
back out OS-3215: causes OS-3223
OS-3215 32bit syscalls with more than 6 parameters re-use arg0/arg1 as arg6/arg7
OS-2834 ship lx brand
        
@@ -18,28 +18,28 @@
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/asm_linkage.h>
 #include <sys/asm_misc.h>
 #include <sys/regset.h>
 #include <sys/privregs.h>
 #include <sys/psw.h>
-#include <sys/machbrand.h>
 
 #if defined(__lint)
 
 #include <sys/types.h>
 #include <sys/thread.h>
 #include <sys/systm.h>
 
 #else   /* __lint */
 
+#include <sys/machbrand.h>
 #include <sys/segments.h>
 #include <sys/pcb.h>
 #include <sys/trap.h>
 #include <sys/ftrace.h>
 #include <sys/traptrace.h>
@@ -501,10 +501,11 @@
 
         movq    %rsp, %rbp
         
         movq    T_LWP(%r15), %r14
         ASSERT_NO_RUPDATE_PENDING(%r14)
+
         ENABLE_INTR_FLAGS
 
         MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
         movl    REGOFF_RAX(%rsp), %eax  /* (%rax damaged by mstate call) */
 
@@ -514,10 +515,32 @@
         incq    LWP_RU_SYSC(%r14)
         movb    $NORMALRETURN, LWP_EOSYS(%r14)
 
         incq    %gs:CPU_STATS_SYS_SYSCALL
 
+        /*
+         * If our LWP has an alternate system call handler, run that instead of
+         * the regular system call path.
+         */
+        movq    LWP_BRAND_SYSCALL(%r14), %rdi
+        testq   %rdi, %rdi
+        jz      _syscall_no_brand
+
+        pushq   %rax
+        subq    $8, %rsp        /* align stack for call to C */
+        call    *%rdi
+        addq    $8, %rsp
+
+        /*
+         * If the alternate handler returns 0, we skip straight to the return to
+         * usermode.  Otherwise, we resume regular system call processing.
+         */
+        testl   %eax, %eax
+        popq    %rax
+        jz      _syscall_after_brand
+
+_syscall_no_brand:
         movw    %ax, T_SYSNUM(%r15)
         movzbl  T_PRE_SYS(%r15), %ebx
         ORL_SYSCALLTRACE(%ebx)
         testl   %ebx, %ebx
         jne     _syscall_pre
@@ -548,10 +571,12 @@
         je      5f
         movq    %r12, %r13
         shrq    $32, %r13       /* upper 32-bits into %edx */
         movl    %r12d, %r12d    /* lower 32-bits into %eax */
 5:
+
+_syscall_after_brand:
         /*
          * Optimistically assume that there's no post-syscall
          * work to do.  (This is to avoid having to call syscall_mstate()
          * with interrupts disabled)
          */
@@ -793,15 +818,41 @@
         ASSERT_LWPTOREGS(%r14, %rsp)
 
         incq     %gs:CPU_STATS_SYS_SYSCALL
 
         /*
+         * If our lwp has an alternate system call handler, run that instead
+         * of the regular system call path.
+         */
+        movq    LWP_BRAND_SYSCALL(%r14), %rax
+        testq   %rax, %rax
+        jz      _syscall32_no_brand
+
+        movb    $LWP_SYS, LWP_STATE(%r14)
+        call    *%rax
+
+        /*
+         * If the alternate handler returns 0, we skip straight to the return
+         * to usermode.  Otherwise, we resume regular system call processing.
+         */
+        testl   %eax, %eax
+        jz      _syscall32_after_brand
+
+_syscall32_no_brand:
+        /*
          * Make some space for MAXSYSARGS (currently 8) 32-bit args placed
          * into 64-bit (long) arg slots, maintaining 16 byte alignment.  Or
          * more succinctly:
          *
          *      SA(MAXSYSARGS * sizeof (long)) == 64
+         *
+         * Note, this space is used both to copy in the arguments from user
+         * land, but also to as part of the old UNIX style syscall_ap() method.
+         * syscall_entry expects that we do not change the values of this space
+         * that we give it. However, this means that when we end up in the more
+         * recent model of passing the arguments based on the calling
+         * conventions, we'll need to save an additional 16 bytes of stack.
          */
 #define SYS_DROP        64                      /* drop for args */
         subq    $SYS_DROP, %rsp
         movb    $LWP_SYS, LWP_STATE(%r14)
         movq    %r15, %rdi
@@ -825,16 +876,20 @@
          * Lots of ideas here, but they won't really help with bringup B-)
          * Correctness can't wait, performance can wait a little longer ..
          */
 
         movq    %rax, %rbx
-        movl    0(%rsp), %edi
-        movl    8(%rsp), %esi
-        movl    0x10(%rsp), %edx
-        movl    0x18(%rsp), %ecx
-        movl    0x20(%rsp), %r8d
-        movl    0x28(%rsp), %r9d
+        movl    0x0(%rsp), %edi         /* arg0 */
+        movl    0x8(%rsp), %esi         /* arg1 */
+        movl    0x10(%rsp), %edx        /* arg2 */
+        movl    0x38(%rsp), %eax        /* arg7 load */
+        movl    0x18(%rsp), %ecx        /* arg3 */
+        pushq   %rax                    /* arg7 saved to stack */
+        movl    0x28(%rsp), %r8d        /* arg4 */
+        movl    0x38(%rsp), %eax        /* arg6 load */
+        movl    0x30(%rsp), %r9d        /* arg5 */
+        pushq   %rax                    /* arg6 saved to stack */
 
         call    *SY_CALLC(%rbx)
 
         movq    %rbp, %rsp      /* pop the args */
 
@@ -848,10 +903,12 @@
          */
         movq    %rax, %r13
         shrq    $32, %r13       /* upper 32-bits into %edx */
         movl    %eax, %r12d     /* lower 32-bits into %eax */
 
+_syscall32_after_brand:
+
         /*
          * Optimistically assume that there's no post-syscall
          * work to do.  (This is to avoid having to call syscall_mstate()
          * with interrupts disabled)
          */
@@ -1077,19 +1134,24 @@
         call    syscall_entry
 
         /*
          * Fetch the arguments copied onto the kernel stack and put
          * them in the right registers to invoke a C-style syscall handler.
-         * %rax contains the handler address.
+         * %rax contains the handler address. For the last two arguments, we
+         * push them onto the stack -- we can't clobber the old arguments.
          */
         movq    %rax, %rbx
-        movl    0(%rsp), %edi
-        movl    8(%rsp), %esi
-        movl    0x10(%rsp), %edx
-        movl    0x18(%rsp), %ecx
-        movl    0x20(%rsp), %r8d
-        movl    0x28(%rsp), %r9d
+        movl    0x0(%rsp), %edi         /* arg0 */
+        movl    0x8(%rsp), %esi         /* arg1 */
+        movl    0x10(%rsp), %edx        /* arg2 */
+        movl    0x38(%rsp), %eax        /* arg7 load */
+        movl    0x18(%rsp), %ecx        /* arg3 */
+        pushq   %rax                    /* arg7 saved to stack */
+        movl    0x28(%rsp), %r8d        /* arg4 */
+        movl    0x38(%rsp), %eax        /* arg6 load */
+        movl    0x30(%rsp), %r9d        /* arg5 */
+        pushq   %rax                    /* arg6 saved to stack */
 
         call    *SY_CALLC(%rbx)
 
         movq    %rbp, %rsp      /* pop the args */
 
@@ -1158,11 +1220,77 @@
         SET_SIZE(_sys_sysenter_post_swapgs)
         SET_SIZE(brand_sys_sysenter)
 
 #endif  /* __lint */
 
+#if defined(__lint)
 /*
+ * System call via an int80.  This entry point is only used by the Linux
+ * application environment.  Unlike the other entry points, there is no
+ * default action to take if no callback is registered for this process.
+ */
+void
+sys_int80()
+{}
+
+#else   /* __lint */
+
+        ENTRY_NP(brand_sys_int80)
+        SWAPGS                          /* kernel gsbase */
+        XPV_TRAP_POP
+
+        /*
+         * We first attempt to call the "b_int80" handler from the "struct
+         * brand_mach_ops" for this brand.  If no handler function is installed
+         * for this brand, the BRAND_CALLBACK() macro returns here and we
+         * check the lwp for a "lwp_brand_syscall" handler.
+         */
+        BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK())
+
+        /*
+         * Check to see if this lwp provides "lwp_brand_syscall".  If so, we
+         * will route this int80 through the regular system call handling path.
+         */
+        movq    %r15, %gs:CPU_RTMP_R15
+        movq    %gs:CPU_THREAD, %r15
+        movq    T_LWP(%r15), %r15
+        movq    LWP_BRAND_SYSCALL(%r15), %r15
+        testq   %r15, %r15
+        movq    %gs:CPU_RTMP_R15, %r15
+        jnz     nopop_syscall_int
+
+        /*
+         * The brand provided neither a "b_int80", nor a "lwp_brand_syscall"
+         * function, and has thus opted out of handling this trap.
+         */
+        SWAPGS                          /* user gsbase */
+        jmp     nopop_int80
+
+        ENTRY_NP(sys_int80)
+        /*
+         * We hit an int80, but this process isn't of a brand with an int80
+         * handler.  Bad process!  Make it look as if the INT failed.
+         * Modify %rip to point before the INT, push the expected error
+         * code and fake a GP fault. Note on 64-bit hypervisor we need
+         * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack
+         * because gptrap will pop them again with its own XPV_TRAP_POP.
+         */
+        XPV_TRAP_POP
+nopop_int80:
+        subq    $2, (%rsp)      /* int insn 2-bytes */
+        pushq   $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
+#if defined(__xpv)
+        push    %r11
+        push    %rcx
+#endif
+        jmp     gptrap                  / GP fault
+        SET_SIZE(sys_int80)
+        SET_SIZE(brand_sys_int80)
+#endif  /* __lint */
+
+
+/*
  * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by
  * the generic i386 libc to do system calls. We do a small amount of setup
  * before jumping into the existing sys_syscall32 path.
  */
 #if defined(__lint)