Print this page
    
OS-2834 ship lx brand
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86pc/ml/syscall_asm.s
          +++ new/usr/src/uts/i86pc/ml/syscall_asm.s
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  26   26  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  27   27  /*        All Rights Reserved                                   */
  28   28  
  29   29  /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  30   30  /*        All Rights Reserved                                   */
  31   31  
  32   32  #include <sys/asm_linkage.h>
  33   33  #include <sys/asm_misc.h>
  34   34  #include <sys/regset.h>
  35   35  #include <sys/psw.h>
  36   36  #include <sys/x86_archext.h>
  37   37  #include <sys/machbrand.h>
  38   38  #include <sys/privregs.h>
  39   39  
  40   40  #if defined(__lint)
  41   41  
  42   42  #include <sys/types.h>
  43   43  #include <sys/thread.h>
  44   44  #include <sys/systm.h>
  45   45  
  46   46  #else   /* __lint */
  47   47  
  48   48  #include <sys/segments.h>
  49   49  #include <sys/pcb.h>
  50   50  #include <sys/trap.h>
  51   51  #include <sys/ftrace.h>
  52   52  #include <sys/traptrace.h>
  53   53  #include <sys/clock.h>
  54   54  #include <sys/panic.h>
  55   55  #include "assym.h"
  56   56  
  57   57  #endif  /* __lint */
  58   58  
  59   59  /*
  60   60   * We implement two flavours of system call entry points
  61   61   *
  62   62   * -    {int,lcall}/iret        (i386)
  63   63   * -    sysenter/sysexit        (Pentium II and beyond)
  64   64   *
  65   65   * The basic pattern used in the handlers is to check to see if we can
  66   66   * do fast (simple) version of the system call; if we can't we use various
  67   67   * C routines that handle corner cases and debugging.
  68   68   *
  69   69   * To reduce the amount of assembler replication, yet keep the system call
  70   70   * implementations vaguely comprehensible, the common code in the body
  71   71   * of the handlers is broken up into a set of preprocessor definitions
  72   72   * below.
  73   73   */
  74   74  
  75   75  /*
  76   76   * When we have SYSCALLTRACE defined, we sneak an extra
  77   77   * predicate into a couple of tests.
  78   78   */
  79   79  #if defined(SYSCALLTRACE)
  80   80  #define ORL_SYSCALLTRACE(r32)   \
  81   81          orl     syscalltrace, r32
  82   82  #else
  83   83  #define ORL_SYSCALLTRACE(r32)
  84   84  #endif
  85   85  
  86   86  /*
  87   87   * This check is false whenever we want to go fast i.e.
  88   88   *
  89   89   *      if (code >= NSYSCALL ||
  90   90   *          t->t_pre_sys || (t->t_proc_flag & TP_WATCHPT) != 0)
  91   91   *              do full version
  92   92   * #ifdef SYSCALLTRACE
  93   93   *      if (syscalltrace)
  94   94   *              do full version
  95   95   * #endif
  96   96   *
  97   97   * Preconditions:
  98   98   * -    t       curthread
  99   99   * -    code    contains the syscall number
 100  100   * Postconditions:
 101  101   * -    %ecx and %edi are smashed
 102  102   * -    condition code flag ZF is cleared if pre-sys is too complex
 103  103   */
 104  104  #define CHECK_PRESYS_NE(t, code)                \
 105  105          movzbl  T_PRE_SYS(t), %edi;             \
 106  106          movzwl  T_PROC_FLAG(t), %ecx;           \
 107  107          andl    $TP_WATCHPT, %ecx;              \
 108  108          orl     %ecx, %edi;                     \
 109  109          cmpl    $NSYSCALL, code;                \
 110  110          setae   %cl;                            \
 111  111          movzbl  %cl, %ecx;                      \
 112  112          orl     %ecx, %edi;                     \
 113  113          ORL_SYSCALLTRACE(%edi)
 114  114  
 115  115  /*
 116  116   * Check if a brand_mach_ops callback is defined for the specified callback_id
 117  117   * type.  If so invoke it with the user's %gs value loaded and the following
 118  118   * data on the stack:
 119  119   *         --------------------------------------
 120  120   *         | user's %ss                         |
 121  121   *    |    | user's %esp                        |
 122  122   *    |    | EFLAGS register                    |
 123  123   *    |    | user's %cs                         |
 124  124   *    |    | user's %eip (user return address)  |
 125  125   *    |    | 'scratch space'                    |
 126  126   *    |    | user's %ebx                        |
 127  127   *    |    | user's %gs selector                |
 128  128   *    v    | lwp pointer                        |
 129  129   *         | callback wrapper return addr       |
 130  130   *         --------------------------------------
 131  131   *
 132  132   * If the brand code returns, we assume that we are meant to execute the
 133  133   * normal system call path.
 134  134   *
 135  135   * The interface to the brand callbacks on the 32-bit kernel assumes %ebx
 136  136   * is available as a scratch register within the callback.  If the callback
 137  137   * returns within the kernel then this macro will restore %ebx.  If the
 138  138   * callback is going to return directly to userland then it should restore
 139  139   * %ebx before returning to userland.
 140  140   */
 141  141  #define BRAND_CALLBACK(callback_id)                                         \
 142  142          subl    $4, %esp                /* save some scratch space      */ ;\
 143  143          pushl   %ebx                    /* save %ebx to use for scratch */ ;\
 144  144          pushl   %gs                     /* save the user %gs            */ ;\
 145  145          movl    $KGS_SEL, %ebx                                             ;\
 146  146          movw    %bx, %gs                /* switch to the kernel's %gs   */ ;\
 147  147          movl    %gs:CPU_THREAD, %ebx    /* load the thread pointer      */ ;\
 148  148          movl    T_LWP(%ebx), %ebx       /* load the lwp pointer         */ ;\
 149  149          pushl   %ebx                    /* push the lwp pointer         */ ;\
 150  150          movl    LWP_PROCP(%ebx), %ebx   /* load the proc pointer        */ ;\
 151  151          movl    P_BRAND(%ebx), %ebx     /* load the brand pointer       */ ;\
 152  152          movl    B_MACHOPS(%ebx), %ebx   /* load the machops pointer     */ ;\
 153  153          movl    _CONST(_MUL(callback_id, CPTRSIZE))(%ebx), %ebx            ;\
 154  154          cmpl    $0, %ebx                                                   ;\
 155  155          je      1f                                                         ;\
 156  156          movl    %ebx, 12(%esp)          /* save callback to scratch     */ ;\
 157  157          movl    4(%esp), %ebx           /* grab the user %gs            */ ;\
 158  158          movw    %bx, %gs                /* restore the user %gs         */ ;\
 159  159          call    *12(%esp)               /* call callback in scratch     */ ;\
 160  160  1:      movl    4(%esp), %ebx           /* restore user %gs (re-do if   */ ;\
 161  161          movw    %bx, %gs                /* branch due to no callback)   */ ;\
 162  162          movl    8(%esp), %ebx           /* restore user's %ebx          */ ;\
 163  163          addl    $16, %esp               /* restore stack ptr            */ 
 164  164  
 165  165  #define MSTATE_TRANSITION(from, to)             \
 166  166          pushl   $to;                            \
 167  167          pushl   $from;                          \
 168  168          call    syscall_mstate;                 \
 169  169          addl    $0x8, %esp
 170  170  
 171  171  /*
 172  172   * aka CPU_STATS_ADDQ(CPU, sys.syscall, 1)
 173  173   * This must be called with interrupts or preemption disabled.
 174  174   */
 175  175  #define CPU_STATS_SYS_SYSCALL_INC                       \
 176  176          addl    $1, %gs:CPU_STATS_SYS_SYSCALL;          \
 177  177          adcl    $0, %gs:CPU_STATS_SYS_SYSCALL+4;
 178  178  
 179  179  #if !defined(__lint)
 180  180  
 181  181  /*
 182  182   * ASSERT(lwptoregs(lwp) == rp);
 183  183   *
 184  184   * this may seem obvious, but very odd things happen if this
 185  185   * assertion is false
 186  186   *
 187  187   * Preconditions:
 188  188   *      -none-
 189  189   * Postconditions (if assertion is true):
 190  190   *      %esi and %edi are smashed
 191  191   */
 192  192  #if defined(DEBUG)
 193  193  
 194  194  __lwptoregs_msg:
 195  195          .string "syscall_asm.s:%d lwptoregs(%p) [%p] != rp [%p]"
 196  196  
 197  197  #define ASSERT_LWPTOREGS(t, rp)                         \
 198  198          movl    T_LWP(t), %esi;                         \
 199  199          movl    LWP_REGS(%esi), %edi;                   \
 200  200          cmpl    rp, %edi;                               \
 201  201          je      7f;                                     \
 202  202          pushl   rp;                                     \
 203  203          pushl   %edi;                                   \
 204  204          pushl   %esi;                                   \
 205  205          pushl   $__LINE__;                              \
 206  206          pushl   $__lwptoregs_msg;                       \
 207  207          call    panic;                                  \
 208  208  7:
 209  209  #else
 210  210  #define ASSERT_LWPTOREGS(t, rp)
 211  211  #endif
 212  212  
 213  213  #endif  /* __lint */
 214  214  
 215  215  /*
 216  216   * This is an assembler version of this fragment:
 217  217   *
 218  218   * lwp->lwp_state = LWP_SYS;
 219  219   * lwp->lwp_ru.sysc++;
 220  220   * lwp->lwp_eosys = NORMALRETURN;
 221  221   * lwp->lwp_ap = argp;
 222  222   *
 223  223   * Preconditions:
 224  224   *      -none-
 225  225   * Postconditions:
 226  226   *      -none-
 227  227   */
 228  228  #define SET_LWP(lwp, argp)                              \
 229  229          movb    $LWP_SYS, LWP_STATE(lwp);               \
 230  230          addl    $1, LWP_RU_SYSC(lwp);                   \
 231  231          adcl    $0, LWP_RU_SYSC+4(lwp);                 \
 232  232          movb    $NORMALRETURN, LWP_EOSYS(lwp);          \
 233  233          movl    argp, LWP_AP(lwp)
 234  234  
 235  235  /*
 236  236   * Set up the thread, lwp, find the handler, and copy
 237  237   * in the arguments from userland to the kernel stack.
 238  238   *
 239  239   * Preconditions:
 240  240   * -    %eax contains the syscall number
 241  241   * Postconditions:
 242  242   * -    %eax contains a pointer to the sysent structure
 243  243   * -    %ecx is zeroed
 244  244   * -    %esi, %edi are smashed
 245  245   * -    %esp is SYS_DROPped ready for the syscall
 246  246   */
 247  247  #define SIMPLE_SYSCALL_PRESYS(t, faultlabel)            \
 248  248          movl    T_LWP(t), %esi;                         \
 249  249          movw    %ax, T_SYSNUM(t);                       \
 250  250          subl    $SYS_DROP, %esp;                        \
 251  251          shll    $SYSENT_SIZE_SHIFT, %eax;                       \
 252  252          SET_LWP(%esi, %esp);                            \
 253  253          leal    sysent(%eax), %eax;                     \
 254  254          movzbl  SY_NARG(%eax), %ecx;                    \
 255  255          testl   %ecx, %ecx;                             \
 256  256          jz      4f;                                     \
 257  257          movl    %esp, %edi;                             \
 258  258          movl    SYS_DROP + REGOFF_UESP(%esp), %esi;     \
 259  259          movl    $faultlabel, T_LOFAULT(t);              \
 260  260          addl    $4, %esi;                               \
 261  261          rep;                                            \
 262  262            smovl;                                        \
 263  263          movl    %ecx, T_LOFAULT(t);                     \
 264  264  4:
 265  265  
 266  266  /*
 267  267   * Check to see if a simple return is possible i.e.
 268  268   *
 269  269   *      if ((t->t_post_sys_ast | syscalltrace) != 0)
 270  270   *              do full version;
 271  271   *
 272  272   * Preconditions:
 273  273   * -    t is curthread
 274  274   * Postconditions:
 275  275   * -    condition code NE is set if post-sys is too complex
 276  276   * -    rtmp is zeroed if it isn't (we rely on this!)
 277  277   */
 278  278  #define CHECK_POSTSYS_NE(t, rtmp)                       \
 279  279          xorl    rtmp, rtmp;                             \
 280  280          ORL_SYSCALLTRACE(rtmp);                         \
 281  281          orl     T_POST_SYS_AST(t), rtmp;                \
 282  282          cmpl    $0, rtmp
 283  283  
 284  284  /*
 285  285   * Fix up the lwp, thread, and eflags for a successful return
 286  286   *
 287  287   * Preconditions:
 288  288   * -    zwreg contains zero
 289  289   * Postconditions:
 290  290   * -    %esp has been unSYS_DROPped
 291  291   * -    %esi is smashed (points to lwp)
 292  292   */
 293  293  #define SIMPLE_SYSCALL_POSTSYS(t, zwreg)                \
 294  294          movl    T_LWP(t), %esi;                         \
 295  295          addl    $SYS_DROP, %esp;                        \
 296  296          movw    zwreg, T_SYSNUM(t);                     \
 297  297          movb    $LWP_USER, LWP_STATE(%esi);             \
 298  298          andb    $_CONST(0xffff - PS_C), REGOFF_EFL(%esp)
 299  299  
 300  300  /*
 301  301   * System call handler.  This is the destination of both the call
 302  302   * gate (lcall 0x27) _and_ the interrupt gate (int 0x91). For our purposes,
 303  303   * there are two significant differences between an interrupt gate and a call
 304  304   * gate:
 305  305   *
 306  306   * 1) An interrupt gate runs the handler with interrupts disabled, whereas a
 307  307   * call gate runs the handler with whatever EFLAGS settings were in effect at
 308  308   * the time of the call.
 309  309   *
 310  310   * 2) An interrupt gate pushes the contents of the EFLAGS register at the time
 311  311   * of the interrupt onto the stack, whereas a call gate does not.
 312  312   *
 313  313   * Because we use the following code sequence to handle system calls made from
 314  314   * _both_ a call gate _and_ an interrupt gate, these two differences must be
 315  315   * respected. In regards to number 1) above, the handler must ensure that a sane
 316  316   * EFLAGS snapshot is stored on the stack so that when the kernel returns back
 317  317   * to the user via iret (which returns to user with the EFLAGS value saved on
 318  318   * the stack), interrupts are re-enabled.
 319  319   *
 320  320   * In regards to number 2) above, the handler must always put a current snapshot
 321  321   * of EFLAGS onto the stack in the appropriate place. If we came in via an
 322  322   * interrupt gate, we will be clobbering the EFLAGS value that was pushed by
 323  323   * the interrupt gate. This is OK, as the only bit that was changed by the
 324  324   * hardware was the IE (interrupt enable) bit, which for an interrupt gate is
 325  325   * now off. If we were to do nothing, the stack would contain an EFLAGS with
 326  326   * IE off, resulting in us eventually returning back to the user with interrupts
 327  327   * disabled. The solution is to turn on the IE bit in the EFLAGS value saved on
 328  328   * the stack.
 329  329   *
 330  330   * Another subtlety which deserves mention is the difference between the two
 331  331   * descriptors. The call gate descriptor is set to instruct the hardware to copy
 332  332   * one parameter from the user stack to the kernel stack, whereas the interrupt
 333  333   * gate descriptor doesn't use the parameter passing mechanism at all. The
 334  334   * kernel doesn't actually use the parameter that is copied by the hardware; the
 335  335   * only reason it does this is so that there is a space on the stack large
 336  336   * enough to hold an EFLAGS register value, which happens to be in the correct
 337  337   * place for use by iret when we go back to userland. How convenient.
 338  338   *
 339  339   * Stack frame description in syscall() and callees.
 340  340   *
 341  341   * |------------|
 342  342   * | regs       | +(8*4)+4      registers
 343  343   * |------------|
 344  344   * | 8 args     | <- %esp       MAXSYSARGS (currently 8) arguments
 345  345   * |------------|
 346  346   * 
 347  347   */
 348  348  #define SYS_DROP        _CONST(_MUL(MAXSYSARGS, 4))
 349  349  
 350  350  #if defined(__lint)
 351  351  
 352  352  /*ARGSUSED*/
 353  353  void
 354  354  sys_call()
 355  355  {}
 356  356  
 357  357  void
 358  358  _allsyscalls()
 359  359  {}
 360  360  
 361  361  size_t _allsyscalls_size;
 362  362  
 363  363  #else   /* __lint */
 364  364  
 365  365          ENTRY_NP2(brand_sys_call, _allsyscalls)
 366  366          BRAND_CALLBACK(BRAND_CB_SYSCALL)
 367  367  
 368  368          ALTENTRY(sys_call)
 369  369          / on entry      eax = system call number
 370  370  
 371  371          / set up the stack to look as in reg.h
 372  372          subl    $8, %esp        / pad the stack with ERRCODE and TRAPNO
 373  373  
 374  374          SYSCALL_PUSH
 375  375  
 376  376  #ifdef TRAPTRACE
 377  377          TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSCALL) / Uses labels "8" and "9"
 378  378          TRACE_REGS(%edi, %esp, %ebx, %ecx)      / Uses label "9"
 379  379          pushl   %eax
 380  380          TRACE_STAMP(%edi)               / Clobbers %eax, %edx, uses "9"
 381  381          popl    %eax
 382  382          movl    %eax, TTR_SYSNUM(%edi)
 383  383  #endif
 384  384  
 385  385  _watch_do_syscall:
 386  386          movl    %esp, %ebp
 387  387  
 388  388          / Interrupts may be enabled here, so we must make sure this thread
 389  389          / doesn't migrate off the CPU while it updates the CPU stats.
 390  390          /
 391  391          / XXX This is only true if we got here via call gate thru the LDT for
 392  392          / old style syscalls. Perhaps this preempt++-- will go away soon?
 393  393          movl    %gs:CPU_THREAD, %ebx
 394  394          addb    $1, T_PREEMPT(%ebx)
 395  395          CPU_STATS_SYS_SYSCALL_INC
 396  396          subb    $1, T_PREEMPT(%ebx)
 397  397  
 398  398          ENABLE_INTR_FLAGS
 399  399  
 400  400          pushl   %eax                            / preserve across mstate call
 401  401          MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
 402  402          popl    %eax
 403  403  
 404  404          movl    %gs:CPU_THREAD, %ebx
 405  405          
 406  406          ASSERT_LWPTOREGS(%ebx, %esp)
 407  407  
 408  408          CHECK_PRESYS_NE(%ebx, %eax)
 409  409          jne     _full_syscall_presys
 410  410          SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
 411  411  
 412  412  _syslcall_call:
 413  413          call    *SY_CALLC(%eax)
 414  414  
 415  415  _syslcall_done:
 416  416          CHECK_POSTSYS_NE(%ebx, %ecx)
 417  417          jne     _full_syscall_postsys
 418  418          SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
 419  419          movl    %eax, REGOFF_EAX(%esp)
 420  420          movl    %edx, REGOFF_EDX(%esp)
 421  421  
 422  422          MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
 423  423  
 424  424          /
 425  425          / get back via iret
 426  426          /
 427  427          CLI(%edx)
 428  428          jmp     sys_rtt_syscall
 429  429  
 430  430  _full_syscall_presys:
 431  431          movl    T_LWP(%ebx), %esi
 432  432          subl    $SYS_DROP, %esp
 433  433          movb    $LWP_SYS, LWP_STATE(%esi)
 434  434          pushl   %esp
 435  435          pushl   %ebx
 436  436          call    syscall_entry
 437  437          addl    $8, %esp
 438  438          jmp     _syslcall_call
 439  439  
 440  440  _full_syscall_postsys:
 441  441          addl    $SYS_DROP, %esp
 442  442          pushl   %edx
 443  443          pushl   %eax
 444  444          pushl   %ebx
 445  445          call    syscall_exit
 446  446          addl    $12, %esp
 447  447          MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
 448  448          jmp     _sys_rtt
 449  449  
 450  450  _syscall_fault:
 451  451          push    $0xe                    / EFAULT
 452  452          call    set_errno
 453  453          addl    $4, %esp
 454  454          xorl    %eax, %eax              / fake syscall_err()
 455  455          xorl    %edx, %edx
 456  456          jmp     _syslcall_done
 457  457          SET_SIZE(sys_call)
 458  458          SET_SIZE(brand_sys_call)
 459  459  
 460  460  #endif  /* __lint */
 461  461  
 462  462  /*
 463  463   * System call handler via the sysenter instruction
 464  464   *
 465  465   * Here's how syscall entry usually works (see sys_call for details).
 466  466   *
 467  467   * There, the caller (lcall or int) in userland has arranged that:
 468  468   *
 469  469   * -    %eax contains the syscall number
 470  470   * -    the user stack contains the args to the syscall
 471  471   *
 472  472   * Normally the lcall instruction into the call gate causes the processor
 473  473   * to push %ss, %esp, <top-of-stack>, %cs, %eip onto the kernel stack.
 474  474   * The sys_call handler then leaves space for r_trapno and r_err, and
 475  475   * pusha's {%eax, %ecx, %edx, %ebx, %esp, %ebp, %esi, %edi}, followed
 476  476   * by %ds, %es, %fs and %gs to capture a 'struct regs' on the stack.
 477  477   * Then the kernel sets %ds, %es and %gs to kernel selectors, and finally
 478  478   * extracts %efl and puts it into r_efl (which happens to live at the offset
 479  479   * that <top-of-stack> was copied into). Note that the value in r_efl has
 480  480   * the IF (interrupt enable) flag turned on. (The int instruction into the
 481  481   * interrupt gate does essentially the same thing, only instead of
 482  482   * <top-of-stack> we get eflags - see comment above.)
 483  483   *
 484  484   * In the sysenter case, things are a lot more primitive.
 485  485   *
 486  486   * The caller in userland has arranged that:
 487  487   *
 488  488   * -    %eax contains the syscall number
 489  489   * -    %ecx contains the user %esp
 490  490   * -    %edx contains the return %eip
 491  491   * -    the user stack contains the args to the syscall
 492  492   *
 493  493   * e.g.
 494  494   *      <args on the stack>
 495  495   *      mov     $SYS_callnum, %eax
 496  496   *      mov     $1f, %edx       / return %eip
 497  497   *      mov     %esp, %ecx      / return %esp
 498  498   *      sysenter
 499  499   * 1:   
 500  500   *
 501  501   * Hardware and (privileged) initialization code have arranged that by
 502  502   * the time the sysenter instructions completes:
 503  503   *
 504  504   * - %eip is pointing to sys_sysenter (below).
 505  505   * - %cs and %ss are set to kernel text and stack (data) selectors.
 506  506   * - %esp is pointing at the lwp's stack
 507  507   * - Interrupts have been disabled.
 508  508   *
 509  509   * The task for the sysenter handler is:
 510  510   *
 511  511   * -    recreate the same regs structure on the stack and the same
 512  512   *      kernel state as if we'd come in on an lcall
 513  513   * -    do the normal work of a syscall
 514  514   * -    execute the system call epilogue, use sysexit to return to userland.
 515  515   *
 516  516   * Note that we are unable to return both "rvals" to userland with this
 517  517   * call, as %edx is used by the sysexit instruction.
 518  518   *
 519  519   * One final complication in this routine is its interaction with
 520  520   * single-stepping in a debugger.  For most of the system call mechanisms,
 521  521   * the CPU automatically clears the single-step flag before we enter the
 522  522   * kernel.  The sysenter mechanism does not clear the flag, so a user
 523  523   * single-stepping through a libc routine may suddenly find him/herself
 524  524   * single-stepping through the kernel.  To detect this, kmdb compares the
 525  525   * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
 526  526   * If it finds that we have single-stepped to a sysenter entry point, it
 527  527   * explicitly clears the flag and executes the sys_sysenter routine.
 528  528   *
 529  529   * One final complication in this final complication is the fact that we
 530  530   * have two different entry points for sysenter: brand_sys_sysenter and
 531  531   * sys_sysenter.  If we enter at brand_sys_sysenter and start single-stepping
 532  532   * through the kernel with kmdb, we will eventually hit the instruction at
 533  533   * sys_sysenter.  kmdb cannot distinguish between that valid single-step
 534  534   * and the undesirable one mentioned above.  To avoid this situation, we
 535  535   * simply add a jump over the instruction at sys_sysenter to make it
 536  536   * impossible to single-step to it.
 537  537   */
 538  538  #if defined(__lint)
 539  539  
 540  540  void
 541  541  sys_sysenter()
 542  542  {}
 543  543  
 544  544  #else   /* __lint */
 545  545  
 546  546          ENTRY_NP(brand_sys_sysenter)
 547  547          pushl   %edx
 548  548          BRAND_CALLBACK(BRAND_CB_SYSENTER)
 549  549          popl    %edx
 550  550          /*
 551  551           * Jump over sys_sysenter to allow single-stepping as described
 552  552           * above.
 553  553           */
 554  554          ja      1f
 555  555  
 556  556          ALTENTRY(sys_sysenter)
 557  557          nop
 558  558  1:
 559  559          /
 560  560          / do what the call gate would've done to the stack ..
 561  561          /
 562  562          pushl   $UDS_SEL        / (really %ss, but it's the same ..)
 563  563          pushl   %ecx            / userland makes this a copy of %esp
 564  564          pushfl
 565  565          orl     $PS_IE, (%esp)  / turn interrupts on when we return to user
 566  566          pushl   $UCS_SEL
 567  567          pushl   %edx            / userland makes this a copy of %eip
 568  568          /
 569  569          / done.  finish building the stack frame
 570  570          /
 571  571          subl    $8, %esp        / leave space for ERR and TRAPNO
 572  572  
 573  573          SYSENTER_PUSH
 574  574  
 575  575  #ifdef TRAPTRACE
 576  576          TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSENTER) / uses labels 8 and 9
 577  577          TRACE_REGS(%edi, %esp, %ebx, %ecx)              / uses label 9
 578  578          pushl   %eax
 579  579          TRACE_STAMP(%edi)               / clobbers %eax, %edx, uses label 9
 580  580          popl    %eax
 581  581          movl    %eax, TTR_SYSNUM(%edi)
 582  582  #endif
 583  583          movl    %esp, %ebp
 584  584  
 585  585          CPU_STATS_SYS_SYSCALL_INC
 586  586  
 587  587          ENABLE_INTR_FLAGS
 588  588  
 589  589          pushl   %eax                            / preserve across mstate call
 590  590          MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
 591  591          popl    %eax
 592  592  
 593  593          movl    %gs:CPU_THREAD, %ebx
 594  594  
 595  595          ASSERT_LWPTOREGS(%ebx, %esp)
 596  596  
 597  597          CHECK_PRESYS_NE(%ebx, %eax)
 598  598          jne     _full_syscall_presys
 599  599          SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
 600  600  
 601  601  _sysenter_call:
 602  602          call    *SY_CALLC(%eax)
 603  603  
 604  604  _sysenter_done:
 605  605          CHECK_POSTSYS_NE(%ebx, %ecx)
 606  606          jne     _full_syscall_postsys
 607  607          SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
 608  608          /
 609  609          / sysexit uses %edx to restore %eip, so we can't use it
 610  610          / to return a value, sigh.
 611  611          / 
 612  612          movl    %eax, REGOFF_EAX(%esp)
 613  613          / movl  %edx, REGOFF_EDX(%esp)
 614  614  
 615  615          / Interrupts will be turned on by the 'sti' executed just before
 616  616          / sysexit. The following ensures that restoring the user's EFLAGS
 617  617          / doesn't enable interrupts too soon.
 618  618          andl    $_BITNOT(PS_IE), REGOFF_EFL(%esp)
 619  619  
 620  620          MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
 621  621  
 622  622          cli
 623  623  
  
    | 
      ↓ open down ↓ | 
    623 lines elided | 
    
      ↑ open up ↑ | 
  
 624  624          SYSCALL_POP
 625  625  
 626  626          popl    %edx                    / sysexit: %edx -> %eip
 627  627          addl    $4, %esp                / get CS off the stack
 628  628          popfl                           / EFL
 629  629          popl    %ecx                    / sysexit: %ecx -> %esp
 630  630          sti
 631  631          sysexit
 632  632          SET_SIZE(sys_sysenter)
 633  633          SET_SIZE(brand_sys_sysenter)
      634 +#endif  /* __lint */
 634  635  
      636 +#if defined(__lint)
 635  637  /*
      638 + * System call via an int80.  This entry point is only used by the Linux
      639 + * application environment.  Unlike the sysenter path, there is no default
      640 + * action to take if no callback is registered for this process.
      641 + */
      642 +void
      643 +sys_int80()
      644 +{}
      645 +
      646 +#else   /* __lint */
      647 +
      648 +        ENTRY_NP(brand_sys_int80)
      649 +        BRAND_CALLBACK(BRAND_CB_INT80)
      650 +
      651 +        ALTENTRY(sys_int80)
      652 +        /*
      653 +         * We hit an int80, but this process isn't of a brand with an int80
      654 +         * handler.  Bad process!  Make it look as if the INT failed.
      655 +         * Modify %eip to point before the INT, push the expected error
      656 +         * code and fake a GP fault.
      657 +         * 
      658 +         */
      659 +        subl    $2, (%esp)      /* int insn 2-bytes */
      660 +        pushl   $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2)
      661 +        jmp     gptrap                  / GP fault
      662 +        SET_SIZE(sys_int80)
      663 +        SET_SIZE(brand_sys_int80)
      664 +
      665 +/*
 636  666   * Declare a uintptr_t which covers the entire pc range of syscall
 637  667   * handlers for the stack walkers that need this.
 638  668   */
 639  669          .align  CPTRSIZE
 640  670          .globl  _allsyscalls_size
 641  671          .type   _allsyscalls_size, @object
 642  672  _allsyscalls_size:
 643  673          .NWORD  . - _allsyscalls
 644  674          SET_SIZE(_allsyscalls_size)
 645  675  
 646  676  #endif  /* __lint */
 647  677  
 648  678  /*
 649  679   * These are the thread context handlers for lwps using sysenter/sysexit.
 650  680   */
 651  681  
 652  682  #if defined(__lint)
 653  683  
 654  684  /*ARGSUSED*/
 655  685  void
 656  686  sep_save(void *ksp)
 657  687  {}
 658  688  
 659  689  /*ARGSUSED*/                    
 660  690  void
 661  691  sep_restore(void *ksp)
 662  692  {}
 663  693  
 664  694  #else   /* __lint */
 665  695  
 666  696          /*
 667  697           * setting this value to zero as we switch away causes the
 668  698           * stack-pointer-on-sysenter to be NULL, ensuring that we
 669  699           * don't silently corrupt another (preempted) thread stack
 670  700           * when running an lwp that (somehow) didn't get sep_restore'd
 671  701           */
 672  702          ENTRY_NP(sep_save)
 673  703          xorl    %edx, %edx
 674  704          xorl    %eax, %eax
 675  705          movl    $MSR_INTC_SEP_ESP, %ecx
 676  706          wrmsr
 677  707          ret
 678  708          SET_SIZE(sep_save)
 679  709  
 680  710          /*
 681  711           * Update the kernel stack pointer as we resume onto this cpu.
 682  712           */
 683  713          ENTRY_NP(sep_restore)
 684  714          movl    4(%esp), %eax                   /* per-lwp kernel sp */
 685  715          xorl    %edx, %edx
 686  716          movl    $MSR_INTC_SEP_ESP, %ecx
 687  717          wrmsr
 688  718          ret
 689  719          SET_SIZE(sep_restore)
 690  720  
 691  721  #endif  /* __lint */
 692  722  
 693  723  /*
 694  724   * Call syscall().  Called from trap() on watchpoint at lcall 0,7
 695  725   */
 696  726  
 697  727  #if defined(__lint)
 698  728  
 699  729  void
 700  730  watch_syscall(void)
 701  731  {}
 702  732  
 703  733  #else   /* __lint */
 704  734  
 705  735          ENTRY_NP(watch_syscall)
 706  736          CLI(%eax)
 707  737          movl    %gs:CPU_THREAD, %ebx
 708  738          movl    T_STACK(%ebx), %esp             / switch to the thread stack
 709  739          movl    REGOFF_EAX(%esp), %eax          / recover original syscall#
 710  740          jmp     _watch_do_syscall
 711  741          SET_SIZE(watch_syscall)
 712  742  
 713  743  #endif  /* __lint */
  
    | 
      ↓ open down ↓ | 
    68 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX