Print this page
    
OS-3825 lxbrand rsyslogd abort on centos6
OS-4047 lxbrand vsyscall while SIGSEGV? on next trap we're handler-free!
Reviewed by: Bryan Cantrill <bryan@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86pc/os/trap.c
          +++ new/usr/src/uts/i86pc/os/trap.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  
    | 
      ↓ open down ↓ | 
    24 lines elided | 
    
      ↑ open up ↑ | 
  
  25   25  
  26   26  /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  28   28  /*              All Rights Reserved                             */
  29   29  /*                                                              */
  30   30  /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  31   31  /*              All Rights Reserved                             */
  32   32  /*                                                              */
  33   33  
  34   34  /*
  35      - * Copyright 2012 Joyent, Inc. All rights reserved.
       35 + * Copyright 2015 Joyent, Inc.
  36   36   */
  37   37  
  38   38  #include <sys/types.h>
  39   39  #include <sys/sysmacros.h>
  40   40  #include <sys/param.h>
  41   41  #include <sys/signal.h>
  42   42  #include <sys/systm.h>
  43   43  #include <sys/user.h>
  44   44  #include <sys/proc.h>
  45   45  #include <sys/disp.h>
  46   46  #include <sys/class.h>
  47   47  #include <sys/core.h>
  48   48  #include <sys/syscall.h>
  49   49  #include <sys/cpuvar.h>
  50   50  #include <sys/vm.h>
  51   51  #include <sys/sysinfo.h>
  52   52  #include <sys/fault.h>
  53   53  #include <sys/stack.h>
  54   54  #include <sys/psw.h>
  55   55  #include <sys/regset.h>
  56   56  #include <sys/fp.h>
  57   57  #include <sys/trap.h>
  58   58  #include <sys/kmem.h>
  59   59  #include <sys/vtrace.h>
  60   60  #include <sys/cmn_err.h>
  61   61  #include <sys/prsystm.h>
  62   62  #include <sys/mutex_impl.h>
  63   63  #include <sys/machsystm.h>
  64   64  #include <sys/archsystm.h>
  65   65  #include <sys/sdt.h>
  66   66  #include <sys/avintr.h>
  67   67  #include <sys/kobj.h>
  68   68  
  69   69  #include <vm/hat.h>
  70   70  
  71   71  #include <vm/seg_kmem.h>
  72   72  #include <vm/as.h>
  73   73  #include <vm/seg.h>
  74   74  #include <vm/hat_pte.h>
  75   75  #include <vm/hat_i86.h>
  76   76  
  77   77  #include <sys/procfs.h>
  78   78  
  79   79  #include <sys/reboot.h>
  80   80  #include <sys/debug.h>
  81   81  #include <sys/debugreg.h>
  82   82  #include <sys/modctl.h>
  83   83  #include <sys/aio_impl.h>
  84   84  #include <sys/tnf.h>
  85   85  #include <sys/tnf_probe.h>
  86   86  #include <sys/cred.h>
  87   87  #include <sys/mman.h>
  88   88  #include <sys/x86_archext.h>
  89   89  #include <sys/copyops.h>
  90   90  #include <c2/audit.h>
  91   91  #include <sys/ftrace.h>
  92   92  #include <sys/panic.h>
  93   93  #include <sys/traptrace.h>
  
    | 
      ↓ open down ↓ | 
    48 lines elided | 
    
      ↑ open up ↑ | 
  
  94   94  #include <sys/ontrap.h>
  95   95  #include <sys/cpc_impl.h>
  96   96  #include <sys/bootconf.h>
  97   97  #include <sys/bootinfo.h>
  98   98  #include <sys/promif.h>
  99   99  #include <sys/mach_mmu.h>
 100  100  #if defined(__xpv)
 101  101  #include <sys/hypervisor.h>
 102  102  #endif
 103  103  #include <sys/contract/process_impl.h>
      104 +#include <sys/brand.h>
 104  105  
 105  106  #define USER    0x10000         /* user-mode flag added to trap type */
 106  107  
 107  108  static const char *trap_type_mnemonic[] = {
 108  109          "de",   "db",   "2",    "bp",
 109  110          "of",   "br",   "ud",   "nm",
 110  111          "df",   "9",    "ts",   "np",
 111  112          "ss",   "gp",   "pf",   "15",
 112  113          "mf",   "ac",   "mc",   "xf"
 113  114  };
 114  115  
 115  116  static const char *trap_type[] = {
 116  117          "Divide error",                         /* trap id 0    */
 117  118          "Debug",                                /* trap id 1    */
 118  119          "NMI interrupt",                        /* trap id 2    */
 119  120          "Breakpoint",                           /* trap id 3    */
 120  121          "Overflow",                             /* trap id 4    */
 121  122          "BOUND range exceeded",                 /* trap id 5    */
 122  123          "Invalid opcode",                       /* trap id 6    */
 123  124          "Device not available",                 /* trap id 7    */
 124  125          "Double fault",                         /* trap id 8    */
 125  126          "Coprocessor segment overrun",          /* trap id 9    */
 126  127          "Invalid TSS",                          /* trap id 10   */
 127  128          "Segment not present",                  /* trap id 11   */
 128  129          "Stack segment fault",                  /* trap id 12   */
 129  130          "General protection",                   /* trap id 13   */
 130  131          "Page fault",                           /* trap id 14   */
 131  132          "Reserved",                             /* trap id 15   */
 132  133          "x87 floating point error",             /* trap id 16   */
 133  134          "Alignment check",                      /* trap id 17   */
 134  135          "Machine check",                        /* trap id 18   */
 135  136          "SIMD floating point exception",        /* trap id 19   */
 136  137  };
 137  138  
 138  139  #define TRAP_TYPES      (sizeof (trap_type) / sizeof (trap_type[0]))
 139  140  
 140  141  #define SLOW_SCALL_SIZE 2
 141  142  #define FAST_SCALL_SIZE 2
 142  143  
 143  144  int tudebug = 0;
 144  145  int tudebugbpt = 0;
 145  146  int tudebugfpe = 0;
 146  147  int tudebugsse = 0;
 147  148  
 148  149  #if defined(TRAPDEBUG) || defined(lint)
 149  150  int tdebug = 0;
 150  151  int lodebug = 0;
 151  152  int faultdebug = 0;
 152  153  #else
 153  154  #define tdebug  0
 154  155  #define lodebug 0
 155  156  #define faultdebug      0
 156  157  #endif /* defined(TRAPDEBUG) || defined(lint) */
 157  158  
 158  159  #if defined(TRAPTRACE)
 159  160  /*
 160  161   * trap trace record for cpu0 is allocated here.
 161  162   * trap trace records for non-boot cpus are allocated in mp_startup_init().
 162  163   */
 163  164  static trap_trace_rec_t trap_tr0[TRAPTR_NENT];
 164  165  trap_trace_ctl_t trap_trace_ctl[NCPU] = {
 165  166          {
 166  167              (uintptr_t)trap_tr0,                        /* next record */
 167  168              (uintptr_t)trap_tr0,                        /* first record */
 168  169              (uintptr_t)(trap_tr0 + TRAPTR_NENT),        /* limit */
 169  170              (uintptr_t)0                                /* current */
 170  171          },
 171  172  };
 172  173  
 173  174  /*
 174  175   * default trap buffer size
 175  176   */
 176  177  size_t trap_trace_bufsize = TRAPTR_NENT * sizeof (trap_trace_rec_t);
 177  178  int trap_trace_freeze = 0;
 178  179  int trap_trace_off = 0;
 179  180  
 180  181  /*
 181  182   * A dummy TRAPTRACE entry to use after death.
 182  183   */
 183  184  trap_trace_rec_t trap_trace_postmort;
 184  185  
 185  186  static void dump_ttrace(void);
 186  187  #endif  /* TRAPTRACE */
 187  188  static void dumpregs(struct regs *);
 188  189  static void showregs(uint_t, struct regs *, caddr_t);
 189  190  static int kern_gpfault(struct regs *);
 190  191  
 191  192  /*ARGSUSED*/
 192  193  static int
 193  194  die(uint_t type, struct regs *rp, caddr_t addr, processorid_t cpuid)
 194  195  {
 195  196          struct panic_trap_info ti;
 196  197          const char *trap_name, *trap_mnemonic;
 197  198  
 198  199          if (type < TRAP_TYPES) {
 199  200                  trap_name = trap_type[type];
 200  201                  trap_mnemonic = trap_type_mnemonic[type];
 201  202          } else {
 202  203                  trap_name = "trap";
 203  204                  trap_mnemonic = "-";
 204  205          }
 205  206  
 206  207  #ifdef TRAPTRACE
 207  208          TRAPTRACE_FREEZE;
 208  209  #endif
 209  210  
 210  211          ti.trap_regs = rp;
 211  212          ti.trap_type = type & ~USER;
 212  213          ti.trap_addr = addr;
 213  214  
 214  215          curthread->t_panic_trap = &ti;
 215  216  
 216  217          if (type == T_PGFLT && addr < (caddr_t)KERNELBASE) {
 217  218                  panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p "
 218  219                      "occurred in module \"%s\" due to %s",
 219  220                      type, trap_mnemonic, trap_name, (void *)rp, (void *)addr,
 220  221                      mod_containing_pc((caddr_t)rp->r_pc),
 221  222                      addr < (caddr_t)PAGESIZE ?
 222  223                      "a NULL pointer dereference" :
 223  224                      "an illegal access to a user address");
 224  225          } else
 225  226                  panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p",
 226  227                      type, trap_mnemonic, trap_name, (void *)rp, (void *)addr);
 227  228          return (0);
 228  229  }
 229  230  
 230  231  /*
 231  232   * Rewrite the instruction at pc to be an int $T_SYSCALLINT instruction.
 232  233   *
 233  234   * int <vector> is two bytes: 0xCD <vector>
 234  235   */
 235  236  
 236  237  static int
 237  238  rewrite_syscall(caddr_t pc)
 238  239  {
 239  240          uchar_t instr[SLOW_SCALL_SIZE] = { 0xCD, T_SYSCALLINT };
 240  241  
 241  242          if (uwrite(curthread->t_procp, instr, SLOW_SCALL_SIZE,
 242  243              (uintptr_t)pc) != 0)
 243  244                  return (1);
 244  245  
 245  246          return (0);
 246  247  }
 247  248  
 248  249  /*
 249  250   * Test to see if the instruction at pc is sysenter or syscall. The second
 250  251   * argument should be the x86 feature flag corresponding to the expected
 251  252   * instruction.
 252  253   *
 253  254   * sysenter is two bytes: 0x0F 0x34
 254  255   * syscall is two bytes:  0x0F 0x05
 255  256   * int $T_SYSCALLINT is two bytes: 0xCD 0x91
 256  257   */
 257  258  
 258  259  static int
 259  260  instr_is_other_syscall(caddr_t pc, int which)
 260  261  {
 261  262          uchar_t instr[FAST_SCALL_SIZE];
 262  263  
 263  264          ASSERT(which == X86FSET_SEP || which == X86FSET_ASYSC || which == 0xCD);
 264  265  
 265  266          if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0)
 266  267                  return (0);
 267  268  
 268  269          switch (which) {
 269  270          case X86FSET_SEP:
 270  271                  if (instr[0] == 0x0F && instr[1] == 0x34)
 271  272                          return (1);
 272  273                  break;
 273  274          case X86FSET_ASYSC:
 274  275                  if (instr[0] == 0x0F && instr[1] == 0x05)
 275  276                          return (1);
 276  277                  break;
 277  278          case 0xCD:
 278  279                  if (instr[0] == 0xCD && instr[1] == T_SYSCALLINT)
 279  280                          return (1);
 280  281                  break;
 281  282          }
 282  283  
 283  284          return (0);
 284  285  }
 285  286  
 286  287  static const char *
 287  288  syscall_insn_string(int syscall_insn)
 288  289  {
 289  290          switch (syscall_insn) {
 290  291          case X86FSET_SEP:
 291  292                  return ("sysenter");
 292  293          case X86FSET_ASYSC:
 293  294                  return ("syscall");
 294  295          case 0xCD:
 295  296                  return ("int");
 296  297          default:
 297  298                  return ("Unknown");
 298  299          }
 299  300  }
 300  301  
 301  302  static int
 302  303  ldt_rewrite_syscall(struct regs *rp, proc_t *p, int syscall_insn)
 303  304  {
 304  305          caddr_t linearpc;
 305  306          int return_code = 0;
 306  307  
 307  308          mutex_enter(&p->p_ldtlock);     /* Must be held across linear_pc() */
 308  309  
 309  310          if (linear_pc(rp, p, &linearpc) == 0) {
 310  311  
 311  312                  /*
 312  313                   * If another thread beat us here, it already changed
 313  314                   * this site to the slower (int) syscall instruction.
 314  315                   */
 315  316                  if (instr_is_other_syscall(linearpc, 0xCD)) {
 316  317                          return_code = 1;
 317  318                  } else if (instr_is_other_syscall(linearpc, syscall_insn)) {
 318  319  
 319  320                          if (rewrite_syscall(linearpc) == 0) {
 320  321                                  return_code = 1;
 321  322                          }
 322  323  #ifdef DEBUG
 323  324                          else
 324  325                                  cmn_err(CE_WARN, "failed to rewrite %s "
 325  326                                      "instruction in process %d",
 326  327                                      syscall_insn_string(syscall_insn),
 327  328                                      p->p_pid);
 328  329  #endif /* DEBUG */
 329  330                  }
 330  331          }
 331  332  
 332  333          mutex_exit(&p->p_ldtlock);      /* Must be held across linear_pc() */
 333  334  
 334  335          return (return_code);
 335  336  }
 336  337  
 337  338  /*
 338  339   * Test to see if the instruction at pc is a system call instruction.
 339  340   *
 340  341   * The bytes of an lcall instruction used for the syscall trap.
 341  342   * static uchar_t lcall[7] = { 0x9a, 0, 0, 0, 0, 0x7, 0 };
 342  343   * static uchar_t lcallalt[7] = { 0x9a, 0, 0, 0, 0, 0x27, 0 };
 343  344   */
 344  345  
 345  346  #define LCALLSIZE       7
 346  347  
 347  348  static int
 348  349  instr_is_lcall_syscall(caddr_t pc)
 349  350  {
 350  351          uchar_t instr[LCALLSIZE];
 351  352  
 352  353          if (copyin_nowatch(pc, (caddr_t)instr, LCALLSIZE) == 0 &&
 353  354              instr[0] == 0x9a &&
 354  355              instr[1] == 0 &&
 355  356              instr[2] == 0 &&
 356  357              instr[3] == 0 &&
 357  358              instr[4] == 0 &&
 358  359              (instr[5] == 0x7 || instr[5] == 0x27) &&
 359  360              instr[6] == 0)
 360  361                  return (1);
 361  362  
 362  363          return (0);
 363  364  }
 364  365  
 365  366  #ifdef __amd64
 366  367  
 367  368  /*
 368  369   * In the first revisions of amd64 CPUs produced by AMD, the LAHF and
 369  370   * SAHF instructions were not implemented in 64-bit mode. Later revisions
 370  371   * did implement these instructions. An extension to the cpuid instruction
 371  372   * was added to check for the capability of executing these instructions
 372  373   * in 64-bit mode.
 373  374   *
 374  375   * Intel originally did not implement these instructions in EM64T either,
 375  376   * but added them in later revisions.
 376  377   *
 377  378   * So, there are different chip revisions by both vendors out there that
 378  379   * may or may not implement these instructions. The easy solution is to
 379  380   * just always emulate these instructions on demand.
 380  381   *
 381  382   * SAHF == store %ah in the lower 8 bits of %rflags (opcode 0x9e)
 382  383   * LAHF == load the lower 8 bits of %rflags into %ah (opcode 0x9f)
 383  384   */
 384  385  
 385  386  #define LSAHFSIZE 1
 386  387  
 387  388  static int
 388  389  instr_is_lsahf(caddr_t pc, uchar_t *instr)
 389  390  {
 390  391          if (copyin_nowatch(pc, (caddr_t)instr, LSAHFSIZE) == 0 &&
 391  392              (*instr == 0x9e || *instr == 0x9f))
 392  393                  return (1);
 393  394          return (0);
 394  395  }
 395  396  
 396  397  /*
 397  398   * Emulate the LAHF and SAHF instructions. The reference manuals define
 398  399   * these instructions to always load/store bit 1 as a 1, and bits 3 and 5
 399  400   * as a 0. The other, defined, bits are copied (the PS_ICC bits and PS_P).
 400  401   *
 401  402   * Note that %ah is bits 8-15 of %rax.
 402  403   */
 403  404  static void
 404  405  emulate_lsahf(struct regs *rp, uchar_t instr)
 405  406  {
 406  407          if (instr == 0x9e) {
 407  408                  /* sahf. Copy bits from %ah to flags. */
 408  409                  rp->r_ps = (rp->r_ps & ~0xff) |
 409  410                      ((rp->r_rax >> 8) & PSL_LSAHFMASK) | PS_MB1;
 410  411          } else {
 411  412                  /* lahf. Copy bits from flags to %ah. */
 412  413                  rp->r_rax = (rp->r_rax & ~0xff00) |
 413  414                      (((rp->r_ps & PSL_LSAHFMASK) | PS_MB1) << 8);
 414  415          }
 415  416          rp->r_pc += LSAHFSIZE;
 416  417  }
 417  418  #endif /* __amd64 */
 418  419  
 419  420  #ifdef OPTERON_ERRATUM_91
 420  421  
 421  422  /*
 422  423   * Test to see if the instruction at pc is a prefetch instruction.
 423  424   *
 424  425   * The first byte of prefetch instructions is always 0x0F.
 425  426   * The second byte is 0x18 for regular prefetch or 0x0D for AMD 3dnow prefetch.
 426  427   * The third byte (ModRM) contains the register field bits (bits 3-5).
 427  428   * These bits must be between 0 and 3 inclusive for regular prefetch and
 428  429   * 0 and 1 inclusive for AMD 3dnow prefetch.
 429  430   *
 430  431   * In 64-bit mode, there may be a one-byte REX prefex (0x40-0x4F).
 431  432   */
 432  433  
 433  434  static int
 434  435  cmp_to_prefetch(uchar_t *p)
 435  436  {
 436  437  #ifdef _LP64
 437  438          if ((p[0] & 0xF0) == 0x40)      /* 64-bit REX prefix */
 438  439                  p++;
 439  440  #endif
 440  441          return ((p[0] == 0x0F && p[1] == 0x18 && ((p[2] >> 3) & 7) <= 3) ||
 441  442              (p[0] == 0x0F && p[1] == 0x0D && ((p[2] >> 3) & 7) <= 1));
 442  443  }
 443  444  
 444  445  static int
 445  446  instr_is_prefetch(caddr_t pc)
 446  447  {
 447  448          uchar_t instr[4];       /* optional REX prefix plus 3-byte opcode */
 448  449  
 449  450          return (copyin_nowatch(pc, instr, sizeof (instr)) == 0 &&
 450  451              cmp_to_prefetch(instr));
 451  452  }
 452  453  
 453  454  #endif /* OPTERON_ERRATUM_91 */
 454  455  
 455  456  /*
 456  457   * Called from the trap handler when a processor trap occurs.
 457  458   *
 458  459   * Note: All user-level traps that might call stop() must exit
 459  460   * trap() by 'goto out' or by falling through.
 460  461   * Note Also: trap() is usually called with interrupts enabled, (PS_IE == 1)
 461  462   * however, there are paths that arrive here with PS_IE == 0 so special care
 462  463   * must be taken in those cases.
 463  464   */
 464  465  void
 465  466  trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
 466  467  {
 467  468          kthread_t *ct = curthread;
 468  469          enum seg_rw rw;
 469  470          unsigned type;
 470  471          proc_t *p = ttoproc(ct);
 471  472          klwp_t *lwp = ttolwp(ct);
 472  473          uintptr_t lofault;
 473  474          label_t *onfault;
 474  475          faultcode_t pagefault(), res, errcode;
 475  476          enum fault_type fault_type;
 476  477          k_siginfo_t siginfo;
 477  478          uint_t fault = 0;
 478  479          int mstate;
 479  480          int sicode = 0;
 480  481          int watchcode;
 481  482          int watchpage;
 482  483          caddr_t vaddr;
 483  484          int singlestep_twiddle;
 484  485          size_t sz;
 485  486          int ta;
 486  487  #ifdef __amd64
 487  488          uchar_t instr;
 488  489  #endif
 489  490  
 490  491          ASSERT_STACK_ALIGNED();
 491  492  
 492  493          type = rp->r_trapno;
 493  494          CPU_STATS_ADDQ(CPU, sys, trap, 1);
 494  495          ASSERT(ct->t_schedflag & TS_DONT_SWAP);
 495  496  
 496  497          if (type == T_PGFLT) {
 497  498  
 498  499                  errcode = rp->r_err;
 499  500                  if (errcode & PF_ERR_WRITE)
 500  501                          rw = S_WRITE;
 501  502                  else if ((caddr_t)rp->r_pc == addr ||
 502  503                      (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC)))
 503  504                          rw = S_EXEC;
 504  505                  else
 505  506                          rw = S_READ;
 506  507  
 507  508  #if defined(__i386)
 508  509                  /*
 509  510                   * Pentium Pro work-around
 510  511                   */
 511  512                  if ((errcode & PF_ERR_PROT) && pentiumpro_bug4046376) {
 512  513                          uint_t  attr;
 513  514                          uint_t  priv_violation;
 514  515                          uint_t  access_violation;
 515  516  
 516  517                          if (hat_getattr(addr < (caddr_t)kernelbase ?
 517  518                              curproc->p_as->a_hat : kas.a_hat, addr, &attr)
 518  519                              == -1) {
 519  520                                  errcode &= ~PF_ERR_PROT;
 520  521                          } else {
 521  522                                  priv_violation = (errcode & PF_ERR_USER) &&
 522  523                                      !(attr & PROT_USER);
 523  524                                  access_violation = (errcode & PF_ERR_WRITE) &&
 524  525                                      !(attr & PROT_WRITE);
 525  526                                  if (!priv_violation && !access_violation)
 526  527                                          goto cleanup;
 527  528                          }
 528  529                  }
 529  530  #endif /* __i386 */
 530  531  
 531  532          } else if (type == T_SGLSTP && lwp != NULL)
 532  533                  lwp->lwp_pcb.pcb_drstat = (uintptr_t)addr;
 533  534  
 534  535          if (tdebug)
 535  536                  showregs(type, rp, addr);
 536  537  
 537  538          if (USERMODE(rp->r_cs)) {
 538  539                  /*
 539  540                   * Set up the current cred to use during this trap. u_cred
 540  541                   * no longer exists.  t_cred is used instead.
 541  542                   * The current process credential applies to the thread for
 542  543                   * the entire trap.  If trapping from the kernel, this
 543  544                   * should already be set up.
 544  545                   */
 545  546                  if (ct->t_cred != p->p_cred) {
 546  547                          cred_t *oldcred = ct->t_cred;
 547  548                          /*
 548  549                           * DTrace accesses t_cred in probe context.  t_cred
 549  550                           * must always be either NULL, or point to a valid,
 550  551                           * allocated cred structure.
 551  552                           */
 552  553                          ct->t_cred = crgetcred();
 553  554                          crfree(oldcred);
 554  555                  }
 555  556                  ASSERT(lwp != NULL);
 556  557                  type |= USER;
 557  558                  ASSERT(lwptoregs(lwp) == rp);
 558  559                  lwp->lwp_state = LWP_SYS;
 559  560  
 560  561                  switch (type) {
 561  562                  case T_PGFLT + USER:
 562  563                          if ((caddr_t)rp->r_pc == addr)
 563  564                                  mstate = LMS_TFAULT;
 564  565                          else
 565  566                                  mstate = LMS_DFAULT;
 566  567                          break;
 567  568                  default:
 568  569                          mstate = LMS_TRAP;
 569  570                          break;
 570  571                  }
 571  572                  /* Kernel probe */
 572  573                  TNF_PROBE_1(thread_state, "thread", /* CSTYLED */,
 573  574                      tnf_microstate, state, mstate);
 574  575                  mstate = new_mstate(ct, mstate);
 575  576  
 576  577                  bzero(&siginfo, sizeof (siginfo));
 577  578          }
 578  579  
 579  580          switch (type) {
 580  581          case T_PGFLT + USER:
 581  582          case T_SGLSTP:
 582  583          case T_SGLSTP + USER:
 583  584          case T_BPTFLT + USER:
 584  585                  break;
 585  586  
 586  587          default:
 587  588                  FTRACE_2("trap(): type=0x%lx, regs=0x%lx",
 588  589                      (ulong_t)type, (ulong_t)rp);
 589  590                  break;
 590  591          }
 591  592  
 592  593          switch (type) {
 593  594          case T_SIMDFPE:
 594  595                  /* Make sure we enable interrupts before die()ing */
 595  596                  sti();  /* The SIMD exception comes in via cmninttrap */
 596  597                  /*FALLTHROUGH*/
 597  598          default:
 598  599                  if (type & USER) {
 599  600                          if (tudebug)
 600  601                                  showregs(type, rp, (caddr_t)0);
 601  602                          printf("trap: Unknown trap type %d in user mode\n",
 602  603                              type & ~USER);
 603  604                          siginfo.si_signo = SIGILL;
 604  605                          siginfo.si_code  = ILL_ILLTRP;
 605  606                          siginfo.si_addr  = (caddr_t)rp->r_pc;
 606  607                          siginfo.si_trapno = type & ~USER;
 607  608                          fault = FLTILL;
 608  609                          break;
 609  610                  } else {
 610  611                          (void) die(type, rp, addr, cpuid);
 611  612                          /*NOTREACHED*/
 612  613                  }
 613  614  
 614  615          case T_PGFLT:           /* system page fault */
 615  616                  /*
 616  617                   * If we're under on_trap() protection (see <sys/ontrap.h>),
 617  618                   * set ot_trap and bounce back to the on_trap() call site
 618  619                   * via the installed trampoline.
 619  620                   */
 620  621                  if ((ct->t_ontrap != NULL) &&
 621  622                      (ct->t_ontrap->ot_prot & OT_DATA_ACCESS)) {
 622  623                          ct->t_ontrap->ot_trap |= OT_DATA_ACCESS;
 623  624                          rp->r_pc = ct->t_ontrap->ot_trampoline;
 624  625                          goto cleanup;
 625  626                  }
 626  627  
 627  628                  /*
 628  629                   * If we have an Instruction fault in kernel mode, then that
 629  630                   * means we've tried to execute a user page (SMEP) or both of
 630  631                   * PAE and NXE are enabled. In either case, given that it's a
 631  632                   * kernel fault, we should panic immediately and not try to make
 632  633                   * any more forward progress. This indicates a bug in the
 633  634                   * kernel, which if execution continued, could be exploited to
 634  635                   * wreak havoc on the system.
 635  636                   */
 636  637                  if (errcode & PF_ERR_EXEC) {
 637  638                          (void) die(type, rp, addr, cpuid);
 638  639                  }
 639  640  
 640  641                  /*
 641  642                   * We need to check if SMAP is in play. If SMAP is in play, then
 642  643                   * any access to a user page will show up as a protection
 643  644                   * violation. To see if SMAP is enabled we first check if it's a
 644  645                   * user address and whether we have the feature flag set. If we
 645  646                   * do and the interrupted registers do not allow for user
 646  647                   * accesses (PS_ACHK is not enabled), then we need to die
 647  648                   * immediately.
 648  649                   */
 649  650                  if (addr < (caddr_t)kernelbase &&
 650  651                      is_x86_feature(x86_featureset, X86FSET_SMAP) == B_TRUE &&
 651  652                      (rp->r_ps & PS_ACHK) == 0) {
 652  653                          (void) die(type, rp, addr, cpuid);
 653  654                  }
 654  655  
 655  656                  /*
 656  657                   * See if we can handle as pagefault. Save lofault and onfault
 657  658                   * across this. Here we assume that an address less than
 658  659                   * KERNELBASE is a user fault.  We can do this as copy.s
 659  660                   * routines verify that the starting address is less than
 660  661                   * KERNELBASE before starting and because we know that we
 661  662                   * always have KERNELBASE mapped as invalid to serve as a
 662  663                   * "barrier".
 663  664                   */
 664  665                  lofault = ct->t_lofault;
 665  666                  onfault = ct->t_onfault;
 666  667                  ct->t_lofault = 0;
 667  668  
 668  669                  mstate = new_mstate(ct, LMS_KFAULT);
 669  670  
 670  671                  if (addr < (caddr_t)kernelbase) {
 671  672                          res = pagefault(addr,
 672  673                              (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 0);
 673  674                          if (res == FC_NOMAP &&
 674  675                              addr < p->p_usrstack &&
 675  676                              grow(addr))
 676  677                                  res = 0;
 677  678                  } else {
 678  679                          res = pagefault(addr,
 679  680                              (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 1);
 680  681                  }
 681  682                  (void) new_mstate(ct, mstate);
 682  683  
 683  684                  /*
 684  685                   * Restore lofault and onfault. If we resolved the fault, exit.
 685  686                   * If we didn't and lofault wasn't set, die.
 686  687                   */
 687  688                  ct->t_lofault = lofault;
 688  689                  ct->t_onfault = onfault;
 689  690                  if (res == 0)
 690  691                          goto cleanup;
 691  692  
 692  693  #if defined(OPTERON_ERRATUM_93) && defined(_LP64)
 693  694                  if (lofault == 0 && opteron_erratum_93) {
 694  695                          /*
 695  696                           * Workaround for Opteron Erratum 93. On return from
 696  697                           * a System Managment Interrupt at a HLT instruction
 697  698                           * the %rip might be truncated to a 32 bit value.
 698  699                           * BIOS is supposed to fix this, but some don't.
 699  700                           * If this occurs we simply restore the high order bits.
 700  701                           * The HLT instruction is 1 byte of 0xf4.
 701  702                           */
 702  703                          uintptr_t       rip = rp->r_pc;
 703  704  
 704  705                          if ((rip & 0xfffffffful) == rip) {
 705  706                                  rip |= 0xfffffffful << 32;
 706  707                                  if (hat_getpfnum(kas.a_hat, (caddr_t)rip) !=
 707  708                                      PFN_INVALID &&
 708  709                                      (*(uchar_t *)rip == 0xf4 ||
 709  710                                      *(uchar_t *)(rip - 1) == 0xf4)) {
 710  711                                          rp->r_pc = rip;
 711  712                                          goto cleanup;
 712  713                                  }
 713  714                          }
 714  715                  }
 715  716  #endif /* OPTERON_ERRATUM_93 && _LP64 */
 716  717  
 717  718  #ifdef OPTERON_ERRATUM_91
 718  719                  if (lofault == 0 && opteron_erratum_91) {
 719  720                          /*
 720  721                           * Workaround for Opteron Erratum 91. Prefetches may
 721  722                           * generate a page fault (they're not supposed to do
 722  723                           * that!). If this occurs we simply return back to the
 723  724                           * instruction.
 724  725                           */
 725  726                          caddr_t         pc = (caddr_t)rp->r_pc;
 726  727  
 727  728                          /*
 728  729                           * If the faulting PC is not mapped, this is a
 729  730                           * legitimate kernel page fault that must result in a
 730  731                           * panic. If the faulting PC is mapped, it could contain
 731  732                           * a prefetch instruction. Check for that here.
 732  733                           */
 733  734                          if (hat_getpfnum(kas.a_hat, pc) != PFN_INVALID) {
 734  735                                  if (cmp_to_prefetch((uchar_t *)pc)) {
 735  736  #ifdef DEBUG
 736  737                                          cmn_err(CE_WARN, "Opteron erratum 91 "
 737  738                                              "occurred: kernel prefetch"
 738  739                                              " at %p generated a page fault!",
 739  740                                              (void *)rp->r_pc);
 740  741  #endif /* DEBUG */
 741  742                                          goto cleanup;
 742  743                                  }
 743  744                          }
 744  745                          (void) die(type, rp, addr, cpuid);
 745  746                  }
 746  747  #endif /* OPTERON_ERRATUM_91 */
 747  748  
 748  749                  if (lofault == 0)
 749  750                          (void) die(type, rp, addr, cpuid);
 750  751  
 751  752                  /*
 752  753                   * Cannot resolve fault.  Return to lofault.
 753  754                   */
 754  755                  if (lodebug) {
 755  756                          showregs(type, rp, addr);
 756  757                          traceregs(rp);
 757  758                  }
 758  759                  if (FC_CODE(res) == FC_OBJERR)
 759  760                          res = FC_ERRNO(res);
 760  761                  else
 761  762                          res = EFAULT;
 762  763                  rp->r_r0 = res;
 763  764                  rp->r_pc = ct->t_lofault;
 764  765                  goto cleanup;
 765  766  
 766  767          case T_PGFLT + USER:    /* user page fault */
 767  768                  if (faultdebug) {
 768  769                          char *fault_str;
 769  770  
 770  771                          switch (rw) {
 771  772                          case S_READ:
 772  773                                  fault_str = "read";
 773  774                                  break;
 774  775                          case S_WRITE:
 775  776                                  fault_str = "write";
 776  777                                  break;
 777  778                          case S_EXEC:
 778  779                                  fault_str = "exec";
 779  780                                  break;
 780  781                          default:
 781  782                                  fault_str = "";
 782  783                                  break;
 783  784                          }
 784  785                          printf("user %s fault:  addr=0x%lx errcode=0x%x\n",
 785  786                              fault_str, (uintptr_t)addr, errcode);
 786  787                  }
 787  788  
 788  789  #if defined(OPTERON_ERRATUM_100) && defined(_LP64)
 789  790                  /*
 790  791                   * Workaround for AMD erratum 100
 791  792                   *
 792  793                   * A 32-bit process may receive a page fault on a non
 793  794                   * 32-bit address by mistake. The range of the faulting
 794  795                   * address will be
 795  796                   *
 796  797                   *      0xffffffff80000000 .. 0xffffffffffffffff or
 797  798                   *      0x0000000100000000 .. 0x000000017fffffff
 798  799                   *
 799  800                   * The fault is always due to an instruction fetch, however
 800  801                   * the value of r_pc should be correct (in 32 bit range),
 801  802                   * so we ignore the page fault on the bogus address.
 802  803                   */
 803  804                  if (p->p_model == DATAMODEL_ILP32 &&
 804  805                      (0xffffffff80000000 <= (uintptr_t)addr ||
 805  806                      (0x100000000 <= (uintptr_t)addr &&
 806  807                      (uintptr_t)addr <= 0x17fffffff))) {
 807  808                          if (!opteron_erratum_100)
 808  809                                  panic("unexpected erratum #100");
 809  810                          if (rp->r_pc <= 0xffffffff)
 810  811                                  goto out;
 811  812                  }
 812  813  #endif /* OPTERON_ERRATUM_100 && _LP64 */
 813  814  
 814  815                  ASSERT(!(curthread->t_flag & T_WATCHPT));
 815  816                  watchpage = (pr_watch_active(p) && pr_is_watchpage(addr, rw));
 816  817  #ifdef __i386
 817  818                  /*
 818  819                   * In 32-bit mode, the lcall (system call) instruction fetches
 819  820                   * one word from the stack, at the stack pointer, because of the
 820  821                   * way the call gate is constructed.  This is a bogus
 821  822                   * read and should not be counted as a read watchpoint.
 822  823                   * We work around the problem here by testing to see if
 823  824                   * this situation applies and, if so, simply jumping to
 824  825                   * the code in locore.s that fields the system call trap.
 825  826                   * The registers on the stack are already set up properly
 826  827                   * due to the match between the call gate sequence and the
 827  828                   * trap gate sequence.  We just have to adjust the pc.
 828  829                   */
 829  830                  if (watchpage && addr == (caddr_t)rp->r_sp &&
 830  831                      rw == S_READ && instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
 831  832                          extern void watch_syscall(void);
 832  833  
 833  834                          rp->r_pc += LCALLSIZE;
 834  835                          watch_syscall();        /* never returns */
 835  836                          /* NOTREACHED */
 836  837                  }
 837  838  #endif /* __i386 */
 838  839                  vaddr = addr;
 839  840                  if (!watchpage || (sz = instr_size(rp, &vaddr, rw)) <= 0)
 840  841                          fault_type = (errcode & PF_ERR_PROT)? F_PROT: F_INVAL;
 841  842                  else if ((watchcode = pr_is_watchpoint(&vaddr, &ta,
 842  843                      sz, NULL, rw)) != 0) {
 843  844                          if (ta) {
 844  845                                  do_watch_step(vaddr, sz, rw,
 845  846                                      watchcode, rp->r_pc);
 846  847                                  fault_type = F_INVAL;
 847  848                          } else {
 848  849                                  bzero(&siginfo, sizeof (siginfo));
 849  850                                  siginfo.si_signo = SIGTRAP;
 850  851                                  siginfo.si_code = watchcode;
 851  852                                  siginfo.si_addr = vaddr;
 852  853                                  siginfo.si_trapafter = 0;
 853  854                                  siginfo.si_pc = (caddr_t)rp->r_pc;
 854  855                                  fault = FLTWATCH;
  
    | 
      ↓ open down ↓ | 
    741 lines elided | 
    
      ↑ open up ↑ | 
  
 855  856                                  break;
 856  857                          }
 857  858                  } else {
 858  859                          /* XXX pr_watch_emul() never succeeds (for now) */
 859  860                          if (rw != S_EXEC && pr_watch_emul(rp, vaddr, rw))
 860  861                                  goto out;
 861  862                          do_watch_step(vaddr, sz, rw, 0, 0);
 862  863                          fault_type = F_INVAL;
 863  864                  }
 864  865  
      866 +                /*
      867 +                 * Allow the brand to interpose on invalid memory accesses
      868 +                 * prior to running the native pagefault handler.  If this
      869 +                 * brand hook returns zero, it was able to handle the fault
      870 +                 * completely.  Otherwise, drive on and call pagefault().
      871 +                 */
      872 +                if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL &&
      873 +                    BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) {
      874 +                        goto out;
      875 +                }
      876 +
 865  877                  res = pagefault(addr, fault_type, rw, 0);
 866  878  
 867  879                  /*
 868  880                   * If pagefault() succeeded, ok.
 869  881                   * Otherwise attempt to grow the stack.
 870  882                   */
 871  883                  if (res == 0 ||
 872  884                      (res == FC_NOMAP &&
 873  885                      addr < p->p_usrstack &&
 874  886                      grow(addr))) {
 875  887                          lwp->lwp_lastfault = FLTPAGE;
 876  888                          lwp->lwp_lastfaddr = addr;
 877  889                          if (prismember(&p->p_fltmask, FLTPAGE)) {
 878  890                                  bzero(&siginfo, sizeof (siginfo));
 879  891                                  siginfo.si_addr = addr;
 880  892                                  (void) stop_on_fault(FLTPAGE, &siginfo);
 881  893                          }
 882  894                          goto out;
 883  895                  } else if (res == FC_PROT && addr < p->p_usrstack &&
 884  896                      (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC))) {
 885  897                          report_stack_exec(p, addr);
 886  898                  }
 887  899  
 888  900  #ifdef OPTERON_ERRATUM_91
 889  901                  /*
 890  902                   * Workaround for Opteron Erratum 91. Prefetches may generate a
 891  903                   * page fault (they're not supposed to do that!). If this
 892  904                   * occurs we simply return back to the instruction.
 893  905                   *
 894  906                   * We rely on copyin to properly fault in the page with r_pc.
 895  907                   */
 896  908                  if (opteron_erratum_91 &&
 897  909                      addr != (caddr_t)rp->r_pc &&
 898  910                      instr_is_prefetch((caddr_t)rp->r_pc)) {
 899  911  #ifdef DEBUG
 900  912                          cmn_err(CE_WARN, "Opteron erratum 91 occurred: "
 901  913                              "prefetch at %p in pid %d generated a trap!",
 902  914                              (void *)rp->r_pc, p->p_pid);
 903  915  #endif /* DEBUG */
 904  916                          goto out;
 905  917                  }
 906  918  #endif /* OPTERON_ERRATUM_91 */
 907  919  
 908  920                  if (tudebug)
 909  921                          showregs(type, rp, addr);
 910  922                  /*
 911  923                   * In the case where both pagefault and grow fail,
 912  924                   * set the code to the value provided by pagefault.
 913  925                   * We map all errors returned from pagefault() to SIGSEGV.
 914  926                   */
 915  927                  bzero(&siginfo, sizeof (siginfo));
 916  928                  siginfo.si_addr = addr;
 917  929                  switch (FC_CODE(res)) {
 918  930                  case FC_HWERR:
 919  931                  case FC_NOSUPPORT:
 920  932                          siginfo.si_signo = SIGBUS;
 921  933                          siginfo.si_code = BUS_ADRERR;
 922  934                          fault = FLTACCESS;
 923  935                          break;
 924  936                  case FC_ALIGN:
 925  937                          siginfo.si_signo = SIGBUS;
 926  938                          siginfo.si_code = BUS_ADRALN;
 927  939                          fault = FLTACCESS;
 928  940                          break;
 929  941                  case FC_OBJERR:
 930  942                          if ((siginfo.si_errno = FC_ERRNO(res)) != EINTR) {
 931  943                                  siginfo.si_signo = SIGBUS;
 932  944                                  siginfo.si_code = BUS_OBJERR;
 933  945                                  fault = FLTACCESS;
 934  946                          }
 935  947                          break;
 936  948                  default:        /* FC_NOMAP or FC_PROT */
 937  949                          siginfo.si_signo = SIGSEGV;
 938  950                          siginfo.si_code =
 939  951                              (res == FC_NOMAP)? SEGV_MAPERR : SEGV_ACCERR;
 940  952                          fault = FLTBOUNDS;
 941  953                          break;
 942  954                  }
 943  955                  break;
 944  956  
 945  957          case T_ILLINST + USER:  /* invalid opcode fault */
 946  958                  /*
 947  959                   * If the syscall instruction is disabled due to LDT usage, a
 948  960                   * user program that attempts to execute it will trigger a #ud
 949  961                   * trap. Check for that case here. If this occurs on a CPU which
 950  962                   * doesn't even support syscall, the result of all of this will
 951  963                   * be to emulate that particular instruction.
 952  964                   */
 953  965                  if (p->p_ldt != NULL &&
 954  966                      ldt_rewrite_syscall(rp, p, X86FSET_ASYSC))
 955  967                          goto out;
 956  968  
 957  969  #ifdef __amd64
 958  970                  /*
 959  971                   * Emulate the LAHF and SAHF instructions if needed.
 960  972                   * See the instr_is_lsahf function for details.
 961  973                   */
 962  974                  if (p->p_model == DATAMODEL_LP64 &&
 963  975                      instr_is_lsahf((caddr_t)rp->r_pc, &instr)) {
 964  976                          emulate_lsahf(rp, instr);
 965  977                          goto out;
 966  978                  }
 967  979  #endif
 968  980  
 969  981                  /*FALLTHROUGH*/
 970  982  
 971  983                  if (tudebug)
 972  984                          showregs(type, rp, (caddr_t)0);
 973  985                  siginfo.si_signo = SIGILL;
 974  986                  siginfo.si_code  = ILL_ILLOPC;
 975  987                  siginfo.si_addr  = (caddr_t)rp->r_pc;
 976  988                  fault = FLTILL;
 977  989                  break;
 978  990  
 979  991          case T_ZERODIV + USER:          /* integer divide by zero */
 980  992                  if (tudebug && tudebugfpe)
 981  993                          showregs(type, rp, (caddr_t)0);
 982  994                  siginfo.si_signo = SIGFPE;
 983  995                  siginfo.si_code  = FPE_INTDIV;
 984  996                  siginfo.si_addr  = (caddr_t)rp->r_pc;
 985  997                  fault = FLTIZDIV;
 986  998                  break;
 987  999  
 988 1000          case T_OVFLW + USER:    /* integer overflow */
 989 1001                  if (tudebug && tudebugfpe)
 990 1002                          showregs(type, rp, (caddr_t)0);
 991 1003                  siginfo.si_signo = SIGFPE;
 992 1004                  siginfo.si_code  = FPE_INTOVF;
 993 1005                  siginfo.si_addr  = (caddr_t)rp->r_pc;
 994 1006                  fault = FLTIOVF;
 995 1007                  break;
 996 1008  
 997 1009          case T_NOEXTFLT + USER: /* math coprocessor not available */
 998 1010                  if (tudebug && tudebugfpe)
 999 1011                          showregs(type, rp, addr);
1000 1012                  if (fpnoextflt(rp)) {
1001 1013                          siginfo.si_signo = SIGILL;
1002 1014                          siginfo.si_code  = ILL_ILLOPC;
1003 1015                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1004 1016                          fault = FLTILL;
1005 1017                  }
1006 1018                  break;
1007 1019  
1008 1020          case T_EXTOVRFLT:       /* extension overrun fault */
1009 1021                  /* check if we took a kernel trap on behalf of user */
1010 1022                  {
1011 1023                          extern  void ndptrap_frstor(void);
1012 1024                          if (rp->r_pc != (uintptr_t)ndptrap_frstor) {
1013 1025                                  sti(); /* T_EXTOVRFLT comes in via cmninttrap */
1014 1026                                  (void) die(type, rp, addr, cpuid);
1015 1027                          }
1016 1028                          type |= USER;
1017 1029                  }
1018 1030                  /*FALLTHROUGH*/
1019 1031          case T_EXTOVRFLT + USER:        /* extension overrun fault */
1020 1032                  if (tudebug && tudebugfpe)
1021 1033                          showregs(type, rp, addr);
1022 1034                  if (fpextovrflt(rp)) {
1023 1035                          siginfo.si_signo = SIGSEGV;
1024 1036                          siginfo.si_code  = SEGV_MAPERR;
1025 1037                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1026 1038                          fault = FLTBOUNDS;
1027 1039                  }
1028 1040                  break;
1029 1041  
1030 1042          case T_EXTERRFLT:       /* x87 floating point exception pending */
1031 1043                  /* check if we took a kernel trap on behalf of user */
1032 1044                  {
1033 1045                          extern  void ndptrap_frstor(void);
1034 1046                          if (rp->r_pc != (uintptr_t)ndptrap_frstor) {
1035 1047                                  sti(); /* T_EXTERRFLT comes in via cmninttrap */
1036 1048                                  (void) die(type, rp, addr, cpuid);
1037 1049                          }
1038 1050                          type |= USER;
1039 1051                  }
1040 1052                  /*FALLTHROUGH*/
1041 1053  
1042 1054          case T_EXTERRFLT + USER: /* x87 floating point exception pending */
1043 1055                  if (tudebug && tudebugfpe)
1044 1056                          showregs(type, rp, addr);
1045 1057                  if (sicode = fpexterrflt(rp)) {
1046 1058                          siginfo.si_signo = SIGFPE;
1047 1059                          siginfo.si_code  = sicode;
1048 1060                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1049 1061                          fault = FLTFPE;
1050 1062                  }
1051 1063                  break;
1052 1064  
1053 1065          case T_SIMDFPE + USER:          /* SSE and SSE2 exceptions */
1054 1066                  if (tudebug && tudebugsse)
1055 1067                          showregs(type, rp, addr);
1056 1068                  if (!is_x86_feature(x86_featureset, X86FSET_SSE) &&
1057 1069                      !is_x86_feature(x86_featureset, X86FSET_SSE2)) {
1058 1070                          /*
1059 1071                           * There are rumours that some user instructions
1060 1072                           * on older CPUs can cause this trap to occur; in
1061 1073                           * which case send a SIGILL instead of a SIGFPE.
1062 1074                           */
1063 1075                          siginfo.si_signo = SIGILL;
1064 1076                          siginfo.si_code  = ILL_ILLTRP;
1065 1077                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1066 1078                          siginfo.si_trapno = type & ~USER;
1067 1079                          fault = FLTILL;
1068 1080                  } else if ((sicode = fpsimderrflt(rp)) != 0) {
1069 1081                          siginfo.si_signo = SIGFPE;
1070 1082                          siginfo.si_code = sicode;
1071 1083                          siginfo.si_addr = (caddr_t)rp->r_pc;
1072 1084                          fault = FLTFPE;
1073 1085                  }
1074 1086  
1075 1087                  sti();  /* The SIMD exception comes in via cmninttrap */
1076 1088                  break;
1077 1089  
1078 1090          case T_BPTFLT:  /* breakpoint trap */
1079 1091                  /*
1080 1092                   * Kernel breakpoint traps should only happen when kmdb is
1081 1093                   * active, and even then, it'll have interposed on the IDT, so
1082 1094                   * control won't get here.  If it does, we've hit a breakpoint
1083 1095                   * without the debugger, which is very strange, and very
1084 1096                   * fatal.
1085 1097                   */
1086 1098                  if (tudebug && tudebugbpt)
1087 1099                          showregs(type, rp, (caddr_t)0);
1088 1100  
1089 1101                  (void) die(type, rp, addr, cpuid);
1090 1102                  break;
1091 1103  
1092 1104          case T_SGLSTP: /* single step/hw breakpoint exception */
1093 1105  
1094 1106                  /* Now evaluate how we got here */
1095 1107                  if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) {
1096 1108                          /*
1097 1109                           * i386 single-steps even through lcalls which
1098 1110                           * change the privilege level. So we take a trap at
1099 1111                           * the first instruction in privileged mode.
1100 1112                           *
1101 1113                           * Set a flag to indicate that upon completion of
1102 1114                           * the system call, deal with the single-step trap.
1103 1115                           *
1104 1116                           * The same thing happens for sysenter, too.
1105 1117                           */
1106 1118                          singlestep_twiddle = 0;
1107 1119                          if (rp->r_pc == (uintptr_t)sys_sysenter ||
1108 1120                              rp->r_pc == (uintptr_t)brand_sys_sysenter) {
1109 1121                                  singlestep_twiddle = 1;
1110 1122  #if defined(__amd64)
1111 1123                                  /*
1112 1124                                   * Since we are already on the kernel's
1113 1125                                   * %gs, on 64-bit systems the sysenter case
1114 1126                                   * needs to adjust the pc to avoid
1115 1127                                   * executing the swapgs instruction at the
1116 1128                                   * top of the handler.
1117 1129                                   */
1118 1130                                  if (rp->r_pc == (uintptr_t)sys_sysenter)
1119 1131                                          rp->r_pc = (uintptr_t)
1120 1132                                              _sys_sysenter_post_swapgs;
1121 1133                                  else
1122 1134                                          rp->r_pc = (uintptr_t)
1123 1135                                              _brand_sys_sysenter_post_swapgs;
1124 1136  #endif
1125 1137                          }
1126 1138  #if defined(__i386)
1127 1139                          else if (rp->r_pc == (uintptr_t)sys_call ||
1128 1140                              rp->r_pc == (uintptr_t)brand_sys_call) {
1129 1141                                  singlestep_twiddle = 1;
1130 1142                          }
1131 1143  #endif
1132 1144                          else {
1133 1145                                  /* not on sysenter/syscall; uregs available */
1134 1146                                  if (tudebug && tudebugbpt)
1135 1147                                          showregs(type, rp, (caddr_t)0);
1136 1148                          }
1137 1149                          if (singlestep_twiddle) {
1138 1150                                  rp->r_ps &= ~PS_T; /* turn off trace */
1139 1151                                  lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
1140 1152                                  ct->t_post_sys = 1;
1141 1153                                  aston(curthread);
1142 1154                                  goto cleanup;
1143 1155                          }
1144 1156                  }
1145 1157                  /* XXX - needs review on debugger interface? */
1146 1158                  if (boothowto & RB_DEBUG)
1147 1159                          debug_enter((char *)NULL);
1148 1160                  else
1149 1161                          (void) die(type, rp, addr, cpuid);
1150 1162                  break;
1151 1163  
1152 1164          case T_NMIFLT:  /* NMI interrupt */
1153 1165                  printf("Unexpected NMI in system mode\n");
1154 1166                  goto cleanup;
1155 1167  
1156 1168          case T_NMIFLT + USER:   /* NMI interrupt */
1157 1169                  printf("Unexpected NMI in user mode\n");
1158 1170                  break;
1159 1171  
1160 1172          case T_GPFLT:   /* general protection violation */
1161 1173                  /*
1162 1174                   * Any #GP that occurs during an on_trap .. no_trap bracket
1163 1175                   * with OT_DATA_ACCESS or OT_SEGMENT_ACCESS protection,
1164 1176                   * or in a on_fault .. no_fault bracket, is forgiven
1165 1177                   * and we trampoline.  This protection is given regardless
1166 1178                   * of whether we are 32/64 bit etc - if a distinction is
1167 1179                   * required then define new on_trap protection types.
1168 1180                   *
1169 1181                   * On amd64, we can get a #gp from referencing addresses
1170 1182                   * in the virtual address hole e.g. from a copyin or in
1171 1183                   * update_sregs while updating user segment registers.
1172 1184                   *
1173 1185                   * On the 32-bit hypervisor we could also generate one in
1174 1186                   * mfn_to_pfn by reaching around or into where the hypervisor
1175 1187                   * lives which is protected by segmentation.
1176 1188                   */
1177 1189  
1178 1190                  /*
1179 1191                   * If we're under on_trap() protection (see <sys/ontrap.h>),
1180 1192                   * set ot_trap and trampoline back to the on_trap() call site
1181 1193                   * for OT_DATA_ACCESS or OT_SEGMENT_ACCESS.
1182 1194                   */
1183 1195                  if (ct->t_ontrap != NULL) {
1184 1196                          int ttype =  ct->t_ontrap->ot_prot &
1185 1197                              (OT_DATA_ACCESS | OT_SEGMENT_ACCESS);
1186 1198  
1187 1199                          if (ttype != 0) {
1188 1200                                  ct->t_ontrap->ot_trap |= ttype;
1189 1201                                  if (tudebug)
1190 1202                                          showregs(type, rp, (caddr_t)0);
1191 1203                                  rp->r_pc = ct->t_ontrap->ot_trampoline;
1192 1204                                  goto cleanup;
1193 1205                          }
1194 1206                  }
1195 1207  
1196 1208                  /*
1197 1209                   * If we're under lofault protection (copyin etc.),
1198 1210                   * longjmp back to lofault with an EFAULT.
1199 1211                   */
1200 1212                  if (ct->t_lofault) {
1201 1213                          /*
1202 1214                           * Fault is not resolvable, so just return to lofault
1203 1215                           */
1204 1216                          if (lodebug) {
1205 1217                                  showregs(type, rp, addr);
1206 1218                                  traceregs(rp);
1207 1219                          }
1208 1220                          rp->r_r0 = EFAULT;
1209 1221                          rp->r_pc = ct->t_lofault;
1210 1222                          goto cleanup;
1211 1223                  }
1212 1224  
1213 1225                  /*
1214 1226                   * We fall through to the next case, which repeats
1215 1227                   * the OT_SEGMENT_ACCESS check which we've already
1216 1228                   * done, so we'll always fall through to the
1217 1229                   * T_STKFLT case.
1218 1230                   */
1219 1231                  /*FALLTHROUGH*/
1220 1232          case T_SEGFLT:  /* segment not present fault */
1221 1233                  /*
1222 1234                   * One example of this is #NP in update_sregs while
1223 1235                   * attempting to update a user segment register
1224 1236                   * that points to a descriptor that is marked not
1225 1237                   * present.
1226 1238                   */
1227 1239                  if (ct->t_ontrap != NULL &&
1228 1240                      ct->t_ontrap->ot_prot & OT_SEGMENT_ACCESS) {
1229 1241                          ct->t_ontrap->ot_trap |= OT_SEGMENT_ACCESS;
1230 1242                          if (tudebug)
1231 1243                                  showregs(type, rp, (caddr_t)0);
1232 1244                          rp->r_pc = ct->t_ontrap->ot_trampoline;
1233 1245                          goto cleanup;
1234 1246                  }
1235 1247                  /*FALLTHROUGH*/
1236 1248          case T_STKFLT:  /* stack fault */
1237 1249          case T_TSSFLT:  /* invalid TSS fault */
1238 1250                  if (tudebug)
1239 1251                          showregs(type, rp, (caddr_t)0);
1240 1252                  if (kern_gpfault(rp))
1241 1253                          (void) die(type, rp, addr, cpuid);
1242 1254                  goto cleanup;
1243 1255  
1244 1256          /*
1245 1257           * ONLY 32-bit PROCESSES can USE a PRIVATE LDT! 64-bit apps
1246 1258           * should have no need for them, so we put a stop to it here.
1247 1259           *
1248 1260           * So: not-present fault is ONLY valid for 32-bit processes with
1249 1261           * a private LDT trying to do a system call. Emulate it.
1250 1262           *
1251 1263           * #gp fault is ONLY valid for 32-bit processes also, which DO NOT
1252 1264           * have a private LDT, and are trying to do a system call. Emulate it.
1253 1265           */
1254 1266  
1255 1267          case T_SEGFLT + USER:   /* segment not present fault */
1256 1268          case T_GPFLT + USER:    /* general protection violation */
1257 1269  #ifdef _SYSCALL32_IMPL
1258 1270                  if (p->p_model != DATAMODEL_NATIVE) {
1259 1271  #endif /* _SYSCALL32_IMPL */
1260 1272                  if (instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
1261 1273                          if (type == T_SEGFLT + USER)
1262 1274                                  ASSERT(p->p_ldt != NULL);
1263 1275  
1264 1276                          if ((p->p_ldt == NULL && type == T_GPFLT + USER) ||
1265 1277                              type == T_SEGFLT + USER) {
1266 1278  
1267 1279                          /*
1268 1280                           * The user attempted a system call via the obsolete
1269 1281                           * call gate mechanism. Because the process doesn't have
1270 1282                           * an LDT (i.e. the ldtr contains 0), a #gp results.
1271 1283                           * Emulate the syscall here, just as we do above for a
1272 1284                           * #np trap.
1273 1285                           */
1274 1286  
1275 1287                          /*
1276 1288                           * Since this is a not-present trap, rp->r_pc points to
1277 1289                           * the trapping lcall instruction. We need to bump it
1278 1290                           * to the next insn so the app can continue on.
1279 1291                           */
1280 1292                          rp->r_pc += LCALLSIZE;
1281 1293                          lwp->lwp_regs = rp;
1282 1294  
1283 1295                          /*
1284 1296                           * Normally the microstate of the LWP is forced back to
1285 1297                           * LMS_USER by the syscall handlers. Emulate that
1286 1298                           * behavior here.
1287 1299                           */
1288 1300                          mstate = LMS_USER;
1289 1301  
1290 1302                          dosyscall();
1291 1303                          goto out;
1292 1304                          }
1293 1305                  }
1294 1306  #ifdef _SYSCALL32_IMPL
1295 1307                  }
1296 1308  #endif /* _SYSCALL32_IMPL */
1297 1309                  /*
1298 1310                   * If the current process is using a private LDT and the
1299 1311                   * trapping instruction is sysenter, the sysenter instruction
1300 1312                   * has been disabled on the CPU because it destroys segment
1301 1313                   * registers. If this is the case, rewrite the instruction to
1302 1314                   * be a safe system call and retry it. If this occurs on a CPU
1303 1315                   * which doesn't even support sysenter, the result of all of
1304 1316                   * this will be to emulate that particular instruction.
1305 1317                   */
1306 1318                  if (p->p_ldt != NULL &&
1307 1319                      ldt_rewrite_syscall(rp, p, X86FSET_SEP))
1308 1320                          goto out;
1309 1321  
1310 1322                  /*FALLTHROUGH*/
1311 1323  
1312 1324          case T_BOUNDFLT + USER: /* bound fault */
1313 1325          case T_STKFLT + USER:   /* stack fault */
1314 1326          case T_TSSFLT + USER:   /* invalid TSS fault */
1315 1327                  if (tudebug)
1316 1328                          showregs(type, rp, (caddr_t)0);
1317 1329                  siginfo.si_signo = SIGSEGV;
1318 1330                  siginfo.si_code  = SEGV_MAPERR;
1319 1331                  siginfo.si_addr  = (caddr_t)rp->r_pc;
1320 1332                  fault = FLTBOUNDS;
1321 1333                  break;
1322 1334  
1323 1335          case T_ALIGNMENT + USER:        /* user alignment error (486) */
1324 1336                  if (tudebug)
1325 1337                          showregs(type, rp, (caddr_t)0);
1326 1338                  bzero(&siginfo, sizeof (siginfo));
1327 1339                  siginfo.si_signo = SIGBUS;
1328 1340                  siginfo.si_code = BUS_ADRALN;
1329 1341                  siginfo.si_addr = (caddr_t)rp->r_pc;
1330 1342                  fault = FLTACCESS;
1331 1343                  break;
1332 1344  
1333 1345          case T_SGLSTP + USER: /* single step/hw breakpoint exception */
1334 1346                  if (tudebug && tudebugbpt)
1335 1347                          showregs(type, rp, (caddr_t)0);
1336 1348  
1337 1349                  /* Was it single-stepping? */
1338 1350                  if (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP) {
1339 1351                          pcb_t *pcb = &lwp->lwp_pcb;
1340 1352  
1341 1353                          rp->r_ps &= ~PS_T;
1342 1354                          /*
1343 1355                           * If both NORMAL_STEP and WATCH_STEP are in effect,
1344 1356                           * give precedence to WATCH_STEP.  If neither is set,
1345 1357                           * user must have set the PS_T bit in %efl; treat this
1346 1358                           * as NORMAL_STEP.
1347 1359                           */
1348 1360                          if ((fault = undo_watch_step(&siginfo)) == 0 &&
1349 1361                              ((pcb->pcb_flags & NORMAL_STEP) ||
1350 1362                              !(pcb->pcb_flags & WATCH_STEP))) {
1351 1363                                  siginfo.si_signo = SIGTRAP;
1352 1364                                  siginfo.si_code = TRAP_TRACE;
1353 1365                                  siginfo.si_addr = (caddr_t)rp->r_pc;
1354 1366                                  fault = FLTTRACE;
1355 1367                          }
1356 1368                          pcb->pcb_flags &= ~(NORMAL_STEP|WATCH_STEP);
1357 1369                  }
1358 1370                  break;
1359 1371  
1360 1372          case T_BPTFLT + USER:   /* breakpoint trap */
1361 1373                  if (tudebug && tudebugbpt)
1362 1374                          showregs(type, rp, (caddr_t)0);
1363 1375                  /*
1364 1376                   * int 3 (the breakpoint instruction) leaves the pc referring
1365 1377                   * to the address one byte after the breakpointed address.
1366 1378                   * If the P_PR_BPTADJ flag has been set via /proc, We adjust
1367 1379                   * it back so it refers to the breakpointed address.
1368 1380                   */
1369 1381                  if (p->p_proc_flag & P_PR_BPTADJ)
1370 1382                          rp->r_pc--;
1371 1383                  siginfo.si_signo = SIGTRAP;
1372 1384                  siginfo.si_code  = TRAP_BRKPT;
1373 1385                  siginfo.si_addr  = (caddr_t)rp->r_pc;
1374 1386                  fault = FLTBPT;
1375 1387                  break;
1376 1388  
1377 1389          case T_AST:
1378 1390                  /*
1379 1391                   * This occurs only after the cs register has been made to
1380 1392                   * look like a kernel selector, either through debugging or
1381 1393                   * possibly by functions like setcontext().  The thread is
1382 1394                   * about to cause a general protection fault at common_iret()
1383 1395                   * in locore.  We let that happen immediately instead of
1384 1396                   * doing the T_AST processing.
1385 1397                   */
1386 1398                  goto cleanup;
1387 1399  
1388 1400          case T_AST + USER:      /* profiling, resched, h/w error pseudo trap */
1389 1401                  if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR) {
1390 1402                          proc_t *p = ttoproc(curthread);
1391 1403                          extern void print_msg_hwerr(ctid_t ct_id, proc_t *p);
1392 1404  
1393 1405                          lwp->lwp_pcb.pcb_flags &= ~ASYNC_HWERR;
1394 1406                          print_msg_hwerr(p->p_ct_process->conp_contract.ct_id,
1395 1407                              p);
1396 1408                          contract_process_hwerr(p->p_ct_process, p);
1397 1409                          siginfo.si_signo = SIGKILL;
1398 1410                          siginfo.si_code = SI_NOINFO;
1399 1411                  } else if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) {
1400 1412                          lwp->lwp_pcb.pcb_flags &= ~CPC_OVERFLOW;
1401 1413                          if (kcpc_overflow_ast()) {
1402 1414                                  /*
1403 1415                                   * Signal performance counter overflow
1404 1416                                   */
1405 1417                                  if (tudebug)
1406 1418                                          showregs(type, rp, (caddr_t)0);
1407 1419                                  bzero(&siginfo, sizeof (siginfo));
1408 1420                                  siginfo.si_signo = SIGEMT;
1409 1421                                  siginfo.si_code = EMT_CPCOVF;
1410 1422                                  siginfo.si_addr = (caddr_t)rp->r_pc;
1411 1423                                  fault = FLTCPCOVF;
1412 1424                          }
1413 1425                  }
1414 1426  
1415 1427                  break;
1416 1428          }
1417 1429  
1418 1430          /*
1419 1431           * We can't get here from a system trap
1420 1432           */
1421 1433          ASSERT(type & USER);
1422 1434  
1423 1435          if (fault) {
1424 1436                  /* We took a fault so abort single step. */
1425 1437                  lwp->lwp_pcb.pcb_flags &= ~(NORMAL_STEP|WATCH_STEP);
1426 1438                  /*
1427 1439                   * Remember the fault and fault adddress
1428 1440                   * for real-time (SIGPROF) profiling.
1429 1441                   */
1430 1442                  lwp->lwp_lastfault = fault;
1431 1443                  lwp->lwp_lastfaddr = siginfo.si_addr;
1432 1444  
1433 1445                  DTRACE_PROC2(fault, int, fault, ksiginfo_t *, &siginfo);
1434 1446  
1435 1447                  /*
1436 1448                   * If a debugger has declared this fault to be an
1437 1449                   * event of interest, stop the lwp.  Otherwise just
1438 1450                   * deliver the associated signal.
1439 1451                   */
1440 1452                  if (siginfo.si_signo != SIGKILL &&
1441 1453                      prismember(&p->p_fltmask, fault) &&
1442 1454                      stop_on_fault(fault, &siginfo) == 0)
1443 1455                          siginfo.si_signo = 0;
1444 1456          }
1445 1457  
1446 1458          if (siginfo.si_signo)
1447 1459                  trapsig(&siginfo, (fault != FLTFPE && fault != FLTCPCOVF));
1448 1460  
1449 1461          if (lwp->lwp_oweupc)
1450 1462                  profil_tick(rp->r_pc);
1451 1463  
1452 1464          if (ct->t_astflag | ct->t_sig_check) {
1453 1465                  /*
1454 1466                   * Turn off the AST flag before checking all the conditions that
1455 1467                   * may have caused an AST.  This flag is on whenever a signal or
1456 1468                   * unusual condition should be handled after the next trap or
1457 1469                   * syscall.
1458 1470                   */
1459 1471                  astoff(ct);
1460 1472                  /*
1461 1473                   * If a single-step trap occurred on a syscall (see above)
1462 1474                   * recognize it now.  Do this before checking for signals
1463 1475                   * because deferred_singlestep_trap() may generate a SIGTRAP to
1464 1476                   * the LWP or may otherwise mark the LWP to call issig(FORREAL).
1465 1477                   */
1466 1478                  if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING)
1467 1479                          deferred_singlestep_trap((caddr_t)rp->r_pc);
1468 1480  
1469 1481                  ct->t_sig_check = 0;
1470 1482  
1471 1483                  /*
1472 1484                   * As in other code paths that check against TP_CHANGEBIND,
1473 1485                   * we perform the check first without p_lock held -- only
1474 1486                   * acquiring p_lock in the unlikely event that it is indeed
1475 1487                   * set.  This is safe because we are doing this after the
1476 1488                   * astoff(); if we are racing another thread setting
1477 1489                   * TP_CHANGEBIND on us, we will pick it up on a subsequent
1478 1490                   * lap through.
1479 1491                   */
1480 1492                  if (curthread->t_proc_flag & TP_CHANGEBIND) {
1481 1493                          mutex_enter(&p->p_lock);
1482 1494                          if (curthread->t_proc_flag & TP_CHANGEBIND) {
1483 1495                                  timer_lwpbind();
1484 1496                                  curthread->t_proc_flag &= ~TP_CHANGEBIND;
1485 1497                          }
1486 1498                          mutex_exit(&p->p_lock);
1487 1499                  }
1488 1500  
1489 1501                  /*
1490 1502                   * for kaio requests that are on the per-process poll queue,
1491 1503                   * aiop->aio_pollq, they're AIO_POLL bit is set, the kernel
1492 1504                   * should copyout their result_t to user memory. by copying
1493 1505                   * out the result_t, the user can poll on memory waiting
1494 1506                   * for the kaio request to complete.
1495 1507                   */
1496 1508                  if (p->p_aio)
1497 1509                          aio_cleanup(0);
1498 1510                  /*
1499 1511                   * If this LWP was asked to hold, call holdlwp(), which will
1500 1512                   * stop.  holdlwps() sets this up and calls pokelwps() which
1501 1513                   * sets the AST flag.
1502 1514                   *
1503 1515                   * Also check TP_EXITLWP, since this is used by fresh new LWPs
1504 1516                   * through lwp_rtt().  That flag is set if the lwp_create(2)
1505 1517                   * syscall failed after creating the LWP.
1506 1518                   */
1507 1519                  if (ISHOLD(p))
1508 1520                          holdlwp();
1509 1521  
1510 1522                  /*
1511 1523                   * All code that sets signals and makes ISSIG evaluate true must
1512 1524                   * set t_astflag afterwards.
1513 1525                   */
1514 1526                  if (ISSIG_PENDING(ct, lwp, p)) {
1515 1527                          if (issig(FORREAL))
1516 1528                                  psig();
1517 1529                          ct->t_sig_check = 1;
1518 1530                  }
1519 1531  
1520 1532                  if (ct->t_rprof != NULL) {
1521 1533                          realsigprof(0, 0, 0);
1522 1534                          ct->t_sig_check = 1;
1523 1535                  }
1524 1536  
1525 1537                  /*
1526 1538                   * /proc can't enable/disable the trace bit itself
1527 1539                   * because that could race with the call gate used by
1528 1540                   * system calls via "lcall". If that happened, an
1529 1541                   * invalid EFLAGS would result. prstep()/prnostep()
1530 1542                   * therefore schedule an AST for the purpose.
1531 1543                   */
1532 1544                  if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) {
1533 1545                          lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP;
1534 1546                          rp->r_ps |= PS_T;
1535 1547                  }
1536 1548                  if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) {
1537 1549                          lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP;
1538 1550                          rp->r_ps &= ~PS_T;
1539 1551                  }
1540 1552          }
1541 1553  
1542 1554  out:    /* We can't get here from a system trap */
1543 1555          ASSERT(type & USER);
1544 1556  
1545 1557          if (ISHOLD(p))
1546 1558                  holdlwp();
1547 1559  
1548 1560          /*
1549 1561           * Set state to LWP_USER here so preempt won't give us a kernel
1550 1562           * priority if it occurs after this point.  Call CL_TRAPRET() to
1551 1563           * restore the user-level priority.
1552 1564           *
1553 1565           * It is important that no locks (other than spinlocks) be entered
1554 1566           * after this point before returning to user mode (unless lwp_state
1555 1567           * is set back to LWP_SYS).
1556 1568           */
1557 1569          lwp->lwp_state = LWP_USER;
1558 1570  
1559 1571          if (ct->t_trapret) {
1560 1572                  ct->t_trapret = 0;
1561 1573                  thread_lock(ct);
1562 1574                  CL_TRAPRET(ct);
1563 1575                  thread_unlock(ct);
1564 1576          }
1565 1577          if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ)
1566 1578                  preempt();
1567 1579          prunstop();
1568 1580          (void) new_mstate(ct, mstate);
1569 1581  
1570 1582          /* Kernel probe */
1571 1583          TNF_PROBE_1(thread_state, "thread", /* CSTYLED */,
1572 1584              tnf_microstate, state, LMS_USER);
1573 1585  
1574 1586          return;
1575 1587  
1576 1588  cleanup:        /* system traps end up here */
1577 1589          ASSERT(!(type & USER));
1578 1590  }
1579 1591  
1580 1592  /*
1581 1593   * Patch non-zero to disable preemption of threads in the kernel.
1582 1594   */
1583 1595  int IGNORE_KERNEL_PREEMPTION = 0;       /* XXX - delete this someday */
1584 1596  
1585 1597  struct kpreempt_cnts {          /* kernel preemption statistics */
1586 1598          int     kpc_idle;       /* executing idle thread */
1587 1599          int     kpc_intr;       /* executing interrupt thread */
1588 1600          int     kpc_clock;      /* executing clock thread */
1589 1601          int     kpc_blocked;    /* thread has blocked preemption (t_preempt) */
1590 1602          int     kpc_notonproc;  /* thread is surrendering processor */
1591 1603          int     kpc_inswtch;    /* thread has ratified scheduling decision */
1592 1604          int     kpc_prilevel;   /* processor interrupt level is too high */
1593 1605          int     kpc_apreempt;   /* asynchronous preemption */
1594 1606          int     kpc_spreempt;   /* synchronous preemption */
1595 1607  } kpreempt_cnts;
1596 1608  
1597 1609  /*
1598 1610   * kernel preemption: forced rescheduling, preempt the running kernel thread.
1599 1611   *      the argument is old PIL for an interrupt,
1600 1612   *      or the distingished value KPREEMPT_SYNC.
1601 1613   */
1602 1614  void
1603 1615  kpreempt(int asyncspl)
1604 1616  {
1605 1617          kthread_t *ct = curthread;
1606 1618  
1607 1619          if (IGNORE_KERNEL_PREEMPTION) {
1608 1620                  aston(CPU->cpu_dispthread);
1609 1621                  return;
1610 1622          }
1611 1623  
1612 1624          /*
1613 1625           * Check that conditions are right for kernel preemption
1614 1626           */
1615 1627          do {
1616 1628                  if (ct->t_preempt) {
1617 1629                          /*
1618 1630                           * either a privileged thread (idle, panic, interrupt)
1619 1631                           * or will check when t_preempt is lowered
1620 1632                           * We need to specifically handle the case where
1621 1633                           * the thread is in the middle of swtch (resume has
1622 1634                           * been called) and has its t_preempt set
1623 1635                           * [idle thread and a thread which is in kpreempt
1624 1636                           * already] and then a high priority thread is
1625 1637                           * available in the local dispatch queue.
1626 1638                           * In this case the resumed thread needs to take a
1627 1639                           * trap so that it can call kpreempt. We achieve
1628 1640                           * this by using siron().
1629 1641                           * How do we detect this condition:
1630 1642                           * idle thread is running and is in the midst of
1631 1643                           * resume: curthread->t_pri == -1 && CPU->dispthread
1632 1644                           * != CPU->thread
1633 1645                           * Need to ensure that this happens only at high pil
1634 1646                           * resume is called at high pil
1635 1647                           * Only in resume_from_idle is the pil changed.
1636 1648                           */
1637 1649                          if (ct->t_pri < 0) {
1638 1650                                  kpreempt_cnts.kpc_idle++;
1639 1651                                  if (CPU->cpu_dispthread != CPU->cpu_thread)
1640 1652                                          siron();
1641 1653                          } else if (ct->t_flag & T_INTR_THREAD) {
1642 1654                                  kpreempt_cnts.kpc_intr++;
1643 1655                                  if (ct->t_pil == CLOCK_LEVEL)
1644 1656                                          kpreempt_cnts.kpc_clock++;
1645 1657                          } else {
1646 1658                                  kpreempt_cnts.kpc_blocked++;
1647 1659                                  if (CPU->cpu_dispthread != CPU->cpu_thread)
1648 1660                                          siron();
1649 1661                          }
1650 1662                          aston(CPU->cpu_dispthread);
1651 1663                          return;
1652 1664                  }
1653 1665                  if (ct->t_state != TS_ONPROC ||
1654 1666                      ct->t_disp_queue != CPU->cpu_disp) {
1655 1667                          /* this thread will be calling swtch() shortly */
1656 1668                          kpreempt_cnts.kpc_notonproc++;
1657 1669                          if (CPU->cpu_thread != CPU->cpu_dispthread) {
1658 1670                                  /* already in swtch(), force another */
1659 1671                                  kpreempt_cnts.kpc_inswtch++;
1660 1672                                  siron();
1661 1673                          }
1662 1674                          return;
1663 1675                  }
1664 1676                  if (getpil() >= DISP_LEVEL) {
1665 1677                          /*
1666 1678                           * We can't preempt this thread if it is at
1667 1679                           * a PIL >= DISP_LEVEL since it may be holding
1668 1680                           * a spin lock (like sched_lock).
1669 1681                           */
1670 1682                          siron();        /* check back later */
1671 1683                          kpreempt_cnts.kpc_prilevel++;
1672 1684                          return;
1673 1685                  }
1674 1686                  if (!interrupts_enabled()) {
1675 1687                          /*
1676 1688                           * Can't preempt while running with ints disabled
1677 1689                           */
1678 1690                          kpreempt_cnts.kpc_prilevel++;
1679 1691                          return;
1680 1692                  }
1681 1693                  if (asyncspl != KPREEMPT_SYNC)
1682 1694                          kpreempt_cnts.kpc_apreempt++;
1683 1695                  else
1684 1696                          kpreempt_cnts.kpc_spreempt++;
1685 1697  
1686 1698                  ct->t_preempt++;
1687 1699                  preempt();
1688 1700                  ct->t_preempt--;
1689 1701          } while (CPU->cpu_kprunrun);
1690 1702  }
1691 1703  
1692 1704  /*
1693 1705   * Print out debugging info.
1694 1706   */
1695 1707  static void
1696 1708  showregs(uint_t type, struct regs *rp, caddr_t addr)
1697 1709  {
1698 1710          int s;
1699 1711  
1700 1712          s = spl7();
1701 1713          type &= ~USER;
1702 1714          if (PTOU(curproc)->u_comm[0])
1703 1715                  printf("%s: ", PTOU(curproc)->u_comm);
1704 1716          if (type < TRAP_TYPES)
1705 1717                  printf("#%s %s\n", trap_type_mnemonic[type], trap_type[type]);
1706 1718          else
1707 1719                  switch (type) {
1708 1720                  case T_SYSCALL:
1709 1721                          printf("Syscall Trap:\n");
1710 1722                          break;
1711 1723                  case T_AST:
1712 1724                          printf("AST\n");
1713 1725                          break;
1714 1726                  default:
1715 1727                          printf("Bad Trap = %d\n", type);
1716 1728                          break;
1717 1729                  }
1718 1730          if (type == T_PGFLT) {
1719 1731                  printf("Bad %s fault at addr=0x%lx\n",
1720 1732                      USERMODE(rp->r_cs) ? "user": "kernel", (uintptr_t)addr);
1721 1733          } else if (addr) {
1722 1734                  printf("addr=0x%lx\n", (uintptr_t)addr);
1723 1735          }
1724 1736  
1725 1737          printf("pid=%d, pc=0x%lx, sp=0x%lx, eflags=0x%lx\n",
1726 1738              (ttoproc(curthread) && ttoproc(curthread)->p_pidp) ?
1727 1739              ttoproc(curthread)->p_pid : 0, rp->r_pc, rp->r_sp, rp->r_ps);
1728 1740  
1729 1741  #if defined(__lint)
1730 1742          /*
1731 1743           * this clause can be deleted when lint bug 4870403 is fixed
1732 1744           * (lint thinks that bit 32 is illegal in a %b format string)
1733 1745           */
1734 1746          printf("cr0: %x cr4: %b\n",
1735 1747              (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4);
1736 1748  #else
1737 1749          printf("cr0: %b cr4: %b\n",
1738 1750              (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4);
1739 1751  #endif  /* __lint */
1740 1752  
1741 1753          printf("cr2: %lx", getcr2());
1742 1754  #if !defined(__xpv)
1743 1755          printf("cr3: %lx", getcr3());
1744 1756  #if defined(__amd64)
1745 1757          printf("cr8: %lx\n", getcr8());
1746 1758  #endif
1747 1759  #endif
1748 1760          printf("\n");
1749 1761  
1750 1762          dumpregs(rp);
1751 1763          splx(s);
1752 1764  }
1753 1765  
1754 1766  static void
1755 1767  dumpregs(struct regs *rp)
1756 1768  {
1757 1769  #if defined(__amd64)
1758 1770          const char fmt[] = "\t%3s: %16lx %3s: %16lx %3s: %16lx\n";
1759 1771  
1760 1772          printf(fmt, "rdi", rp->r_rdi, "rsi", rp->r_rsi, "rdx", rp->r_rdx);
1761 1773          printf(fmt, "rcx", rp->r_rcx, " r8", rp->r_r8, " r9", rp->r_r9);
1762 1774          printf(fmt, "rax", rp->r_rax, "rbx", rp->r_rbx, "rbp", rp->r_rbp);
1763 1775          printf(fmt, "r10", rp->r_r10, "r11", rp->r_r11, "r12", rp->r_r12);
1764 1776          printf(fmt, "r13", rp->r_r13, "r14", rp->r_r14, "r15", rp->r_r15);
1765 1777  
1766 1778          printf(fmt, "fsb", rdmsr(MSR_AMD_FSBASE), "gsb", rdmsr(MSR_AMD_GSBASE),
1767 1779              " ds", rp->r_ds);
1768 1780          printf(fmt, " es", rp->r_es, " fs", rp->r_fs, " gs", rp->r_gs);
1769 1781  
1770 1782          printf(fmt, "trp", rp->r_trapno, "err", rp->r_err, "rip", rp->r_rip);
1771 1783          printf(fmt, " cs", rp->r_cs, "rfl", rp->r_rfl, "rsp", rp->r_rsp);
1772 1784  
1773 1785          printf("\t%3s: %16lx\n", " ss", rp->r_ss);
1774 1786  
1775 1787  #elif defined(__i386)
1776 1788          const char fmt[] = "\t%3s: %8lx %3s: %8lx %3s: %8lx %3s: %8lx\n";
1777 1789  
1778 1790          printf(fmt, " gs", rp->r_gs, " fs", rp->r_fs,
1779 1791              " es", rp->r_es, " ds", rp->r_ds);
1780 1792          printf(fmt, "edi", rp->r_edi, "esi", rp->r_esi,
1781 1793              "ebp", rp->r_ebp, "esp", rp->r_esp);
1782 1794          printf(fmt, "ebx", rp->r_ebx, "edx", rp->r_edx,
1783 1795              "ecx", rp->r_ecx, "eax", rp->r_eax);
1784 1796          printf(fmt, "trp", rp->r_trapno, "err", rp->r_err,
1785 1797              "eip", rp->r_eip, " cs", rp->r_cs);
1786 1798          printf("\t%3s: %8lx %3s: %8lx %3s: %8lx\n",
1787 1799              "efl", rp->r_efl, "usp", rp->r_uesp, " ss", rp->r_ss);
1788 1800  
1789 1801  #endif  /* __i386 */
1790 1802  }
1791 1803  
1792 1804  /*
1793 1805   * Test to see if the instruction is iret on i386 or iretq on amd64.
1794 1806   *
1795 1807   * On the hypervisor we can only test for nopop_sys_rtt_syscall. If true
1796 1808   * then we are in the context of hypervisor's failsafe handler because it
1797 1809   * tried to iret and failed due to a bad selector. See xen_failsafe_callback.
1798 1810   */
1799 1811  static int
1800 1812  instr_is_iret(caddr_t pc)
1801 1813  {
1802 1814  
1803 1815  #if defined(__xpv)
1804 1816          extern void nopop_sys_rtt_syscall(void);
1805 1817          return ((pc == (caddr_t)nopop_sys_rtt_syscall) ? 1 : 0);
1806 1818  
1807 1819  #else
1808 1820  
1809 1821  #if defined(__amd64)
1810 1822          static const uint8_t iret_insn[2] = { 0x48, 0xcf };     /* iretq */
1811 1823  
1812 1824  #elif defined(__i386)
1813 1825          static const uint8_t iret_insn[1] = { 0xcf };           /* iret */
1814 1826  #endif  /* __i386 */
1815 1827          return (bcmp(pc, iret_insn, sizeof (iret_insn)) == 0);
1816 1828  
1817 1829  #endif  /* __xpv */
1818 1830  }
1819 1831  
1820 1832  #if defined(__i386)
1821 1833  
1822 1834  /*
1823 1835   * Test to see if the instruction is part of __SEGREGS_POP
1824 1836   *
1825 1837   * Note carefully the appallingly awful dependency between
1826 1838   * the instruction sequence used in __SEGREGS_POP and these
1827 1839   * instructions encoded here.
1828 1840   */
1829 1841  static int
1830 1842  instr_is_segregs_pop(caddr_t pc)
1831 1843  {
1832 1844          static const uint8_t movw_0_esp_gs[4] = { 0x8e, 0x6c, 0x24, 0x0 };
1833 1845          static const uint8_t movw_4_esp_fs[4] = { 0x8e, 0x64, 0x24, 0x4 };
1834 1846          static const uint8_t movw_8_esp_es[4] = { 0x8e, 0x44, 0x24, 0x8 };
1835 1847          static const uint8_t movw_c_esp_ds[4] = { 0x8e, 0x5c, 0x24, 0xc };
1836 1848  
1837 1849          if (bcmp(pc, movw_0_esp_gs, sizeof (movw_0_esp_gs)) == 0 ||
1838 1850              bcmp(pc, movw_4_esp_fs, sizeof (movw_4_esp_fs)) == 0 ||
1839 1851              bcmp(pc, movw_8_esp_es, sizeof (movw_8_esp_es)) == 0 ||
1840 1852              bcmp(pc, movw_c_esp_ds, sizeof (movw_c_esp_ds)) == 0)
1841 1853                  return (1);
1842 1854  
1843 1855          return (0);
1844 1856  }
1845 1857  
1846 1858  #endif  /* __i386 */
1847 1859  
1848 1860  /*
1849 1861   * Test to see if the instruction is part of _sys_rtt.
1850 1862   *
1851 1863   * Again on the hypervisor if we try to IRET to user land with a bad code
1852 1864   * or stack selector we will get vectored through xen_failsafe_callback.
1853 1865   * In which case we assume we got here via _sys_rtt since we only allow
1854 1866   * IRET to user land to take place in _sys_rtt.
1855 1867   */
1856 1868  static int
1857 1869  instr_is_sys_rtt(caddr_t pc)
1858 1870  {
1859 1871          extern void _sys_rtt(), _sys_rtt_end();
1860 1872  
1861 1873          if ((uintptr_t)pc < (uintptr_t)_sys_rtt ||
1862 1874              (uintptr_t)pc > (uintptr_t)_sys_rtt_end)
1863 1875                  return (0);
1864 1876  
1865 1877          return (1);
1866 1878  }
1867 1879  
1868 1880  /*
1869 1881   * Handle #gp faults in kernel mode.
1870 1882   *
1871 1883   * One legitimate way this can happen is if we attempt to update segment
1872 1884   * registers to naughty values on the way out of the kernel.
1873 1885   *
1874 1886   * This can happen in a couple of ways: someone - either accidentally or
1875 1887   * on purpose - creates (setcontext(2), lwp_create(2)) or modifies
1876 1888   * (signal(2)) a ucontext that contains silly segment register values.
1877 1889   * Or someone - either accidentally or on purpose - modifies the prgregset_t
1878 1890   * of a subject process via /proc to contain silly segment register values.
1879 1891   *
1880 1892   * (The unfortunate part is that we can end up discovering the bad segment
1881 1893   * register value in the middle of an 'iret' after we've popped most of the
1882 1894   * stack.  So it becomes quite difficult to associate an accurate ucontext
1883 1895   * with the lwp, because the act of taking the #gp trap overwrites most of
1884 1896   * what we were going to send the lwp.)
1885 1897   *
1886 1898   * OTOH if it turns out that's -not- the problem, and we're -not- an lwp
1887 1899   * trying to return to user mode and we get a #gp fault, then we need
1888 1900   * to die() -- which will happen if we return non-zero from this routine.
1889 1901   */
1890 1902  static int
1891 1903  kern_gpfault(struct regs *rp)
1892 1904  {
1893 1905          kthread_t *t = curthread;
1894 1906          proc_t *p = ttoproc(t);
1895 1907          klwp_t *lwp = ttolwp(t);
1896 1908          struct regs tmpregs, *trp = NULL;
1897 1909          caddr_t pc = (caddr_t)rp->r_pc;
1898 1910          int v;
1899 1911          uint32_t auditing = AU_AUDITING();
1900 1912  
1901 1913          /*
1902 1914           * if we're not an lwp, or in the case of running native the
1903 1915           * pc range is outside _sys_rtt, then we should immediately
1904 1916           * be die()ing horribly.
1905 1917           */
1906 1918          if (lwp == NULL || !instr_is_sys_rtt(pc))
1907 1919                  return (1);
1908 1920  
1909 1921          /*
1910 1922           * So at least we're in the right part of the kernel.
1911 1923           *
1912 1924           * Disassemble the instruction at the faulting pc.
1913 1925           * Once we know what it is, we carefully reconstruct the stack
1914 1926           * based on the order in which the stack is deconstructed in
1915 1927           * _sys_rtt. Ew.
1916 1928           */
1917 1929          if (instr_is_iret(pc)) {
1918 1930                  /*
1919 1931                   * We took the #gp while trying to perform the IRET.
1920 1932                   * This means that either %cs or %ss are bad.
1921 1933                   * All we know for sure is that most of the general
1922 1934                   * registers have been restored, including the
1923 1935                   * segment registers, and all we have left on the
1924 1936                   * topmost part of the lwp's stack are the
1925 1937                   * registers that the iretq was unable to consume.
1926 1938                   *
1927 1939                   * All the rest of the state was crushed by the #gp
1928 1940                   * which pushed -its- registers atop our old save area
1929 1941                   * (because we had to decrement the stack pointer, sigh) so
1930 1942                   * all that we can try and do is to reconstruct the
1931 1943                   * crushed frame from the #gp trap frame itself.
1932 1944                   */
1933 1945                  trp = &tmpregs;
1934 1946                  trp->r_ss = lwptoregs(lwp)->r_ss;
1935 1947                  trp->r_sp = lwptoregs(lwp)->r_sp;
1936 1948                  trp->r_ps = lwptoregs(lwp)->r_ps;
1937 1949                  trp->r_cs = lwptoregs(lwp)->r_cs;
1938 1950                  trp->r_pc = lwptoregs(lwp)->r_pc;
1939 1951                  bcopy(rp, trp, offsetof(struct regs, r_pc));
1940 1952  
1941 1953                  /*
1942 1954                   * Validate simple math
1943 1955                   */
1944 1956                  ASSERT(trp->r_pc == lwptoregs(lwp)->r_pc);
1945 1957                  ASSERT(trp->r_err == rp->r_err);
1946 1958  
1947 1959  
1948 1960  
1949 1961          }
1950 1962  
1951 1963  #if defined(__amd64)
1952 1964          if (trp == NULL && lwp->lwp_pcb.pcb_rupdate != 0) {
1953 1965  
1954 1966                  /*
1955 1967                   * This is the common case -- we're trying to load
1956 1968                   * a bad segment register value in the only section
1957 1969                   * of kernel code that ever loads segment registers.
1958 1970                   *
1959 1971                   * We don't need to do anything at this point because
1960 1972                   * the pcb contains all the pending segment register
1961 1973                   * state, and the regs are still intact because we
1962 1974                   * didn't adjust the stack pointer yet.  Given the fidelity
1963 1975                   * of all this, we could conceivably send a signal
1964 1976                   * to the lwp, rather than core-ing.
1965 1977                   */
1966 1978                  trp = lwptoregs(lwp);
1967 1979                  ASSERT((caddr_t)trp == (caddr_t)rp->r_sp);
1968 1980          }
1969 1981  
1970 1982  #elif defined(__i386)
1971 1983  
1972 1984          if (trp == NULL && instr_is_segregs_pop(pc))
1973 1985                  trp = lwptoregs(lwp);
1974 1986  
1975 1987  #endif  /* __i386 */
1976 1988  
1977 1989          if (trp == NULL)
1978 1990                  return (1);
1979 1991  
1980 1992          /*
1981 1993           * If we get to here, we're reasonably confident that we've
1982 1994           * correctly decoded what happened on the way out of the kernel.
1983 1995           * Rewrite the lwp's registers so that we can create a core dump
1984 1996           * the (at least vaguely) represents the mcontext we were
1985 1997           * being asked to restore when things went so terribly wrong.
1986 1998           */
1987 1999  
1988 2000          /*
1989 2001           * Make sure that we have a meaningful %trapno and %err.
1990 2002           */
1991 2003          trp->r_trapno = rp->r_trapno;
1992 2004          trp->r_err = rp->r_err;
1993 2005  
1994 2006          if ((caddr_t)trp != (caddr_t)lwptoregs(lwp))
1995 2007                  bcopy(trp, lwptoregs(lwp), sizeof (*trp));
1996 2008  
1997 2009  
1998 2010          mutex_enter(&p->p_lock);
1999 2011          lwp->lwp_cursig = SIGSEGV;
2000 2012          mutex_exit(&p->p_lock);
2001 2013  
2002 2014          /*
2003 2015           * Terminate all LWPs but don't discard them.  If another lwp beat
2004 2016           * us to the punch by calling exit(), evaporate now.
2005 2017           */
2006 2018          proc_is_exiting(p);
2007 2019          if (exitlwps(1) != 0) {
2008 2020                  mutex_enter(&p->p_lock);
2009 2021                  lwp_exit();
2010 2022          }
2011 2023  
2012 2024          if (auditing)           /* audit core dump */
2013 2025                  audit_core_start(SIGSEGV);
2014 2026          v = core(SIGSEGV, B_FALSE);
2015 2027          if (auditing)           /* audit core dump */
2016 2028                  audit_core_finish(v ? CLD_KILLED : CLD_DUMPED);
2017 2029          exit(v ? CLD_KILLED : CLD_DUMPED, SIGSEGV);
2018 2030          return (0);
2019 2031  }
2020 2032  
2021 2033  /*
2022 2034   * dump_tss() - Display the TSS structure
2023 2035   */
2024 2036  
2025 2037  #if !defined(__xpv)
2026 2038  #if defined(__amd64)
2027 2039  
2028 2040  static void
2029 2041  dump_tss(void)
2030 2042  {
2031 2043          const char tss_fmt[] = "tss.%s:\t0x%p\n";  /* Format string */
2032 2044          tss_t *tss = CPU->cpu_tss;
2033 2045  
2034 2046          printf(tss_fmt, "tss_rsp0", (void *)tss->tss_rsp0);
2035 2047          printf(tss_fmt, "tss_rsp1", (void *)tss->tss_rsp1);
2036 2048          printf(tss_fmt, "tss_rsp2", (void *)tss->tss_rsp2);
2037 2049  
2038 2050          printf(tss_fmt, "tss_ist1", (void *)tss->tss_ist1);
2039 2051          printf(tss_fmt, "tss_ist2", (void *)tss->tss_ist2);
2040 2052          printf(tss_fmt, "tss_ist3", (void *)tss->tss_ist3);
2041 2053          printf(tss_fmt, "tss_ist4", (void *)tss->tss_ist4);
2042 2054          printf(tss_fmt, "tss_ist5", (void *)tss->tss_ist5);
2043 2055          printf(tss_fmt, "tss_ist6", (void *)tss->tss_ist6);
2044 2056          printf(tss_fmt, "tss_ist7", (void *)tss->tss_ist7);
2045 2057  }
2046 2058  
2047 2059  #elif defined(__i386)
2048 2060  
2049 2061  static void
2050 2062  dump_tss(void)
2051 2063  {
2052 2064          const char tss_fmt[] = "tss.%s:\t0x%p\n";  /* Format string */
2053 2065          tss_t *tss = CPU->cpu_tss;
2054 2066  
2055 2067          printf(tss_fmt, "tss_link", (void *)(uintptr_t)tss->tss_link);
2056 2068          printf(tss_fmt, "tss_esp0", (void *)(uintptr_t)tss->tss_esp0);
2057 2069          printf(tss_fmt, "tss_ss0", (void *)(uintptr_t)tss->tss_ss0);
2058 2070          printf(tss_fmt, "tss_esp1", (void *)(uintptr_t)tss->tss_esp1);
2059 2071          printf(tss_fmt, "tss_ss1", (void *)(uintptr_t)tss->tss_ss1);
2060 2072          printf(tss_fmt, "tss_esp2", (void *)(uintptr_t)tss->tss_esp2);
2061 2073          printf(tss_fmt, "tss_ss2", (void *)(uintptr_t)tss->tss_ss2);
2062 2074          printf(tss_fmt, "tss_cr3", (void *)(uintptr_t)tss->tss_cr3);
2063 2075          printf(tss_fmt, "tss_eip", (void *)(uintptr_t)tss->tss_eip);
2064 2076          printf(tss_fmt, "tss_eflags", (void *)(uintptr_t)tss->tss_eflags);
2065 2077          printf(tss_fmt, "tss_eax", (void *)(uintptr_t)tss->tss_eax);
2066 2078          printf(tss_fmt, "tss_ebx", (void *)(uintptr_t)tss->tss_ebx);
2067 2079          printf(tss_fmt, "tss_ecx", (void *)(uintptr_t)tss->tss_ecx);
2068 2080          printf(tss_fmt, "tss_edx", (void *)(uintptr_t)tss->tss_edx);
2069 2081          printf(tss_fmt, "tss_esp", (void *)(uintptr_t)tss->tss_esp);
2070 2082  }
2071 2083  
2072 2084  #endif  /* __amd64 */
2073 2085  #endif  /* !__xpv */
2074 2086  
2075 2087  #if defined(TRAPTRACE)
2076 2088  
2077 2089  int ttrace_nrec = 10;           /* number of records to dump out */
2078 2090  int ttrace_dump_nregs = 0;      /* dump out this many records with regs too */
2079 2091  
2080 2092  /*
2081 2093   * Dump out the last ttrace_nrec traptrace records on each CPU
2082 2094   */
2083 2095  static void
2084 2096  dump_ttrace(void)
2085 2097  {
2086 2098          trap_trace_ctl_t *ttc;
2087 2099          trap_trace_rec_t *rec;
2088 2100          uintptr_t current;
2089 2101          int i, j, k;
2090 2102          int n = NCPU;
2091 2103  #if defined(__amd64)
2092 2104          const char banner[] =
2093 2105              "\ncpu          address    timestamp "
2094 2106              "type  vc  handler   pc\n";
2095 2107          const char fmt1[] = "%3d %016lx %12llx ";
2096 2108  #elif defined(__i386)
2097 2109          const char banner[] =
2098 2110              "\ncpu  address     timestamp type  vc  handler   pc\n";
2099 2111          const char fmt1[] = "%3d %08lx %12llx ";
2100 2112  #endif
2101 2113          const char fmt2[] = "%4s %3x ";
2102 2114          const char fmt3[] = "%8s ";
2103 2115  
2104 2116          if (ttrace_nrec == 0)
2105 2117                  return;
2106 2118  
2107 2119          printf(banner);
2108 2120  
2109 2121          for (i = 0; i < n; i++) {
2110 2122                  ttc = &trap_trace_ctl[i];
2111 2123                  if (ttc->ttc_first == NULL)
2112 2124                          continue;
2113 2125  
2114 2126                  current = ttc->ttc_next - sizeof (trap_trace_rec_t);
2115 2127                  for (j = 0; j < ttrace_nrec; j++) {
2116 2128                          struct sysent   *sys;
2117 2129                          struct autovec  *vec;
2118 2130                          extern struct av_head autovect[];
2119 2131                          int type;
2120 2132                          ulong_t off;
2121 2133                          char *sym, *stype;
2122 2134  
2123 2135                          if (current < ttc->ttc_first)
2124 2136                                  current =
2125 2137                                      ttc->ttc_limit - sizeof (trap_trace_rec_t);
2126 2138  
2127 2139                          if (current == NULL)
2128 2140                                  continue;
2129 2141  
2130 2142                          rec = (trap_trace_rec_t *)current;
2131 2143  
2132 2144                          if (rec->ttr_stamp == 0)
2133 2145                                  break;
2134 2146  
2135 2147                          printf(fmt1, i, (uintptr_t)rec, rec->ttr_stamp);
2136 2148  
2137 2149                          switch (rec->ttr_marker) {
2138 2150                          case TT_SYSCALL:
2139 2151                          case TT_SYSENTER:
2140 2152                          case TT_SYSC:
2141 2153                          case TT_SYSC64:
2142 2154  #if defined(__amd64)
2143 2155                                  sys = &sysent32[rec->ttr_sysnum];
2144 2156                                  switch (rec->ttr_marker) {
2145 2157                                  case TT_SYSC64:
2146 2158                                          sys = &sysent[rec->ttr_sysnum];
2147 2159                                          /*FALLTHROUGH*/
2148 2160  #elif defined(__i386)
2149 2161                                  sys = &sysent[rec->ttr_sysnum];
2150 2162                                  switch (rec->ttr_marker) {
2151 2163                                  case TT_SYSC64:
2152 2164  #endif
2153 2165                                  case TT_SYSC:
2154 2166                                          stype = "sysc"; /* syscall */
2155 2167                                          break;
2156 2168                                  case TT_SYSCALL:
2157 2169                                          stype = "lcal"; /* lcall */
2158 2170                                          break;
2159 2171                                  case TT_SYSENTER:
2160 2172                                          stype = "syse"; /* sysenter */
2161 2173                                          break;
2162 2174                                  default:
2163 2175                                          break;
2164 2176                                  }
2165 2177                                  printf(fmt2, "sysc", rec->ttr_sysnum);
2166 2178                                  if (sys != NULL) {
2167 2179                                          sym = kobj_getsymname(
2168 2180                                              (uintptr_t)sys->sy_callc,
2169 2181                                              &off);
2170 2182                                          if (sym != NULL)
2171 2183                                                  printf(fmt3, sym);
2172 2184                                          else
2173 2185                                                  printf("%p ", sys->sy_callc);
2174 2186                                  } else {
2175 2187                                          printf(fmt3, "unknown");
2176 2188                                  }
2177 2189                                  break;
2178 2190  
2179 2191                          case TT_INTERRUPT:
2180 2192                                  printf(fmt2, "intr", rec->ttr_vector);
2181 2193                                  if (get_intr_handler != NULL)
2182 2194                                          vec = (struct autovec *)
2183 2195                                              (*get_intr_handler)
2184 2196                                              (rec->ttr_cpuid, rec->ttr_vector);
2185 2197                                  else
2186 2198                                          vec =
2187 2199                                              autovect[rec->ttr_vector].avh_link;
2188 2200  
2189 2201                                  if (vec != NULL) {
2190 2202                                          sym = kobj_getsymname(
2191 2203                                              (uintptr_t)vec->av_vector, &off);
2192 2204                                          if (sym != NULL)
2193 2205                                                  printf(fmt3, sym);
2194 2206                                          else
2195 2207                                                  printf("%p ", vec->av_vector);
2196 2208                                  } else {
2197 2209                                          printf(fmt3, "unknown ");
2198 2210                                  }
2199 2211                                  break;
2200 2212  
2201 2213                          case TT_TRAP:
2202 2214                          case TT_EVENT:
2203 2215                                  type = rec->ttr_regs.r_trapno;
2204 2216                                  printf(fmt2, "trap", type);
2205 2217                                  if (type < TRAP_TYPES)
2206 2218                                          printf("     #%s ",
2207 2219                                              trap_type_mnemonic[type]);
2208 2220                                  else
2209 2221                                          switch (type) {
2210 2222                                          case T_AST:
2211 2223                                                  printf(fmt3, "ast");
2212 2224                                                  break;
2213 2225                                          default:
2214 2226                                                  printf(fmt3, "");
2215 2227                                                  break;
2216 2228                                          }
2217 2229                                  break;
2218 2230  
2219 2231                          default:
2220 2232                                  break;
2221 2233                          }
2222 2234  
2223 2235                          sym = kobj_getsymname(rec->ttr_regs.r_pc, &off);
2224 2236                          if (sym != NULL)
2225 2237                                  printf("%s+%lx\n", sym, off);
2226 2238                          else
2227 2239                                  printf("%lx\n", rec->ttr_regs.r_pc);
2228 2240  
2229 2241                          if (ttrace_dump_nregs-- > 0) {
2230 2242                                  int s;
2231 2243  
2232 2244                                  if (rec->ttr_marker == TT_INTERRUPT)
2233 2245                                          printf(
2234 2246                                              "\t\tipl %x spl %x pri %x\n",
2235 2247                                              rec->ttr_ipl,
2236 2248                                              rec->ttr_spl,
2237 2249                                              rec->ttr_pri);
2238 2250  
2239 2251                                  dumpregs(&rec->ttr_regs);
2240 2252  
2241 2253                                  printf("\t%3s: %p\n\n", " ct",
2242 2254                                      (void *)rec->ttr_curthread);
2243 2255  
2244 2256                                  /*
2245 2257                                   * print out the pc stack that we recorded
2246 2258                                   * at trap time (if any)
2247 2259                                   */
2248 2260                                  for (s = 0; s < rec->ttr_sdepth; s++) {
2249 2261                                          uintptr_t fullpc;
2250 2262  
2251 2263                                          if (s >= TTR_STACK_DEPTH) {
2252 2264                                                  printf("ttr_sdepth corrupt\n");
2253 2265                                                  break;
2254 2266                                          }
2255 2267  
2256 2268                                          fullpc = (uintptr_t)rec->ttr_stack[s];
2257 2269  
2258 2270                                          sym = kobj_getsymname(fullpc, &off);
2259 2271                                          if (sym != NULL)
2260 2272                                                  printf("-> %s+0x%lx()\n",
2261 2273                                                      sym, off);
2262 2274                                          else
2263 2275                                                  printf("-> 0x%lx()\n", fullpc);
2264 2276                                  }
2265 2277                                  printf("\n");
2266 2278                          }
2267 2279                          current -= sizeof (trap_trace_rec_t);
2268 2280                  }
2269 2281          }
2270 2282  }
2271 2283  
2272 2284  #endif  /* TRAPTRACE */
2273 2285  
2274 2286  void
2275 2287  panic_showtrap(struct panic_trap_info *tip)
2276 2288  {
2277 2289          showregs(tip->trap_type, tip->trap_regs, tip->trap_addr);
2278 2290  
2279 2291  #if defined(TRAPTRACE)
2280 2292          dump_ttrace();
2281 2293  #endif
2282 2294  
2283 2295  #if !defined(__xpv)
2284 2296          if (tip->trap_type == T_DBLFLT)
2285 2297                  dump_tss();
2286 2298  #endif
2287 2299  }
2288 2300  
2289 2301  void
2290 2302  panic_savetrap(panic_data_t *pdp, struct panic_trap_info *tip)
2291 2303  {
2292 2304          panic_saveregs(pdp, tip->trap_regs);
2293 2305  }
  
    | 
      ↓ open down ↓ | 
    1419 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX