Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/i86pc/os/trap.c
          +++ new/usr/src/uts/i86pc/os/trap.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  
  26   26  /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27   27  /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  28   28  /*              All Rights Reserved                             */
  29   29  /*                                                              */
  30   30  /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  31   31  /*              All Rights Reserved                             */
  32   32  /*                                                              */
  33   33  
  34   34  /*
  35   35   * Copyright 2015 Joyent, Inc.
  36   36   */
  37   37  
  38   38  #include <sys/types.h>
  39   39  #include <sys/sysmacros.h>
  40   40  #include <sys/param.h>
  41   41  #include <sys/signal.h>
  42   42  #include <sys/systm.h>
  43   43  #include <sys/user.h>
  44   44  #include <sys/proc.h>
  45   45  #include <sys/disp.h>
  46   46  #include <sys/class.h>
  47   47  #include <sys/core.h>
  48   48  #include <sys/syscall.h>
  49   49  #include <sys/cpuvar.h>
  50   50  #include <sys/vm.h>
  51   51  #include <sys/sysinfo.h>
  52   52  #include <sys/fault.h>
  53   53  #include <sys/stack.h>
  54   54  #include <sys/psw.h>
  55   55  #include <sys/regset.h>
  56   56  #include <sys/fp.h>
  57   57  #include <sys/trap.h>
  58   58  #include <sys/kmem.h>
  59   59  #include <sys/vtrace.h>
  60   60  #include <sys/cmn_err.h>
  61   61  #include <sys/prsystm.h>
  62   62  #include <sys/mutex_impl.h>
  63   63  #include <sys/machsystm.h>
  64   64  #include <sys/archsystm.h>
  65   65  #include <sys/sdt.h>
  66   66  #include <sys/avintr.h>
  67   67  #include <sys/kobj.h>
  68   68  
  69   69  #include <vm/hat.h>
  70   70  
  71   71  #include <vm/seg_kmem.h>
  72   72  #include <vm/as.h>
  73   73  #include <vm/seg.h>
  74   74  #include <vm/hat_pte.h>
  75   75  #include <vm/hat_i86.h>
  76   76  
  77   77  #include <sys/procfs.h>
  78   78  
  79   79  #include <sys/reboot.h>
  80   80  #include <sys/debug.h>
  81   81  #include <sys/debugreg.h>
  82   82  #include <sys/modctl.h>
  83   83  #include <sys/aio_impl.h>
  84   84  #include <sys/tnf.h>
  85   85  #include <sys/tnf_probe.h>
  86   86  #include <sys/cred.h>
  87   87  #include <sys/mman.h>
  88   88  #include <sys/x86_archext.h>
  89   89  #include <sys/copyops.h>
  90   90  #include <c2/audit.h>
  91   91  #include <sys/ftrace.h>
  92   92  #include <sys/panic.h>
  93   93  #include <sys/traptrace.h>
  94   94  #include <sys/ontrap.h>
  95   95  #include <sys/cpc_impl.h>
  96   96  #include <sys/bootconf.h>
  97   97  #include <sys/bootinfo.h>
  98   98  #include <sys/promif.h>
  99   99  #include <sys/mach_mmu.h>
 100  100  #if defined(__xpv)
 101  101  #include <sys/hypervisor.h>
 102  102  #endif
 103  103  #include <sys/contract/process_impl.h>
 104  104  #include <sys/brand.h>
 105  105  
 106  106  #define USER    0x10000         /* user-mode flag added to trap type */
 107  107  
 108  108  static const char *trap_type_mnemonic[] = {
 109  109          "de",   "db",   "2",    "bp",
 110  110          "of",   "br",   "ud",   "nm",
 111  111          "df",   "9",    "ts",   "np",
 112  112          "ss",   "gp",   "pf",   "15",
 113  113          "mf",   "ac",   "mc",   "xf"
 114  114  };
 115  115  
 116  116  static const char *trap_type[] = {
 117  117          "Divide error",                         /* trap id 0    */
 118  118          "Debug",                                /* trap id 1    */
 119  119          "NMI interrupt",                        /* trap id 2    */
 120  120          "Breakpoint",                           /* trap id 3    */
 121  121          "Overflow",                             /* trap id 4    */
 122  122          "BOUND range exceeded",                 /* trap id 5    */
 123  123          "Invalid opcode",                       /* trap id 6    */
 124  124          "Device not available",                 /* trap id 7    */
 125  125          "Double fault",                         /* trap id 8    */
 126  126          "Coprocessor segment overrun",          /* trap id 9    */
 127  127          "Invalid TSS",                          /* trap id 10   */
 128  128          "Segment not present",                  /* trap id 11   */
 129  129          "Stack segment fault",                  /* trap id 12   */
 130  130          "General protection",                   /* trap id 13   */
 131  131          "Page fault",                           /* trap id 14   */
 132  132          "Reserved",                             /* trap id 15   */
 133  133          "x87 floating point error",             /* trap id 16   */
 134  134          "Alignment check",                      /* trap id 17   */
 135  135          "Machine check",                        /* trap id 18   */
 136  136          "SIMD floating point exception",        /* trap id 19   */
 137  137  };
 138  138  
 139  139  #define TRAP_TYPES      (sizeof (trap_type) / sizeof (trap_type[0]))
 140  140  
 141  141  #define SLOW_SCALL_SIZE 2
 142  142  #define FAST_SCALL_SIZE 2
 143  143  
 144  144  int tudebug = 0;
 145  145  int tudebugbpt = 0;
 146  146  int tudebugfpe = 0;
 147  147  int tudebugsse = 0;
 148  148  
 149  149  #if defined(TRAPDEBUG) || defined(lint)
 150  150  int tdebug = 0;
 151  151  int lodebug = 0;
 152  152  int faultdebug = 0;
 153  153  #else
 154  154  #define tdebug  0
 155  155  #define lodebug 0
 156  156  #define faultdebug      0
 157  157  #endif /* defined(TRAPDEBUG) || defined(lint) */
 158  158  
 159  159  #if defined(TRAPTRACE)
 160  160  /*
 161  161   * trap trace record for cpu0 is allocated here.
 162  162   * trap trace records for non-boot cpus are allocated in mp_startup_init().
 163  163   */
 164  164  static trap_trace_rec_t trap_tr0[TRAPTR_NENT];
 165  165  trap_trace_ctl_t trap_trace_ctl[NCPU] = {
 166  166          {
 167  167              (uintptr_t)trap_tr0,                        /* next record */
 168  168              (uintptr_t)trap_tr0,                        /* first record */
 169  169              (uintptr_t)(trap_tr0 + TRAPTR_NENT),        /* limit */
 170  170              (uintptr_t)0                                /* current */
 171  171          },
 172  172  };
 173  173  
 174  174  /*
 175  175   * default trap buffer size
 176  176   */
 177  177  size_t trap_trace_bufsize = TRAPTR_NENT * sizeof (trap_trace_rec_t);
 178  178  int trap_trace_freeze = 0;
 179  179  int trap_trace_off = 0;
 180  180  
 181  181  /*
 182  182   * A dummy TRAPTRACE entry to use after death.
 183  183   */
 184  184  trap_trace_rec_t trap_trace_postmort;
 185  185  
 186  186  static void dump_ttrace(void);
 187  187  #endif  /* TRAPTRACE */
 188  188  static void dumpregs(struct regs *);
 189  189  static void showregs(uint_t, struct regs *, caddr_t);
 190  190  static int kern_gpfault(struct regs *);
 191  191  
 192  192  /*ARGSUSED*/
 193  193  static int
 194  194  die(uint_t type, struct regs *rp, caddr_t addr, processorid_t cpuid)
 195  195  {
 196  196          struct panic_trap_info ti;
 197  197          const char *trap_name, *trap_mnemonic;
 198  198  
 199  199          if (type < TRAP_TYPES) {
 200  200                  trap_name = trap_type[type];
 201  201                  trap_mnemonic = trap_type_mnemonic[type];
 202  202          } else {
 203  203                  trap_name = "trap";
 204  204                  trap_mnemonic = "-";
 205  205          }
 206  206  
 207  207  #ifdef TRAPTRACE
 208  208          TRAPTRACE_FREEZE;
 209  209  #endif
 210  210  
 211  211          ti.trap_regs = rp;
 212  212          ti.trap_type = type & ~USER;
 213  213          ti.trap_addr = addr;
 214  214  
 215  215          curthread->t_panic_trap = &ti;
 216  216  
 217  217          if (type == T_PGFLT && addr < (caddr_t)KERNELBASE) {
 218  218                  panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p "
 219  219                      "occurred in module \"%s\" due to %s",
 220  220                      type, trap_mnemonic, trap_name, (void *)rp, (void *)addr,
 221  221                      mod_containing_pc((caddr_t)rp->r_pc),
 222  222                      addr < (caddr_t)PAGESIZE ?
 223  223                      "a NULL pointer dereference" :
 224  224                      "an illegal access to a user address");
 225  225          } else
 226  226                  panic("BAD TRAP: type=%x (#%s %s) rp=%p addr=%p",
 227  227                      type, trap_mnemonic, trap_name, (void *)rp, (void *)addr);
 228  228          return (0);
 229  229  }
 230  230  
 231  231  /*
 232  232   * Rewrite the instruction at pc to be an int $T_SYSCALLINT instruction.
 233  233   *
 234  234   * int <vector> is two bytes: 0xCD <vector>
 235  235   */
 236  236  
 237  237  static int
 238  238  rewrite_syscall(caddr_t pc)
 239  239  {
 240  240          uchar_t instr[SLOW_SCALL_SIZE] = { 0xCD, T_SYSCALLINT };
 241  241  
 242  242          if (uwrite(curthread->t_procp, instr, SLOW_SCALL_SIZE,
 243  243              (uintptr_t)pc) != 0)
 244  244                  return (1);
 245  245  
 246  246          return (0);
 247  247  }
 248  248  
 249  249  /*
 250  250   * Test to see if the instruction at pc is sysenter or syscall. The second
 251  251   * argument should be the x86 feature flag corresponding to the expected
 252  252   * instruction.
 253  253   *
 254  254   * sysenter is two bytes: 0x0F 0x34
 255  255   * syscall is two bytes:  0x0F 0x05
 256  256   * int $T_SYSCALLINT is two bytes: 0xCD 0x91
 257  257   */
 258  258  
 259  259  static int
 260  260  instr_is_other_syscall(caddr_t pc, int which)
 261  261  {
 262  262          uchar_t instr[FAST_SCALL_SIZE];
 263  263  
 264  264          ASSERT(which == X86FSET_SEP || which == X86FSET_ASYSC || which == 0xCD);
 265  265  
 266  266          if (copyin_nowatch(pc, (caddr_t)instr, FAST_SCALL_SIZE) != 0)
 267  267                  return (0);
 268  268  
 269  269          switch (which) {
 270  270          case X86FSET_SEP:
 271  271                  if (instr[0] == 0x0F && instr[1] == 0x34)
 272  272                          return (1);
 273  273                  break;
 274  274          case X86FSET_ASYSC:
 275  275                  if (instr[0] == 0x0F && instr[1] == 0x05)
 276  276                          return (1);
 277  277                  break;
 278  278          case 0xCD:
 279  279                  if (instr[0] == 0xCD && instr[1] == T_SYSCALLINT)
 280  280                          return (1);
 281  281                  break;
 282  282          }
 283  283  
 284  284          return (0);
 285  285  }
 286  286  
 287  287  static const char *
 288  288  syscall_insn_string(int syscall_insn)
 289  289  {
 290  290          switch (syscall_insn) {
 291  291          case X86FSET_SEP:
 292  292                  return ("sysenter");
 293  293          case X86FSET_ASYSC:
 294  294                  return ("syscall");
 295  295          case 0xCD:
 296  296                  return ("int");
 297  297          default:
 298  298                  return ("Unknown");
 299  299          }
 300  300  }
 301  301  
 302  302  static int
 303  303  ldt_rewrite_syscall(struct regs *rp, proc_t *p, int syscall_insn)
 304  304  {
 305  305          caddr_t linearpc;
 306  306          int return_code = 0;
 307  307  
 308  308          mutex_enter(&p->p_ldtlock);     /* Must be held across linear_pc() */
 309  309  
 310  310          if (linear_pc(rp, p, &linearpc) == 0) {
 311  311  
 312  312                  /*
 313  313                   * If another thread beat us here, it already changed
 314  314                   * this site to the slower (int) syscall instruction.
 315  315                   */
 316  316                  if (instr_is_other_syscall(linearpc, 0xCD)) {
 317  317                          return_code = 1;
 318  318                  } else if (instr_is_other_syscall(linearpc, syscall_insn)) {
 319  319  
 320  320                          if (rewrite_syscall(linearpc) == 0) {
 321  321                                  return_code = 1;
 322  322                          }
 323  323  #ifdef DEBUG
 324  324                          else
 325  325                                  cmn_err(CE_WARN, "failed to rewrite %s "
 326  326                                      "instruction in process %d",
 327  327                                      syscall_insn_string(syscall_insn),
 328  328                                      p->p_pid);
 329  329  #endif /* DEBUG */
 330  330                  }
 331  331          }
 332  332  
 333  333          mutex_exit(&p->p_ldtlock);      /* Must be held across linear_pc() */
 334  334  
 335  335          return (return_code);
 336  336  }
 337  337  
 338  338  /*
 339  339   * Test to see if the instruction at pc is a system call instruction.
 340  340   *
 341  341   * The bytes of an lcall instruction used for the syscall trap.
 342  342   * static uchar_t lcall[7] = { 0x9a, 0, 0, 0, 0, 0x7, 0 };
 343  343   * static uchar_t lcallalt[7] = { 0x9a, 0, 0, 0, 0, 0x27, 0 };
 344  344   */
 345  345  
 346  346  #define LCALLSIZE       7
 347  347  
 348  348  static int
 349  349  instr_is_lcall_syscall(caddr_t pc)
 350  350  {
 351  351          uchar_t instr[LCALLSIZE];
 352  352  
 353  353          if (copyin_nowatch(pc, (caddr_t)instr, LCALLSIZE) == 0 &&
 354  354              instr[0] == 0x9a &&
 355  355              instr[1] == 0 &&
 356  356              instr[2] == 0 &&
 357  357              instr[3] == 0 &&
 358  358              instr[4] == 0 &&
 359  359              (instr[5] == 0x7 || instr[5] == 0x27) &&
 360  360              instr[6] == 0)
 361  361                  return (1);
 362  362  
 363  363          return (0);
 364  364  }
 365  365  
 366  366  #ifdef __amd64
 367  367  
 368  368  /*
 369  369   * In the first revisions of amd64 CPUs produced by AMD, the LAHF and
 370  370   * SAHF instructions were not implemented in 64-bit mode. Later revisions
 371  371   * did implement these instructions. An extension to the cpuid instruction
 372  372   * was added to check for the capability of executing these instructions
 373  373   * in 64-bit mode.
 374  374   *
 375  375   * Intel originally did not implement these instructions in EM64T either,
 376  376   * but added them in later revisions.
 377  377   *
 378  378   * So, there are different chip revisions by both vendors out there that
 379  379   * may or may not implement these instructions. The easy solution is to
 380  380   * just always emulate these instructions on demand.
 381  381   *
 382  382   * SAHF == store %ah in the lower 8 bits of %rflags (opcode 0x9e)
 383  383   * LAHF == load the lower 8 bits of %rflags into %ah (opcode 0x9f)
 384  384   */
 385  385  
 386  386  #define LSAHFSIZE 1
 387  387  
 388  388  static int
 389  389  instr_is_lsahf(caddr_t pc, uchar_t *instr)
 390  390  {
 391  391          if (copyin_nowatch(pc, (caddr_t)instr, LSAHFSIZE) == 0 &&
 392  392              (*instr == 0x9e || *instr == 0x9f))
 393  393                  return (1);
 394  394          return (0);
 395  395  }
 396  396  
 397  397  /*
 398  398   * Emulate the LAHF and SAHF instructions. The reference manuals define
 399  399   * these instructions to always load/store bit 1 as a 1, and bits 3 and 5
 400  400   * as a 0. The other, defined, bits are copied (the PS_ICC bits and PS_P).
 401  401   *
 402  402   * Note that %ah is bits 8-15 of %rax.
 403  403   */
 404  404  static void
 405  405  emulate_lsahf(struct regs *rp, uchar_t instr)
 406  406  {
 407  407          if (instr == 0x9e) {
 408  408                  /* sahf. Copy bits from %ah to flags. */
 409  409                  rp->r_ps = (rp->r_ps & ~0xff) |
 410  410                      ((rp->r_rax >> 8) & PSL_LSAHFMASK) | PS_MB1;
 411  411          } else {
 412  412                  /* lahf. Copy bits from flags to %ah. */
 413  413                  rp->r_rax = (rp->r_rax & ~0xff00) |
 414  414                      (((rp->r_ps & PSL_LSAHFMASK) | PS_MB1) << 8);
 415  415          }
 416  416          rp->r_pc += LSAHFSIZE;
 417  417  }
 418  418  #endif /* __amd64 */
 419  419  
 420  420  #ifdef OPTERON_ERRATUM_91
 421  421  
 422  422  /*
 423  423   * Test to see if the instruction at pc is a prefetch instruction.
 424  424   *
 425  425   * The first byte of prefetch instructions is always 0x0F.
 426  426   * The second byte is 0x18 for regular prefetch or 0x0D for AMD 3dnow prefetch.
 427  427   * The third byte (ModRM) contains the register field bits (bits 3-5).
 428  428   * These bits must be between 0 and 3 inclusive for regular prefetch and
 429  429   * 0 and 1 inclusive for AMD 3dnow prefetch.
 430  430   *
 431  431   * In 64-bit mode, there may be a one-byte REX prefex (0x40-0x4F).
 432  432   */
 433  433  
 434  434  static int
 435  435  cmp_to_prefetch(uchar_t *p)
 436  436  {
 437  437  #ifdef _LP64
 438  438          if ((p[0] & 0xF0) == 0x40)      /* 64-bit REX prefix */
 439  439                  p++;
 440  440  #endif
 441  441          return ((p[0] == 0x0F && p[1] == 0x18 && ((p[2] >> 3) & 7) <= 3) ||
 442  442              (p[0] == 0x0F && p[1] == 0x0D && ((p[2] >> 3) & 7) <= 1));
 443  443  }
 444  444  
 445  445  static int
 446  446  instr_is_prefetch(caddr_t pc)
 447  447  {
 448  448          uchar_t instr[4];       /* optional REX prefix plus 3-byte opcode */
 449  449  
 450  450          return (copyin_nowatch(pc, instr, sizeof (instr)) == 0 &&
 451  451              cmp_to_prefetch(instr));
 452  452  }
 453  453  
 454  454  #endif /* OPTERON_ERRATUM_91 */
 455  455  
 456  456  /*
 457  457   * Called from the trap handler when a processor trap occurs.
 458  458   *
 459  459   * Note: All user-level traps that might call stop() must exit
 460  460   * trap() by 'goto out' or by falling through.
 461  461   * Note Also: trap() is usually called with interrupts enabled, (PS_IE == 1)
 462  462   * however, there are paths that arrive here with PS_IE == 0 so special care
 463  463   * must be taken in those cases.
 464  464   */
 465  465  void
 466  466  trap(struct regs *rp, caddr_t addr, processorid_t cpuid)
 467  467  {
 468  468          kthread_t *ct = curthread;
 469  469          enum seg_rw rw;
 470  470          unsigned type;
 471  471          proc_t *p = ttoproc(ct);
 472  472          klwp_t *lwp = ttolwp(ct);
 473  473          uintptr_t lofault;
 474  474          label_t *onfault;
 475  475          faultcode_t pagefault(), res, errcode;
 476  476          enum fault_type fault_type;
 477  477          k_siginfo_t siginfo;
 478  478          uint_t fault = 0;
 479  479          int mstate;
 480  480          int sicode = 0;
 481  481          int watchcode;
 482  482          int watchpage;
 483  483          caddr_t vaddr;
 484  484          int singlestep_twiddle;
 485  485          size_t sz;
 486  486          int ta;
 487  487  #ifdef __amd64
 488  488          uchar_t instr;
 489  489  #endif
 490  490  
 491  491          ASSERT_STACK_ALIGNED();
 492  492  
 493  493          type = rp->r_trapno;
 494  494          CPU_STATS_ADDQ(CPU, sys, trap, 1);
 495  495          ASSERT(ct->t_schedflag & TS_DONT_SWAP);
 496  496  
 497  497          if (type == T_PGFLT) {
 498  498  
 499  499                  errcode = rp->r_err;
 500  500                  if (errcode & PF_ERR_WRITE)
 501  501                          rw = S_WRITE;
 502  502                  else if ((caddr_t)rp->r_pc == addr ||
 503  503                      (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC)))
 504  504                          rw = S_EXEC;
 505  505                  else
 506  506                          rw = S_READ;
 507  507  
 508  508  #if defined(__i386)
 509  509                  /*
 510  510                   * Pentium Pro work-around
 511  511                   */
 512  512                  if ((errcode & PF_ERR_PROT) && pentiumpro_bug4046376) {
 513  513                          uint_t  attr;
 514  514                          uint_t  priv_violation;
 515  515                          uint_t  access_violation;
 516  516  
 517  517                          if (hat_getattr(addr < (caddr_t)kernelbase ?
 518  518                              curproc->p_as->a_hat : kas.a_hat, addr, &attr)
 519  519                              == -1) {
 520  520                                  errcode &= ~PF_ERR_PROT;
 521  521                          } else {
 522  522                                  priv_violation = (errcode & PF_ERR_USER) &&
 523  523                                      !(attr & PROT_USER);
 524  524                                  access_violation = (errcode & PF_ERR_WRITE) &&
 525  525                                      !(attr & PROT_WRITE);
 526  526                                  if (!priv_violation && !access_violation)
 527  527                                          goto cleanup;
 528  528                          }
 529  529                  }
 530  530  #endif /* __i386 */
 531  531  
 532  532          } else if (type == T_SGLSTP && lwp != NULL)
 533  533                  lwp->lwp_pcb.pcb_drstat = (uintptr_t)addr;
 534  534  
 535  535          if (tdebug)
 536  536                  showregs(type, rp, addr);
 537  537  
 538  538          if (USERMODE(rp->r_cs)) {
 539  539                  /*
 540  540                   * Set up the current cred to use during this trap. u_cred
 541  541                   * no longer exists.  t_cred is used instead.
 542  542                   * The current process credential applies to the thread for
 543  543                   * the entire trap.  If trapping from the kernel, this
 544  544                   * should already be set up.
 545  545                   */
 546  546                  if (ct->t_cred != p->p_cred) {
 547  547                          cred_t *oldcred = ct->t_cred;
 548  548                          /*
 549  549                           * DTrace accesses t_cred in probe context.  t_cred
 550  550                           * must always be either NULL, or point to a valid,
 551  551                           * allocated cred structure.
 552  552                           */
 553  553                          ct->t_cred = crgetcred();
 554  554                          crfree(oldcred);
 555  555                  }
 556  556                  ASSERT(lwp != NULL);
 557  557                  type |= USER;
 558  558                  ASSERT(lwptoregs(lwp) == rp);
 559  559                  lwp->lwp_state = LWP_SYS;
 560  560  
 561  561                  switch (type) {
 562  562                  case T_PGFLT + USER:
 563  563                          if ((caddr_t)rp->r_pc == addr)
 564  564                                  mstate = LMS_TFAULT;
 565  565                          else
 566  566                                  mstate = LMS_DFAULT;
 567  567                          break;
 568  568                  default:
 569  569                          mstate = LMS_TRAP;
 570  570                          break;
 571  571                  }
 572  572                  /* Kernel probe */
 573  573                  TNF_PROBE_1(thread_state, "thread", /* CSTYLED */,
 574  574                      tnf_microstate, state, mstate);
 575  575                  mstate = new_mstate(ct, mstate);
 576  576  
 577  577                  bzero(&siginfo, sizeof (siginfo));
 578  578          }
 579  579  
 580  580          switch (type) {
 581  581          case T_PGFLT + USER:
 582  582          case T_SGLSTP:
 583  583          case T_SGLSTP + USER:
 584  584          case T_BPTFLT + USER:
 585  585                  break;
 586  586  
 587  587          default:
 588  588                  FTRACE_2("trap(): type=0x%lx, regs=0x%lx",
 589  589                      (ulong_t)type, (ulong_t)rp);
 590  590                  break;
 591  591          }
 592  592  
 593  593          switch (type) {
 594  594          case T_SIMDFPE:
 595  595                  /* Make sure we enable interrupts before die()ing */
 596  596                  sti();  /* The SIMD exception comes in via cmninttrap */
 597  597                  /*FALLTHROUGH*/
 598  598          default:
 599  599                  if (type & USER) {
 600  600                          if (tudebug)
 601  601                                  showregs(type, rp, (caddr_t)0);
 602  602                          printf("trap: Unknown trap type %d in user mode\n",
 603  603                              type & ~USER);
 604  604                          siginfo.si_signo = SIGILL;
 605  605                          siginfo.si_code  = ILL_ILLTRP;
 606  606                          siginfo.si_addr  = (caddr_t)rp->r_pc;
 607  607                          siginfo.si_trapno = type & ~USER;
 608  608                          fault = FLTILL;
 609  609                          break;
 610  610                  } else {
 611  611                          (void) die(type, rp, addr, cpuid);
 612  612                          /*NOTREACHED*/
 613  613                  }
 614  614  
 615  615          case T_PGFLT:           /* system page fault */
 616  616                  /*
 617  617                   * If we're under on_trap() protection (see <sys/ontrap.h>),
 618  618                   * set ot_trap and bounce back to the on_trap() call site
 619  619                   * via the installed trampoline.
 620  620                   */
 621  621                  if ((ct->t_ontrap != NULL) &&
 622  622                      (ct->t_ontrap->ot_prot & OT_DATA_ACCESS)) {
 623  623                          ct->t_ontrap->ot_trap |= OT_DATA_ACCESS;
 624  624                          rp->r_pc = ct->t_ontrap->ot_trampoline;
 625  625                          goto cleanup;
 626  626                  }
 627  627  
 628  628                  /*
 629  629                   * If we have an Instruction fault in kernel mode, then that
 630  630                   * means we've tried to execute a user page (SMEP) or both of
 631  631                   * PAE and NXE are enabled. In either case, given that it's a
 632  632                   * kernel fault, we should panic immediately and not try to make
 633  633                   * any more forward progress. This indicates a bug in the
 634  634                   * kernel, which if execution continued, could be exploited to
 635  635                   * wreak havoc on the system.
 636  636                   */
 637  637                  if (errcode & PF_ERR_EXEC) {
 638  638                          (void) die(type, rp, addr, cpuid);
 639  639                  }
 640  640  
 641  641                  /*
 642  642                   * We need to check if SMAP is in play. If SMAP is in play, then
 643  643                   * any access to a user page will show up as a protection
 644  644                   * violation. To see if SMAP is enabled we first check if it's a
 645  645                   * user address and whether we have the feature flag set. If we
 646  646                   * do and the interrupted registers do not allow for user
 647  647                   * accesses (PS_ACHK is not enabled), then we need to die
 648  648                   * immediately.
 649  649                   */
 650  650                  if (addr < (caddr_t)kernelbase &&
 651  651                      is_x86_feature(x86_featureset, X86FSET_SMAP) == B_TRUE &&
 652  652                      (rp->r_ps & PS_ACHK) == 0) {
 653  653                          (void) die(type, rp, addr, cpuid);
 654  654                  }
 655  655  
 656  656                  /*
 657  657                   * See if we can handle as pagefault. Save lofault and onfault
 658  658                   * across this. Here we assume that an address less than
 659  659                   * KERNELBASE is a user fault.  We can do this as copy.s
 660  660                   * routines verify that the starting address is less than
 661  661                   * KERNELBASE before starting and because we know that we
 662  662                   * always have KERNELBASE mapped as invalid to serve as a
 663  663                   * "barrier".
 664  664                   */
 665  665                  lofault = ct->t_lofault;
 666  666                  onfault = ct->t_onfault;
 667  667                  ct->t_lofault = 0;
 668  668  
 669  669                  mstate = new_mstate(ct, LMS_KFAULT);
 670  670  
 671  671                  if (addr < (caddr_t)kernelbase) {
 672  672                          res = pagefault(addr,
 673  673                              (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 0);
 674  674                          if (res == FC_NOMAP &&
 675  675                              addr < p->p_usrstack &&
 676  676                              grow(addr))
 677  677                                  res = 0;
 678  678                  } else {
 679  679                          res = pagefault(addr,
 680  680                              (errcode & PF_ERR_PROT)? F_PROT: F_INVAL, rw, 1);
 681  681                  }
 682  682                  (void) new_mstate(ct, mstate);
 683  683  
 684  684                  /*
 685  685                   * Restore lofault and onfault. If we resolved the fault, exit.
 686  686                   * If we didn't and lofault wasn't set, die.
 687  687                   */
 688  688                  ct->t_lofault = lofault;
 689  689                  ct->t_onfault = onfault;
 690  690                  if (res == 0)
 691  691                          goto cleanup;
 692  692  
 693  693  #if defined(OPTERON_ERRATUM_93) && defined(_LP64)
 694  694                  if (lofault == 0 && opteron_erratum_93) {
 695  695                          /*
 696  696                           * Workaround for Opteron Erratum 93. On return from
 697  697                           * a System Managment Interrupt at a HLT instruction
 698  698                           * the %rip might be truncated to a 32 bit value.
 699  699                           * BIOS is supposed to fix this, but some don't.
 700  700                           * If this occurs we simply restore the high order bits.
 701  701                           * The HLT instruction is 1 byte of 0xf4.
 702  702                           */
 703  703                          uintptr_t       rip = rp->r_pc;
 704  704  
 705  705                          if ((rip & 0xfffffffful) == rip) {
 706  706                                  rip |= 0xfffffffful << 32;
 707  707                                  if (hat_getpfnum(kas.a_hat, (caddr_t)rip) !=
 708  708                                      PFN_INVALID &&
 709  709                                      (*(uchar_t *)rip == 0xf4 ||
 710  710                                      *(uchar_t *)(rip - 1) == 0xf4)) {
 711  711                                          rp->r_pc = rip;
 712  712                                          goto cleanup;
 713  713                                  }
 714  714                          }
 715  715                  }
 716  716  #endif /* OPTERON_ERRATUM_93 && _LP64 */
 717  717  
 718  718  #ifdef OPTERON_ERRATUM_91
 719  719                  if (lofault == 0 && opteron_erratum_91) {
 720  720                          /*
 721  721                           * Workaround for Opteron Erratum 91. Prefetches may
 722  722                           * generate a page fault (they're not supposed to do
 723  723                           * that!). If this occurs we simply return back to the
 724  724                           * instruction.
 725  725                           */
 726  726                          caddr_t         pc = (caddr_t)rp->r_pc;
 727  727  
 728  728                          /*
 729  729                           * If the faulting PC is not mapped, this is a
 730  730                           * legitimate kernel page fault that must result in a
 731  731                           * panic. If the faulting PC is mapped, it could contain
 732  732                           * a prefetch instruction. Check for that here.
 733  733                           */
 734  734                          if (hat_getpfnum(kas.a_hat, pc) != PFN_INVALID) {
 735  735                                  if (cmp_to_prefetch((uchar_t *)pc)) {
 736  736  #ifdef DEBUG
 737  737                                          cmn_err(CE_WARN, "Opteron erratum 91 "
 738  738                                              "occurred: kernel prefetch"
 739  739                                              " at %p generated a page fault!",
 740  740                                              (void *)rp->r_pc);
 741  741  #endif /* DEBUG */
 742  742                                          goto cleanup;
 743  743                                  }
 744  744                          }
 745  745                          (void) die(type, rp, addr, cpuid);
 746  746                  }
 747  747  #endif /* OPTERON_ERRATUM_91 */
 748  748  
 749  749                  if (lofault == 0)
 750  750                          (void) die(type, rp, addr, cpuid);
 751  751  
 752  752                  /*
 753  753                   * Cannot resolve fault.  Return to lofault.
 754  754                   */
 755  755                  if (lodebug) {
 756  756                          showregs(type, rp, addr);
 757  757                          traceregs(rp);
 758  758                  }
 759  759                  if (FC_CODE(res) == FC_OBJERR)
 760  760                          res = FC_ERRNO(res);
 761  761                  else
 762  762                          res = EFAULT;
 763  763                  rp->r_r0 = res;
 764  764                  rp->r_pc = ct->t_lofault;
 765  765                  goto cleanup;
 766  766  
 767  767          case T_PGFLT + USER:    /* user page fault */
 768  768                  if (faultdebug) {
 769  769                          char *fault_str;
 770  770  
 771  771                          switch (rw) {
 772  772                          case S_READ:
 773  773                                  fault_str = "read";
 774  774                                  break;
 775  775                          case S_WRITE:
 776  776                                  fault_str = "write";
 777  777                                  break;
 778  778                          case S_EXEC:
 779  779                                  fault_str = "exec";
 780  780                                  break;
 781  781                          default:
 782  782                                  fault_str = "";
 783  783                                  break;
 784  784                          }
 785  785                          printf("user %s fault:  addr=0x%lx errcode=0x%x\n",
 786  786                              fault_str, (uintptr_t)addr, errcode);
 787  787                  }
 788  788  
 789  789  #if defined(OPTERON_ERRATUM_100) && defined(_LP64)
 790  790                  /*
 791  791                   * Workaround for AMD erratum 100
 792  792                   *
 793  793                   * A 32-bit process may receive a page fault on a non
 794  794                   * 32-bit address by mistake. The range of the faulting
 795  795                   * address will be
 796  796                   *
 797  797                   *      0xffffffff80000000 .. 0xffffffffffffffff or
 798  798                   *      0x0000000100000000 .. 0x000000017fffffff
 799  799                   *
 800  800                   * The fault is always due to an instruction fetch, however
 801  801                   * the value of r_pc should be correct (in 32 bit range),
 802  802                   * so we ignore the page fault on the bogus address.
 803  803                   */
 804  804                  if (p->p_model == DATAMODEL_ILP32 &&
 805  805                      (0xffffffff80000000 <= (uintptr_t)addr ||
 806  806                      (0x100000000 <= (uintptr_t)addr &&
 807  807                      (uintptr_t)addr <= 0x17fffffff))) {
 808  808                          if (!opteron_erratum_100)
 809  809                                  panic("unexpected erratum #100");
 810  810                          if (rp->r_pc <= 0xffffffff)
 811  811                                  goto out;
 812  812                  }
 813  813  #endif /* OPTERON_ERRATUM_100 && _LP64 */
 814  814  
 815  815                  ASSERT(!(curthread->t_flag & T_WATCHPT));
 816  816                  watchpage = (pr_watch_active(p) && pr_is_watchpage(addr, rw));
 817  817  #ifdef __i386
 818  818                  /*
 819  819                   * In 32-bit mode, the lcall (system call) instruction fetches
 820  820                   * one word from the stack, at the stack pointer, because of the
 821  821                   * way the call gate is constructed.  This is a bogus
 822  822                   * read and should not be counted as a read watchpoint.
 823  823                   * We work around the problem here by testing to see if
 824  824                   * this situation applies and, if so, simply jumping to
 825  825                   * the code in locore.s that fields the system call trap.
 826  826                   * The registers on the stack are already set up properly
 827  827                   * due to the match between the call gate sequence and the
 828  828                   * trap gate sequence.  We just have to adjust the pc.
 829  829                   */
 830  830                  if (watchpage && addr == (caddr_t)rp->r_sp &&
 831  831                      rw == S_READ && instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
 832  832                          extern void watch_syscall(void);
 833  833  
 834  834                          rp->r_pc += LCALLSIZE;
 835  835                          watch_syscall();        /* never returns */
 836  836                          /* NOTREACHED */
 837  837                  }
 838  838  #endif /* __i386 */
 839  839                  vaddr = addr;
 840  840                  if (!watchpage || (sz = instr_size(rp, &vaddr, rw)) <= 0)
 841  841                          fault_type = (errcode & PF_ERR_PROT)? F_PROT: F_INVAL;
 842  842                  else if ((watchcode = pr_is_watchpoint(&vaddr, &ta,
 843  843                      sz, NULL, rw)) != 0) {
 844  844                          if (ta) {
 845  845                                  do_watch_step(vaddr, sz, rw,
 846  846                                      watchcode, rp->r_pc);
 847  847                                  fault_type = F_INVAL;
 848  848                          } else {
 849  849                                  bzero(&siginfo, sizeof (siginfo));
 850  850                                  siginfo.si_signo = SIGTRAP;
 851  851                                  siginfo.si_code = watchcode;
 852  852                                  siginfo.si_addr = vaddr;
 853  853                                  siginfo.si_trapafter = 0;
 854  854                                  siginfo.si_pc = (caddr_t)rp->r_pc;
 855  855                                  fault = FLTWATCH;
 856  856                                  break;
 857  857                          }
 858  858                  } else {
 859  859                          /* XXX pr_watch_emul() never succeeds (for now) */
 860  860                          if (rw != S_EXEC && pr_watch_emul(rp, vaddr, rw))
 861  861                                  goto out;
 862  862                          do_watch_step(vaddr, sz, rw, 0, 0);
 863  863                          fault_type = F_INVAL;
 864  864                  }
 865  865  
 866  866                  /*
 867  867                   * Allow the brand to interpose on invalid memory accesses
 868  868                   * prior to running the native pagefault handler.  If this
 869  869                   * brand hook returns zero, it was able to handle the fault
 870  870                   * completely.  Otherwise, drive on and call pagefault().
 871  871                   */
 872  872                  if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL &&
 873  873                      BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) {
 874  874                          goto out;
 875  875                  }
 876  876  
 877  877                  res = pagefault(addr, fault_type, rw, 0);
 878  878  
 879  879                  /*
 880  880                   * If pagefault() succeeded, ok.
 881  881                   * Otherwise attempt to grow the stack.
 882  882                   */
 883  883                  if (res == 0 ||
 884  884                      (res == FC_NOMAP &&
 885  885                      addr < p->p_usrstack &&
 886  886                      grow(addr))) {
 887  887                          lwp->lwp_lastfault = FLTPAGE;
 888  888                          lwp->lwp_lastfaddr = addr;
 889  889                          if (prismember(&p->p_fltmask, FLTPAGE)) {
 890  890                                  bzero(&siginfo, sizeof (siginfo));
 891  891                                  siginfo.si_addr = addr;
 892  892                                  (void) stop_on_fault(FLTPAGE, &siginfo);
 893  893                          }
 894  894                          goto out;
 895  895                  } else if (res == FC_PROT && addr < p->p_usrstack &&
 896  896                      (mmu.pt_nx != 0 && (errcode & PF_ERR_EXEC))) {
 897  897                          report_stack_exec(p, addr);
 898  898                  }
 899  899  
 900  900  #ifdef OPTERON_ERRATUM_91
 901  901                  /*
 902  902                   * Workaround for Opteron Erratum 91. Prefetches may generate a
 903  903                   * page fault (they're not supposed to do that!). If this
 904  904                   * occurs we simply return back to the instruction.
 905  905                   *
 906  906                   * We rely on copyin to properly fault in the page with r_pc.
 907  907                   */
 908  908                  if (opteron_erratum_91 &&
 909  909                      addr != (caddr_t)rp->r_pc &&
 910  910                      instr_is_prefetch((caddr_t)rp->r_pc)) {
 911  911  #ifdef DEBUG
 912  912                          cmn_err(CE_WARN, "Opteron erratum 91 occurred: "
 913  913                              "prefetch at %p in pid %d generated a trap!",
 914  914                              (void *)rp->r_pc, p->p_pid);
 915  915  #endif /* DEBUG */
 916  916                          goto out;
 917  917                  }
 918  918  #endif /* OPTERON_ERRATUM_91 */
 919  919  
 920  920                  if (tudebug)
 921  921                          showregs(type, rp, addr);
 922  922                  /*
 923  923                   * In the case where both pagefault and grow fail,
 924  924                   * set the code to the value provided by pagefault.
 925  925                   * We map all errors returned from pagefault() to SIGSEGV.
 926  926                   */
 927  927                  bzero(&siginfo, sizeof (siginfo));
 928  928                  siginfo.si_addr = addr;
 929  929                  switch (FC_CODE(res)) {
 930  930                  case FC_HWERR:
 931  931                  case FC_NOSUPPORT:
 932  932                          siginfo.si_signo = SIGBUS;
 933  933                          siginfo.si_code = BUS_ADRERR;
 934  934                          fault = FLTACCESS;
 935  935                          break;
 936  936                  case FC_ALIGN:
 937  937                          siginfo.si_signo = SIGBUS;
 938  938                          siginfo.si_code = BUS_ADRALN;
 939  939                          fault = FLTACCESS;
 940  940                          break;
 941  941                  case FC_OBJERR:
 942  942                          if ((siginfo.si_errno = FC_ERRNO(res)) != EINTR) {
 943  943                                  siginfo.si_signo = SIGBUS;
 944  944                                  siginfo.si_code = BUS_OBJERR;
 945  945                                  fault = FLTACCESS;
 946  946                          }
 947  947                          break;
 948  948                  default:        /* FC_NOMAP or FC_PROT */
 949  949                          siginfo.si_signo = SIGSEGV;
 950  950                          siginfo.si_code =
 951  951                              (res == FC_NOMAP)? SEGV_MAPERR : SEGV_ACCERR;
 952  952                          fault = FLTBOUNDS;
 953  953                          break;
 954  954                  }
 955  955                  break;
 956  956  
 957  957          case T_ILLINST + USER:  /* invalid opcode fault */
 958  958                  /*
 959  959                   * If the syscall instruction is disabled due to LDT usage, a
 960  960                   * user program that attempts to execute it will trigger a #ud
 961  961                   * trap. Check for that case here. If this occurs on a CPU which
 962  962                   * doesn't even support syscall, the result of all of this will
 963  963                   * be to emulate that particular instruction.
 964  964                   */
 965  965                  if (p->p_ldt != NULL &&
 966  966                      ldt_rewrite_syscall(rp, p, X86FSET_ASYSC))
 967  967                          goto out;
 968  968  
 969  969  #ifdef __amd64
 970  970                  /*
 971  971                   * Emulate the LAHF and SAHF instructions if needed.
 972  972                   * See the instr_is_lsahf function for details.
 973  973                   */
 974  974                  if (p->p_model == DATAMODEL_LP64 &&
 975  975                      instr_is_lsahf((caddr_t)rp->r_pc, &instr)) {
 976  976                          emulate_lsahf(rp, instr);
 977  977                          goto out;
 978  978                  }
 979  979  #endif
 980  980  
 981  981                  /*FALLTHROUGH*/
 982  982  
 983  983                  if (tudebug)
 984  984                          showregs(type, rp, (caddr_t)0);
 985  985                  siginfo.si_signo = SIGILL;
 986  986                  siginfo.si_code  = ILL_ILLOPC;
 987  987                  siginfo.si_addr  = (caddr_t)rp->r_pc;
 988  988                  fault = FLTILL;
 989  989                  break;
 990  990  
 991  991          case T_ZERODIV + USER:          /* integer divide by zero */
 992  992                  if (tudebug && tudebugfpe)
 993  993                          showregs(type, rp, (caddr_t)0);
 994  994                  siginfo.si_signo = SIGFPE;
 995  995                  siginfo.si_code  = FPE_INTDIV;
 996  996                  siginfo.si_addr  = (caddr_t)rp->r_pc;
 997  997                  fault = FLTIZDIV;
 998  998                  break;
 999  999  
1000 1000          case T_OVFLW + USER:    /* integer overflow */
1001 1001                  if (tudebug && tudebugfpe)
1002 1002                          showregs(type, rp, (caddr_t)0);
1003 1003                  siginfo.si_signo = SIGFPE;
1004 1004                  siginfo.si_code  = FPE_INTOVF;
1005 1005                  siginfo.si_addr  = (caddr_t)rp->r_pc;
1006 1006                  fault = FLTIOVF;
1007 1007                  break;
1008 1008  
1009 1009          case T_NOEXTFLT + USER: /* math coprocessor not available */
1010 1010                  if (tudebug && tudebugfpe)
1011 1011                          showregs(type, rp, addr);
1012 1012                  if (fpnoextflt(rp)) {
1013 1013                          siginfo.si_signo = SIGILL;
1014 1014                          siginfo.si_code  = ILL_ILLOPC;
1015 1015                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1016 1016                          fault = FLTILL;
1017 1017                  }
1018 1018                  break;
1019 1019  
1020 1020          case T_EXTOVRFLT:       /* extension overrun fault */
1021 1021                  /* check if we took a kernel trap on behalf of user */
1022 1022                  {
1023 1023                          extern  void ndptrap_frstor(void);
1024 1024                          if (rp->r_pc != (uintptr_t)ndptrap_frstor) {
1025 1025                                  sti(); /* T_EXTOVRFLT comes in via cmninttrap */
1026 1026                                  (void) die(type, rp, addr, cpuid);
1027 1027                          }
1028 1028                          type |= USER;
1029 1029                  }
1030 1030                  /*FALLTHROUGH*/
1031 1031          case T_EXTOVRFLT + USER:        /* extension overrun fault */
1032 1032                  if (tudebug && tudebugfpe)
1033 1033                          showregs(type, rp, addr);
1034 1034                  if (fpextovrflt(rp)) {
1035 1035                          siginfo.si_signo = SIGSEGV;
1036 1036                          siginfo.si_code  = SEGV_MAPERR;
1037 1037                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1038 1038                          fault = FLTBOUNDS;
1039 1039                  }
1040 1040                  break;
1041 1041  
1042 1042          case T_EXTERRFLT:       /* x87 floating point exception pending */
1043 1043                  /* check if we took a kernel trap on behalf of user */
1044 1044                  {
1045 1045                          extern  void ndptrap_frstor(void);
1046 1046                          if (rp->r_pc != (uintptr_t)ndptrap_frstor) {
1047 1047                                  sti(); /* T_EXTERRFLT comes in via cmninttrap */
1048 1048                                  (void) die(type, rp, addr, cpuid);
1049 1049                          }
1050 1050                          type |= USER;
1051 1051                  }
1052 1052                  /*FALLTHROUGH*/
1053 1053  
1054 1054          case T_EXTERRFLT + USER: /* x87 floating point exception pending */
1055 1055                  if (tudebug && tudebugfpe)
1056 1056                          showregs(type, rp, addr);
1057 1057                  if (sicode = fpexterrflt(rp)) {
1058 1058                          siginfo.si_signo = SIGFPE;
1059 1059                          siginfo.si_code  = sicode;
1060 1060                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1061 1061                          fault = FLTFPE;
1062 1062                  }
1063 1063                  break;
1064 1064  
1065 1065          case T_SIMDFPE + USER:          /* SSE and SSE2 exceptions */
1066 1066                  if (tudebug && tudebugsse)
1067 1067                          showregs(type, rp, addr);
1068 1068                  if (!is_x86_feature(x86_featureset, X86FSET_SSE) &&
1069 1069                      !is_x86_feature(x86_featureset, X86FSET_SSE2)) {
1070 1070                          /*
1071 1071                           * There are rumours that some user instructions
1072 1072                           * on older CPUs can cause this trap to occur; in
1073 1073                           * which case send a SIGILL instead of a SIGFPE.
1074 1074                           */
1075 1075                          siginfo.si_signo = SIGILL;
1076 1076                          siginfo.si_code  = ILL_ILLTRP;
1077 1077                          siginfo.si_addr  = (caddr_t)rp->r_pc;
1078 1078                          siginfo.si_trapno = type & ~USER;
1079 1079                          fault = FLTILL;
1080 1080                  } else if ((sicode = fpsimderrflt(rp)) != 0) {
1081 1081                          siginfo.si_signo = SIGFPE;
1082 1082                          siginfo.si_code = sicode;
1083 1083                          siginfo.si_addr = (caddr_t)rp->r_pc;
1084 1084                          fault = FLTFPE;
1085 1085                  }
1086 1086  
1087 1087                  sti();  /* The SIMD exception comes in via cmninttrap */
1088 1088                  break;
1089 1089  
1090 1090          case T_BPTFLT:  /* breakpoint trap */
1091 1091                  /*
1092 1092                   * Kernel breakpoint traps should only happen when kmdb is
1093 1093                   * active, and even then, it'll have interposed on the IDT, so
1094 1094                   * control won't get here.  If it does, we've hit a breakpoint
1095 1095                   * without the debugger, which is very strange, and very
1096 1096                   * fatal.
1097 1097                   */
1098 1098                  if (tudebug && tudebugbpt)
1099 1099                          showregs(type, rp, (caddr_t)0);
1100 1100  
1101 1101                  (void) die(type, rp, addr, cpuid);
1102 1102                  break;
1103 1103  
1104 1104          case T_SGLSTP: /* single step/hw breakpoint exception */
1105 1105  
1106 1106                  /* Now evaluate how we got here */
1107 1107                  if (lwp != NULL && (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP)) {
1108 1108                          /*
1109 1109                           * i386 single-steps even through lcalls which
1110 1110                           * change the privilege level. So we take a trap at
1111 1111                           * the first instruction in privileged mode.
1112 1112                           *
1113 1113                           * Set a flag to indicate that upon completion of
1114 1114                           * the system call, deal with the single-step trap.
1115 1115                           *
1116 1116                           * The same thing happens for sysenter, too.
1117 1117                           */
1118 1118                          singlestep_twiddle = 0;
1119 1119                          if (rp->r_pc == (uintptr_t)sys_sysenter ||
1120 1120                              rp->r_pc == (uintptr_t)brand_sys_sysenter) {
1121 1121                                  singlestep_twiddle = 1;
1122 1122  #if defined(__amd64)
1123 1123                                  /*
1124 1124                                   * Since we are already on the kernel's
1125 1125                                   * %gs, on 64-bit systems the sysenter case
1126 1126                                   * needs to adjust the pc to avoid
1127 1127                                   * executing the swapgs instruction at the
1128 1128                                   * top of the handler.
1129 1129                                   */
1130 1130                                  if (rp->r_pc == (uintptr_t)sys_sysenter)
1131 1131                                          rp->r_pc = (uintptr_t)
1132 1132                                              _sys_sysenter_post_swapgs;
1133 1133                                  else
1134 1134                                          rp->r_pc = (uintptr_t)
1135 1135                                              _brand_sys_sysenter_post_swapgs;
1136 1136  #endif
1137 1137                          }
1138 1138  #if defined(__i386)
1139 1139                          else if (rp->r_pc == (uintptr_t)sys_call ||
1140 1140                              rp->r_pc == (uintptr_t)brand_sys_call) {
1141 1141                                  singlestep_twiddle = 1;
1142 1142                          }
1143 1143  #endif
1144 1144                          else {
1145 1145                                  /* not on sysenter/syscall; uregs available */
1146 1146                                  if (tudebug && tudebugbpt)
1147 1147                                          showregs(type, rp, (caddr_t)0);
1148 1148                          }
1149 1149                          if (singlestep_twiddle) {
1150 1150                                  rp->r_ps &= ~PS_T; /* turn off trace */
1151 1151                                  lwp->lwp_pcb.pcb_flags |= DEBUG_PENDING;
1152 1152                                  ct->t_post_sys = 1;
1153 1153                                  aston(curthread);
1154 1154                                  goto cleanup;
1155 1155                          }
1156 1156                  }
1157 1157                  /* XXX - needs review on debugger interface? */
1158 1158                  if (boothowto & RB_DEBUG)
1159 1159                          debug_enter((char *)NULL);
1160 1160                  else
1161 1161                          (void) die(type, rp, addr, cpuid);
1162 1162                  break;
1163 1163  
1164 1164          case T_NMIFLT:  /* NMI interrupt */
1165 1165                  printf("Unexpected NMI in system mode\n");
1166 1166                  goto cleanup;
1167 1167  
1168 1168          case T_NMIFLT + USER:   /* NMI interrupt */
1169 1169                  printf("Unexpected NMI in user mode\n");
1170 1170                  break;
1171 1171  
1172 1172          case T_GPFLT:   /* general protection violation */
1173 1173                  /*
1174 1174                   * Any #GP that occurs during an on_trap .. no_trap bracket
1175 1175                   * with OT_DATA_ACCESS or OT_SEGMENT_ACCESS protection,
1176 1176                   * or in a on_fault .. no_fault bracket, is forgiven
1177 1177                   * and we trampoline.  This protection is given regardless
1178 1178                   * of whether we are 32/64 bit etc - if a distinction is
1179 1179                   * required then define new on_trap protection types.
1180 1180                   *
1181 1181                   * On amd64, we can get a #gp from referencing addresses
1182 1182                   * in the virtual address hole e.g. from a copyin or in
1183 1183                   * update_sregs while updating user segment registers.
1184 1184                   *
1185 1185                   * On the 32-bit hypervisor we could also generate one in
1186 1186                   * mfn_to_pfn by reaching around or into where the hypervisor
1187 1187                   * lives which is protected by segmentation.
1188 1188                   */
1189 1189  
1190 1190                  /*
1191 1191                   * If we're under on_trap() protection (see <sys/ontrap.h>),
1192 1192                   * set ot_trap and trampoline back to the on_trap() call site
1193 1193                   * for OT_DATA_ACCESS or OT_SEGMENT_ACCESS.
1194 1194                   */
1195 1195                  if (ct->t_ontrap != NULL) {
1196 1196                          int ttype =  ct->t_ontrap->ot_prot &
1197 1197                              (OT_DATA_ACCESS | OT_SEGMENT_ACCESS);
1198 1198  
1199 1199                          if (ttype != 0) {
1200 1200                                  ct->t_ontrap->ot_trap |= ttype;
1201 1201                                  if (tudebug)
1202 1202                                          showregs(type, rp, (caddr_t)0);
1203 1203                                  rp->r_pc = ct->t_ontrap->ot_trampoline;
1204 1204                                  goto cleanup;
1205 1205                          }
1206 1206                  }
1207 1207  
1208 1208                  /*
1209 1209                   * If we're under lofault protection (copyin etc.),
1210 1210                   * longjmp back to lofault with an EFAULT.
1211 1211                   */
1212 1212                  if (ct->t_lofault) {
1213 1213                          /*
1214 1214                           * Fault is not resolvable, so just return to lofault
1215 1215                           */
1216 1216                          if (lodebug) {
1217 1217                                  showregs(type, rp, addr);
1218 1218                                  traceregs(rp);
1219 1219                          }
1220 1220                          rp->r_r0 = EFAULT;
1221 1221                          rp->r_pc = ct->t_lofault;
1222 1222                          goto cleanup;
1223 1223                  }
1224 1224  
1225 1225                  /*
1226 1226                   * We fall through to the next case, which repeats
1227 1227                   * the OT_SEGMENT_ACCESS check which we've already
1228 1228                   * done, so we'll always fall through to the
1229 1229                   * T_STKFLT case.
1230 1230                   */
1231 1231                  /*FALLTHROUGH*/
1232 1232          case T_SEGFLT:  /* segment not present fault */
1233 1233                  /*
1234 1234                   * One example of this is #NP in update_sregs while
1235 1235                   * attempting to update a user segment register
1236 1236                   * that points to a descriptor that is marked not
1237 1237                   * present.
1238 1238                   */
1239 1239                  if (ct->t_ontrap != NULL &&
1240 1240                      ct->t_ontrap->ot_prot & OT_SEGMENT_ACCESS) {
1241 1241                          ct->t_ontrap->ot_trap |= OT_SEGMENT_ACCESS;
1242 1242                          if (tudebug)
1243 1243                                  showregs(type, rp, (caddr_t)0);
1244 1244                          rp->r_pc = ct->t_ontrap->ot_trampoline;
1245 1245                          goto cleanup;
1246 1246                  }
1247 1247                  /*FALLTHROUGH*/
1248 1248          case T_STKFLT:  /* stack fault */
1249 1249          case T_TSSFLT:  /* invalid TSS fault */
1250 1250                  if (tudebug)
1251 1251                          showregs(type, rp, (caddr_t)0);
1252 1252                  if (kern_gpfault(rp))
1253 1253                          (void) die(type, rp, addr, cpuid);
1254 1254                  goto cleanup;
1255 1255  
1256 1256          /*
1257 1257           * ONLY 32-bit PROCESSES can USE a PRIVATE LDT! 64-bit apps
1258 1258           * should have no need for them, so we put a stop to it here.
1259 1259           *
1260 1260           * So: not-present fault is ONLY valid for 32-bit processes with
1261 1261           * a private LDT trying to do a system call. Emulate it.
1262 1262           *
1263 1263           * #gp fault is ONLY valid for 32-bit processes also, which DO NOT
1264 1264           * have a private LDT, and are trying to do a system call. Emulate it.
1265 1265           */
1266 1266  
1267 1267          case T_SEGFLT + USER:   /* segment not present fault */
1268 1268          case T_GPFLT + USER:    /* general protection violation */
1269 1269  #ifdef _SYSCALL32_IMPL
1270 1270                  if (p->p_model != DATAMODEL_NATIVE) {
1271 1271  #endif /* _SYSCALL32_IMPL */
1272 1272                  if (instr_is_lcall_syscall((caddr_t)rp->r_pc)) {
1273 1273                          if (type == T_SEGFLT + USER)
1274 1274                                  ASSERT(p->p_ldt != NULL);
1275 1275  
1276 1276                          if ((p->p_ldt == NULL && type == T_GPFLT + USER) ||
1277 1277                              type == T_SEGFLT + USER) {
1278 1278  
1279 1279                          /*
1280 1280                           * The user attempted a system call via the obsolete
1281 1281                           * call gate mechanism. Because the process doesn't have
1282 1282                           * an LDT (i.e. the ldtr contains 0), a #gp results.
1283 1283                           * Emulate the syscall here, just as we do above for a
1284 1284                           * #np trap.
1285 1285                           */
1286 1286  
1287 1287                          /*
1288 1288                           * Since this is a not-present trap, rp->r_pc points to
1289 1289                           * the trapping lcall instruction. We need to bump it
1290 1290                           * to the next insn so the app can continue on.
1291 1291                           */
1292 1292                          rp->r_pc += LCALLSIZE;
1293 1293                          lwp->lwp_regs = rp;
1294 1294  
1295 1295                          /*
1296 1296                           * Normally the microstate of the LWP is forced back to
1297 1297                           * LMS_USER by the syscall handlers. Emulate that
1298 1298                           * behavior here.
1299 1299                           */
1300 1300                          mstate = LMS_USER;
1301 1301  
1302 1302                          dosyscall();
1303 1303                          goto out;
1304 1304                          }
1305 1305                  }
1306 1306  #ifdef _SYSCALL32_IMPL
1307 1307                  }
1308 1308  #endif /* _SYSCALL32_IMPL */
1309 1309                  /*
1310 1310                   * If the current process is using a private LDT and the
1311 1311                   * trapping instruction is sysenter, the sysenter instruction
1312 1312                   * has been disabled on the CPU because it destroys segment
1313 1313                   * registers. If this is the case, rewrite the instruction to
1314 1314                   * be a safe system call and retry it. If this occurs on a CPU
1315 1315                   * which doesn't even support sysenter, the result of all of
1316 1316                   * this will be to emulate that particular instruction.
1317 1317                   */
1318 1318                  if (p->p_ldt != NULL &&
1319 1319                      ldt_rewrite_syscall(rp, p, X86FSET_SEP))
1320 1320                          goto out;
1321 1321  
1322 1322                  /*FALLTHROUGH*/
1323 1323  
1324 1324          case T_BOUNDFLT + USER: /* bound fault */
1325 1325          case T_STKFLT + USER:   /* stack fault */
1326 1326          case T_TSSFLT + USER:   /* invalid TSS fault */
1327 1327                  if (tudebug)
1328 1328                          showregs(type, rp, (caddr_t)0);
1329 1329                  siginfo.si_signo = SIGSEGV;
1330 1330                  siginfo.si_code  = SEGV_MAPERR;
1331 1331                  siginfo.si_addr  = (caddr_t)rp->r_pc;
1332 1332                  fault = FLTBOUNDS;
1333 1333                  break;
1334 1334  
1335 1335          case T_ALIGNMENT + USER:        /* user alignment error (486) */
1336 1336                  if (tudebug)
1337 1337                          showregs(type, rp, (caddr_t)0);
1338 1338                  bzero(&siginfo, sizeof (siginfo));
1339 1339                  siginfo.si_signo = SIGBUS;
1340 1340                  siginfo.si_code = BUS_ADRALN;
1341 1341                  siginfo.si_addr = (caddr_t)rp->r_pc;
1342 1342                  fault = FLTACCESS;
1343 1343                  break;
1344 1344  
1345 1345          case T_SGLSTP + USER: /* single step/hw breakpoint exception */
1346 1346                  if (tudebug && tudebugbpt)
1347 1347                          showregs(type, rp, (caddr_t)0);
1348 1348  
1349 1349                  /* Was it single-stepping? */
1350 1350                  if (lwp->lwp_pcb.pcb_drstat & DR_SINGLESTEP) {
1351 1351                          pcb_t *pcb = &lwp->lwp_pcb;
1352 1352  
1353 1353                          rp->r_ps &= ~PS_T;
1354 1354                          /*
1355 1355                           * If both NORMAL_STEP and WATCH_STEP are in effect,
1356 1356                           * give precedence to WATCH_STEP.  If neither is set,
1357 1357                           * user must have set the PS_T bit in %efl; treat this
1358 1358                           * as NORMAL_STEP.
1359 1359                           */
1360 1360                          if ((fault = undo_watch_step(&siginfo)) == 0 &&
1361 1361                              ((pcb->pcb_flags & NORMAL_STEP) ||
1362 1362                              !(pcb->pcb_flags & WATCH_STEP))) {
1363 1363                                  siginfo.si_signo = SIGTRAP;
1364 1364                                  siginfo.si_code = TRAP_TRACE;
1365 1365                                  siginfo.si_addr = (caddr_t)rp->r_pc;
1366 1366                                  fault = FLTTRACE;
1367 1367                          }
1368 1368                          pcb->pcb_flags &= ~(NORMAL_STEP|WATCH_STEP);
1369 1369                  }
1370 1370                  break;
1371 1371  
1372 1372          case T_BPTFLT + USER:   /* breakpoint trap */
1373 1373                  if (tudebug && tudebugbpt)
1374 1374                          showregs(type, rp, (caddr_t)0);
1375 1375                  /*
1376 1376                   * int 3 (the breakpoint instruction) leaves the pc referring
1377 1377                   * to the address one byte after the breakpointed address.
1378 1378                   * If the P_PR_BPTADJ flag has been set via /proc, We adjust
1379 1379                   * it back so it refers to the breakpointed address.
1380 1380                   */
1381 1381                  if (p->p_proc_flag & P_PR_BPTADJ)
1382 1382                          rp->r_pc--;
1383 1383                  siginfo.si_signo = SIGTRAP;
1384 1384                  siginfo.si_code  = TRAP_BRKPT;
1385 1385                  siginfo.si_addr  = (caddr_t)rp->r_pc;
1386 1386                  fault = FLTBPT;
1387 1387                  break;
1388 1388  
1389 1389          case T_AST:
1390 1390                  /*
1391 1391                   * This occurs only after the cs register has been made to
1392 1392                   * look like a kernel selector, either through debugging or
1393 1393                   * possibly by functions like setcontext().  The thread is
1394 1394                   * about to cause a general protection fault at common_iret()
1395 1395                   * in locore.  We let that happen immediately instead of
1396 1396                   * doing the T_AST processing.
1397 1397                   */
1398 1398                  goto cleanup;
1399 1399  
1400 1400          case T_AST + USER:      /* profiling, resched, h/w error pseudo trap */
1401 1401                  if (lwp->lwp_pcb.pcb_flags & ASYNC_HWERR) {
1402 1402                          proc_t *p = ttoproc(curthread);
1403 1403                          extern void print_msg_hwerr(ctid_t ct_id, proc_t *p);
1404 1404  
1405 1405                          lwp->lwp_pcb.pcb_flags &= ~ASYNC_HWERR;
1406 1406                          print_msg_hwerr(p->p_ct_process->conp_contract.ct_id,
1407 1407                              p);
1408 1408                          contract_process_hwerr(p->p_ct_process, p);
1409 1409                          siginfo.si_signo = SIGKILL;
1410 1410                          siginfo.si_code = SI_NOINFO;
1411 1411                  } else if (lwp->lwp_pcb.pcb_flags & CPC_OVERFLOW) {
1412 1412                          lwp->lwp_pcb.pcb_flags &= ~CPC_OVERFLOW;
1413 1413                          if (kcpc_overflow_ast()) {
1414 1414                                  /*
1415 1415                                   * Signal performance counter overflow
1416 1416                                   */
1417 1417                                  if (tudebug)
1418 1418                                          showregs(type, rp, (caddr_t)0);
1419 1419                                  bzero(&siginfo, sizeof (siginfo));
1420 1420                                  siginfo.si_signo = SIGEMT;
1421 1421                                  siginfo.si_code = EMT_CPCOVF;
1422 1422                                  siginfo.si_addr = (caddr_t)rp->r_pc;
1423 1423                                  fault = FLTCPCOVF;
1424 1424                          }
1425 1425                  }
1426 1426  
1427 1427                  break;
1428 1428          }
1429 1429  
1430 1430          /*
1431 1431           * We can't get here from a system trap
1432 1432           */
1433 1433          ASSERT(type & USER);
1434 1434  
1435 1435          if (fault) {
1436 1436                  /* We took a fault so abort single step. */
1437 1437                  lwp->lwp_pcb.pcb_flags &= ~(NORMAL_STEP|WATCH_STEP);
1438 1438                  /*
1439 1439                   * Remember the fault and fault adddress
1440 1440                   * for real-time (SIGPROF) profiling.
1441 1441                   */
1442 1442                  lwp->lwp_lastfault = fault;
1443 1443                  lwp->lwp_lastfaddr = siginfo.si_addr;
1444 1444  
1445 1445                  DTRACE_PROC2(fault, int, fault, ksiginfo_t *, &siginfo);
1446 1446  
1447 1447                  /*
1448 1448                   * If a debugger has declared this fault to be an
1449 1449                   * event of interest, stop the lwp.  Otherwise just
1450 1450                   * deliver the associated signal.
1451 1451                   */
1452 1452                  if (siginfo.si_signo != SIGKILL &&
1453 1453                      prismember(&p->p_fltmask, fault) &&
1454 1454                      stop_on_fault(fault, &siginfo) == 0)
1455 1455                          siginfo.si_signo = 0;
1456 1456          }
1457 1457  
1458 1458          if (siginfo.si_signo)
1459 1459                  trapsig(&siginfo, (fault != FLTFPE && fault != FLTCPCOVF));
1460 1460  
1461 1461          if (lwp->lwp_oweupc)
1462 1462                  profil_tick(rp->r_pc);
1463 1463  
1464 1464          if (ct->t_astflag | ct->t_sig_check) {
1465 1465                  /*
1466 1466                   * Turn off the AST flag before checking all the conditions that
1467 1467                   * may have caused an AST.  This flag is on whenever a signal or
1468 1468                   * unusual condition should be handled after the next trap or
1469 1469                   * syscall.
1470 1470                   */
1471 1471                  astoff(ct);
1472 1472                  /*
1473 1473                   * If a single-step trap occurred on a syscall (see above)
1474 1474                   * recognize it now.  Do this before checking for signals
1475 1475                   * because deferred_singlestep_trap() may generate a SIGTRAP to
1476 1476                   * the LWP or may otherwise mark the LWP to call issig(FORREAL).
1477 1477                   */
1478 1478                  if (lwp->lwp_pcb.pcb_flags & DEBUG_PENDING)
1479 1479                          deferred_singlestep_trap((caddr_t)rp->r_pc);
1480 1480  
1481 1481                  ct->t_sig_check = 0;
1482 1482  
1483 1483                  /*
1484 1484                   * As in other code paths that check against TP_CHANGEBIND,
1485 1485                   * we perform the check first without p_lock held -- only
1486 1486                   * acquiring p_lock in the unlikely event that it is indeed
1487 1487                   * set.  This is safe because we are doing this after the
1488 1488                   * astoff(); if we are racing another thread setting
1489 1489                   * TP_CHANGEBIND on us, we will pick it up on a subsequent
1490 1490                   * lap through.
1491 1491                   */
1492 1492                  if (curthread->t_proc_flag & TP_CHANGEBIND) {
1493 1493                          mutex_enter(&p->p_lock);
1494 1494                          if (curthread->t_proc_flag & TP_CHANGEBIND) {
1495 1495                                  timer_lwpbind();
1496 1496                                  curthread->t_proc_flag &= ~TP_CHANGEBIND;
1497 1497                          }
1498 1498                          mutex_exit(&p->p_lock);
1499 1499                  }
1500 1500  
1501 1501                  /*
1502 1502                   * for kaio requests that are on the per-process poll queue,
1503 1503                   * aiop->aio_pollq, they're AIO_POLL bit is set, the kernel
1504 1504                   * should copyout their result_t to user memory. by copying
1505 1505                   * out the result_t, the user can poll on memory waiting
1506 1506                   * for the kaio request to complete.
1507 1507                   */
1508 1508                  if (p->p_aio)
1509 1509                          aio_cleanup(0);
1510 1510                  /*
1511 1511                   * If this LWP was asked to hold, call holdlwp(), which will
1512 1512                   * stop.  holdlwps() sets this up and calls pokelwps() which
1513 1513                   * sets the AST flag.
1514 1514                   *
1515 1515                   * Also check TP_EXITLWP, since this is used by fresh new LWPs
1516 1516                   * through lwp_rtt().  That flag is set if the lwp_create(2)
1517 1517                   * syscall failed after creating the LWP.
1518 1518                   */
1519 1519                  if (ISHOLD(p))
1520 1520                          holdlwp();
1521 1521  
1522 1522                  /*
1523 1523                   * All code that sets signals and makes ISSIG evaluate true must
1524 1524                   * set t_astflag afterwards.
1525 1525                   */
1526 1526                  if (ISSIG_PENDING(ct, lwp, p)) {
1527 1527                          if (issig(FORREAL))
1528 1528                                  psig();
1529 1529                          ct->t_sig_check = 1;
1530 1530                  }
1531 1531  
1532 1532                  if (ct->t_rprof != NULL) {
1533 1533                          realsigprof(0, 0, 0);
1534 1534                          ct->t_sig_check = 1;
1535 1535                  }
1536 1536  
1537 1537                  /*
1538 1538                   * /proc can't enable/disable the trace bit itself
1539 1539                   * because that could race with the call gate used by
1540 1540                   * system calls via "lcall". If that happened, an
1541 1541                   * invalid EFLAGS would result. prstep()/prnostep()
1542 1542                   * therefore schedule an AST for the purpose.
1543 1543                   */
1544 1544                  if (lwp->lwp_pcb.pcb_flags & REQUEST_STEP) {
1545 1545                          lwp->lwp_pcb.pcb_flags &= ~REQUEST_STEP;
1546 1546                          rp->r_ps |= PS_T;
1547 1547                  }
1548 1548                  if (lwp->lwp_pcb.pcb_flags & REQUEST_NOSTEP) {
1549 1549                          lwp->lwp_pcb.pcb_flags &= ~REQUEST_NOSTEP;
1550 1550                          rp->r_ps &= ~PS_T;
1551 1551                  }
1552 1552          }
1553 1553  
1554 1554  out:    /* We can't get here from a system trap */
1555 1555          ASSERT(type & USER);
1556 1556  
1557 1557          if (ISHOLD(p))
1558 1558                  holdlwp();
1559 1559  
1560 1560          /*
1561 1561           * Set state to LWP_USER here so preempt won't give us a kernel
1562 1562           * priority if it occurs after this point.  Call CL_TRAPRET() to
1563 1563           * restore the user-level priority.
1564 1564           *
1565 1565           * It is important that no locks (other than spinlocks) be entered
1566 1566           * after this point before returning to user mode (unless lwp_state
1567 1567           * is set back to LWP_SYS).
1568 1568           */
1569 1569          lwp->lwp_state = LWP_USER;
1570 1570  
1571 1571          if (ct->t_trapret) {
1572 1572                  ct->t_trapret = 0;
1573 1573                  thread_lock(ct);
1574 1574                  CL_TRAPRET(ct);
1575 1575                  thread_unlock(ct);
1576 1576          }
1577 1577          if (CPU->cpu_runrun || curthread->t_schedflag & TS_ANYWAITQ)
1578 1578                  preempt();
1579 1579          prunstop();
1580 1580          (void) new_mstate(ct, mstate);
1581 1581  
1582 1582          /* Kernel probe */
1583 1583          TNF_PROBE_1(thread_state, "thread", /* CSTYLED */,
1584 1584              tnf_microstate, state, LMS_USER);
1585 1585  
1586 1586          return;
1587 1587  
1588 1588  cleanup:        /* system traps end up here */
1589 1589          ASSERT(!(type & USER));
1590 1590  }
1591 1591  
1592 1592  /*
1593 1593   * Patch non-zero to disable preemption of threads in the kernel.
1594 1594   */
1595 1595  int IGNORE_KERNEL_PREEMPTION = 0;       /* XXX - delete this someday */
1596 1596  
1597 1597  struct kpreempt_cnts {          /* kernel preemption statistics */
1598 1598          int     kpc_idle;       /* executing idle thread */
1599 1599          int     kpc_intr;       /* executing interrupt thread */
1600 1600          int     kpc_clock;      /* executing clock thread */
1601 1601          int     kpc_blocked;    /* thread has blocked preemption (t_preempt) */
1602 1602          int     kpc_notonproc;  /* thread is surrendering processor */
1603 1603          int     kpc_inswtch;    /* thread has ratified scheduling decision */
1604 1604          int     kpc_prilevel;   /* processor interrupt level is too high */
1605 1605          int     kpc_apreempt;   /* asynchronous preemption */
1606 1606          int     kpc_spreempt;   /* synchronous preemption */
1607 1607  } kpreempt_cnts;
1608 1608  
1609 1609  /*
1610 1610   * kernel preemption: forced rescheduling, preempt the running kernel thread.
1611 1611   *      the argument is old PIL for an interrupt,
1612 1612   *      or the distingished value KPREEMPT_SYNC.
1613 1613   */
1614 1614  void
1615 1615  kpreempt(int asyncspl)
1616 1616  {
1617 1617          kthread_t *ct = curthread;
1618 1618  
1619 1619          if (IGNORE_KERNEL_PREEMPTION) {
1620 1620                  aston(CPU->cpu_dispthread);
1621 1621                  return;
1622 1622          }
1623 1623  
1624 1624          /*
1625 1625           * Check that conditions are right for kernel preemption
1626 1626           */
1627 1627          do {
1628 1628                  if (ct->t_preempt) {
1629 1629                          /*
1630 1630                           * either a privileged thread (idle, panic, interrupt)
1631 1631                           * or will check when t_preempt is lowered
1632 1632                           * We need to specifically handle the case where
1633 1633                           * the thread is in the middle of swtch (resume has
1634 1634                           * been called) and has its t_preempt set
1635 1635                           * [idle thread and a thread which is in kpreempt
1636 1636                           * already] and then a high priority thread is
1637 1637                           * available in the local dispatch queue.
1638 1638                           * In this case the resumed thread needs to take a
1639 1639                           * trap so that it can call kpreempt. We achieve
1640 1640                           * this by using siron().
1641 1641                           * How do we detect this condition:
1642 1642                           * idle thread is running and is in the midst of
1643 1643                           * resume: curthread->t_pri == -1 && CPU->dispthread
1644 1644                           * != CPU->thread
1645 1645                           * Need to ensure that this happens only at high pil
1646 1646                           * resume is called at high pil
1647 1647                           * Only in resume_from_idle is the pil changed.
1648 1648                           */
1649 1649                          if (ct->t_pri < 0) {
1650 1650                                  kpreempt_cnts.kpc_idle++;
1651 1651                                  if (CPU->cpu_dispthread != CPU->cpu_thread)
1652 1652                                          siron();
1653 1653                          } else if (ct->t_flag & T_INTR_THREAD) {
1654 1654                                  kpreempt_cnts.kpc_intr++;
1655 1655                                  if (ct->t_pil == CLOCK_LEVEL)
1656 1656                                          kpreempt_cnts.kpc_clock++;
1657 1657                          } else {
1658 1658                                  kpreempt_cnts.kpc_blocked++;
1659 1659                                  if (CPU->cpu_dispthread != CPU->cpu_thread)
1660 1660                                          siron();
1661 1661                          }
1662 1662                          aston(CPU->cpu_dispthread);
1663 1663                          return;
1664 1664                  }
1665 1665                  if (ct->t_state != TS_ONPROC ||
1666 1666                      ct->t_disp_queue != CPU->cpu_disp) {
1667 1667                          /* this thread will be calling swtch() shortly */
1668 1668                          kpreempt_cnts.kpc_notonproc++;
1669 1669                          if (CPU->cpu_thread != CPU->cpu_dispthread) {
1670 1670                                  /* already in swtch(), force another */
1671 1671                                  kpreempt_cnts.kpc_inswtch++;
1672 1672                                  siron();
1673 1673                          }
1674 1674                          return;
1675 1675                  }
1676 1676                  if (getpil() >= DISP_LEVEL) {
1677 1677                          /*
1678 1678                           * We can't preempt this thread if it is at
1679 1679                           * a PIL >= DISP_LEVEL since it may be holding
1680 1680                           * a spin lock (like sched_lock).
1681 1681                           */
1682 1682                          siron();        /* check back later */
1683 1683                          kpreempt_cnts.kpc_prilevel++;
1684 1684                          return;
1685 1685                  }
1686 1686                  if (!interrupts_enabled()) {
1687 1687                          /*
1688 1688                           * Can't preempt while running with ints disabled
1689 1689                           */
1690 1690                          kpreempt_cnts.kpc_prilevel++;
1691 1691                          return;
1692 1692                  }
1693 1693                  if (asyncspl != KPREEMPT_SYNC)
1694 1694                          kpreempt_cnts.kpc_apreempt++;
1695 1695                  else
1696 1696                          kpreempt_cnts.kpc_spreempt++;
1697 1697  
1698 1698                  ct->t_preempt++;
1699 1699                  preempt();
1700 1700                  ct->t_preempt--;
1701 1701          } while (CPU->cpu_kprunrun);
1702 1702  }
1703 1703  
1704 1704  /*
1705 1705   * Print out debugging info.
1706 1706   */
1707 1707  static void
1708 1708  showregs(uint_t type, struct regs *rp, caddr_t addr)
1709 1709  {
1710 1710          int s;
1711 1711  
1712 1712          s = spl7();
1713 1713          type &= ~USER;
1714 1714          if (PTOU(curproc)->u_comm[0])
1715 1715                  printf("%s: ", PTOU(curproc)->u_comm);
1716 1716          if (type < TRAP_TYPES)
1717 1717                  printf("#%s %s\n", trap_type_mnemonic[type], trap_type[type]);
1718 1718          else
1719 1719                  switch (type) {
1720 1720                  case T_SYSCALL:
1721 1721                          printf("Syscall Trap:\n");
1722 1722                          break;
1723 1723                  case T_AST:
1724 1724                          printf("AST\n");
1725 1725                          break;
1726 1726                  default:
1727 1727                          printf("Bad Trap = %d\n", type);
1728 1728                          break;
1729 1729                  }
1730 1730          if (type == T_PGFLT) {
1731 1731                  printf("Bad %s fault at addr=0x%lx\n",
1732 1732                      USERMODE(rp->r_cs) ? "user": "kernel", (uintptr_t)addr);
1733 1733          } else if (addr) {
1734 1734                  printf("addr=0x%lx\n", (uintptr_t)addr);
1735 1735          }
1736 1736  
1737 1737          printf("pid=%d, pc=0x%lx, sp=0x%lx, eflags=0x%lx\n",
1738 1738              (ttoproc(curthread) && ttoproc(curthread)->p_pidp) ?
1739 1739              ttoproc(curthread)->p_pid : 0, rp->r_pc, rp->r_sp, rp->r_ps);
1740 1740  
1741 1741  #if defined(__lint)
1742 1742          /*
1743 1743           * this clause can be deleted when lint bug 4870403 is fixed
1744 1744           * (lint thinks that bit 32 is illegal in a %b format string)
1745 1745           */
1746 1746          printf("cr0: %x cr4: %b\n",
1747 1747              (uint_t)getcr0(), (uint_t)getcr4(), FMT_CR4);
1748 1748  #else
1749 1749          printf("cr0: %b cr4: %b\n",
1750 1750              (uint_t)getcr0(), FMT_CR0, (uint_t)getcr4(), FMT_CR4);
1751 1751  #endif  /* __lint */
1752 1752  
1753 1753          printf("cr2: %lx", getcr2());
1754 1754  #if !defined(__xpv)
1755 1755          printf("cr3: %lx", getcr3());
1756 1756  #if defined(__amd64)
1757 1757          printf("cr8: %lx\n", getcr8());
1758 1758  #endif
1759 1759  #endif
1760 1760          printf("\n");
1761 1761  
1762 1762          dumpregs(rp);
1763 1763          splx(s);
1764 1764  }
1765 1765  
1766 1766  static void
1767 1767  dumpregs(struct regs *rp)
1768 1768  {
1769 1769  #if defined(__amd64)
1770 1770          const char fmt[] = "\t%3s: %16lx %3s: %16lx %3s: %16lx\n";
1771 1771  
1772 1772          printf(fmt, "rdi", rp->r_rdi, "rsi", rp->r_rsi, "rdx", rp->r_rdx);
1773 1773          printf(fmt, "rcx", rp->r_rcx, " r8", rp->r_r8, " r9", rp->r_r9);
1774 1774          printf(fmt, "rax", rp->r_rax, "rbx", rp->r_rbx, "rbp", rp->r_rbp);
1775 1775          printf(fmt, "r10", rp->r_r10, "r11", rp->r_r11, "r12", rp->r_r12);
1776 1776          printf(fmt, "r13", rp->r_r13, "r14", rp->r_r14, "r15", rp->r_r15);
1777 1777  
1778 1778          printf(fmt, "fsb", rdmsr(MSR_AMD_FSBASE), "gsb", rdmsr(MSR_AMD_GSBASE),
1779 1779              " ds", rp->r_ds);
1780 1780          printf(fmt, " es", rp->r_es, " fs", rp->r_fs, " gs", rp->r_gs);
1781 1781  
1782 1782          printf(fmt, "trp", rp->r_trapno, "err", rp->r_err, "rip", rp->r_rip);
1783 1783          printf(fmt, " cs", rp->r_cs, "rfl", rp->r_rfl, "rsp", rp->r_rsp);
1784 1784  
1785 1785          printf("\t%3s: %16lx\n", " ss", rp->r_ss);
1786 1786  
1787 1787  #elif defined(__i386)
1788 1788          const char fmt[] = "\t%3s: %8lx %3s: %8lx %3s: %8lx %3s: %8lx\n";
1789 1789  
1790 1790          printf(fmt, " gs", rp->r_gs, " fs", rp->r_fs,
1791 1791              " es", rp->r_es, " ds", rp->r_ds);
1792 1792          printf(fmt, "edi", rp->r_edi, "esi", rp->r_esi,
1793 1793              "ebp", rp->r_ebp, "esp", rp->r_esp);
1794 1794          printf(fmt, "ebx", rp->r_ebx, "edx", rp->r_edx,
1795 1795              "ecx", rp->r_ecx, "eax", rp->r_eax);
1796 1796          printf(fmt, "trp", rp->r_trapno, "err", rp->r_err,
1797 1797              "eip", rp->r_eip, " cs", rp->r_cs);
1798 1798          printf("\t%3s: %8lx %3s: %8lx %3s: %8lx\n",
1799 1799              "efl", rp->r_efl, "usp", rp->r_uesp, " ss", rp->r_ss);
1800 1800  
1801 1801  #endif  /* __i386 */
1802 1802  }
1803 1803  
1804 1804  /*
1805 1805   * Test to see if the instruction is iret on i386 or iretq on amd64.
1806 1806   *
1807 1807   * On the hypervisor we can only test for nopop_sys_rtt_syscall. If true
1808 1808   * then we are in the context of hypervisor's failsafe handler because it
1809 1809   * tried to iret and failed due to a bad selector. See xen_failsafe_callback.
1810 1810   */
1811 1811  static int
1812 1812  instr_is_iret(caddr_t pc)
1813 1813  {
1814 1814  
1815 1815  #if defined(__xpv)
1816 1816          extern void nopop_sys_rtt_syscall(void);
1817 1817          return ((pc == (caddr_t)nopop_sys_rtt_syscall) ? 1 : 0);
1818 1818  
1819 1819  #else
1820 1820  
1821 1821  #if defined(__amd64)
1822 1822          static const uint8_t iret_insn[2] = { 0x48, 0xcf };     /* iretq */
1823 1823  
1824 1824  #elif defined(__i386)
1825 1825          static const uint8_t iret_insn[1] = { 0xcf };           /* iret */
1826 1826  #endif  /* __i386 */
1827 1827          return (bcmp(pc, iret_insn, sizeof (iret_insn)) == 0);
1828 1828  
1829 1829  #endif  /* __xpv */
1830 1830  }
1831 1831  
1832 1832  #if defined(__i386)
1833 1833  
1834 1834  /*
1835 1835   * Test to see if the instruction is part of __SEGREGS_POP
1836 1836   *
1837 1837   * Note carefully the appallingly awful dependency between
1838 1838   * the instruction sequence used in __SEGREGS_POP and these
1839 1839   * instructions encoded here.
1840 1840   */
1841 1841  static int
1842 1842  instr_is_segregs_pop(caddr_t pc)
1843 1843  {
1844 1844          static const uint8_t movw_0_esp_gs[4] = { 0x8e, 0x6c, 0x24, 0x0 };
1845 1845          static const uint8_t movw_4_esp_fs[4] = { 0x8e, 0x64, 0x24, 0x4 };
1846 1846          static const uint8_t movw_8_esp_es[4] = { 0x8e, 0x44, 0x24, 0x8 };
1847 1847          static const uint8_t movw_c_esp_ds[4] = { 0x8e, 0x5c, 0x24, 0xc };
1848 1848  
1849 1849          if (bcmp(pc, movw_0_esp_gs, sizeof (movw_0_esp_gs)) == 0 ||
1850 1850              bcmp(pc, movw_4_esp_fs, sizeof (movw_4_esp_fs)) == 0 ||
1851 1851              bcmp(pc, movw_8_esp_es, sizeof (movw_8_esp_es)) == 0 ||
1852 1852              bcmp(pc, movw_c_esp_ds, sizeof (movw_c_esp_ds)) == 0)
1853 1853                  return (1);
1854 1854  
1855 1855          return (0);
1856 1856  }
1857 1857  
1858 1858  #endif  /* __i386 */
1859 1859  
1860 1860  /*
1861 1861   * Test to see if the instruction is part of _sys_rtt.
1862 1862   *
1863 1863   * Again on the hypervisor if we try to IRET to user land with a bad code
1864 1864   * or stack selector we will get vectored through xen_failsafe_callback.
1865 1865   * In which case we assume we got here via _sys_rtt since we only allow
1866 1866   * IRET to user land to take place in _sys_rtt.
1867 1867   */
1868 1868  static int
1869 1869  instr_is_sys_rtt(caddr_t pc)
1870 1870  {
1871 1871          extern void _sys_rtt(), _sys_rtt_end();
1872 1872  
1873 1873          if ((uintptr_t)pc < (uintptr_t)_sys_rtt ||
1874 1874              (uintptr_t)pc > (uintptr_t)_sys_rtt_end)
1875 1875                  return (0);
1876 1876  
1877 1877          return (1);
1878 1878  }
1879 1879  
1880 1880  /*
1881 1881   * Handle #gp faults in kernel mode.
1882 1882   *
1883 1883   * One legitimate way this can happen is if we attempt to update segment
1884 1884   * registers to naughty values on the way out of the kernel.
1885 1885   *
1886 1886   * This can happen in a couple of ways: someone - either accidentally or
1887 1887   * on purpose - creates (setcontext(2), lwp_create(2)) or modifies
1888 1888   * (signal(2)) a ucontext that contains silly segment register values.
1889 1889   * Or someone - either accidentally or on purpose - modifies the prgregset_t
1890 1890   * of a subject process via /proc to contain silly segment register values.
1891 1891   *
1892 1892   * (The unfortunate part is that we can end up discovering the bad segment
1893 1893   * register value in the middle of an 'iret' after we've popped most of the
1894 1894   * stack.  So it becomes quite difficult to associate an accurate ucontext
1895 1895   * with the lwp, because the act of taking the #gp trap overwrites most of
1896 1896   * what we were going to send the lwp.)
1897 1897   *
1898 1898   * OTOH if it turns out that's -not- the problem, and we're -not- an lwp
1899 1899   * trying to return to user mode and we get a #gp fault, then we need
1900 1900   * to die() -- which will happen if we return non-zero from this routine.
1901 1901   */
1902 1902  static int
1903 1903  kern_gpfault(struct regs *rp)
1904 1904  {
1905 1905          kthread_t *t = curthread;
1906 1906          proc_t *p = ttoproc(t);
1907 1907          klwp_t *lwp = ttolwp(t);
1908 1908          struct regs tmpregs, *trp = NULL;
1909 1909          caddr_t pc = (caddr_t)rp->r_pc;
1910 1910          int v;
1911 1911          uint32_t auditing = AU_AUDITING();
1912 1912  
1913 1913          /*
1914 1914           * if we're not an lwp, or in the case of running native the
1915 1915           * pc range is outside _sys_rtt, then we should immediately
1916 1916           * be die()ing horribly.
1917 1917           */
1918 1918          if (lwp == NULL || !instr_is_sys_rtt(pc))
1919 1919                  return (1);
1920 1920  
1921 1921          /*
1922 1922           * So at least we're in the right part of the kernel.
1923 1923           *
1924 1924           * Disassemble the instruction at the faulting pc.
1925 1925           * Once we know what it is, we carefully reconstruct the stack
1926 1926           * based on the order in which the stack is deconstructed in
1927 1927           * _sys_rtt. Ew.
1928 1928           */
1929 1929          if (instr_is_iret(pc)) {
1930 1930                  /*
1931 1931                   * We took the #gp while trying to perform the IRET.
1932 1932                   * This means that either %cs or %ss are bad.
1933 1933                   * All we know for sure is that most of the general
1934 1934                   * registers have been restored, including the
1935 1935                   * segment registers, and all we have left on the
1936 1936                   * topmost part of the lwp's stack are the
1937 1937                   * registers that the iretq was unable to consume.
1938 1938                   *
1939 1939                   * All the rest of the state was crushed by the #gp
1940 1940                   * which pushed -its- registers atop our old save area
1941 1941                   * (because we had to decrement the stack pointer, sigh) so
1942 1942                   * all that we can try and do is to reconstruct the
1943 1943                   * crushed frame from the #gp trap frame itself.
1944 1944                   */
1945 1945                  trp = &tmpregs;
1946 1946                  trp->r_ss = lwptoregs(lwp)->r_ss;
1947 1947                  trp->r_sp = lwptoregs(lwp)->r_sp;
1948 1948                  trp->r_ps = lwptoregs(lwp)->r_ps;
1949 1949                  trp->r_cs = lwptoregs(lwp)->r_cs;
1950 1950                  trp->r_pc = lwptoregs(lwp)->r_pc;
1951 1951                  bcopy(rp, trp, offsetof(struct regs, r_pc));
1952 1952  
1953 1953                  /*
1954 1954                   * Validate simple math
1955 1955                   */
1956 1956                  ASSERT(trp->r_pc == lwptoregs(lwp)->r_pc);
1957 1957                  ASSERT(trp->r_err == rp->r_err);
1958 1958  
1959 1959  
1960 1960  
1961 1961          }
1962 1962  
1963 1963  #if defined(__amd64)
1964 1964          if (trp == NULL && lwp->lwp_pcb.pcb_rupdate != 0) {
1965 1965  
1966 1966                  /*
1967 1967                   * This is the common case -- we're trying to load
1968 1968                   * a bad segment register value in the only section
1969 1969                   * of kernel code that ever loads segment registers.
1970 1970                   *
1971 1971                   * We don't need to do anything at this point because
1972 1972                   * the pcb contains all the pending segment register
1973 1973                   * state, and the regs are still intact because we
1974 1974                   * didn't adjust the stack pointer yet.  Given the fidelity
1975 1975                   * of all this, we could conceivably send a signal
1976 1976                   * to the lwp, rather than core-ing.
1977 1977                   */
1978 1978                  trp = lwptoregs(lwp);
1979 1979                  ASSERT((caddr_t)trp == (caddr_t)rp->r_sp);
1980 1980          }
1981 1981  
1982 1982  #elif defined(__i386)
1983 1983  
1984 1984          if (trp == NULL && instr_is_segregs_pop(pc))
1985 1985                  trp = lwptoregs(lwp);
1986 1986  
1987 1987  #endif  /* __i386 */
1988 1988  
1989 1989          if (trp == NULL)
1990 1990                  return (1);
1991 1991  
1992 1992          /*
1993 1993           * If we get to here, we're reasonably confident that we've
1994 1994           * correctly decoded what happened on the way out of the kernel.
1995 1995           * Rewrite the lwp's registers so that we can create a core dump
1996 1996           * the (at least vaguely) represents the mcontext we were
1997 1997           * being asked to restore when things went so terribly wrong.
1998 1998           */
1999 1999  
2000 2000          /*
2001 2001           * Make sure that we have a meaningful %trapno and %err.
2002 2002           */
2003 2003          trp->r_trapno = rp->r_trapno;
2004 2004          trp->r_err = rp->r_err;
2005 2005  
2006 2006          if ((caddr_t)trp != (caddr_t)lwptoregs(lwp))
2007 2007                  bcopy(trp, lwptoregs(lwp), sizeof (*trp));
2008 2008  
2009 2009  
2010 2010          mutex_enter(&p->p_lock);
2011 2011          lwp->lwp_cursig = SIGSEGV;
2012 2012          mutex_exit(&p->p_lock);
2013 2013  
2014 2014          /*
2015 2015           * Terminate all LWPs but don't discard them.  If another lwp beat
2016 2016           * us to the punch by calling exit(), evaporate now.
2017 2017           */
2018 2018          proc_is_exiting(p);
2019 2019          if (exitlwps(1) != 0) {
2020 2020                  mutex_enter(&p->p_lock);
2021 2021                  lwp_exit();
2022 2022          }
2023 2023  
2024 2024          if (auditing)           /* audit core dump */
2025 2025                  audit_core_start(SIGSEGV);
2026 2026          v = core(SIGSEGV, B_FALSE);
2027 2027          if (auditing)           /* audit core dump */
2028 2028                  audit_core_finish(v ? CLD_KILLED : CLD_DUMPED);
2029 2029          exit(v ? CLD_KILLED : CLD_DUMPED, SIGSEGV);
2030 2030          return (0);
2031 2031  }
2032 2032  
2033 2033  /*
2034 2034   * dump_tss() - Display the TSS structure
2035 2035   */
2036 2036  
2037 2037  #if !defined(__xpv)
2038 2038  #if defined(__amd64)
2039 2039  
2040 2040  static void
2041 2041  dump_tss(void)
2042 2042  {
2043 2043          const char tss_fmt[] = "tss.%s:\t0x%p\n";  /* Format string */
2044 2044          tss_t *tss = CPU->cpu_tss;
2045 2045  
2046 2046          printf(tss_fmt, "tss_rsp0", (void *)tss->tss_rsp0);
2047 2047          printf(tss_fmt, "tss_rsp1", (void *)tss->tss_rsp1);
2048 2048          printf(tss_fmt, "tss_rsp2", (void *)tss->tss_rsp2);
2049 2049  
2050 2050          printf(tss_fmt, "tss_ist1", (void *)tss->tss_ist1);
2051 2051          printf(tss_fmt, "tss_ist2", (void *)tss->tss_ist2);
2052 2052          printf(tss_fmt, "tss_ist3", (void *)tss->tss_ist3);
2053 2053          printf(tss_fmt, "tss_ist4", (void *)tss->tss_ist4);
2054 2054          printf(tss_fmt, "tss_ist5", (void *)tss->tss_ist5);
2055 2055          printf(tss_fmt, "tss_ist6", (void *)tss->tss_ist6);
2056 2056          printf(tss_fmt, "tss_ist7", (void *)tss->tss_ist7);
2057 2057  }
2058 2058  
2059 2059  #elif defined(__i386)
2060 2060  
2061 2061  static void
2062 2062  dump_tss(void)
2063 2063  {
2064 2064          const char tss_fmt[] = "tss.%s:\t0x%p\n";  /* Format string */
2065 2065          tss_t *tss = CPU->cpu_tss;
2066 2066  
2067 2067          printf(tss_fmt, "tss_link", (void *)(uintptr_t)tss->tss_link);
2068 2068          printf(tss_fmt, "tss_esp0", (void *)(uintptr_t)tss->tss_esp0);
2069 2069          printf(tss_fmt, "tss_ss0", (void *)(uintptr_t)tss->tss_ss0);
2070 2070          printf(tss_fmt, "tss_esp1", (void *)(uintptr_t)tss->tss_esp1);
2071 2071          printf(tss_fmt, "tss_ss1", (void *)(uintptr_t)tss->tss_ss1);
2072 2072          printf(tss_fmt, "tss_esp2", (void *)(uintptr_t)tss->tss_esp2);
2073 2073          printf(tss_fmt, "tss_ss2", (void *)(uintptr_t)tss->tss_ss2);
2074 2074          printf(tss_fmt, "tss_cr3", (void *)(uintptr_t)tss->tss_cr3);
2075 2075          printf(tss_fmt, "tss_eip", (void *)(uintptr_t)tss->tss_eip);
2076 2076          printf(tss_fmt, "tss_eflags", (void *)(uintptr_t)tss->tss_eflags);
2077 2077          printf(tss_fmt, "tss_eax", (void *)(uintptr_t)tss->tss_eax);
2078 2078          printf(tss_fmt, "tss_ebx", (void *)(uintptr_t)tss->tss_ebx);
2079 2079          printf(tss_fmt, "tss_ecx", (void *)(uintptr_t)tss->tss_ecx);
2080 2080          printf(tss_fmt, "tss_edx", (void *)(uintptr_t)tss->tss_edx);
2081 2081          printf(tss_fmt, "tss_esp", (void *)(uintptr_t)tss->tss_esp);
2082 2082  }
2083 2083  
2084 2084  #endif  /* __amd64 */
2085 2085  #endif  /* !__xpv */
2086 2086  
2087 2087  #if defined(TRAPTRACE)
2088 2088  
2089 2089  int ttrace_nrec = 10;           /* number of records to dump out */
2090 2090  int ttrace_dump_nregs = 0;      /* dump out this many records with regs too */
2091 2091  
2092 2092  /*
2093 2093   * Dump out the last ttrace_nrec traptrace records on each CPU
2094 2094   */
2095 2095  static void
2096 2096  dump_ttrace(void)
2097 2097  {
2098 2098          trap_trace_ctl_t *ttc;
2099 2099          trap_trace_rec_t *rec;
2100 2100          uintptr_t current;
2101 2101          int i, j, k;
2102 2102          int n = NCPU;
2103 2103  #if defined(__amd64)
2104 2104          const char banner[] =
2105 2105              "\ncpu          address    timestamp "
2106 2106              "type  vc  handler   pc\n";
2107 2107          const char fmt1[] = "%3d %016lx %12llx ";
2108 2108  #elif defined(__i386)
2109 2109          const char banner[] =
2110 2110              "\ncpu  address     timestamp type  vc  handler   pc\n";
2111 2111          const char fmt1[] = "%3d %08lx %12llx ";
2112 2112  #endif
2113 2113          const char fmt2[] = "%4s %3x ";
2114 2114          const char fmt3[] = "%8s ";
2115 2115  
2116 2116          if (ttrace_nrec == 0)
2117 2117                  return;
2118 2118  
2119 2119          printf(banner);
2120 2120  
2121 2121          for (i = 0; i < n; i++) {
2122 2122                  ttc = &trap_trace_ctl[i];
2123 2123                  if (ttc->ttc_first == NULL)
2124 2124                          continue;
2125 2125  
2126 2126                  current = ttc->ttc_next - sizeof (trap_trace_rec_t);
2127 2127                  for (j = 0; j < ttrace_nrec; j++) {
2128 2128                          struct sysent   *sys;
2129 2129                          struct autovec  *vec;
2130 2130                          extern struct av_head autovect[];
2131 2131                          int type;
2132 2132                          ulong_t off;
2133 2133                          char *sym, *stype;
2134 2134  
2135 2135                          if (current < ttc->ttc_first)
2136 2136                                  current =
2137 2137                                      ttc->ttc_limit - sizeof (trap_trace_rec_t);
2138 2138  
2139 2139                          if (current == NULL)
2140 2140                                  continue;
2141 2141  
2142 2142                          rec = (trap_trace_rec_t *)current;
2143 2143  
2144 2144                          if (rec->ttr_stamp == 0)
2145 2145                                  break;
2146 2146  
2147 2147                          printf(fmt1, i, (uintptr_t)rec, rec->ttr_stamp);
2148 2148  
2149 2149                          switch (rec->ttr_marker) {
2150 2150                          case TT_SYSCALL:
2151 2151                          case TT_SYSENTER:
2152 2152                          case TT_SYSC:
2153 2153                          case TT_SYSC64:
2154 2154  #if defined(__amd64)
2155 2155                                  sys = &sysent32[rec->ttr_sysnum];
2156 2156                                  switch (rec->ttr_marker) {
2157 2157                                  case TT_SYSC64:
2158 2158                                          sys = &sysent[rec->ttr_sysnum];
2159 2159                                          /*FALLTHROUGH*/
2160 2160  #elif defined(__i386)
2161 2161                                  sys = &sysent[rec->ttr_sysnum];
2162 2162                                  switch (rec->ttr_marker) {
2163 2163                                  case TT_SYSC64:
2164 2164  #endif
2165 2165                                  case TT_SYSC:
2166 2166                                          stype = "sysc"; /* syscall */
2167 2167                                          break;
2168 2168                                  case TT_SYSCALL:
2169 2169                                          stype = "lcal"; /* lcall */
2170 2170                                          break;
2171 2171                                  case TT_SYSENTER:
2172 2172                                          stype = "syse"; /* sysenter */
2173 2173                                          break;
2174 2174                                  default:
2175 2175                                          break;
2176 2176                                  }
2177 2177                                  printf(fmt2, "sysc", rec->ttr_sysnum);
2178 2178                                  if (sys != NULL) {
2179 2179                                          sym = kobj_getsymname(
2180 2180                                              (uintptr_t)sys->sy_callc,
2181 2181                                              &off);
2182 2182                                          if (sym != NULL)
2183 2183                                                  printf(fmt3, sym);
2184 2184                                          else
2185 2185                                                  printf("%p ", sys->sy_callc);
2186 2186                                  } else {
2187 2187                                          printf(fmt3, "unknown");
2188 2188                                  }
2189 2189                                  break;
2190 2190  
2191 2191                          case TT_INTERRUPT:
2192 2192                                  printf(fmt2, "intr", rec->ttr_vector);
2193 2193                                  if (get_intr_handler != NULL)
2194 2194                                          vec = (struct autovec *)
2195 2195                                              (*get_intr_handler)
2196 2196                                              (rec->ttr_cpuid, rec->ttr_vector);
2197 2197                                  else
2198 2198                                          vec =
2199 2199                                              autovect[rec->ttr_vector].avh_link;
2200 2200  
2201 2201                                  if (vec != NULL) {
2202 2202                                          sym = kobj_getsymname(
2203 2203                                              (uintptr_t)vec->av_vector, &off);
2204 2204                                          if (sym != NULL)
2205 2205                                                  printf(fmt3, sym);
2206 2206                                          else
2207 2207                                                  printf("%p ", vec->av_vector);
2208 2208                                  } else {
2209 2209                                          printf(fmt3, "unknown ");
2210 2210                                  }
2211 2211                                  break;
2212 2212  
2213 2213                          case TT_TRAP:
2214 2214                          case TT_EVENT:
2215 2215                                  type = rec->ttr_regs.r_trapno;
2216 2216                                  printf(fmt2, "trap", type);
2217 2217                                  if (type < TRAP_TYPES)
2218 2218                                          printf("     #%s ",
2219 2219                                              trap_type_mnemonic[type]);
2220 2220                                  else
2221 2221                                          switch (type) {
2222 2222                                          case T_AST:
2223 2223                                                  printf(fmt3, "ast");
2224 2224                                                  break;
2225 2225                                          default:
2226 2226                                                  printf(fmt3, "");
2227 2227                                                  break;
2228 2228                                          }
2229 2229                                  break;
2230 2230  
2231 2231                          default:
2232 2232                                  break;
2233 2233                          }
2234 2234  
2235 2235                          sym = kobj_getsymname(rec->ttr_regs.r_pc, &off);
2236 2236                          if (sym != NULL)
2237 2237                                  printf("%s+%lx\n", sym, off);
2238 2238                          else
2239 2239                                  printf("%lx\n", rec->ttr_regs.r_pc);
2240 2240  
2241 2241                          if (ttrace_dump_nregs-- > 0) {
2242 2242                                  int s;
2243 2243  
2244 2244                                  if (rec->ttr_marker == TT_INTERRUPT)
2245 2245                                          printf(
2246 2246                                              "\t\tipl %x spl %x pri %x\n",
2247 2247                                              rec->ttr_ipl,
2248 2248                                              rec->ttr_spl,
2249 2249                                              rec->ttr_pri);
2250 2250  
2251 2251                                  dumpregs(&rec->ttr_regs);
2252 2252  
2253 2253                                  printf("\t%3s: %p\n\n", " ct",
2254 2254                                      (void *)rec->ttr_curthread);
2255 2255  
2256 2256                                  /*
2257 2257                                   * print out the pc stack that we recorded
2258 2258                                   * at trap time (if any)
2259 2259                                   */
2260 2260                                  for (s = 0; s < rec->ttr_sdepth; s++) {
2261 2261                                          uintptr_t fullpc;
2262 2262  
2263 2263                                          if (s >= TTR_STACK_DEPTH) {
2264 2264                                                  printf("ttr_sdepth corrupt\n");
2265 2265                                                  break;
2266 2266                                          }
2267 2267  
2268 2268                                          fullpc = (uintptr_t)rec->ttr_stack[s];
2269 2269  
2270 2270                                          sym = kobj_getsymname(fullpc, &off);
2271 2271                                          if (sym != NULL)
2272 2272                                                  printf("-> %s+0x%lx()\n",
2273 2273                                                      sym, off);
2274 2274                                          else
2275 2275                                                  printf("-> 0x%lx()\n", fullpc);
2276 2276                                  }
2277 2277                                  printf("\n");
2278 2278                          }
2279 2279                          current -= sizeof (trap_trace_rec_t);
2280 2280                  }
2281 2281          }
2282 2282  }
2283 2283  
2284 2284  #endif  /* TRAPTRACE */
2285 2285  
2286 2286  void
2287 2287  panic_showtrap(struct panic_trap_info *tip)
2288 2288  {
2289 2289          showregs(tip->trap_type, tip->trap_regs, tip->trap_addr);
2290 2290  
2291 2291  #if defined(TRAPTRACE)
2292 2292          dump_ttrace();
2293 2293  #endif
2294 2294  
2295 2295  #if !defined(__xpv)
2296 2296          if (tip->trap_type == T_DBLFLT)
2297 2297                  dump_tss();
2298 2298  #endif
2299 2299  }
2300 2300  
2301 2301  void
2302 2302  panic_savetrap(panic_data_t *pdp, struct panic_trap_info *tip)
2303 2303  {
2304 2304          panic_saveregs(pdp, tip->trap_regs);
2305 2305  }
  
    | 
      ↓ open down ↓ | 
    2305 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX